Move sub pixel variance to vpx_dsp
Change-Id: I66bf6720c396c89aa2d1fd26d5d52bf5d5e3dff1
This commit is contained in:
Родитель
155b9416b3
Коммит
6a82f0d7fb
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,137 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vp8/common/filter.h"
|
||||
|
||||
// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
|
||||
#if CONFIG_VP8_ENCODER
|
||||
|
||||
#if HAVE_MEDIA
|
||||
#include "vp8/common/arm/bilinearfilter_arm.h"
|
||||
|
||||
unsigned int vp8_sub_pixel_variance8x8_armv6
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
unsigned short first_pass[10*8];
|
||||
unsigned char second_pass[8*8];
|
||||
const short *HFilter, *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
|
||||
src_pixels_per_line,
|
||||
9, 8, HFilter);
|
||||
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
|
||||
8, 8, 8, VFilter);
|
||||
|
||||
return vpx_variance8x8_media(second_pass, 8, dst_ptr,
|
||||
dst_pixels_per_line, sse);
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x16_armv6
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
unsigned short first_pass[36*16];
|
||||
unsigned char second_pass[20*16];
|
||||
const short *HFilter, *VFilter;
|
||||
unsigned int var;
|
||||
|
||||
if (xoffset == 4 && yoffset == 0)
|
||||
{
|
||||
var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, sse);
|
||||
}
|
||||
else if (xoffset == 0 && yoffset == 4)
|
||||
{
|
||||
var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, sse);
|
||||
}
|
||||
else if (xoffset == 4 && yoffset == 4)
|
||||
{
|
||||
var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, sse);
|
||||
}
|
||||
else
|
||||
{
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
|
||||
src_pixels_per_line,
|
||||
17, 16, HFilter);
|
||||
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
|
||||
16, 16, 16, VFilter);
|
||||
|
||||
var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
|
||||
dst_pixels_per_line, sse);
|
||||
}
|
||||
return var;
|
||||
}
|
||||
|
||||
#endif // HAVE_MEDIA
|
||||
|
||||
|
||||
#if HAVE_NEON
|
||||
|
||||
extern unsigned int vp8_sub_pixel_variance16x16_neon_func
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
);
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x16_neon
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
if (xoffset == 4 && yoffset == 0)
|
||||
return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
|
||||
else if (xoffset == 0 && yoffset == 4)
|
||||
return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
|
||||
else if (xoffset == 4 && yoffset == 4)
|
||||
return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
|
||||
else
|
||||
return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
|
||||
}
|
||||
|
||||
#endif // HAVE_NEON
|
||||
#endif // CONFIG_VP8_ENCODER
|
|
@ -20,7 +20,7 @@
|
|||
#include "./vp8_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vp8/common/postproc.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#include "vpx_scale/yv12config.h"
|
||||
|
||||
|
|
|
@ -237,47 +237,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch,
|
|||
specialize qw/vp8_bilinear_predict4x4 mmx media/;
|
||||
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
|
||||
|
||||
#
|
||||
# Sub-pixel Variance
|
||||
#
|
||||
add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
|
||||
specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/;
|
||||
$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;
|
||||
|
||||
add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
|
||||
specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media/;
|
||||
$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;
|
||||
$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;
|
||||
|
||||
add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
|
||||
specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;
|
||||
$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt;
|
||||
|
||||
add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
|
||||
specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/;
|
||||
$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;
|
||||
|
||||
add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
|
||||
specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/;
|
||||
$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;
|
||||
$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;
|
||||
$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
|
||||
|
||||
add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;
|
||||
$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
|
||||
$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
|
||||
|
||||
add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;
|
||||
$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
|
||||
$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
|
||||
|
||||
add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
|
||||
$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
|
||||
$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
|
||||
|
||||
#
|
||||
# Encoder functions below this point.
|
||||
#
|
||||
|
|
|
@ -1,92 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VP8_COMMON_VARIANCE_H_
|
||||
#define VP8_COMMON_VARIANCE_H_
|
||||
|
||||
#include "vpx_config.h"
|
||||
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef unsigned int(*vpx_sad_fn_t)(
|
||||
const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride);
|
||||
|
||||
typedef void (*vp8_copy32xn_fn_t)(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride,
|
||||
int n);
|
||||
|
||||
typedef void (*vpx_sad_multi_fn_t)(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_array,
|
||||
int ref_stride,
|
||||
unsigned int *sad_array);
|
||||
|
||||
typedef void (*vpx_sad_multi_d_fn_t)
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char * const ref_array[],
|
||||
int ref_stride,
|
||||
unsigned int *sad_array
|
||||
);
|
||||
|
||||
typedef unsigned int (*vpx_variance_fn_t)
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int *sse
|
||||
);
|
||||
|
||||
typedef unsigned int (*vp8_subpixvariance_fn_t)
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *ref_ptr,
|
||||
int Refstride,
|
||||
unsigned int *sse
|
||||
);
|
||||
|
||||
typedef struct variance_vtable
|
||||
{
|
||||
vpx_sad_fn_t sdf;
|
||||
vpx_variance_fn_t vf;
|
||||
vp8_subpixvariance_fn_t svf;
|
||||
vpx_variance_fn_t svf_halfpix_h;
|
||||
vpx_variance_fn_t svf_halfpix_v;
|
||||
vpx_variance_fn_t svf_halfpix_hv;
|
||||
vpx_sad_multi_fn_t sdx3f;
|
||||
vpx_sad_multi_fn_t sdx8f;
|
||||
vpx_sad_multi_d_fn_t sdx4df;
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
vp8_copy32xn_fn_t copymem;
|
||||
#endif
|
||||
} vp8_variance_fn_ptr_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_VARIANCE_H_
|
|
@ -1,337 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "filter.h"
|
||||
#include "variance.h"
|
||||
|
||||
/* This is a bad idea.
|
||||
* ctz = count trailing zeros */
|
||||
static int ctz(int a) {
|
||||
int b = 0;
|
||||
while (a != 1) {
|
||||
a >>= 1;
|
||||
b++;
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
static unsigned int variance(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
int w,
|
||||
int h,
|
||||
unsigned int *sse)
|
||||
{
|
||||
int i, j;
|
||||
int diff, sum;
|
||||
|
||||
sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++)
|
||||
{
|
||||
for (j = 0; j < w; j++)
|
||||
{
|
||||
diff = src_ptr[j] - ref_ptr[j];
|
||||
sum += diff;
|
||||
*sse += diff * diff;
|
||||
}
|
||||
|
||||
src_ptr += source_stride;
|
||||
ref_ptr += recon_stride;
|
||||
}
|
||||
|
||||
return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : filter_block2d_bil_first_pass
|
||||
*
|
||||
* INPUTS : UINT8 *src_ptr : Pointer to source block.
|
||||
* UINT32 src_pixels_per_line : Stride of input block.
|
||||
* UINT32 pixel_step : Offset between filter input samples (see notes).
|
||||
* UINT32 output_height : Input block height.
|
||||
* UINT32 output_width : Input block width.
|
||||
* INT32 *vp8_filter : Array of 2 bi-linear filter taps.
|
||||
*
|
||||
* OUTPUTS : INT32 *output_ptr : Pointer to filtered block.
|
||||
*
|
||||
* RETURNS : void
|
||||
*
|
||||
* FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
|
||||
* either horizontal or vertical direction to produce the
|
||||
* filtered output block. Used to implement first-pass
|
||||
* of 2-D separable filter.
|
||||
*
|
||||
* SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
|
||||
* Two filter taps should sum to VP8_FILTER_WEIGHT.
|
||||
* pixel_step defines whether the filter is applied
|
||||
* horizontally (pixel_step=1) or vertically (pixel_step=stride).
|
||||
* It defines the offset required to move from one input
|
||||
* to the next.
|
||||
*
|
||||
****************************************************************************/
|
||||
static void var_filter_block2d_bil_first_pass
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
unsigned short *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const short *vp8_filter
|
||||
)
|
||||
{
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0; i < output_height; i++)
|
||||
{
|
||||
for (j = 0; j < output_width; j++)
|
||||
{
|
||||
/* Apply bilinear filter */
|
||||
output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
|
||||
((int)src_ptr[pixel_step] * vp8_filter[1]) +
|
||||
(VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
|
||||
src_ptr++;
|
||||
}
|
||||
|
||||
/* Next row... */
|
||||
src_ptr += src_pixels_per_line - output_width;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : filter_block2d_bil_second_pass
|
||||
*
|
||||
* INPUTS : INT32 *src_ptr : Pointer to source block.
|
||||
* UINT32 src_pixels_per_line : Stride of input block.
|
||||
* UINT32 pixel_step : Offset between filter input samples (see notes).
|
||||
* UINT32 output_height : Input block height.
|
||||
* UINT32 output_width : Input block width.
|
||||
* INT32 *vp8_filter : Array of 2 bi-linear filter taps.
|
||||
*
|
||||
* OUTPUTS : UINT16 *output_ptr : Pointer to filtered block.
|
||||
*
|
||||
* RETURNS : void
|
||||
*
|
||||
* FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
|
||||
* either horizontal or vertical direction to produce the
|
||||
* filtered output block. Used to implement second-pass
|
||||
* of 2-D separable filter.
|
||||
*
|
||||
* SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
|
||||
* Two filter taps should sum to VP8_FILTER_WEIGHT.
|
||||
* pixel_step defines whether the filter is applied
|
||||
* horizontally (pixel_step=1) or vertically (pixel_step=stride).
|
||||
* It defines the offset required to move from one input
|
||||
* to the next.
|
||||
*
|
||||
****************************************************************************/
|
||||
static void var_filter_block2d_bil_second_pass
|
||||
(
|
||||
const unsigned short *src_ptr,
|
||||
unsigned char *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const short *vp8_filter
|
||||
)
|
||||
{
|
||||
unsigned int i, j;
|
||||
int Temp;
|
||||
|
||||
for (i = 0; i < output_height; i++)
|
||||
{
|
||||
for (j = 0; j < output_width; j++)
|
||||
{
|
||||
/* Apply filter */
|
||||
Temp = ((int)src_ptr[0] * vp8_filter[0]) +
|
||||
((int)src_ptr[pixel_step] * vp8_filter[1]) +
|
||||
(VP8_FILTER_WEIGHT / 2);
|
||||
output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
|
||||
src_ptr++;
|
||||
}
|
||||
|
||||
/* Next row... */
|
||||
src_ptr += src_pixels_per_line - output_width;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_sub_pixel_variance4x4_c
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
unsigned char temp2[20*16];
|
||||
const short *HFilter, *VFilter;
|
||||
unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
/* First filter 1d Horizontal */
|
||||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
|
||||
|
||||
/* Now filter Verticaly */
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
|
||||
|
||||
return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_sub_pixel_variance8x8_c
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */
|
||||
unsigned char temp2[20*16];
|
||||
const short *HFilter, *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
|
||||
|
||||
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x16_c
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
unsigned short FData3[17*16]; /* Temp data bufffer used in filtering */
|
||||
unsigned char temp2[20*16];
|
||||
const short *HFilter, *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
|
||||
|
||||
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance_halfpixvar16x16_h_c(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,
|
||||
ref_ptr, recon_stride, sse);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance_halfpixvar16x16_v_c(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,
|
||||
ref_ptr, recon_stride, sse);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance_halfpixvar16x16_hv_c(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
|
||||
ref_ptr, recon_stride, sse);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x8_c
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
unsigned short FData3[16*9]; /* Temp data bufffer used in filtering */
|
||||
unsigned char temp2[20*16];
|
||||
const short *HFilter, *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
|
||||
|
||||
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance8x16_c
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
unsigned short FData3[9*16]; /* Temp data bufffer used in filtering */
|
||||
unsigned char temp2[20*16];
|
||||
const short *HFilter, *VFilter;
|
||||
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
|
||||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
|
||||
|
||||
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
|
||||
}
|
|
@ -1,972 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
%define xmm_filter_shift 7
|
||||
|
||||
;void vp8_filter_block2d_bil_var_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int xoffset,
|
||||
; int yoffset,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared;;
|
||||
;
|
||||
;)
|
||||
global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
|
||||
sym(vp8_filter_block2d_bil_var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 9
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
pxor xmm6, xmm6 ;
|
||||
pxor xmm7, xmm7 ;
|
||||
|
||||
lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
|
||||
movdqa xmm4, XMMWORD PTR [rsi]
|
||||
|
||||
lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
|
||||
movsxd rax, dword ptr arg(5) ; xoffset
|
||||
|
||||
cmp rax, 0 ; skip first_pass filter if xoffset=0
|
||||
je filter_block2d_bil_var_sse2_sp_only
|
||||
|
||||
shl rax, 5 ; point to filter coeff with xoffset
|
||||
lea rax, [rax + rcx] ; HFilter
|
||||
|
||||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; skip second_pass filter if yoffset=0
|
||||
je filter_block2d_bil_var_sse2_fp_only
|
||||
|
||||
shl rdx, 5
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
movq xmm3, QWORD PTR [rsi+1] ;
|
||||
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
pmullw xmm1, [rax] ;
|
||||
punpcklbw xmm3, xmm0
|
||||
pmullw xmm3, [rax+16] ;
|
||||
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
movdqa xmm5, xmm1
|
||||
|
||||
movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
|
||||
lea rsi, [rsi + rbx]
|
||||
%if ABI_IS_32BIT=0
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
filter_block2d_bil_var_sse2_loop:
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
movq xmm3, QWORD PTR [rsi+1] ;
|
||||
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
pmullw xmm1, [rax] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
pmullw xmm3, [rax+16] ;
|
||||
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
|
||||
movdqa xmm3, xmm5 ;
|
||||
movdqa xmm5, xmm1 ;
|
||||
|
||||
pmullw xmm3, [rdx] ;
|
||||
pmullw xmm1, [rdx+16] ;
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
|
||||
psubw xmm1, xmm3 ;
|
||||
paddw xmm6, xmm1 ;
|
||||
|
||||
pmaddwd xmm1, xmm1 ;
|
||||
paddd xmm7, xmm1 ;
|
||||
|
||||
lea rsi, [rsi + rbx] ;ref_pixels_per_line
|
||||
%if ABI_IS_32BIT
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line
|
||||
%else
|
||||
lea rdi, [rdi + r9]
|
||||
%endif
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil_var_sse2_loop ;
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_var_sse2_sp_only:
|
||||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
|
||||
je filter_block2d_bil_var_sse2_full_pixel
|
||||
|
||||
shl rdx, 5
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
|
||||
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
|
||||
lea rsi, [rsi + rax]
|
||||
|
||||
filter_block2d_bil_sp_only_loop:
|
||||
movq xmm3, QWORD PTR [rsi] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
movdqa xmm5, xmm3
|
||||
|
||||
pmullw xmm1, [rdx] ;
|
||||
pmullw xmm3, [rdx+16] ;
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
|
||||
psubw xmm1, xmm3 ;
|
||||
paddw xmm6, xmm1 ;
|
||||
|
||||
pmaddwd xmm1, xmm1 ;
|
||||
paddd xmm7, xmm1 ;
|
||||
|
||||
movdqa xmm1, xmm5 ;
|
||||
lea rsi, [rsi + rax] ;ref_pixels_per_line
|
||||
lea rdi, [rdi + rbx] ;src_pixels_per_line
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil_sp_only_loop ;
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_var_sse2_full_pixel:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
filter_block2d_bil_full_pixel_loop:
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
|
||||
movq xmm2, QWORD PTR [rdi] ;
|
||||
punpcklbw xmm2, xmm0 ;
|
||||
|
||||
psubw xmm1, xmm2 ;
|
||||
paddw xmm6, xmm1 ;
|
||||
|
||||
pmaddwd xmm1, xmm1 ;
|
||||
paddd xmm7, xmm1 ;
|
||||
|
||||
lea rsi, [rsi + rax] ;ref_pixels_per_line
|
||||
lea rdi, [rdi + rbx] ;src_pixels_per_line
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil_full_pixel_loop ;
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_var_sse2_fp_only:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
|
||||
|
||||
filter_block2d_bil_fp_only_loop:
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
movq xmm3, QWORD PTR [rsi+1] ;
|
||||
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
pmullw xmm1, [rax] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
pmullw xmm3, [rax+16] ;
|
||||
|
||||
paddw xmm1, xmm3 ;
|
||||
paddw xmm1, xmm4 ;
|
||||
psraw xmm1, xmm_filter_shift ;
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ;
|
||||
punpcklbw xmm3, xmm0 ;
|
||||
|
||||
psubw xmm1, xmm3 ;
|
||||
paddw xmm6, xmm1 ;
|
||||
|
||||
pmaddwd xmm1, xmm1 ;
|
||||
paddd xmm7, xmm1 ;
|
||||
lea rsi, [rsi + rdx]
|
||||
lea rdi, [rdi + rbx] ;src_pixels_per_line
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil_fp_only_loop ;
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_variance:
|
||||
movdq2q mm6, xmm6 ;
|
||||
movdq2q mm7, xmm7 ;
|
||||
|
||||
psrldq xmm6, 8
|
||||
psrldq xmm7, 8
|
||||
|
||||
movdq2q mm2, xmm6
|
||||
movdq2q mm3, xmm7
|
||||
|
||||
paddw mm6, mm2
|
||||
paddd mm7, mm3
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rsi, arg(7) ; sum
|
||||
mov rdi, arg(8) ; sumsquared
|
||||
|
||||
movd [rsi], mm2 ; xsum
|
||||
movd [rdi], mm4 ; xxsum
|
||||
|
||||
; begin epilog
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_half_horiz_vert_variance8x_h_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
|
||||
sym(vp8_half_horiz_vert_variance8x_h_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
pxor xmm6, xmm6 ; error accumulator
|
||||
pxor xmm7, xmm7 ; sse eaccumulator
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
movsxd rcx, dword ptr arg(4) ;Height ;
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
|
||||
movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
|
||||
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
|
||||
%else
|
||||
add rsi, r8
|
||||
%endif
|
||||
|
||||
vp8_half_horiz_vert_variance8x_h_1:
|
||||
|
||||
movq xmm1, QWORD PTR [rsi] ;
|
||||
movq xmm2, QWORD PTR [rsi+1] ;
|
||||
pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
|
||||
|
||||
pavgb xmm5, xmm1 ; xmm = vertical average of the above
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = words of above
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = words of above
|
||||
|
||||
psubw xmm5, xmm3 ; xmm5 -= xmm3
|
||||
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
|
||||
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
|
||||
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
|
||||
|
||||
movdqa xmm5, xmm1 ; save xmm1 for use on the next row
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
|
||||
add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
|
||||
%else
|
||||
add rsi, r8
|
||||
add rdi, r9
|
||||
%endif
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz vp8_half_horiz_vert_variance8x_h_1 ;
|
||||
|
||||
movdq2q mm6, xmm6 ;
|
||||
movdq2q mm7, xmm7 ;
|
||||
|
||||
psrldq xmm6, 8
|
||||
psrldq xmm7, 8
|
||||
|
||||
movdq2q mm2, xmm6
|
||||
movdq2q mm3, xmm7
|
||||
|
||||
paddw mm6, mm2
|
||||
paddd mm7, mm3
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rsi, arg(5) ; sum
|
||||
mov rdi, arg(6) ; sumsquared
|
||||
|
||||
movd [rsi], mm2 ;
|
||||
movd [rdi], mm4 ;
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_half_horiz_vert_variance16x_h_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
|
||||
sym(vp8_half_horiz_vert_variance16x_h_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
pxor xmm6, xmm6 ; error accumulator
|
||||
pxor xmm7, xmm7 ; sse eaccumulator
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
movsxd rcx, dword ptr arg(4) ;Height ;
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
movdqu xmm5, XMMWORD PTR [rsi]
|
||||
movdqu xmm3, XMMWORD PTR [rsi+1]
|
||||
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
|
||||
|
||||
lea rsi, [rsi + rax]
|
||||
|
||||
vp8_half_horiz_vert_variance16x_h_1:
|
||||
movdqu xmm1, XMMWORD PTR [rsi] ;
|
||||
movdqu xmm2, XMMWORD PTR [rsi+1] ;
|
||||
pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
|
||||
|
||||
pavgb xmm5, xmm1 ; xmm = vertical average of the above
|
||||
|
||||
movdqa xmm4, xmm5
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = words of above
|
||||
punpckhbw xmm4, xmm0
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = words of above
|
||||
psubw xmm5, xmm3 ; xmm5 -= xmm3
|
||||
|
||||
movq xmm3, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm3, xmm0
|
||||
psubw xmm4, xmm3
|
||||
|
||||
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
|
||||
paddw xmm6, xmm4
|
||||
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
|
||||
pmaddwd xmm4, xmm4
|
||||
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
|
||||
paddd xmm7, xmm4
|
||||
|
||||
movdqa xmm5, xmm1 ; save xmm1 for use on the next row
|
||||
|
||||
lea rsi, [rsi + rax]
|
||||
lea rdi, [rdi + rdx]
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz vp8_half_horiz_vert_variance16x_h_1 ;
|
||||
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm5, xmm5
|
||||
|
||||
punpcklwd xmm0, xmm6
|
||||
punpckhwd xmm1, xmm6
|
||||
psrad xmm0, 16
|
||||
psrad xmm1, 16
|
||||
paddd xmm0, xmm1
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
movdqa xmm6, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
punpckhdq xmm7, xmm5
|
||||
paddd xmm6, xmm7
|
||||
|
||||
punpckldq xmm0, xmm5
|
||||
punpckhdq xmm1, xmm5
|
||||
paddd xmm0, xmm1
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
psrldq xmm7, 8
|
||||
psrldq xmm1, 8
|
||||
|
||||
paddd xmm6, xmm7
|
||||
paddd xmm0, xmm1
|
||||
|
||||
mov rsi, arg(5) ;[Sum]
|
||||
mov rdi, arg(6) ;[SSE]
|
||||
|
||||
movd [rsi], xmm0
|
||||
movd [rdi], xmm6
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_half_vert_variance8x_h_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
|
||||
sym(vp8_half_vert_variance8x_h_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
pxor xmm6, xmm6 ; error accumulator
|
||||
pxor xmm7, xmm7 ; sse eaccumulator
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
movsxd rcx, dword ptr arg(4) ;Height ;
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
vp8_half_vert_variance8x_h_1:
|
||||
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
|
||||
movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
|
||||
|
||||
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = words of above
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = words of above
|
||||
|
||||
psubw xmm5, xmm3 ; xmm5 -= xmm3
|
||||
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
|
||||
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
|
||||
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
|
||||
add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
|
||||
%else
|
||||
add rsi, r8
|
||||
add rdi, r9
|
||||
%endif
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz vp8_half_vert_variance8x_h_1 ;
|
||||
|
||||
movdq2q mm6, xmm6 ;
|
||||
movdq2q mm7, xmm7 ;
|
||||
|
||||
psrldq xmm6, 8
|
||||
psrldq xmm7, 8
|
||||
|
||||
movdq2q mm2, xmm6
|
||||
movdq2q mm3, xmm7
|
||||
|
||||
paddw mm6, mm2
|
||||
paddd mm7, mm3
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rsi, arg(5) ; sum
|
||||
mov rdi, arg(6) ; sumsquared
|
||||
|
||||
movd [rsi], mm2 ;
|
||||
movd [rdi], mm4 ;
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_half_vert_variance16x_h_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
|
||||
sym(vp8_half_vert_variance16x_h_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
pxor xmm6, xmm6 ; error accumulator
|
||||
pxor xmm7, xmm7 ; sse eaccumulator
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
|
||||
|
||||
movdqu xmm5, XMMWORD PTR [rsi]
|
||||
lea rsi, [rsi + rax ]
|
||||
pxor xmm0, xmm0
|
||||
|
||||
vp8_half_vert_variance16x_h_1:
|
||||
movdqu xmm3, XMMWORD PTR [rsi]
|
||||
|
||||
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
|
||||
movdqa xmm4, xmm5
|
||||
punpcklbw xmm5, xmm0
|
||||
punpckhbw xmm4, xmm0
|
||||
|
||||
movq xmm2, QWORD PTR [rdi]
|
||||
punpcklbw xmm2, xmm0
|
||||
psubw xmm5, xmm2
|
||||
movq xmm2, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm2, xmm0
|
||||
psubw xmm4, xmm2
|
||||
|
||||
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
|
||||
paddw xmm6, xmm4
|
||||
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
|
||||
pmaddwd xmm4, xmm4
|
||||
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
|
||||
paddd xmm7, xmm4
|
||||
|
||||
movdqa xmm5, xmm3
|
||||
|
||||
lea rsi, [rsi + rax]
|
||||
lea rdi, [rdi + rdx]
|
||||
|
||||
sub rcx, 1
|
||||
jnz vp8_half_vert_variance16x_h_1
|
||||
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm5, xmm5
|
||||
|
||||
punpcklwd xmm0, xmm6
|
||||
punpckhwd xmm1, xmm6
|
||||
psrad xmm0, 16
|
||||
psrad xmm1, 16
|
||||
paddd xmm0, xmm1
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
movdqa xmm6, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
punpckhdq xmm7, xmm5
|
||||
paddd xmm6, xmm7
|
||||
|
||||
punpckldq xmm0, xmm5
|
||||
punpckhdq xmm1, xmm5
|
||||
paddd xmm0, xmm1
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
psrldq xmm7, 8
|
||||
psrldq xmm1, 8
|
||||
|
||||
paddd xmm6, xmm7
|
||||
paddd xmm0, xmm1
|
||||
|
||||
mov rsi, arg(5) ;[Sum]
|
||||
mov rdi, arg(6) ;[SSE]
|
||||
|
||||
movd [rsi], xmm0
|
||||
movd [rdi], xmm6
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_half_horiz_variance8x_h_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
|
||||
sym(vp8_half_horiz_variance8x_h_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
pxor xmm6, xmm6 ; error accumulator
|
||||
pxor xmm7, xmm7 ; sse eaccumulator
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
movsxd rcx, dword ptr arg(4) ;Height ;
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
vp8_half_horiz_variance8x_h_1:
|
||||
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
|
||||
movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
|
||||
|
||||
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = words of above
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = words of above
|
||||
|
||||
psubw xmm5, xmm3 ; xmm5 -= xmm3
|
||||
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
|
||||
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
|
||||
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
|
||||
add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
|
||||
%else
|
||||
add rsi, r8
|
||||
add rdi, r9
|
||||
%endif
|
||||
sub rcx, 1 ;
|
||||
jnz vp8_half_horiz_variance8x_h_1 ;
|
||||
|
||||
movdq2q mm6, xmm6 ;
|
||||
movdq2q mm7, xmm7 ;
|
||||
|
||||
psrldq xmm6, 8
|
||||
psrldq xmm7, 8
|
||||
|
||||
movdq2q mm2, xmm6
|
||||
movdq2q mm3, xmm7
|
||||
|
||||
paddw mm6, mm2
|
||||
paddd mm7, mm3
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rsi, arg(5) ; sum
|
||||
mov rdi, arg(6) ; sumsquared
|
||||
|
||||
movd [rsi], mm2 ;
|
||||
movd [rdi], mm4 ;
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_half_horiz_variance16x_h_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
|
||||
sym(vp8_half_horiz_variance16x_h_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
pxor xmm6, xmm6 ; error accumulator
|
||||
pxor xmm7, xmm7 ; sse eaccumulator
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
movsxd rcx, dword ptr arg(4) ;Height ;
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
vp8_half_horiz_variance16x_h_1:
|
||||
movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
|
||||
movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
|
||||
|
||||
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
|
||||
movdqa xmm1, xmm5
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = words of above
|
||||
punpckhbw xmm1, xmm0
|
||||
|
||||
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = words of above
|
||||
movq xmm2, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm2, xmm0
|
||||
|
||||
psubw xmm5, xmm3 ; xmm5 -= xmm3
|
||||
psubw xmm1, xmm2
|
||||
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
|
||||
paddw xmm6, xmm1
|
||||
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
|
||||
pmaddwd xmm1, xmm1
|
||||
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
|
||||
paddd xmm7, xmm1
|
||||
|
||||
lea rsi, [rsi + rax]
|
||||
lea rdi, [rdi + rdx]
|
||||
|
||||
sub rcx, 1 ;
|
||||
jnz vp8_half_horiz_variance16x_h_1 ;
|
||||
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm5, xmm5
|
||||
|
||||
punpcklwd xmm0, xmm6
|
||||
punpckhwd xmm1, xmm6
|
||||
psrad xmm0, 16
|
||||
psrad xmm1, 16
|
||||
paddd xmm0, xmm1
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
movdqa xmm6, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
punpckhdq xmm7, xmm5
|
||||
paddd xmm6, xmm7
|
||||
|
||||
punpckldq xmm0, xmm5
|
||||
punpckhdq xmm1, xmm5
|
||||
paddd xmm0, xmm1
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
psrldq xmm7, 8
|
||||
psrldq xmm1, 8
|
||||
|
||||
paddd xmm6, xmm7
|
||||
paddd xmm0, xmm1
|
||||
|
||||
mov rsi, arg(5) ;[Sum]
|
||||
mov rdi, arg(6) ;[SSE]
|
||||
|
||||
movd [rsi], xmm0
|
||||
movd [rdi], xmm6
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
|
||||
align 16
|
||||
xmm_bi_rd:
|
||||
times 8 dw 64
|
||||
align 16
|
||||
vp8_bilinear_filters_sse2:
|
||||
dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
|
||||
dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
|
||||
dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
|
||||
dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
|
||||
dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
|
||||
dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
|
||||
dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
|
|
@ -1,364 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
%define xmm_filter_shift 7
|
||||
|
||||
|
||||
;void vp8_filter_block2d_bil_var_ssse3
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; int xoffset,
|
||||
; int yoffset,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared;;
|
||||
;
|
||||
;)
|
||||
;Note: The filter coefficient at offset=0 is 128. Since the second register
|
||||
;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
|
||||
global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
|
||||
sym(vp8_filter_block2d_bil_var_ssse3):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 9
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
pxor xmm6, xmm6
|
||||
pxor xmm7, xmm7
|
||||
|
||||
lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
|
||||
movsxd rax, dword ptr arg(5) ; xoffset
|
||||
|
||||
cmp rax, 0 ; skip first_pass filter if xoffset=0
|
||||
je .filter_block2d_bil_var_ssse3_sp_only
|
||||
|
||||
shl rax, 4 ; point to filter coeff with xoffset
|
||||
lea rax, [rax + rcx] ; HFilter
|
||||
|
||||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; skip second_pass filter if yoffset=0
|
||||
je .filter_block2d_bil_var_ssse3_fp_only
|
||||
|
||||
shl rdx, 4
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rsi]
|
||||
movdqu xmm1, XMMWORD PTR [rsi+1]
|
||||
movdqa xmm2, xmm0
|
||||
|
||||
punpcklbw xmm0, xmm1
|
||||
punpckhbw xmm2, xmm1
|
||||
pmaddubsw xmm0, [rax]
|
||||
pmaddubsw xmm2, [rax]
|
||||
|
||||
paddw xmm0, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm2, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm0, xmm_filter_shift
|
||||
psraw xmm2, xmm_filter_shift
|
||||
|
||||
packuswb xmm0, xmm2
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
lea rsi, [rsi + r8]
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil_var_ssse3_loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rsi+1]
|
||||
movdqa xmm3, xmm1
|
||||
|
||||
punpcklbw xmm1, xmm2
|
||||
punpckhbw xmm3, xmm2
|
||||
pmaddubsw xmm1, [rax]
|
||||
pmaddubsw xmm3, [rax]
|
||||
|
||||
paddw xmm1, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm3, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm1, xmm_filter_shift
|
||||
psraw xmm3, xmm_filter_shift
|
||||
packuswb xmm1, xmm3
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm0, xmm1
|
||||
movdqa xmm3, xmm2
|
||||
|
||||
punpcklbw xmm2, xmm1
|
||||
punpckhbw xmm3, xmm1
|
||||
pmaddubsw xmm2, [rdx]
|
||||
pmaddubsw xmm3, [rdx]
|
||||
|
||||
paddw xmm2, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm3, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm2, xmm_filter_shift
|
||||
psraw xmm3, xmm_filter_shift
|
||||
|
||||
movq xmm1, QWORD PTR [rdi]
|
||||
pxor xmm4, xmm4
|
||||
punpcklbw xmm1, xmm4
|
||||
movq xmm5, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm5, xmm4
|
||||
|
||||
psubw xmm2, xmm1
|
||||
psubw xmm3, xmm5
|
||||
paddw xmm6, xmm2
|
||||
paddw xmm6, xmm3
|
||||
pmaddwd xmm2, xmm2
|
||||
pmaddwd xmm3, xmm3
|
||||
paddd xmm7, xmm2
|
||||
paddd xmm7, xmm3
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line
|
||||
%else
|
||||
lea rsi, [rsi + r8]
|
||||
lea rdi, [rdi + r9]
|
||||
%endif
|
||||
|
||||
sub rcx, 1
|
||||
jnz .filter_block2d_bil_var_ssse3_loop
|
||||
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
.filter_block2d_bil_var_ssse3_sp_only:
|
||||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; Both xoffset =0 and yoffset=0
|
||||
je .filter_block2d_bil_var_ssse3_full_pixel
|
||||
|
||||
shl rdx, 4
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqa xmm0, xmm1
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
lea rsi, [rsi + rax]
|
||||
|
||||
.filter_block2d_bil_sp_only_loop:
|
||||
movdqu xmm3, XMMWORD PTR [rsi]
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm0, xmm3
|
||||
|
||||
punpcklbw xmm1, xmm3
|
||||
punpckhbw xmm2, xmm3
|
||||
pmaddubsw xmm1, [rdx]
|
||||
pmaddubsw xmm2, [rdx]
|
||||
|
||||
paddw xmm1, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm2, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm1, xmm_filter_shift
|
||||
psraw xmm2, xmm_filter_shift
|
||||
|
||||
movq xmm3, QWORD PTR [rdi]
|
||||
pxor xmm4, xmm4
|
||||
punpcklbw xmm3, xmm4
|
||||
movq xmm5, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm5, xmm4
|
||||
|
||||
psubw xmm1, xmm3
|
||||
psubw xmm2, xmm5
|
||||
paddw xmm6, xmm1
|
||||
paddw xmm6, xmm2
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm7, xmm2
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
lea rsi, [rsi + rax] ;ref_pixels_per_line
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line
|
||||
%else
|
||||
lea rdi, [rdi + r9]
|
||||
%endif
|
||||
|
||||
sub rcx, 1
|
||||
jnz .filter_block2d_bil_sp_only_loop
|
||||
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
.filter_block2d_bil_var_ssse3_full_pixel:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
|
||||
pxor xmm0, xmm0
|
||||
|
||||
.filter_block2d_bil_full_pixel_loop:
|
||||
movq xmm1, QWORD PTR [rsi]
|
||||
punpcklbw xmm1, xmm0
|
||||
movq xmm2, QWORD PTR [rsi+8]
|
||||
punpcklbw xmm2, xmm0
|
||||
|
||||
movq xmm3, QWORD PTR [rdi]
|
||||
punpcklbw xmm3, xmm0
|
||||
movq xmm4, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm4, xmm0
|
||||
|
||||
psubw xmm1, xmm3
|
||||
psubw xmm2, xmm4
|
||||
paddw xmm6, xmm1
|
||||
paddw xmm6, xmm2
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm7, xmm2
|
||||
|
||||
lea rsi, [rsi + rax] ;ref_pixels_per_line
|
||||
lea rdi, [rdi + rdx] ;src_pixels_per_line
|
||||
sub rcx, 1
|
||||
jnz .filter_block2d_bil_full_pixel_loop
|
||||
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
.filter_block2d_bil_var_ssse3_fp_only:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
|
||||
|
||||
pxor xmm0, xmm0
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil_fp_only_loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rsi+1]
|
||||
movdqa xmm3, xmm1
|
||||
|
||||
punpcklbw xmm1, xmm2
|
||||
punpckhbw xmm3, xmm2
|
||||
pmaddubsw xmm1, [rax]
|
||||
pmaddubsw xmm3, [rax]
|
||||
|
||||
paddw xmm1, [GLOBAL(xmm_bi_rd)]
|
||||
paddw xmm3, [GLOBAL(xmm_bi_rd)]
|
||||
psraw xmm1, xmm_filter_shift
|
||||
psraw xmm3, xmm_filter_shift
|
||||
|
||||
movq xmm2, XMMWORD PTR [rdi]
|
||||
pxor xmm4, xmm4
|
||||
punpcklbw xmm2, xmm4
|
||||
movq xmm5, QWORD PTR [rdi+8]
|
||||
punpcklbw xmm5, xmm4
|
||||
|
||||
psubw xmm1, xmm2
|
||||
psubw xmm3, xmm5
|
||||
paddw xmm6, xmm1
|
||||
paddw xmm6, xmm3
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm3, xmm3
|
||||
paddd xmm7, xmm1
|
||||
paddd xmm7, xmm3
|
||||
|
||||
lea rsi, [rsi + rdx]
|
||||
%if ABI_IS_32BIT
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line
|
||||
%else
|
||||
lea rdi, [rdi + r9]
|
||||
%endif
|
||||
|
||||
sub rcx, 1
|
||||
jnz .filter_block2d_bil_fp_only_loop
|
||||
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
.filter_block2d_bil_variance:
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm5, xmm5
|
||||
|
||||
punpcklwd xmm0, xmm6
|
||||
punpckhwd xmm1, xmm6
|
||||
psrad xmm0, 16
|
||||
psrad xmm1, 16
|
||||
paddd xmm0, xmm1
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
movdqa xmm6, xmm7
|
||||
punpckldq xmm6, xmm5
|
||||
punpckhdq xmm7, xmm5
|
||||
paddd xmm6, xmm7
|
||||
|
||||
punpckldq xmm0, xmm5
|
||||
punpckhdq xmm1, xmm5
|
||||
paddd xmm0, xmm1
|
||||
|
||||
movdqa xmm7, xmm6
|
||||
movdqa xmm1, xmm0
|
||||
|
||||
psrldq xmm7, 8
|
||||
psrldq xmm1, 8
|
||||
|
||||
paddd xmm6, xmm7
|
||||
paddd xmm0, xmm1
|
||||
|
||||
mov rsi, arg(7) ;[Sum]
|
||||
mov rdi, arg(8) ;[SSE]
|
||||
|
||||
movd [rsi], xmm0
|
||||
movd [rdi], xmm6
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
xmm_bi_rd:
|
||||
times 8 dw 64
|
||||
align 16
|
||||
vp8_bilinear_filters_ssse3:
|
||||
times 8 db 128, 0
|
||||
times 8 db 112, 16
|
||||
times 8 db 96, 32
|
||||
times 8 db 80, 48
|
||||
times 8 db 64, 64
|
||||
times 8 db 48, 80
|
||||
times 8 db 32, 96
|
||||
times 8 db 16, 112
|
|
@ -1,157 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "vpx_config.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
extern void vp8_half_horiz_vert_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
extern void vp8_half_horiz_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
extern void vp8_half_vert_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
extern void vp8_filter_block2d_bil_var_ssse3
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x16_ssse3
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
int xsum0;
|
||||
unsigned int xxsum0;
|
||||
|
||||
/* note we could avoid these if statements if the calling function
|
||||
* just called the appropriate functions inside.
|
||||
*/
|
||||
if (xoffset == 4 && yoffset == 0)
|
||||
{
|
||||
vp8_half_horiz_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else if (xoffset == 0 && yoffset == 4)
|
||||
{
|
||||
vp8_half_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else if (xoffset == 4 && yoffset == 4)
|
||||
{
|
||||
vp8_half_horiz_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else
|
||||
{
|
||||
vp8_filter_block2d_bil_var_ssse3(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
xoffset, yoffset,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x8_ssse3
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
|
||||
)
|
||||
{
|
||||
int xsum0;
|
||||
unsigned int xxsum0;
|
||||
|
||||
if (xoffset == 4 && yoffset == 0)
|
||||
{
|
||||
vp8_half_horiz_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else if (xoffset == 0 && yoffset == 4)
|
||||
{
|
||||
vp8_half_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else if (xoffset == 4 && yoffset == 4)
|
||||
{
|
||||
vp8_half_horiz_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else
|
||||
{
|
||||
vp8_filter_block2d_bil_var_ssse3(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
xoffset, yoffset,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
|
||||
}
|
|
@ -1,353 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
%define mmx_filter_shift 7
|
||||
|
||||
;void vp8_filter_block2d_bil4x4_var_mmx
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned short *HFilter,
|
||||
; unsigned short *VFilter,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
|
||||
sym(vp8_filter_block2d_bil4x4_var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 8
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm6, mm6 ;
|
||||
pxor mm7, mm7 ;
|
||||
|
||||
mov rax, arg(4) ;HFilter ;
|
||||
mov rdx, arg(5) ;VFilter ;
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
|
||||
mov rcx, 4 ;
|
||||
pxor mm0, mm0 ;
|
||||
|
||||
movd mm1, [rsi] ;
|
||||
movd mm3, [rsi+1] ;
|
||||
|
||||
punpcklbw mm1, mm0 ;
|
||||
pmullw mm1, [rax] ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
pmullw mm3, [rax+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
movq mm5, mm1
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
add rsi, r8
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil4x4_var_mmx_loop:
|
||||
|
||||
movd mm1, [rsi] ;
|
||||
movd mm3, [rsi+1] ;
|
||||
|
||||
punpcklbw mm1, mm0 ;
|
||||
pmullw mm1, [rax] ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
pmullw mm3, [rax+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
movq mm3, mm5 ;
|
||||
|
||||
movq mm5, mm1 ;
|
||||
pmullw mm3, [rdx] ;
|
||||
|
||||
pmullw mm1, [rdx+8] ;
|
||||
paddw mm1, mm3 ;
|
||||
|
||||
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
|
||||
movd mm3, [rdi] ;
|
||||
punpcklbw mm3, mm0 ;
|
||||
|
||||
psubw mm1, mm3 ;
|
||||
paddw mm6, mm1 ;
|
||||
|
||||
pmaddwd mm1, mm1 ;
|
||||
paddd mm7, mm1 ;
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
add rsi, r8
|
||||
add rdi, r9
|
||||
%endif
|
||||
sub rcx, 1 ;
|
||||
jnz .filter_block2d_bil4x4_var_mmx_loop ;
|
||||
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rdi, arg(6) ;sum
|
||||
mov rsi, arg(7) ;sumsquared
|
||||
|
||||
movd dword ptr [rdi], mm2 ;
|
||||
movd dword ptr [rsi], mm4 ;
|
||||
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
|
||||
;void vp8_filter_block2d_bil_var_mmx
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; unsigned short *HFilter,
|
||||
; unsigned short *VFilter,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
|
||||
sym(vp8_filter_block2d_bil_var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 9
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
pxor mm6, mm6 ;
|
||||
pxor mm7, mm7 ;
|
||||
mov rax, arg(5) ;HFilter ;
|
||||
|
||||
mov rdx, arg(6) ;VFilter ;
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
movsxd rcx, dword ptr arg(4) ;Height ;
|
||||
|
||||
pxor mm0, mm0 ;
|
||||
movq mm1, [rsi] ;
|
||||
|
||||
movq mm3, [rsi+1] ;
|
||||
movq mm2, mm1 ;
|
||||
|
||||
movq mm4, mm3 ;
|
||||
punpcklbw mm1, mm0 ;
|
||||
|
||||
punpckhbw mm2, mm0 ;
|
||||
pmullw mm1, [rax] ;
|
||||
|
||||
pmullw mm2, [rax] ;
|
||||
punpcklbw mm3, mm0 ;
|
||||
|
||||
punpckhbw mm4, mm0 ;
|
||||
pmullw mm3, [rax+8] ;
|
||||
|
||||
pmullw mm4, [rax+8] ;
|
||||
paddw mm1, mm3 ;
|
||||
|
||||
paddw mm2, mm4 ;
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm2, mmx_filter_shift ;
|
||||
movq mm5, mm1
|
||||
|
||||
packuswb mm5, mm2 ;
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
add rsi, r8
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil_var_mmx_loop:
|
||||
|
||||
movq mm1, [rsi] ;
|
||||
movq mm3, [rsi+1] ;
|
||||
|
||||
movq mm2, mm1 ;
|
||||
movq mm4, mm3 ;
|
||||
|
||||
punpcklbw mm1, mm0 ;
|
||||
punpckhbw mm2, mm0 ;
|
||||
|
||||
pmullw mm1, [rax] ;
|
||||
pmullw mm2, [rax] ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
pmullw mm3, [rax+8] ;
|
||||
pmullw mm4, [rax+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm2, mm4 ;
|
||||
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
|
||||
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
||||
psraw mm2, mmx_filter_shift ;
|
||||
|
||||
movq mm3, mm5 ;
|
||||
movq mm4, mm5 ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
movq mm5, mm1 ;
|
||||
packuswb mm5, mm2 ;
|
||||
|
||||
pmullw mm3, [rdx] ;
|
||||
pmullw mm4, [rdx] ;
|
||||
|
||||
pmullw mm1, [rdx+8] ;
|
||||
pmullw mm2, [rdx+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm2, mm4 ;
|
||||
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
psraw mm2, mmx_filter_shift ;
|
||||
|
||||
movq mm3, [rdi] ;
|
||||
movq mm4, mm3 ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
psubw mm1, mm3 ;
|
||||
psubw mm2, mm4 ;
|
||||
|
||||
paddw mm6, mm1 ;
|
||||
pmaddwd mm1, mm1 ;
|
||||
|
||||
paddw mm6, mm2 ;
|
||||
pmaddwd mm2, mm2 ;
|
||||
|
||||
paddd mm7, mm1 ;
|
||||
paddd mm7, mm2 ;
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
|
||||
add rsi, r8
|
||||
add rdi, r9
|
||||
%endif
|
||||
sub rcx, 1 ;
|
||||
jnz .filter_block2d_bil_var_mmx_loop ;
|
||||
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rdi, arg(7) ;sum
|
||||
mov rsi, arg(8) ;sumsquared
|
||||
|
||||
movd dword ptr [rdi], mm2 ;
|
||||
movd dword ptr [rsi], mm4 ;
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
;short mmx_bi_rd[4] = { 64, 64, 64, 64};
|
||||
align 16
|
||||
mmx_bi_rd:
|
||||
times 4 dw 64
|
|
@ -1,244 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "vpx_config.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vp8/common/x86/filter_x86.h"
|
||||
|
||||
extern void filter_block1d_h6_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
unsigned short *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
short *filter
|
||||
);
|
||||
extern void filter_block1d_v6_mmx
|
||||
(
|
||||
const short *src_ptr,
|
||||
unsigned char *output_ptr,
|
||||
unsigned int pixels_per_line,
|
||||
unsigned int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
short *filter
|
||||
);
|
||||
|
||||
extern void vp8_filter_block2d_bil4x4_var_mmx
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
const short *HFilter,
|
||||
const short *VFilter,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
extern void vp8_filter_block2d_bil_var_mmx
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
const short *HFilter,
|
||||
const short *VFilter,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
unsigned int vp8_sub_pixel_variance4x4_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse)
|
||||
|
||||
{
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vp8_filter_block2d_bil4x4_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line,
|
||||
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
|
||||
&xsum, &xxsum
|
||||
);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_sub_pixel_variance8x8_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vp8_filter_block2d_bil_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
|
||||
&xsum, &xxsum
|
||||
);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 6));
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x16_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
|
||||
vp8_filter_block2d_bil_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
|
||||
&xsum0, &xxsum0
|
||||
);
|
||||
|
||||
|
||||
vp8_filter_block2d_bil_var_mmx(
|
||||
src_ptr + 8, src_pixels_per_line,
|
||||
dst_ptr + 8, dst_pixels_per_line, 16,
|
||||
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
|
||||
&xsum1, &xxsum1
|
||||
);
|
||||
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
|
||||
|
||||
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x8_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
|
||||
vp8_filter_block2d_bil_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
|
||||
&xsum0, &xxsum0
|
||||
);
|
||||
|
||||
|
||||
vp8_filter_block2d_bil_var_mmx(
|
||||
src_ptr + 8, src_pixels_per_line,
|
||||
dst_ptr + 8, dst_pixels_per_line, 8,
|
||||
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
|
||||
&xsum1, &xxsum1
|
||||
);
|
||||
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance8x16_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vp8_filter_block2d_bil_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
|
||||
&xsum, &xxsum
|
||||
);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 7));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance_halfpixvar16x16_h_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
|
||||
ref_ptr, recon_stride, sse);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance_halfpixvar16x16_v_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
|
||||
ref_ptr, recon_stride, sse);
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
|
||||
ref_ptr, recon_stride, sse);
|
||||
}
|
|
@ -1,403 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "vpx_config.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vp8/common/x86/filter_x86.h"
|
||||
|
||||
extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
|
||||
extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
|
||||
extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
|
||||
extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
|
||||
|
||||
extern void vp8_filter_block2d_bil4x4_var_mmx
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
const short *HFilter,
|
||||
const short *VFilter,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
void vp8_filter_block2d_bil_var_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
void vp8_half_horiz_vert_variance8x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
void vp8_half_horiz_vert_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
void vp8_half_horiz_variance8x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
void vp8_half_horiz_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
void vp8_half_vert_variance8x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
void vp8_half_vert_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
int *sum,
|
||||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
unsigned int vp8_sub_pixel_variance4x4_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vp8_filter_block2d_bil4x4_var_mmx(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line,
|
||||
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
|
||||
&xsum, &xxsum
|
||||
);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_sub_pixel_variance8x8_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
|
||||
if (xoffset == 4 && yoffset == 0)
|
||||
{
|
||||
vp8_half_horiz_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
else if (xoffset == 0 && yoffset == 4)
|
||||
{
|
||||
vp8_half_vert_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
else if (xoffset == 4 && yoffset == 4)
|
||||
{
|
||||
vp8_half_horiz_vert_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
else
|
||||
{
|
||||
vp8_filter_block2d_bil_var_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
xoffset, yoffset,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 6));
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x16_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
|
||||
/* note we could avoid these if statements if the calling function
|
||||
* just called the appropriate functions inside.
|
||||
*/
|
||||
if (xoffset == 4 && yoffset == 0)
|
||||
{
|
||||
vp8_half_horiz_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else if (xoffset == 0 && yoffset == 4)
|
||||
{
|
||||
vp8_half_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else if (xoffset == 4 && yoffset == 4)
|
||||
{
|
||||
vp8_half_horiz_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else
|
||||
{
|
||||
vp8_filter_block2d_bil_var_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
xoffset, yoffset,
|
||||
&xsum0, &xxsum0
|
||||
);
|
||||
|
||||
vp8_filter_block2d_bil_var_sse2(
|
||||
src_ptr + 8, src_pixels_per_line,
|
||||
dst_ptr + 8, dst_pixels_per_line, 16,
|
||||
xoffset, yoffset,
|
||||
&xsum1, &xxsum1
|
||||
);
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
}
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x8_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
|
||||
)
|
||||
{
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
if (xoffset == 4 && yoffset == 0)
|
||||
{
|
||||
vp8_half_horiz_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else if (xoffset == 0 && yoffset == 4)
|
||||
{
|
||||
vp8_half_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else if (xoffset == 4 && yoffset == 4)
|
||||
{
|
||||
vp8_half_horiz_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
&xsum0, &xxsum0);
|
||||
}
|
||||
else
|
||||
{
|
||||
vp8_filter_block2d_bil_var_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 8,
|
||||
xoffset, yoffset,
|
||||
&xsum0, &xxsum0);
|
||||
|
||||
vp8_filter_block2d_bil_var_sse2(
|
||||
src_ptr + 8, src_pixels_per_line,
|
||||
dst_ptr + 8, dst_pixels_per_line, 8,
|
||||
xoffset, yoffset,
|
||||
&xsum1, &xxsum1);
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
}
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance8x16_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse
|
||||
)
|
||||
{
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
|
||||
if (xoffset == 4 && yoffset == 0)
|
||||
{
|
||||
vp8_half_horiz_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
else if (xoffset == 0 && yoffset == 4)
|
||||
{
|
||||
vp8_half_vert_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
else if (xoffset == 4 && yoffset == 4)
|
||||
{
|
||||
vp8_half_horiz_vert_variance8x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
else
|
||||
{
|
||||
vp8_filter_block2d_bil_var_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
xoffset, yoffset,
|
||||
&xsum, &xxsum);
|
||||
}
|
||||
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 7));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance_halfpixvar16x16_h_wmt(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse)
|
||||
{
|
||||
int xsum0;
|
||||
unsigned int xxsum0;
|
||||
|
||||
vp8_half_horiz_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance_halfpixvar16x16_v_wmt(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse)
|
||||
{
|
||||
int xsum0;
|
||||
unsigned int xxsum0;
|
||||
vp8_half_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
const unsigned char *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse)
|
||||
{
|
||||
int xsum0;
|
||||
unsigned int xxsum0;
|
||||
|
||||
vp8_half_horiz_vert_variance16x_h_sse2(
|
||||
src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line, 16,
|
||||
&xsum0, &xxsum0);
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
|
||||
}
|
|
@ -16,7 +16,7 @@
|
|||
#include "./vpx_scale_rtcd.h"
|
||||
#include "block.h"
|
||||
#include "onyx_int.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
#include "encodeintra.h"
|
||||
#include "vp8/common/setupintrarecon.h"
|
||||
#include "vp8/common/systemdependent.h"
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#define VP8_ENCODER_MCOMP_H_
|
||||
|
||||
#include "block.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
|
@ -2132,17 +2132,17 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||
|
||||
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
|
||||
cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vpx_variance_halfpixvar16x16_h;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vpx_variance_halfpixvar16x16_v;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
|
||||
cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
|
||||
cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
|
||||
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
|
||||
|
||||
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
|
||||
cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
|
||||
cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
|
||||
cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
|
||||
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
|
||||
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
|
||||
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
|
||||
|
@ -2152,7 +2152,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||
|
||||
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
|
||||
cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
|
||||
cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
|
||||
cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
|
||||
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
|
||||
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
|
||||
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
|
||||
|
@ -2162,7 +2162,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||
|
||||
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
|
||||
cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
|
||||
cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
|
||||
cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
|
||||
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
|
||||
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
|
||||
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
|
||||
|
@ -2172,7 +2172,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||
|
||||
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
|
||||
cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
|
||||
cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
|
||||
cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
|
||||
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
|
||||
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
|
||||
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
#include "treewriter.h"
|
||||
#include "tokenize.h"
|
||||
#include "vp8/common/onyxc_int.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
#include "encodemb.h"
|
||||
#include "quantize.h"
|
||||
#include "vp8/common/entropy.h"
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
#include "encodemb.h"
|
||||
#include "vp8/common/reconinter.h"
|
||||
#include "vp8/common/reconintra4x4.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
#include "mcomp.h"
|
||||
#include "rdopt.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
#include "vp8/common/quant_common.h"
|
||||
#include "encodemb.h"
|
||||
#include "quantize.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
#include "mcomp.h"
|
||||
#include "rdopt.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
@ -500,9 +500,9 @@ int VP8_UVSSE(MACROBLOCK *x)
|
|||
|
||||
if ((mv_row | mv_col) & 7)
|
||||
{
|
||||
vp8_sub_pixel_variance8x8(uptr, pre_stride,
|
||||
vpx_sub_pixel_variance8x8(uptr, pre_stride,
|
||||
mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
|
||||
vp8_sub_pixel_variance8x8(vptr, pre_stride,
|
||||
vpx_sub_pixel_variance8x8(vptr, pre_stride,
|
||||
mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
|
||||
sse2 += sse1;
|
||||
}
|
||||
|
|
|
@ -63,8 +63,6 @@ VP8_COMMON_SRCS-yes += common/reconintra.c
|
|||
VP8_COMMON_SRCS-yes += common/reconintra4x4.c
|
||||
VP8_COMMON_SRCS-yes += common/setupintrarecon.c
|
||||
VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
|
||||
VP8_COMMON_SRCS-yes += common/variance_c.c
|
||||
VP8_COMMON_SRCS-yes += common/variance.h
|
||||
VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
|
||||
|
||||
|
||||
|
@ -86,8 +84,6 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
|
|||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_mmx.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_impl_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
|
||||
|
@ -96,12 +92,8 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
|
|||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp8_variance_sse2.c
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
|
||||
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
|
||||
|
||||
ifeq ($(CONFIG_POSTPROC),yes)
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
|
||||
|
@ -129,7 +121,6 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
|
|||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/variance_arm.c
|
||||
|
||||
# common (media)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.c
|
||||
|
@ -149,9 +140,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM)
|
|||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
|
||||
|
||||
# common (neon intrinsics)
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c
|
||||
|
@ -170,6 +158,5 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
|
|||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
|
||||
|
||||
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
|
||||
uint8_t *dst, int32_t dst_stride, int32_t height) {
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
|
||||
uint8_t *dst, int32_t dst_stride, int32_t height) {
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
|
||||
|
||||
#include "vp9/common/vp9_filter.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
extern const uint8_t mc_filt_mask_arr[16 * 3];
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
|
||||
v8i16 k0_m = __msa_fill_h(cnst0); \
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \
|
||||
out0 = __msa_subs_u_h(out0, in0); \
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
#ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
|
||||
#define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
|
||||
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
|
||||
p1_out, p0_out, q0_out, q1_out) { \
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -10,7 +10,7 @@
|
|||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/vp9_onyxc_int.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
uint8_t *dst_ptr, int32_t dst_stride,
|
||||
|
|
|
@ -802,88 +802,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||
#
|
||||
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
|
||||
|
||||
|
||||
# variance
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance16x16 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance8x8 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
|
||||
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
|
||||
specialize qw/vp9_avg_8x8 sse2 neon msa/;
|
||||
|
||||
|
@ -1085,241 +1003,6 @@ specialize qw/vp9_temporal_filter_apply sse2 msa/;
|
|||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance4x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance4x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
|
||||
|
||||
|
||||
# ENCODEMB INVOKE
|
||||
|
||||
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
|
||||
uint32_t sum_out;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \
|
||||
static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr, \
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
|
||||
v8i16 k0_m = __msa_fill_h(cnst0); \
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
|
||||
uint32_t stride,
|
||||
|
|
|
@ -1023,8 +1023,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad32x16_bits8,
|
||||
vpx_highbd_sad32x16_avg_bits8,
|
||||
vpx_highbd_8_variance32x16,
|
||||
vp9_highbd_sub_pixel_variance32x16,
|
||||
vp9_highbd_sub_pixel_avg_variance32x16,
|
||||
vpx_highbd_8_sub_pixel_variance32x16,
|
||||
vpx_highbd_8_sub_pixel_avg_variance32x16,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad32x16x4d_bits8)
|
||||
|
@ -1033,8 +1033,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad16x32_bits8,
|
||||
vpx_highbd_sad16x32_avg_bits8,
|
||||
vpx_highbd_8_variance16x32,
|
||||
vp9_highbd_sub_pixel_variance16x32,
|
||||
vp9_highbd_sub_pixel_avg_variance16x32,
|
||||
vpx_highbd_8_sub_pixel_variance16x32,
|
||||
vpx_highbd_8_sub_pixel_avg_variance16x32,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad16x32x4d_bits8)
|
||||
|
@ -1043,8 +1043,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad64x32_bits8,
|
||||
vpx_highbd_sad64x32_avg_bits8,
|
||||
vpx_highbd_8_variance64x32,
|
||||
vp9_highbd_sub_pixel_variance64x32,
|
||||
vp9_highbd_sub_pixel_avg_variance64x32,
|
||||
vpx_highbd_8_sub_pixel_variance64x32,
|
||||
vpx_highbd_8_sub_pixel_avg_variance64x32,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad64x32x4d_bits8)
|
||||
|
@ -1053,8 +1053,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad32x64_bits8,
|
||||
vpx_highbd_sad32x64_avg_bits8,
|
||||
vpx_highbd_8_variance32x64,
|
||||
vp9_highbd_sub_pixel_variance32x64,
|
||||
vp9_highbd_sub_pixel_avg_variance32x64,
|
||||
vpx_highbd_8_sub_pixel_variance32x64,
|
||||
vpx_highbd_8_sub_pixel_avg_variance32x64,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad32x64x4d_bits8)
|
||||
|
@ -1063,8 +1063,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad32x32_bits8,
|
||||
vpx_highbd_sad32x32_avg_bits8,
|
||||
vpx_highbd_8_variance32x32,
|
||||
vp9_highbd_sub_pixel_variance32x32,
|
||||
vp9_highbd_sub_pixel_avg_variance32x32,
|
||||
vpx_highbd_8_sub_pixel_variance32x32,
|
||||
vpx_highbd_8_sub_pixel_avg_variance32x32,
|
||||
vpx_highbd_sad32x32x3_bits8,
|
||||
vpx_highbd_sad32x32x8_bits8,
|
||||
vpx_highbd_sad32x32x4d_bits8)
|
||||
|
@ -1073,8 +1073,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad64x64_bits8,
|
||||
vpx_highbd_sad64x64_avg_bits8,
|
||||
vpx_highbd_8_variance64x64,
|
||||
vp9_highbd_sub_pixel_variance64x64,
|
||||
vp9_highbd_sub_pixel_avg_variance64x64,
|
||||
vpx_highbd_8_sub_pixel_variance64x64,
|
||||
vpx_highbd_8_sub_pixel_avg_variance64x64,
|
||||
vpx_highbd_sad64x64x3_bits8,
|
||||
vpx_highbd_sad64x64x8_bits8,
|
||||
vpx_highbd_sad64x64x4d_bits8)
|
||||
|
@ -1083,8 +1083,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad16x16_bits8,
|
||||
vpx_highbd_sad16x16_avg_bits8,
|
||||
vpx_highbd_8_variance16x16,
|
||||
vp9_highbd_sub_pixel_variance16x16,
|
||||
vp9_highbd_sub_pixel_avg_variance16x16,
|
||||
vpx_highbd_8_sub_pixel_variance16x16,
|
||||
vpx_highbd_8_sub_pixel_avg_variance16x16,
|
||||
vpx_highbd_sad16x16x3_bits8,
|
||||
vpx_highbd_sad16x16x8_bits8,
|
||||
vpx_highbd_sad16x16x4d_bits8)
|
||||
|
@ -1093,8 +1093,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad16x8_bits8,
|
||||
vpx_highbd_sad16x8_avg_bits8,
|
||||
vpx_highbd_8_variance16x8,
|
||||
vp9_highbd_sub_pixel_variance16x8,
|
||||
vp9_highbd_sub_pixel_avg_variance16x8,
|
||||
vpx_highbd_8_sub_pixel_variance16x8,
|
||||
vpx_highbd_8_sub_pixel_avg_variance16x8,
|
||||
vpx_highbd_sad16x8x3_bits8,
|
||||
vpx_highbd_sad16x8x8_bits8,
|
||||
vpx_highbd_sad16x8x4d_bits8)
|
||||
|
@ -1103,8 +1103,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad8x16_bits8,
|
||||
vpx_highbd_sad8x16_avg_bits8,
|
||||
vpx_highbd_8_variance8x16,
|
||||
vp9_highbd_sub_pixel_variance8x16,
|
||||
vp9_highbd_sub_pixel_avg_variance8x16,
|
||||
vpx_highbd_8_sub_pixel_variance8x16,
|
||||
vpx_highbd_8_sub_pixel_avg_variance8x16,
|
||||
vpx_highbd_sad8x16x3_bits8,
|
||||
vpx_highbd_sad8x16x8_bits8,
|
||||
vpx_highbd_sad8x16x4d_bits8)
|
||||
|
@ -1113,8 +1113,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad8x8_bits8,
|
||||
vpx_highbd_sad8x8_avg_bits8,
|
||||
vpx_highbd_8_variance8x8,
|
||||
vp9_highbd_sub_pixel_variance8x8,
|
||||
vp9_highbd_sub_pixel_avg_variance8x8,
|
||||
vpx_highbd_8_sub_pixel_variance8x8,
|
||||
vpx_highbd_8_sub_pixel_avg_variance8x8,
|
||||
vpx_highbd_sad8x8x3_bits8,
|
||||
vpx_highbd_sad8x8x8_bits8,
|
||||
vpx_highbd_sad8x8x4d_bits8)
|
||||
|
@ -1123,8 +1123,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad8x4_bits8,
|
||||
vpx_highbd_sad8x4_avg_bits8,
|
||||
vpx_highbd_8_variance8x4,
|
||||
vp9_highbd_sub_pixel_variance8x4,
|
||||
vp9_highbd_sub_pixel_avg_variance8x4,
|
||||
vpx_highbd_8_sub_pixel_variance8x4,
|
||||
vpx_highbd_8_sub_pixel_avg_variance8x4,
|
||||
NULL,
|
||||
vpx_highbd_sad8x4x8_bits8,
|
||||
vpx_highbd_sad8x4x4d_bits8)
|
||||
|
@ -1133,8 +1133,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad4x8_bits8,
|
||||
vpx_highbd_sad4x8_avg_bits8,
|
||||
vpx_highbd_8_variance4x8,
|
||||
vp9_highbd_sub_pixel_variance4x8,
|
||||
vp9_highbd_sub_pixel_avg_variance4x8,
|
||||
vpx_highbd_8_sub_pixel_variance4x8,
|
||||
vpx_highbd_8_sub_pixel_avg_variance4x8,
|
||||
NULL,
|
||||
vpx_highbd_sad4x8x8_bits8,
|
||||
vpx_highbd_sad4x8x4d_bits8)
|
||||
|
@ -1143,8 +1143,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad4x4_bits8,
|
||||
vpx_highbd_sad4x4_avg_bits8,
|
||||
vpx_highbd_8_variance4x4,
|
||||
vp9_highbd_sub_pixel_variance4x4,
|
||||
vp9_highbd_sub_pixel_avg_variance4x4,
|
||||
vpx_highbd_8_sub_pixel_variance4x4,
|
||||
vpx_highbd_8_sub_pixel_avg_variance4x4,
|
||||
vpx_highbd_sad4x4x3_bits8,
|
||||
vpx_highbd_sad4x4x8_bits8,
|
||||
vpx_highbd_sad4x4x4d_bits8)
|
||||
|
@ -1155,8 +1155,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad32x16_bits10,
|
||||
vpx_highbd_sad32x16_avg_bits10,
|
||||
vpx_highbd_10_variance32x16,
|
||||
vp9_highbd_10_sub_pixel_variance32x16,
|
||||
vp9_highbd_10_sub_pixel_avg_variance32x16,
|
||||
vpx_highbd_10_sub_pixel_variance32x16,
|
||||
vpx_highbd_10_sub_pixel_avg_variance32x16,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad32x16x4d_bits10)
|
||||
|
@ -1165,8 +1165,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad16x32_bits10,
|
||||
vpx_highbd_sad16x32_avg_bits10,
|
||||
vpx_highbd_10_variance16x32,
|
||||
vp9_highbd_10_sub_pixel_variance16x32,
|
||||
vp9_highbd_10_sub_pixel_avg_variance16x32,
|
||||
vpx_highbd_10_sub_pixel_variance16x32,
|
||||
vpx_highbd_10_sub_pixel_avg_variance16x32,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad16x32x4d_bits10)
|
||||
|
@ -1175,8 +1175,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad64x32_bits10,
|
||||
vpx_highbd_sad64x32_avg_bits10,
|
||||
vpx_highbd_10_variance64x32,
|
||||
vp9_highbd_10_sub_pixel_variance64x32,
|
||||
vp9_highbd_10_sub_pixel_avg_variance64x32,
|
||||
vpx_highbd_10_sub_pixel_variance64x32,
|
||||
vpx_highbd_10_sub_pixel_avg_variance64x32,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad64x32x4d_bits10)
|
||||
|
@ -1185,8 +1185,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad32x64_bits10,
|
||||
vpx_highbd_sad32x64_avg_bits10,
|
||||
vpx_highbd_10_variance32x64,
|
||||
vp9_highbd_10_sub_pixel_variance32x64,
|
||||
vp9_highbd_10_sub_pixel_avg_variance32x64,
|
||||
vpx_highbd_10_sub_pixel_variance32x64,
|
||||
vpx_highbd_10_sub_pixel_avg_variance32x64,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad32x64x4d_bits10)
|
||||
|
@ -1195,8 +1195,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad32x32_bits10,
|
||||
vpx_highbd_sad32x32_avg_bits10,
|
||||
vpx_highbd_10_variance32x32,
|
||||
vp9_highbd_10_sub_pixel_variance32x32,
|
||||
vp9_highbd_10_sub_pixel_avg_variance32x32,
|
||||
vpx_highbd_10_sub_pixel_variance32x32,
|
||||
vpx_highbd_10_sub_pixel_avg_variance32x32,
|
||||
vpx_highbd_sad32x32x3_bits10,
|
||||
vpx_highbd_sad32x32x8_bits10,
|
||||
vpx_highbd_sad32x32x4d_bits10)
|
||||
|
@ -1205,8 +1205,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad64x64_bits10,
|
||||
vpx_highbd_sad64x64_avg_bits10,
|
||||
vpx_highbd_10_variance64x64,
|
||||
vp9_highbd_10_sub_pixel_variance64x64,
|
||||
vp9_highbd_10_sub_pixel_avg_variance64x64,
|
||||
vpx_highbd_10_sub_pixel_variance64x64,
|
||||
vpx_highbd_10_sub_pixel_avg_variance64x64,
|
||||
vpx_highbd_sad64x64x3_bits10,
|
||||
vpx_highbd_sad64x64x8_bits10,
|
||||
vpx_highbd_sad64x64x4d_bits10)
|
||||
|
@ -1215,8 +1215,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad16x16_bits10,
|
||||
vpx_highbd_sad16x16_avg_bits10,
|
||||
vpx_highbd_10_variance16x16,
|
||||
vp9_highbd_10_sub_pixel_variance16x16,
|
||||
vp9_highbd_10_sub_pixel_avg_variance16x16,
|
||||
vpx_highbd_10_sub_pixel_variance16x16,
|
||||
vpx_highbd_10_sub_pixel_avg_variance16x16,
|
||||
vpx_highbd_sad16x16x3_bits10,
|
||||
vpx_highbd_sad16x16x8_bits10,
|
||||
vpx_highbd_sad16x16x4d_bits10)
|
||||
|
@ -1225,8 +1225,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad16x8_bits10,
|
||||
vpx_highbd_sad16x8_avg_bits10,
|
||||
vpx_highbd_10_variance16x8,
|
||||
vp9_highbd_10_sub_pixel_variance16x8,
|
||||
vp9_highbd_10_sub_pixel_avg_variance16x8,
|
||||
vpx_highbd_10_sub_pixel_variance16x8,
|
||||
vpx_highbd_10_sub_pixel_avg_variance16x8,
|
||||
vpx_highbd_sad16x8x3_bits10,
|
||||
vpx_highbd_sad16x8x8_bits10,
|
||||
vpx_highbd_sad16x8x4d_bits10)
|
||||
|
@ -1235,8 +1235,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad8x16_bits10,
|
||||
vpx_highbd_sad8x16_avg_bits10,
|
||||
vpx_highbd_10_variance8x16,
|
||||
vp9_highbd_10_sub_pixel_variance8x16,
|
||||
vp9_highbd_10_sub_pixel_avg_variance8x16,
|
||||
vpx_highbd_10_sub_pixel_variance8x16,
|
||||
vpx_highbd_10_sub_pixel_avg_variance8x16,
|
||||
vpx_highbd_sad8x16x3_bits10,
|
||||
vpx_highbd_sad8x16x8_bits10,
|
||||
vpx_highbd_sad8x16x4d_bits10)
|
||||
|
@ -1245,8 +1245,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad8x8_bits10,
|
||||
vpx_highbd_sad8x8_avg_bits10,
|
||||
vpx_highbd_10_variance8x8,
|
||||
vp9_highbd_10_sub_pixel_variance8x8,
|
||||
vp9_highbd_10_sub_pixel_avg_variance8x8,
|
||||
vpx_highbd_10_sub_pixel_variance8x8,
|
||||
vpx_highbd_10_sub_pixel_avg_variance8x8,
|
||||
vpx_highbd_sad8x8x3_bits10,
|
||||
vpx_highbd_sad8x8x8_bits10,
|
||||
vpx_highbd_sad8x8x4d_bits10)
|
||||
|
@ -1255,8 +1255,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad8x4_bits10,
|
||||
vpx_highbd_sad8x4_avg_bits10,
|
||||
vpx_highbd_10_variance8x4,
|
||||
vp9_highbd_10_sub_pixel_variance8x4,
|
||||
vp9_highbd_10_sub_pixel_avg_variance8x4,
|
||||
vpx_highbd_10_sub_pixel_variance8x4,
|
||||
vpx_highbd_10_sub_pixel_avg_variance8x4,
|
||||
NULL,
|
||||
vpx_highbd_sad8x4x8_bits10,
|
||||
vpx_highbd_sad8x4x4d_bits10)
|
||||
|
@ -1265,8 +1265,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad4x8_bits10,
|
||||
vpx_highbd_sad4x8_avg_bits10,
|
||||
vpx_highbd_10_variance4x8,
|
||||
vp9_highbd_10_sub_pixel_variance4x8,
|
||||
vp9_highbd_10_sub_pixel_avg_variance4x8,
|
||||
vpx_highbd_10_sub_pixel_variance4x8,
|
||||
vpx_highbd_10_sub_pixel_avg_variance4x8,
|
||||
NULL,
|
||||
vpx_highbd_sad4x8x8_bits10,
|
||||
vpx_highbd_sad4x8x4d_bits10)
|
||||
|
@ -1275,8 +1275,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad4x4_bits10,
|
||||
vpx_highbd_sad4x4_avg_bits10,
|
||||
vpx_highbd_10_variance4x4,
|
||||
vp9_highbd_10_sub_pixel_variance4x4,
|
||||
vp9_highbd_10_sub_pixel_avg_variance4x4,
|
||||
vpx_highbd_10_sub_pixel_variance4x4,
|
||||
vpx_highbd_10_sub_pixel_avg_variance4x4,
|
||||
vpx_highbd_sad4x4x3_bits10,
|
||||
vpx_highbd_sad4x4x8_bits10,
|
||||
vpx_highbd_sad4x4x4d_bits10)
|
||||
|
@ -1287,8 +1287,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad32x16_bits12,
|
||||
vpx_highbd_sad32x16_avg_bits12,
|
||||
vpx_highbd_12_variance32x16,
|
||||
vp9_highbd_12_sub_pixel_variance32x16,
|
||||
vp9_highbd_12_sub_pixel_avg_variance32x16,
|
||||
vpx_highbd_12_sub_pixel_variance32x16,
|
||||
vpx_highbd_12_sub_pixel_avg_variance32x16,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad32x16x4d_bits12)
|
||||
|
@ -1297,8 +1297,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad16x32_bits12,
|
||||
vpx_highbd_sad16x32_avg_bits12,
|
||||
vpx_highbd_12_variance16x32,
|
||||
vp9_highbd_12_sub_pixel_variance16x32,
|
||||
vp9_highbd_12_sub_pixel_avg_variance16x32,
|
||||
vpx_highbd_12_sub_pixel_variance16x32,
|
||||
vpx_highbd_12_sub_pixel_avg_variance16x32,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad16x32x4d_bits12)
|
||||
|
@ -1307,8 +1307,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad64x32_bits12,
|
||||
vpx_highbd_sad64x32_avg_bits12,
|
||||
vpx_highbd_12_variance64x32,
|
||||
vp9_highbd_12_sub_pixel_variance64x32,
|
||||
vp9_highbd_12_sub_pixel_avg_variance64x32,
|
||||
vpx_highbd_12_sub_pixel_variance64x32,
|
||||
vpx_highbd_12_sub_pixel_avg_variance64x32,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad64x32x4d_bits12)
|
||||
|
@ -1317,8 +1317,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad32x64_bits12,
|
||||
vpx_highbd_sad32x64_avg_bits12,
|
||||
vpx_highbd_12_variance32x64,
|
||||
vp9_highbd_12_sub_pixel_variance32x64,
|
||||
vp9_highbd_12_sub_pixel_avg_variance32x64,
|
||||
vpx_highbd_12_sub_pixel_variance32x64,
|
||||
vpx_highbd_12_sub_pixel_avg_variance32x64,
|
||||
NULL,
|
||||
NULL,
|
||||
vpx_highbd_sad32x64x4d_bits12)
|
||||
|
@ -1327,8 +1327,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad32x32_bits12,
|
||||
vpx_highbd_sad32x32_avg_bits12,
|
||||
vpx_highbd_12_variance32x32,
|
||||
vp9_highbd_12_sub_pixel_variance32x32,
|
||||
vp9_highbd_12_sub_pixel_avg_variance32x32,
|
||||
vpx_highbd_12_sub_pixel_variance32x32,
|
||||
vpx_highbd_12_sub_pixel_avg_variance32x32,
|
||||
vpx_highbd_sad32x32x3_bits12,
|
||||
vpx_highbd_sad32x32x8_bits12,
|
||||
vpx_highbd_sad32x32x4d_bits12)
|
||||
|
@ -1337,8 +1337,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad64x64_bits12,
|
||||
vpx_highbd_sad64x64_avg_bits12,
|
||||
vpx_highbd_12_variance64x64,
|
||||
vp9_highbd_12_sub_pixel_variance64x64,
|
||||
vp9_highbd_12_sub_pixel_avg_variance64x64,
|
||||
vpx_highbd_12_sub_pixel_variance64x64,
|
||||
vpx_highbd_12_sub_pixel_avg_variance64x64,
|
||||
vpx_highbd_sad64x64x3_bits12,
|
||||
vpx_highbd_sad64x64x8_bits12,
|
||||
vpx_highbd_sad64x64x4d_bits12)
|
||||
|
@ -1347,8 +1347,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad16x16_bits12,
|
||||
vpx_highbd_sad16x16_avg_bits12,
|
||||
vpx_highbd_12_variance16x16,
|
||||
vp9_highbd_12_sub_pixel_variance16x16,
|
||||
vp9_highbd_12_sub_pixel_avg_variance16x16,
|
||||
vpx_highbd_12_sub_pixel_variance16x16,
|
||||
vpx_highbd_12_sub_pixel_avg_variance16x16,
|
||||
vpx_highbd_sad16x16x3_bits12,
|
||||
vpx_highbd_sad16x16x8_bits12,
|
||||
vpx_highbd_sad16x16x4d_bits12)
|
||||
|
@ -1357,8 +1357,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad16x8_bits12,
|
||||
vpx_highbd_sad16x8_avg_bits12,
|
||||
vpx_highbd_12_variance16x8,
|
||||
vp9_highbd_12_sub_pixel_variance16x8,
|
||||
vp9_highbd_12_sub_pixel_avg_variance16x8,
|
||||
vpx_highbd_12_sub_pixel_variance16x8,
|
||||
vpx_highbd_12_sub_pixel_avg_variance16x8,
|
||||
vpx_highbd_sad16x8x3_bits12,
|
||||
vpx_highbd_sad16x8x8_bits12,
|
||||
vpx_highbd_sad16x8x4d_bits12)
|
||||
|
@ -1367,8 +1367,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad8x16_bits12,
|
||||
vpx_highbd_sad8x16_avg_bits12,
|
||||
vpx_highbd_12_variance8x16,
|
||||
vp9_highbd_12_sub_pixel_variance8x16,
|
||||
vp9_highbd_12_sub_pixel_avg_variance8x16,
|
||||
vpx_highbd_12_sub_pixel_variance8x16,
|
||||
vpx_highbd_12_sub_pixel_avg_variance8x16,
|
||||
vpx_highbd_sad8x16x3_bits12,
|
||||
vpx_highbd_sad8x16x8_bits12,
|
||||
vpx_highbd_sad8x16x4d_bits12)
|
||||
|
@ -1377,8 +1377,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad8x8_bits12,
|
||||
vpx_highbd_sad8x8_avg_bits12,
|
||||
vpx_highbd_12_variance8x8,
|
||||
vp9_highbd_12_sub_pixel_variance8x8,
|
||||
vp9_highbd_12_sub_pixel_avg_variance8x8,
|
||||
vpx_highbd_12_sub_pixel_variance8x8,
|
||||
vpx_highbd_12_sub_pixel_avg_variance8x8,
|
||||
vpx_highbd_sad8x8x3_bits12,
|
||||
vpx_highbd_sad8x8x8_bits12,
|
||||
vpx_highbd_sad8x8x4d_bits12)
|
||||
|
@ -1387,8 +1387,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad8x4_bits12,
|
||||
vpx_highbd_sad8x4_avg_bits12,
|
||||
vpx_highbd_12_variance8x4,
|
||||
vp9_highbd_12_sub_pixel_variance8x4,
|
||||
vp9_highbd_12_sub_pixel_avg_variance8x4,
|
||||
vpx_highbd_12_sub_pixel_variance8x4,
|
||||
vpx_highbd_12_sub_pixel_avg_variance8x4,
|
||||
NULL,
|
||||
vpx_highbd_sad8x4x8_bits12,
|
||||
vpx_highbd_sad8x4x4d_bits12)
|
||||
|
@ -1397,8 +1397,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad4x8_bits12,
|
||||
vpx_highbd_sad4x8_avg_bits12,
|
||||
vpx_highbd_12_variance4x8,
|
||||
vp9_highbd_12_sub_pixel_variance4x8,
|
||||
vp9_highbd_12_sub_pixel_avg_variance4x8,
|
||||
vpx_highbd_12_sub_pixel_variance4x8,
|
||||
vpx_highbd_12_sub_pixel_avg_variance4x8,
|
||||
NULL,
|
||||
vpx_highbd_sad4x8x8_bits12,
|
||||
vpx_highbd_sad4x8x4d_bits12)
|
||||
|
@ -1407,8 +1407,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
vpx_highbd_sad4x4_bits12,
|
||||
vpx_highbd_sad4x4_avg_bits12,
|
||||
vpx_highbd_12_variance4x4,
|
||||
vp9_highbd_12_sub_pixel_variance4x4,
|
||||
vp9_highbd_12_sub_pixel_avg_variance4x4,
|
||||
vpx_highbd_12_sub_pixel_variance4x4,
|
||||
vpx_highbd_12_sub_pixel_avg_variance4x4,
|
||||
vpx_highbd_sad4x4x3_bits12,
|
||||
vpx_highbd_sad4x4x8_bits12,
|
||||
vpx_highbd_sad4x4x4d_bits12)
|
||||
|
@ -1832,62 +1832,62 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
|
|||
cpi->fn_ptr[BT].sdx4df = SDX4DF;
|
||||
|
||||
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
|
||||
vpx_variance32x16, vp9_sub_pixel_variance32x16,
|
||||
vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
|
||||
vpx_variance32x16, vpx_sub_pixel_variance32x16,
|
||||
vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
|
||||
|
||||
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
|
||||
vpx_variance16x32, vp9_sub_pixel_variance16x32,
|
||||
vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
|
||||
vpx_variance16x32, vpx_sub_pixel_variance16x32,
|
||||
vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
|
||||
|
||||
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
|
||||
vpx_variance64x32, vp9_sub_pixel_variance64x32,
|
||||
vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
|
||||
vpx_variance64x32, vpx_sub_pixel_variance64x32,
|
||||
vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
|
||||
|
||||
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
|
||||
vpx_variance32x64, vp9_sub_pixel_variance32x64,
|
||||
vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
|
||||
vpx_variance32x64, vpx_sub_pixel_variance32x64,
|
||||
vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
|
||||
|
||||
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
|
||||
vpx_variance32x32, vp9_sub_pixel_variance32x32,
|
||||
vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
|
||||
vpx_variance32x32, vpx_sub_pixel_variance32x32,
|
||||
vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
|
||||
vpx_sad32x32x4d)
|
||||
|
||||
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
|
||||
vpx_variance64x64, vp9_sub_pixel_variance64x64,
|
||||
vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
|
||||
vpx_variance64x64, vpx_sub_pixel_variance64x64,
|
||||
vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
|
||||
vpx_sad64x64x4d)
|
||||
|
||||
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
|
||||
vpx_variance16x16, vp9_sub_pixel_variance16x16,
|
||||
vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
|
||||
vpx_variance16x16, vpx_sub_pixel_variance16x16,
|
||||
vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
|
||||
vpx_sad16x16x4d)
|
||||
|
||||
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
|
||||
vpx_variance16x8, vp9_sub_pixel_variance16x8,
|
||||
vp9_sub_pixel_avg_variance16x8,
|
||||
vpx_variance16x8, vpx_sub_pixel_variance16x8,
|
||||
vpx_sub_pixel_avg_variance16x8,
|
||||
vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
|
||||
|
||||
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
|
||||
vpx_variance8x16, vp9_sub_pixel_variance8x16,
|
||||
vp9_sub_pixel_avg_variance8x16,
|
||||
vpx_variance8x16, vpx_sub_pixel_variance8x16,
|
||||
vpx_sub_pixel_avg_variance8x16,
|
||||
vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
|
||||
|
||||
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
|
||||
vpx_variance8x8, vp9_sub_pixel_variance8x8,
|
||||
vp9_sub_pixel_avg_variance8x8,
|
||||
vpx_variance8x8, vpx_sub_pixel_variance8x8,
|
||||
vpx_sub_pixel_avg_variance8x8,
|
||||
vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
|
||||
|
||||
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
|
||||
vpx_variance8x4, vp9_sub_pixel_variance8x4,
|
||||
vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
|
||||
vpx_variance8x4, vpx_sub_pixel_variance8x4,
|
||||
vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
|
||||
|
||||
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
|
||||
vpx_variance4x8, vp9_sub_pixel_variance4x8,
|
||||
vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
|
||||
vpx_variance4x8, vpx_sub_pixel_variance4x8,
|
||||
vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
|
||||
|
||||
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
|
||||
vpx_variance4x4, vp9_sub_pixel_variance4x4,
|
||||
vp9_sub_pixel_avg_variance4x4,
|
||||
vpx_variance4x4, vpx_sub_pixel_variance4x4,
|
||||
vpx_sub_pixel_avg_variance4x4,
|
||||
vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
|
|
|
@ -40,7 +40,7 @@
|
|||
#include "vp9/encoder/vp9_speed_features.h"
|
||||
#include "vp9/encoder/vp9_svc_layercontext.h"
|
||||
#include "vp9/encoder/vp9_tokenize.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
|
||||
#if CONFIG_VP9_TEMPORAL_DENOISING
|
||||
#include "vp9/encoder/vp9_denoiser.h"
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
#include "vp9/encoder/vp9_mcomp.h"
|
||||
#include "vp9/encoder/vp9_quantize.h"
|
||||
#include "vp9/encoder/vp9_rd.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
|
||||
#define OUTPUT_FPF 0
|
||||
#define ARF_STATS_OUTPUT 0
|
||||
|
@ -298,7 +298,7 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
|
|||
}
|
||||
}
|
||||
|
||||
static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
|
||||
static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
|
||||
switch (bsize) {
|
||||
case BLOCK_8X8:
|
||||
return vpx_mse8x8;
|
||||
|
@ -315,13 +315,13 @@ static unsigned int get_prediction_error(BLOCK_SIZE bsize,
|
|||
const struct buf_2d *src,
|
||||
const struct buf_2d *ref) {
|
||||
unsigned int sse;
|
||||
const vp9_variance_fn_t fn = get_block_variance_fn(bsize);
|
||||
const vpx_variance_fn_t fn = get_block_variance_fn(bsize);
|
||||
fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
|
||||
return sse;
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
|
||||
static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
|
||||
int bd) {
|
||||
switch (bd) {
|
||||
default:
|
||||
|
@ -368,7 +368,7 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
|
|||
const struct buf_2d *ref,
|
||||
int bd) {
|
||||
unsigned int sse;
|
||||
const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
|
||||
const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
|
||||
fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
|
||||
return sse;
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#define VP9_ENCODER_VP9_MCOMP_H_
|
||||
|
||||
#include "vp9/encoder/vp9_block.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
|
@ -37,7 +37,6 @@
|
|||
#include "vp9/encoder/vp9_ratectrl.h"
|
||||
#include "vp9/encoder/vp9_rd.h"
|
||||
#include "vp9/encoder/vp9_tokenize.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
|
||||
#define RD_THRESH_POW 1.25
|
||||
#define RD_MULT_EPB_RATIO 64
|
||||
|
|
|
@ -39,7 +39,6 @@
|
|||
#include "vp9/encoder/vp9_ratectrl.h"
|
||||
#include "vp9/encoder/vp9_rd.h"
|
||||
#include "vp9/encoder/vp9_rdopt.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vp9/encoder/vp9_aq_variance.h"
|
||||
|
||||
#define LAST_FRAME_MODE_MASK ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
|
||||
|
|
|
@ -1,380 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
#include "vp9/common/vp9_common.h"
|
||||
#include "vp9/common/vp9_filter.h"
|
||||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
|
||||
static const uint8_t bilinear_filters[8][2] = {
|
||||
{ 128, 0, },
|
||||
{ 112, 16, },
|
||||
{ 96, 32, },
|
||||
{ 80, 48, },
|
||||
{ 64, 64, },
|
||||
{ 48, 80, },
|
||||
{ 32, 96, },
|
||||
{ 16, 112, },
|
||||
};
|
||||
|
||||
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
|
||||
// or vertical direction to produce the filtered output block. Used to implement
|
||||
// first-pass of 2-D separable filter.
|
||||
//
|
||||
// Produces int32_t output to retain precision for next pass. Two filter taps
|
||||
// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is
|
||||
// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It
|
||||
// defines the offset required to move from one input to the next.
|
||||
static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
|
||||
uint16_t *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *vp9_filter) {
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0; i < output_height; i++) {
|
||||
for (j = 0; j < output_width; j++) {
|
||||
output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
|
||||
(int)src_ptr[pixel_step] * vp9_filter[1],
|
||||
FILTER_BITS);
|
||||
|
||||
src_ptr++;
|
||||
}
|
||||
|
||||
// Next row...
|
||||
src_ptr += src_pixels_per_line - output_width;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
|
||||
// or vertical direction to produce the filtered output block. Used to implement
|
||||
// second-pass of 2-D separable filter.
|
||||
//
|
||||
// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two
|
||||
// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the
|
||||
// filter is applied horizontally (pixel_step=1) or vertically (pixel_step=
|
||||
// stride). It defines the offset required to move from one input to the next.
|
||||
static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
|
||||
uint8_t *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *vp9_filter) {
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0; i < output_height; i++) {
|
||||
for (j = 0; j < output_width; j++) {
|
||||
output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
|
||||
(int)src_ptr[pixel_step] * vp9_filter[1],
|
||||
FILTER_BITS);
|
||||
src_ptr++;
|
||||
}
|
||||
|
||||
src_ptr += src_pixels_per_line - output_width;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
#define SUBPIX_VAR(W, H) \
|
||||
unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint8_t temp2[H * W]; \
|
||||
\
|
||||
var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
|
||||
bilinear_filters[xoffset]); \
|
||||
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
#define SUBPIX_AVG_VAR(W, H) \
|
||||
unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
unsigned int *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint8_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
|
||||
\
|
||||
var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
|
||||
bilinear_filters[xoffset]); \
|
||||
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
|
||||
\
|
||||
return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
SUBPIX_VAR(4, 4)
|
||||
SUBPIX_AVG_VAR(4, 4)
|
||||
|
||||
SUBPIX_VAR(4, 8)
|
||||
SUBPIX_AVG_VAR(4, 8)
|
||||
|
||||
SUBPIX_VAR(8, 4)
|
||||
SUBPIX_AVG_VAR(8, 4)
|
||||
|
||||
SUBPIX_VAR(8, 8)
|
||||
SUBPIX_AVG_VAR(8, 8)
|
||||
|
||||
SUBPIX_VAR(8, 16)
|
||||
SUBPIX_AVG_VAR(8, 16)
|
||||
|
||||
SUBPIX_VAR(16, 8)
|
||||
SUBPIX_AVG_VAR(16, 8)
|
||||
|
||||
SUBPIX_VAR(16, 16)
|
||||
SUBPIX_AVG_VAR(16, 16)
|
||||
|
||||
SUBPIX_VAR(16, 32)
|
||||
SUBPIX_AVG_VAR(16, 32)
|
||||
|
||||
SUBPIX_VAR(32, 16)
|
||||
SUBPIX_AVG_VAR(32, 16)
|
||||
|
||||
SUBPIX_VAR(32, 32)
|
||||
SUBPIX_AVG_VAR(32, 32)
|
||||
|
||||
SUBPIX_VAR(32, 64)
|
||||
SUBPIX_AVG_VAR(32, 64)
|
||||
|
||||
SUBPIX_VAR(64, 32)
|
||||
SUBPIX_AVG_VAR(64, 32)
|
||||
|
||||
SUBPIX_VAR(64, 64)
|
||||
SUBPIX_AVG_VAR(64, 64)
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static void highbd_var_filter_block2d_bil_first_pass(
|
||||
const uint8_t *src_ptr8,
|
||||
uint16_t *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *vp9_filter) {
|
||||
unsigned int i, j;
|
||||
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
|
||||
for (i = 0; i < output_height; i++) {
|
||||
for (j = 0; j < output_width; j++) {
|
||||
output_ptr[j] =
|
||||
ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
|
||||
(int)src_ptr[pixel_step] * vp9_filter[1],
|
||||
FILTER_BITS);
|
||||
|
||||
src_ptr++;
|
||||
}
|
||||
|
||||
// Next row...
|
||||
src_ptr += src_pixels_per_line - output_width;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_var_filter_block2d_bil_second_pass(
|
||||
const uint16_t *src_ptr,
|
||||
uint16_t *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *vp9_filter) {
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0; i < output_height; i++) {
|
||||
for (j = 0; j < output_width; j++) {
|
||||
output_ptr[j] =
|
||||
ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
|
||||
(int)src_ptr[pixel_step] * vp9_filter[1],
|
||||
FILTER_BITS);
|
||||
src_ptr++;
|
||||
}
|
||||
|
||||
src_ptr += src_pixels_per_line - output_width;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
#define HIGHBD_SUBPIX_VAR(W, H) \
|
||||
unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
|
||||
dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||
W, dst, dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||
W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
|
||||
unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
unsigned int *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||
\
|
||||
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
|
||||
dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
unsigned int *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||
\
|
||||
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||
W, dst, dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
unsigned int *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||
\
|
||||
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||
W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
HIGHBD_SUBPIX_VAR(4, 4)
|
||||
HIGHBD_SUBPIX_AVG_VAR(4, 4)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(4, 8)
|
||||
HIGHBD_SUBPIX_AVG_VAR(4, 8)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(8, 4)
|
||||
HIGHBD_SUBPIX_AVG_VAR(8, 4)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(8, 8)
|
||||
HIGHBD_SUBPIX_AVG_VAR(8, 8)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(8, 16)
|
||||
HIGHBD_SUBPIX_AVG_VAR(8, 16)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(16, 8)
|
||||
HIGHBD_SUBPIX_AVG_VAR(16, 8)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(16, 16)
|
||||
HIGHBD_SUBPIX_AVG_VAR(16, 16)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(16, 32)
|
||||
HIGHBD_SUBPIX_AVG_VAR(16, 32)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(32, 16)
|
||||
HIGHBD_SUBPIX_AVG_VAR(32, 16)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(32, 32)
|
||||
HIGHBD_SUBPIX_AVG_VAR(32, 32)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(32, 64)
|
||||
HIGHBD_SUBPIX_AVG_VAR(32, 64)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(64, 32)
|
||||
HIGHBD_SUBPIX_AVG_VAR(64, 32)
|
||||
|
||||
HIGHBD_SUBPIX_VAR(64, 64)
|
||||
HIGHBD_SUBPIX_AVG_VAR(64, 64)
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
|
@ -1,81 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP9_ENCODER_VP9_VARIANCE_H_
|
||||
#define VP9_ENCODER_VP9_VARIANCE_H_
|
||||
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride);
|
||||
|
||||
typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
const uint8_t *second_pred);
|
||||
|
||||
typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int *sad_array);
|
||||
|
||||
typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t* const ref_ptr[],
|
||||
int ref_stride, unsigned int *sad_array);
|
||||
|
||||
typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride,
|
||||
unsigned int *sse);
|
||||
|
||||
typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const uint8_t *ref_ptr,
|
||||
int Refstride,
|
||||
unsigned int *sse);
|
||||
|
||||
typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
|
||||
int source_stride,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const uint8_t *ref_ptr,
|
||||
int Refstride,
|
||||
unsigned int *sse,
|
||||
const uint8_t *second_pred);
|
||||
|
||||
typedef struct vp9_variance_vtable {
|
||||
vp9_sad_fn_t sdf;
|
||||
vp9_sad_avg_fn_t sdaf;
|
||||
vp9_variance_fn_t vf;
|
||||
vp9_subpixvariance_fn_t svf;
|
||||
vp9_subp_avg_variance_fn_t svaf;
|
||||
vp9_sad_multi_fn_t sdx3f;
|
||||
vp9_sad_multi_fn_t sdx8f;
|
||||
vp9_sad_multi_d_fn_t sdx4df;
|
||||
} vp9_variance_fn_ptr_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP9_ENCODER_VP9_VARIANCE_H_
|
|
@ -1,349 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#include "./vpx_config.h"
|
||||
#include "vp9/common/vp9_common.h"
|
||||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#define DECL(w, opt) \
|
||||
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint16_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
int height, unsigned int *sse);
|
||||
#define DECLS(opt1, opt2) \
|
||||
DECL(8, opt1); \
|
||||
DECL(16, opt1)
|
||||
|
||||
DECLS(sse2, sse);
|
||||
// DECLS(ssse3, ssse3);
|
||||
#undef DECLS
|
||||
#undef DECL
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst8, \
|
||||
int dst_stride, \
|
||||
uint32_t *sse_ptr) { \
|
||||
uint32_t sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst, dst_stride, h, \
|
||||
&sse); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
|
||||
src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, \
|
||||
dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 48, src_stride, x_offset, y_offset, \
|
||||
dst + 48, dst_stride, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
\
|
||||
uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
|
||||
uint32_t sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst, dst_stride, \
|
||||
h, &sse); \
|
||||
if (w > wf) { \
|
||||
uint32_t sse2; \
|
||||
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
|
||||
src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, \
|
||||
dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 2); \
|
||||
sse = ROUND_POWER_OF_TWO(sse, 4); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
\
|
||||
uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
|
||||
int start_row; \
|
||||
uint32_t sse; \
|
||||
int se = 0; \
|
||||
uint64_t long_sse = 0; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
for (start_row = 0; start_row < h; start_row +=16) { \
|
||||
uint32_t sse2; \
|
||||
int height = h - start_row < 16 ? h - start_row : 16; \
|
||||
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf) { \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 16 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 32 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 48 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
}\
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 4); \
|
||||
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
#define FNS(opt1, opt2) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
|
||||
FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (int64_t));
|
||||
|
||||
|
||||
FNS(sse2, sse);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
|
||||
#define DECL(w, opt) \
|
||||
int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint16_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
const uint16_t *sec, \
|
||||
ptrdiff_t sec_stride, \
|
||||
int height, \
|
||||
unsigned int *sse);
|
||||
#define DECLS(opt1) \
|
||||
DECL(16, opt1) \
|
||||
DECL(8, opt1)
|
||||
|
||||
DECLS(sse2);
|
||||
#undef DECL
|
||||
#undef DECLS
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
|
||||
const uint8_t *sec8) { \
|
||||
uint32_t sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
|
||||
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, sec, w, h, &sse); \
|
||||
if (w > wf) { \
|
||||
uint32_t sse2; \
|
||||
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 16, src_stride, x_offset, y_offset, \
|
||||
dst + 16, dst_stride, sec + 16, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 32, src_stride, x_offset, y_offset, \
|
||||
dst + 32, dst_stride, sec + 32, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 48, src_stride, x_offset, y_offset, \
|
||||
dst + 48, dst_stride, sec + 48, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
\
|
||||
uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
|
||||
const uint8_t *sec8) { \
|
||||
uint32_t sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
|
||||
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, \
|
||||
sec, w, h, &sse); \
|
||||
if (w > wf) { \
|
||||
uint32_t sse2; \
|
||||
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 16, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, dst_stride, \
|
||||
sec + 16, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
sec + 32, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
sec + 48, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 2); \
|
||||
sse = ROUND_POWER_OF_TWO(sse, 4); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
\
|
||||
uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
|
||||
const uint8_t *sec8) { \
|
||||
int start_row; \
|
||||
uint32_t sse; \
|
||||
int se = 0; \
|
||||
uint64_t long_sse = 0; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
|
||||
for (start_row = 0; start_row < h; start_row +=16) { \
|
||||
uint32_t sse2; \
|
||||
int height = h - start_row < 16 ? h - start_row : 16; \
|
||||
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + (start_row * src_stride), src_stride, x_offset, \
|
||||
y_offset, dst + (start_row * dst_stride), dst_stride, \
|
||||
sec + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf) { \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 16 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16 + (start_row * dst_stride), dst_stride, \
|
||||
sec + 16 + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 32 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32 + (start_row * dst_stride), dst_stride, \
|
||||
sec + 32 + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 48 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48 + (start_row * dst_stride), dst_stride, \
|
||||
sec + 48 + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 4); \
|
||||
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
|
||||
#define FNS(opt1) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
|
||||
FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (int64_t));
|
||||
|
||||
FNS(sse2);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
|
@ -1,525 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <immintrin.h> // AVX2
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
|
||||
DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
|
||||
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
|
||||
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
|
||||
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
|
||||
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
|
||||
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
|
||||
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
|
||||
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
|
||||
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
|
||||
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
|
||||
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
|
||||
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
|
||||
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
|
||||
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
|
||||
};
|
||||
|
||||
#define FILTER_SRC(filter) \
|
||||
/* filter the source */ \
|
||||
exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
|
||||
exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
|
||||
\
|
||||
/* add 8 to source */ \
|
||||
exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
|
||||
exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
|
||||
\
|
||||
/* divide source by 16 */ \
|
||||
exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
|
||||
exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
|
||||
|
||||
#define MERGE_WITH_SRC(src_reg, reg) \
|
||||
exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
|
||||
exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
|
||||
|
||||
#define LOAD_SRC_DST \
|
||||
/* load source and destination */ \
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
|
||||
dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
|
||||
|
||||
#define AVG_NEXT_SRC(src_reg, size_stride) \
|
||||
src_next_reg = _mm256_loadu_si256((__m256i const *) \
|
||||
(src + size_stride)); \
|
||||
/* average between current and next stride source */ \
|
||||
src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
|
||||
|
||||
#define MERGE_NEXT_SRC(src_reg, size_stride) \
|
||||
src_next_reg = _mm256_loadu_si256((__m256i const *) \
|
||||
(src + size_stride)); \
|
||||
MERGE_WITH_SRC(src_reg, src_next_reg)
|
||||
|
||||
#define CALC_SUM_SSE_INSIDE_LOOP \
|
||||
/* expand each byte to 2 bytes */ \
|
||||
exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
|
||||
exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
|
||||
/* source - dest */ \
|
||||
exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
|
||||
exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
|
||||
/* caculate sum */ \
|
||||
sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
|
||||
exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
|
||||
sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
|
||||
exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
|
||||
/* calculate sse */ \
|
||||
sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
|
||||
sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
|
||||
|
||||
// final calculation to sum and sse
|
||||
#define CALC_SUM_AND_SSE \
|
||||
res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
|
||||
sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
|
||||
sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
|
||||
sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
|
||||
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
|
||||
sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
|
||||
\
|
||||
sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
|
||||
sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
|
||||
\
|
||||
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
|
||||
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
|
||||
*((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
|
||||
_mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
|
||||
sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
|
||||
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
|
||||
sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
|
||||
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
|
||||
|
||||
|
||||
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
int height,
|
||||
unsigned int *sse) {
|
||||
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
|
||||
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
|
||||
__m256i zero_reg;
|
||||
int i, sum;
|
||||
sum_reg = _mm256_set1_epi16(0);
|
||||
sse_reg = _mm256_set1_epi16(0);
|
||||
zero_reg = _mm256_set1_epi16(0);
|
||||
|
||||
// x_offset = 0 and y_offset = 0
|
||||
if (x_offset == 0) {
|
||||
if (y_offset == 0) {
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
// expend each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 0 and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i src_next_reg;
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, src_stride)
|
||||
// expend each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 0 and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i filter, pw8, src_next_reg;
|
||||
|
||||
y_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, src_stride)
|
||||
FILTER_SRC(filter)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
// x_offset = 8 and y_offset = 0
|
||||
} else if (x_offset == 8) {
|
||||
if (y_offset == 0) {
|
||||
__m256i src_next_reg;
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 8 and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i src_next_reg, src_avg;
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
for (i = 0; i < height ; i++) {
|
||||
src_avg = src_reg;
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
// average between previous average to current average
|
||||
src_avg = _mm256_avg_epu8(src_avg, src_reg);
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_avg, zero_reg)
|
||||
// save current source average
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 8 and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i filter, pw8, src_next_reg, src_avg;
|
||||
y_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
for (i = 0; i < height ; i++) {
|
||||
// save current source average
|
||||
src_avg = src_reg;
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
MERGE_WITH_SRC(src_avg, src_reg)
|
||||
FILTER_SRC(filter)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = 0
|
||||
} else {
|
||||
if (y_offset == 0) {
|
||||
__m256i filter, pw8, src_next_reg;
|
||||
x_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i filter, pw8, src_next_reg, src_pack;
|
||||
x_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
// convert each 16 bit to 8 bit to each low and high lane source
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
for (i = 0; i < height ; i++) {
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
// average between previous pack to the current
|
||||
src_pack = _mm256_avg_epu8(src_pack, src_reg);
|
||||
MERGE_WITH_SRC(src_pack, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src_pack = src_reg;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
|
||||
x_offset <<= 5;
|
||||
xfilter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
y_offset <<= 5;
|
||||
yfilter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
|
||||
FILTER_SRC(xfilter)
|
||||
// convert each 16 bit to 8 bit to each low and high lane source
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
for (i = 0; i < height ; i++) {
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(xfilter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
// merge previous pack to current pack source
|
||||
MERGE_WITH_SRC(src_pack, src_reg)
|
||||
// filter the source
|
||||
FILTER_SRC(yfilter)
|
||||
src_pack = src_reg;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
CALC_SUM_AND_SSE
|
||||
return sum;
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
const uint8_t *sec,
|
||||
int sec_stride,
|
||||
int height,
|
||||
unsigned int *sse) {
|
||||
__m256i sec_reg;
|
||||
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
|
||||
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
|
||||
__m256i zero_reg;
|
||||
int i, sum;
|
||||
sum_reg = _mm256_set1_epi16(0);
|
||||
sse_reg = _mm256_set1_epi16(0);
|
||||
zero_reg = _mm256_set1_epi16(0);
|
||||
|
||||
// x_offset = 0 and y_offset = 0
|
||||
if (x_offset == 0) {
|
||||
if (y_offset == 0) {
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
// expend each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
} else if (y_offset == 8) {
|
||||
__m256i src_next_reg;
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, src_stride)
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
// expend each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 0 and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i filter, pw8, src_next_reg;
|
||||
|
||||
y_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, src_stride)
|
||||
FILTER_SRC(filter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
// x_offset = 8 and y_offset = 0
|
||||
} else if (x_offset == 8) {
|
||||
if (y_offset == 0) {
|
||||
__m256i src_next_reg;
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 8 and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i src_next_reg, src_avg;
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
for (i = 0; i < height ; i++) {
|
||||
// save current source average
|
||||
src_avg = src_reg;
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
// average between previous average to current average
|
||||
src_avg = _mm256_avg_epu8(src_avg, src_reg);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_avg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 8 and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i filter, pw8, src_next_reg, src_avg;
|
||||
y_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
for (i = 0; i < height ; i++) {
|
||||
// save current source average
|
||||
src_avg = src_reg;
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
MERGE_WITH_SRC(src_avg, src_reg)
|
||||
FILTER_SRC(filter)
|
||||
src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_avg, zero_reg)
|
||||
sec+= sec_stride;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = 0
|
||||
} else {
|
||||
if (y_offset == 0) {
|
||||
__m256i filter, pw8, src_next_reg;
|
||||
x_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
sec+= sec_stride;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i filter, pw8, src_next_reg, src_pack;
|
||||
x_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
// convert each 16 bit to 8 bit to each low and high lane source
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
for (i = 0; i < height ; i++) {
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
// average between previous pack to the current
|
||||
src_pack = _mm256_avg_epu8(src_pack, src_reg);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
|
||||
sec+= sec_stride;
|
||||
MERGE_WITH_SRC(src_pack, zero_reg)
|
||||
src_pack = src_reg;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
|
||||
x_offset <<= 5;
|
||||
xfilter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
y_offset <<= 5;
|
||||
yfilter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
|
||||
FILTER_SRC(xfilter)
|
||||
// convert each 16 bit to 8 bit to each low and high lane source
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
for (i = 0; i < height ; i++) {
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(xfilter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
// merge previous pack to current pack source
|
||||
MERGE_WITH_SRC(src_pack, src_reg)
|
||||
// filter the source
|
||||
FILTER_SRC(yfilter)
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
|
||||
MERGE_WITH_SRC(src_pack, zero_reg)
|
||||
src_pack = src_reg;
|
||||
sec+= sec_stride;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
CALC_SUM_AND_SSE
|
||||
return sum;
|
||||
}
|
|
@ -1,104 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
|
||||
int x_offset, int y_offset,
|
||||
const uint8_t *dst, int dst_stride,
|
||||
int height,
|
||||
unsigned int *sse);
|
||||
|
||||
unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
const uint8_t *sec,
|
||||
int sec_stride,
|
||||
int height,
|
||||
unsigned int *sseptr);
|
||||
|
||||
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse1;
|
||||
const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
|
||||
y_offset, dst, dst_stride,
|
||||
64, &sse1);
|
||||
unsigned int sse2;
|
||||
const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
|
||||
x_offset, y_offset,
|
||||
dst + 32, dst_stride,
|
||||
64, &sse2);
|
||||
const int se = se1 + se2;
|
||||
*sse = sse1 + sse2;
|
||||
return *sse - (((int64_t)se * se) >> 12);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
unsigned int *sse) {
|
||||
const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
|
||||
y_offset, dst, dst_stride,
|
||||
32, sse);
|
||||
return *sse - (((int64_t)se * se) >> 10);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
unsigned int *sse,
|
||||
const uint8_t *sec) {
|
||||
unsigned int sse1;
|
||||
const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
|
||||
y_offset, dst, dst_stride,
|
||||
sec, 64, 64, &sse1);
|
||||
unsigned int sse2;
|
||||
const int se2 =
|
||||
vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
|
||||
y_offset, dst + 32, dst_stride,
|
||||
sec + 32, 64, 64, &sse2);
|
||||
const int se = se1 + se2;
|
||||
|
||||
*sse = sse1 + sse2;
|
||||
|
||||
return *sse - (((int64_t)se * se) >> 12);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
unsigned int *sse,
|
||||
const uint8_t *sec) {
|
||||
// processing 32 element in parallel
|
||||
const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
|
||||
y_offset, dst, dst_stride,
|
||||
sec, 32, 32, sse);
|
||||
return *sse - (((int64_t)se * se) >> 10);
|
||||
}
|
|
@ -1,182 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <emmintrin.h> // SSE2
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
// The 2 unused parameters are place holders for PIC enabled build.
|
||||
#define DECL(w, opt) \
|
||||
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
int height, unsigned int *sse, \
|
||||
void *unused0, void *unused)
|
||||
#define DECLS(opt1, opt2) \
|
||||
DECL(4, opt2); \
|
||||
DECL(8, opt1); \
|
||||
DECL(16, opt1)
|
||||
|
||||
DECLS(sse2, sse);
|
||||
DECLS(ssse3, ssse3);
|
||||
#undef DECLS
|
||||
#undef DECL
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
int dst_stride, \
|
||||
unsigned int *sse_ptr) { \
|
||||
unsigned int sse; \
|
||||
int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, \
|
||||
h, &sse, NULL, NULL); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, dst_stride, \
|
||||
h, &sse2, NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
h, &sse2, NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
h, &sse2, NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
#define FNS(opt1, opt2) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
|
||||
FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
|
||||
FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
|
||||
FN(4, 4, 4, 2, 2, opt2, (unsigned int))
|
||||
|
||||
FNS(sse2, sse);
|
||||
FNS(ssse3, ssse3);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
|
||||
// The 2 unused parameters are place holders for PIC enabled build.
|
||||
#define DECL(w, opt) \
|
||||
int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
const uint8_t *sec, \
|
||||
ptrdiff_t sec_stride, \
|
||||
int height, unsigned int *sse, \
|
||||
void *unused0, void *unused)
|
||||
#define DECLS(opt1, opt2) \
|
||||
DECL(4, opt2); \
|
||||
DECL(8, opt1); \
|
||||
DECL(16, opt1)
|
||||
|
||||
DECLS(sse2, sse);
|
||||
DECLS(ssse3, ssse3);
|
||||
#undef DECL
|
||||
#undef DECLS
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
int dst_stride, \
|
||||
unsigned int *sseptr, \
|
||||
const uint8_t *sec) { \
|
||||
unsigned int sse; \
|
||||
int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, \
|
||||
sec, w, h, &sse, NULL, \
|
||||
NULL); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, dst_stride, \
|
||||
sec + 16, w, h, &sse2, \
|
||||
NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
sec + 32, w, h, &sse2, \
|
||||
NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
sec + 48, w, h, &sse2, \
|
||||
NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sseptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
#define FNS(opt1, opt2) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
|
||||
FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
|
||||
FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
|
||||
FN(4, 4, 4, 2, 2, opt2, (unsigned int))
|
||||
|
||||
FNS(sse2, sse);
|
||||
FNS(ssse3, ssse3);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
|
@ -131,7 +131,6 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_d
|
|||
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
|
||||
|
||||
# common (msa)
|
||||
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
|
||||
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
|
||||
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c
|
||||
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c
|
||||
|
|
12
vp9/vp9cx.mk
12
vp9/vp9cx.mk
|
@ -58,7 +58,6 @@ VP9_CX_SRCS-yes += encoder/vp9_pickmode.h
|
|||
VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_variance.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_encoder.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
|
||||
|
@ -84,7 +83,6 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
|
|||
|
||||
VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_variance.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
|
||||
|
@ -103,7 +101,6 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
|||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
||||
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
|
@ -114,12 +111,6 @@ endif
|
|||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
|
@ -143,14 +134,12 @@ endif
|
|||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2_impl.h
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
|
||||
|
||||
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
|
||||
endif
|
||||
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
|
||||
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
|
||||
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
|
||||
|
||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
|
||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
|
||||
|
@ -160,6 +149,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
|
|||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
|
||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
|
||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
|
||||
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_variance_msa.c
|
||||
|
||||
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
|
||||
|
|
|
@ -0,0 +1,237 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vpx_filter_block2d_bil_first_pass_media|
|
||||
EXPORT |vpx_filter_block2d_bil_second_pass_media|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;-------------------------------------
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 unsigned short *dst_ptr,
|
||||
; r2 unsigned int src_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *vpx_filter
|
||||
;-------------------------------------
|
||||
; The output is transposed stroed in output array to make it easy for second pass filtering.
|
||||
|vpx_filter_block2d_bil_first_pass_media| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vpx_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
mov r12, r3 ; outer-loop counter
|
||||
|
||||
add r7, r2, r4 ; preload next row
|
||||
pld [r0, r7]
|
||||
|
||||
sub r2, r2, r4 ; src increment for height loop
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
|
||||
mov r3, r3, lsl #1 ; height*2
|
||||
add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
|
||||
|
||||
mov r11, r1 ; save dst_ptr for each row
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_1st_filter
|
||||
|
||||
|bil_height_loop_1st_v6|
|
||||
ldrb r6, [r0] ; load source data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
mov lr, r4, lsr #2 ; 4-in-parellel loop counter
|
||||
|
||||
|bil_width_loop_1st_v6|
|
||||
ldrb r9, [r0, #3]
|
||||
ldrb r10, [r0, #4]
|
||||
|
||||
pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
|
||||
pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
|
||||
|
||||
smuad r6, r6, r5 ; apply the filter
|
||||
pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
|
||||
smuad r7, r7, r5
|
||||
pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
|
||||
|
||||
smuad r8, r8, r5
|
||||
smuad r9, r9, r5
|
||||
|
||||
add r0, r0, #4
|
||||
subs lr, lr, #1
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #16, r6, asr #7
|
||||
usat r7, #16, r7, asr #7
|
||||
|
||||
strh r6, [r1], r3 ; result is transposed and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strh r7, [r1], r3
|
||||
add r9, r9, #0x40
|
||||
usat r8, #16, r8, asr #7
|
||||
usat r9, #16, r9, asr #7
|
||||
|
||||
strh r8, [r1], r3 ; result is transposed and stored
|
||||
|
||||
ldrneb r6, [r0] ; load source data
|
||||
strh r9, [r1], r3
|
||||
|
||||
ldrneb r7, [r0, #1]
|
||||
ldrneb r8, [r0, #2]
|
||||
|
||||
bne bil_width_loop_1st_v6
|
||||
|
||||
add r0, r0, r2 ; move to next input row
|
||||
subs r12, r12, #1
|
||||
|
||||
add r9, r2, r4, lsl #1 ; adding back block width
|
||||
pld [r0, r9] ; preload next row
|
||||
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_1st_v6
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_1st_filter|
|
||||
|bil_height_loop_null_1st|
|
||||
mov lr, r4, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_null_1st|
|
||||
ldrb r6, [r0] ; load data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
ldrb r9, [r0, #3]
|
||||
|
||||
strh r6, [r1], r3 ; store it to immediate buffer
|
||||
add r0, r0, #4
|
||||
strh r7, [r1], r3
|
||||
subs lr, lr, #1
|
||||
strh r8, [r1], r3
|
||||
strh r9, [r1], r3
|
||||
|
||||
bne bil_width_loop_null_1st
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r2 ; move to next input line
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_1st
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP ; |vpx_filter_block2d_bil_first_pass_media|
|
||||
|
||||
|
||||
;---------------------------------
|
||||
; r0 unsigned short *src_ptr,
|
||||
; r1 unsigned char *dst_ptr,
|
||||
; r2 int dst_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *vpx_filter
|
||||
;---------------------------------
|
||||
|vpx_filter_block2d_bil_second_pass_media| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vpx_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
|
||||
mov r11, r1
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_2nd_filter
|
||||
|
||||
|bil_height_loop_2nd|
|
||||
ldr r6, [r0] ; load the data
|
||||
ldr r8, [r0, #4]
|
||||
ldrh r10, [r0, #8]
|
||||
mov lr, r3, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_2nd|
|
||||
pkhtb r7, r6, r8 ; src[1] | src[2]
|
||||
pkhtb r9, r8, r10 ; src[3] | src[4]
|
||||
|
||||
smuad r6, r6, r5 ; apply filter
|
||||
smuad r8, r8, r5 ; apply filter
|
||||
|
||||
subs lr, lr, #1
|
||||
|
||||
smuadx r7, r7, r5 ; apply filter
|
||||
smuadx r9, r9, r5 ; apply filter
|
||||
|
||||
add r0, r0, #8
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #8, r6, asr #7
|
||||
usat r7, #8, r7, asr #7
|
||||
strb r6, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strb r7, [r1], r2
|
||||
add r9, r9, #0x40
|
||||
usat r8, #8, r8, asr #7
|
||||
usat r9, #8, r9, asr #7
|
||||
strb r8, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
ldrne r6, [r0] ; load data
|
||||
strb r9, [r1], r2
|
||||
ldrne r8, [r0, #4]
|
||||
ldrneh r10, [r0, #8]
|
||||
|
||||
bne bil_width_loop_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4 ; update src for next row
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_2nd
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_2nd_filter|
|
||||
|bil_height_loop_null_2nd|
|
||||
mov lr, r3, lsr #2
|
||||
|
||||
|bil_width_loop_null_2nd|
|
||||
ldr r6, [r0], #4 ; load data
|
||||
subs lr, lr, #1
|
||||
ldr r8, [r0], #4
|
||||
|
||||
strb r6, [r1], r2 ; store data
|
||||
mov r7, r6, lsr #16
|
||||
strb r7, [r1], r2
|
||||
mov r9, r8, lsr #16
|
||||
strb r8, [r1], r2
|
||||
strb r9, [r1], r2
|
||||
|
||||
bne bil_width_loop_null_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_2nd
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vpx_filter_block2d_second_pass_media|
|
||||
|
||||
END
|
|
@ -0,0 +1,105 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#if HAVE_MEDIA
|
||||
static const int16_t bilinear_filters_media[8][2] = {
|
||||
{ 128, 0 },
|
||||
{ 112, 16 },
|
||||
{ 96, 32 },
|
||||
{ 80, 48 },
|
||||
{ 64, 64 },
|
||||
{ 48, 80 },
|
||||
{ 32, 96 },
|
||||
{ 16, 112 }
|
||||
};
|
||||
|
||||
extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
|
||||
uint16_t *dst_ptr,
|
||||
uint32_t src_pitch,
|
||||
uint32_t height,
|
||||
uint32_t width,
|
||||
const int16_t *filter);
|
||||
|
||||
extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
|
||||
uint8_t *dst_ptr,
|
||||
int32_t src_pitch,
|
||||
uint32_t height,
|
||||
uint32_t width,
|
||||
const int16_t *filter);
|
||||
|
||||
|
||||
unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset, int yoffset,
|
||||
const uint8_t *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse) {
|
||||
uint16_t first_pass[10*8];
|
||||
uint8_t second_pass[8*8];
|
||||
const int16_t *HFilter, *VFilter;
|
||||
|
||||
HFilter = bilinear_filters_media[xoffset];
|
||||
VFilter = bilinear_filters_media[yoffset];
|
||||
|
||||
vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
|
||||
src_pixels_per_line,
|
||||
9, 8, HFilter);
|
||||
vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
|
||||
8, 8, 8, VFilter);
|
||||
|
||||
return vpx_variance8x8_media(second_pass, 8, dst_ptr,
|
||||
dst_pixels_per_line, sse);
|
||||
}
|
||||
|
||||
unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
const uint8_t *dst_ptr,
|
||||
int dst_pixels_per_line,
|
||||
unsigned int *sse) {
|
||||
uint16_t first_pass[36*16];
|
||||
uint8_t second_pass[20*16];
|
||||
const int16_t *HFilter, *VFilter;
|
||||
unsigned int var;
|
||||
|
||||
if (xoffset == 4 && yoffset == 0) {
|
||||
var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line,
|
||||
sse);
|
||||
} else if (xoffset == 0 && yoffset == 4) {
|
||||
var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line,
|
||||
sse);
|
||||
} else if (xoffset == 4 && yoffset == 4) {
|
||||
var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
|
||||
dst_ptr, dst_pixels_per_line,
|
||||
sse);
|
||||
} else {
|
||||
HFilter = bilinear_filters_media[xoffset];
|
||||
VFilter = bilinear_filters_media[yoffset];
|
||||
|
||||
vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
|
||||
src_pixels_per_line,
|
||||
17, 16, HFilter);
|
||||
vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
|
||||
16, 16, 16, VFilter);
|
||||
|
||||
var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
|
||||
dst_pixels_per_line, sse);
|
||||
}
|
||||
return var;
|
||||
}
|
||||
#endif // HAVE_MEDIA
|
|
@ -9,14 +9,13 @@
|
|||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
#include "vp9/common/vp9_filter.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
|
||||
static const uint8_t bilinear_filters[8][2] = {
|
||||
{ 128, 0, },
|
||||
|
@ -35,9 +34,9 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
|
|||
int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *vp9_filter) {
|
||||
const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
|
||||
const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
|
||||
const uint8_t *filter) {
|
||||
const uint8x8_t f0 = vmov_n_u8(filter[0]);
|
||||
const uint8x8_t f1 = vmov_n_u8(filter[1]);
|
||||
unsigned int i;
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
|
||||
|
@ -58,9 +57,9 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
|
|||
int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *vp9_filter) {
|
||||
const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
|
||||
const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
|
||||
const uint8_t *filter) {
|
||||
const uint8x8_t f0 = vmov_n_u8(filter[0]);
|
||||
const uint8x8_t f1 = vmov_n_u8(filter[1]);
|
||||
unsigned int i, j;
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
for (j = 0; j < output_width; j += 16) {
|
||||
|
@ -80,7 +79,7 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
|
|||
}
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
|
||||
unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
|
||||
int src_stride,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
|
@ -98,7 +97,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
|
|||
return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
|
||||
unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
|
||||
int src_stride,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
|
@ -116,7 +115,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
|
|||
return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
|
||||
unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
|
||||
int src_stride,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
|
@ -134,7 +133,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
|
|||
return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
|
||||
unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
|
||||
int src_stride,
|
||||
int xoffset,
|
||||
int yoffset,
|
|
@ -9,7 +9,7 @@
|
|||
;
|
||||
|
||||
|
||||
EXPORT |vp8_variance_halfpixvar16x16_h_armv6|
|
||||
EXPORT |vpx_variance_halfpixvar16x16_h_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
|
@ -22,7 +22,7 @@
|
|||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance_halfpixvar16x16_h_armv6| PROC
|
||||
|vpx_variance_halfpixvar16x16_h_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
|
@ -9,7 +9,7 @@
|
|||
;
|
||||
|
||||
|
||||
EXPORT |vp8_variance_halfpixvar16x16_hv_armv6|
|
||||
EXPORT |vpx_variance_halfpixvar16x16_hv_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
|
@ -22,7 +22,7 @@
|
|||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance_halfpixvar16x16_hv_armv6| PROC
|
||||
|vpx_variance_halfpixvar16x16_hv_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
|
@ -9,7 +9,7 @@
|
|||
;
|
||||
|
||||
|
||||
EXPORT |vp8_variance_halfpixvar16x16_v_armv6|
|
||||
EXPORT |vpx_variance_halfpixvar16x16_v_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
|
@ -22,7 +22,7 @@
|
|||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance_halfpixvar16x16_v_armv6| PROC
|
||||
|vpx_variance_halfpixvar16x16_v_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -8,13 +8,12 @@
|
|||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vp9/common/vp9_filter.h"
|
||||
#include "vp9/common/mips/msa/vp9_macros_msa.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
#include "vpx_dsp/variance.h"
|
||||
|
||||
static const uint8_t bilinear_filters[8][2] = {
|
||||
static const uint8_t bilinear_filters_msa[8][2] = {
|
||||
{ 128, 0, },
|
||||
{ 112, 16, },
|
||||
{ 96, 32, },
|
||||
|
@ -707,8 +706,8 @@ static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
|
|||
#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
|
||||
#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
|
||||
|
||||
#define VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
|
||||
uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
|
||||
#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
|
||||
uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
|
||||
int32_t src_stride, \
|
||||
int32_t xoffset, \
|
||||
int32_t yoffset, \
|
||||
|
@ -717,8 +716,8 @@ uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
|
|||
uint32_t *sse) { \
|
||||
int32_t diff; \
|
||||
uint32_t var; \
|
||||
const uint8_t *h_filter = bilinear_filters[xoffset]; \
|
||||
const uint8_t *v_filter = bilinear_filters[yoffset]; \
|
||||
const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
|
||||
const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
|
||||
\
|
||||
if (yoffset) { \
|
||||
if (xoffset) { \
|
||||
|
@ -749,20 +748,20 @@ uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
|
|||
return var; \
|
||||
}
|
||||
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
|
||||
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
|
||||
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
|
||||
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
|
||||
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
|
||||
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
|
||||
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
|
|
@ -14,13 +14,26 @@
|
|||
#include "vpx_ports/mem.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride) {
|
||||
#include "vpx_dsp/variance.h"
|
||||
|
||||
static const uint8_t bilinear_filters[8][2] = {
|
||||
{ 128, 0 },
|
||||
{ 112, 16 },
|
||||
{ 96, 32 },
|
||||
{ 80, 48 },
|
||||
{ 64, 64 },
|
||||
{ 48, 80 },
|
||||
{ 32, 96 },
|
||||
{ 16, 112 },
|
||||
};
|
||||
|
||||
uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride) {
|
||||
int distortion = 0;
|
||||
int r, c;
|
||||
|
||||
for (r = 0; r < 4; r++) {
|
||||
for (c = 0; c < 4; c++) {
|
||||
for (r = 0; r < 4; ++r) {
|
||||
for (c = 0; c < 4; ++c) {
|
||||
int diff = a[c] - b[c];
|
||||
distortion += diff * diff;
|
||||
}
|
||||
|
@ -32,7 +45,7 @@ unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride,
|
|||
return distortion;
|
||||
}
|
||||
|
||||
unsigned int vpx_get_mb_ss_c(const int16_t *a) {
|
||||
uint32_t vpx_get_mb_ss_c(const int16_t *a) {
|
||||
unsigned int i, sum = 0;
|
||||
|
||||
for (i = 0; i < 256; ++i) {
|
||||
|
@ -42,16 +55,38 @@ unsigned int vpx_get_mb_ss_c(const int16_t *a) {
|
|||
return sum;
|
||||
}
|
||||
|
||||
uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
|
||||
b, b_stride, sse);
|
||||
}
|
||||
|
||||
|
||||
uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
|
||||
b, b_stride, sse);
|
||||
}
|
||||
|
||||
uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
|
||||
b, b_stride, sse);
|
||||
}
|
||||
|
||||
static void variance(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
int w, int h, uint32_t *sse, int *sum) {
|
||||
int i, j;
|
||||
|
||||
*sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
|
@ -62,15 +97,113 @@ static void variance(const uint8_t *a, int a_stride,
|
|||
}
|
||||
}
|
||||
|
||||
// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
|
||||
// or vertical direction to produce the filtered output block. Used to implement
|
||||
// the first-pass of 2-D separable filter.
|
||||
//
|
||||
// Produces int16_t output to retain precision for the next pass. Two filter
|
||||
// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
|
||||
// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
|
||||
// It defines the offset required to move from one input to the next.
|
||||
static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
|
||||
unsigned int src_pixels_per_line,
|
||||
int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *filter) {
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
for (j = 0; j < output_width; ++j) {
|
||||
b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
|
||||
(int)a[pixel_step] * filter[1],
|
||||
FILTER_BITS);
|
||||
|
||||
++a;
|
||||
}
|
||||
|
||||
a += src_pixels_per_line - output_width;
|
||||
b += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
|
||||
// or vertical direction to produce the filtered output block. Used to implement
|
||||
// the second-pass of 2-D separable filter.
|
||||
//
|
||||
// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
|
||||
// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
|
||||
// filter is applied horizontally (pixel_step = 1) or vertically
|
||||
// (pixel_step = stride). It defines the offset required to move from one input
|
||||
// to the next. Output is 8-bit.
|
||||
static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *filter) {
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
for (j = 0; j < output_width; ++j) {
|
||||
b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
|
||||
(int)a[pixel_step] * filter[1],
|
||||
FILTER_BITS);
|
||||
++a;
|
||||
}
|
||||
|
||||
a += src_pixels_per_line - output_width;
|
||||
b += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
#define VAR(W, H) \
|
||||
unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
uint32_t *sse) { \
|
||||
int sum; \
|
||||
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
}
|
||||
|
||||
#define SUBPIX_VAR(W, H) \
|
||||
uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
uint32_t *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint8_t temp2[H * W]; \
|
||||
\
|
||||
var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
|
||||
bilinear_filters[xoffset]); \
|
||||
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
|
||||
}
|
||||
|
||||
#define SUBPIX_AVG_VAR(W, H) \
|
||||
uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
uint32_t *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint8_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
|
||||
\
|
||||
var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
|
||||
bilinear_filters[xoffset]); \
|
||||
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
|
||||
\
|
||||
return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
|
||||
}
|
||||
|
||||
/* Identical to the variance call except it takes an additional parameter, sum,
|
||||
* and returns that value using pass-by-reference instead of returning
|
||||
* sse - sum^2 / w*h
|
||||
|
@ -78,7 +211,7 @@ unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
|||
#define GET_VAR(W, H) \
|
||||
void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
uint32_t *sse, int *sum) { \
|
||||
variance(a, a_stride, b, b_stride, W, H, sse, sum); \
|
||||
}
|
||||
|
||||
|
@ -87,27 +220,33 @@ void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
|
|||
* variable.
|
||||
*/
|
||||
#define MSE(W, H) \
|
||||
unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
uint32_t *sse) { \
|
||||
int sum; \
|
||||
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
}
|
||||
|
||||
VAR(64, 64)
|
||||
VAR(64, 32)
|
||||
VAR(32, 64)
|
||||
VAR(32, 32)
|
||||
VAR(32, 16)
|
||||
VAR(16, 32)
|
||||
VAR(16, 16)
|
||||
VAR(16, 8)
|
||||
VAR(8, 16)
|
||||
VAR(8, 8)
|
||||
VAR(8, 4)
|
||||
VAR(4, 8)
|
||||
VAR(4, 4)
|
||||
/* All three forms of the variance are available in the same sizes. */
|
||||
#define VARIANCES(W, H) \
|
||||
VAR(W, H) \
|
||||
SUBPIX_VAR(W, H) \
|
||||
SUBPIX_AVG_VAR(W, H)
|
||||
|
||||
VARIANCES(64, 64)
|
||||
VARIANCES(64, 32)
|
||||
VARIANCES(32, 64)
|
||||
VARIANCES(32, 32)
|
||||
VARIANCES(32, 16)
|
||||
VARIANCES(16, 32)
|
||||
VARIANCES(16, 16)
|
||||
VARIANCES(16, 8)
|
||||
VARIANCES(8, 16)
|
||||
VARIANCES(8, 8)
|
||||
VARIANCES(8, 4)
|
||||
VARIANCES(4, 8)
|
||||
VARIANCES(4, 4)
|
||||
|
||||
GET_VAR(16, 16)
|
||||
GET_VAR(8, 8)
|
||||
|
@ -117,12 +256,13 @@ MSE(16, 8)
|
|||
MSE(8, 16)
|
||||
MSE(8, 8)
|
||||
|
||||
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
|
||||
int height, const uint8_t *ref, int ref_stride) {
|
||||
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
|
||||
int width, int height,
|
||||
const uint8_t *ref, int ref_stride) {
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j++) {
|
||||
for (i = 0; i < height; ++i) {
|
||||
for (j = 0; j < width; ++j) {
|
||||
const int tmp = pred[j] + ref[j];
|
||||
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
||||
}
|
||||
|
@ -143,8 +283,8 @@ static void highbd_variance64(const uint8_t *a8, int a_stride,
|
|||
*sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
|
@ -156,60 +296,60 @@ static void highbd_variance64(const uint8_t *a8, int a_stride,
|
|||
|
||||
static void highbd_8_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
int w, int h, uint32_t *sse, int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sse = (unsigned int)sse_long;
|
||||
*sse = (uint32_t)sse_long;
|
||||
*sum = (int)sum_long;
|
||||
}
|
||||
|
||||
static void highbd_10_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
int w, int h, uint32_t *sse, int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
|
||||
*sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
|
||||
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
|
||||
}
|
||||
|
||||
static void highbd_12_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
int w, int h, uint32_t *sse, int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
|
||||
*sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
|
||||
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
|
||||
}
|
||||
|
||||
#define HIGHBD_VAR(W, H) \
|
||||
unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
uint32_t *sse) { \
|
||||
int sum; \
|
||||
highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
} \
|
||||
\
|
||||
unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
uint32_t *sse) { \
|
||||
int sum; \
|
||||
highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
} \
|
||||
\
|
||||
unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
uint32_t *sse) { \
|
||||
int sum; \
|
||||
highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
|
@ -217,54 +357,243 @@ unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
|
|||
|
||||
#define HIGHBD_GET_VAR(S) \
|
||||
void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
uint32_t *sse, int *sum) { \
|
||||
highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||
} \
|
||||
\
|
||||
void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
uint32_t *sse, int *sum) { \
|
||||
highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||
} \
|
||||
\
|
||||
void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
uint32_t *sse, int *sum) { \
|
||||
highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||
}
|
||||
|
||||
#define HIGHBD_MSE(W, H) \
|
||||
unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
uint32_t *sse) { \
|
||||
int sum; \
|
||||
highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
} \
|
||||
\
|
||||
unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
uint32_t *sse) { \
|
||||
int sum; \
|
||||
highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
} \
|
||||
\
|
||||
unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
uint32_t *sse) { \
|
||||
int sum; \
|
||||
highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
}
|
||||
|
||||
static void highbd_var_filter_block2d_bil_first_pass(
|
||||
const uint8_t *src_ptr8,
|
||||
uint16_t *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *filter) {
|
||||
unsigned int i, j;
|
||||
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
for (j = 0; j < output_width; ++j) {
|
||||
output_ptr[j] =
|
||||
ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
|
||||
(int)src_ptr[pixel_step] * filter[1],
|
||||
FILTER_BITS);
|
||||
|
||||
++src_ptr;
|
||||
}
|
||||
|
||||
// Next row...
|
||||
src_ptr += src_pixels_per_line - output_width;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_var_filter_block2d_bil_second_pass(
|
||||
const uint16_t *src_ptr,
|
||||
uint16_t *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int pixel_step,
|
||||
unsigned int output_height,
|
||||
unsigned int output_width,
|
||||
const uint8_t *filter) {
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
for (j = 0; j < output_width; ++j) {
|
||||
output_ptr[j] =
|
||||
ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
|
||||
(int)src_ptr[pixel_step] * filter[1],
|
||||
FILTER_BITS);
|
||||
++src_ptr;
|
||||
}
|
||||
|
||||
src_ptr += src_pixels_per_line - output_width;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
|
||||
#define HIGHBD_SUBPIX_VAR(W, H) \
|
||||
uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
uint32_t *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
|
||||
dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
uint32_t *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||
W, dst, dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
uint32_t *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||
W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
|
||||
uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
uint32_t *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||
\
|
||||
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
|
||||
dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
uint32_t *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||
\
|
||||
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||
W, dst, dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
int xoffset, int yoffset, \
|
||||
const uint8_t *dst, int dst_stride, \
|
||||
uint32_t *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
|
||||
W, bilinear_filters[xoffset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[yoffset]); \
|
||||
\
|
||||
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||
\
|
||||
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||
W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
/* All three forms of the variance are available in the same sizes. */
|
||||
#define HIGHBD_VARIANCES(W, H) \
|
||||
HIGHBD_VAR(W, H) \
|
||||
HIGHBD_SUBPIX_VAR(W, H) \
|
||||
HIGHBD_SUBPIX_AVG_VAR(W, H)
|
||||
|
||||
HIGHBD_VARIANCES(64, 64)
|
||||
HIGHBD_VARIANCES(64, 32)
|
||||
HIGHBD_VARIANCES(32, 64)
|
||||
HIGHBD_VARIANCES(32, 32)
|
||||
HIGHBD_VARIANCES(32, 16)
|
||||
HIGHBD_VARIANCES(16, 32)
|
||||
HIGHBD_VARIANCES(16, 16)
|
||||
HIGHBD_VARIANCES(16, 8)
|
||||
HIGHBD_VARIANCES(8, 16)
|
||||
HIGHBD_VARIANCES(8, 8)
|
||||
HIGHBD_VARIANCES(8, 4)
|
||||
HIGHBD_VARIANCES(4, 8)
|
||||
HIGHBD_VARIANCES(4, 4)
|
||||
|
||||
HIGHBD_GET_VAR(8)
|
||||
HIGHBD_GET_VAR(16)
|
||||
|
||||
|
@ -273,28 +602,14 @@ HIGHBD_MSE(16, 8)
|
|||
HIGHBD_MSE(8, 16)
|
||||
HIGHBD_MSE(8, 8)
|
||||
|
||||
HIGHBD_VAR(64, 64)
|
||||
HIGHBD_VAR(64, 32)
|
||||
HIGHBD_VAR(32, 64)
|
||||
HIGHBD_VAR(32, 32)
|
||||
HIGHBD_VAR(32, 16)
|
||||
HIGHBD_VAR(16, 32)
|
||||
HIGHBD_VAR(16, 16)
|
||||
HIGHBD_VAR(16, 8)
|
||||
HIGHBD_VAR(8, 16)
|
||||
HIGHBD_VAR(8, 8)
|
||||
HIGHBD_VAR(8, 4)
|
||||
HIGHBD_VAR(4, 8)
|
||||
HIGHBD_VAR(4, 4)
|
||||
|
||||
void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
|
||||
int width, int height, const uint8_t *ref8,
|
||||
int ref_stride) {
|
||||
int i, j;
|
||||
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j++) {
|
||||
for (i = 0; i < height; ++i) {
|
||||
for (j = 0; j < width; ++j) {
|
||||
const int tmp = pred[j] + ref[j];
|
||||
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VPX_DSP_VARIANCE_H_
|
||||
#define VPX_DSP_VARIANCE_H_
|
||||
|
||||
#include "./vpx_config.h"
|
||||
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define FILTER_BITS 7
|
||||
#define FILTER_WEIGHT 128
|
||||
|
||||
typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b_ptr, int b_stride);
|
||||
|
||||
typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
|
||||
const uint8_t *b_ptr, int b_stride,
|
||||
const uint8_t *second_pred);
|
||||
|
||||
typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
|
||||
uint8_t *b, int b_stride, int n);
|
||||
|
||||
typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sad_array);
|
||||
|
||||
typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
|
||||
const uint8_t *const b_array[],
|
||||
int b_stride,
|
||||
unsigned int *sad_array);
|
||||
|
||||
typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse);
|
||||
|
||||
typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
|
||||
int xoffset, int yoffset,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse);
|
||||
|
||||
typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
|
||||
int a_stride,
|
||||
int xoffset, int yoffset,
|
||||
const uint8_t *b_ptr,
|
||||
int b_stride,
|
||||
unsigned int *sse,
|
||||
const uint8_t *second_pred);
|
||||
#if CONFIG_VP8
|
||||
typedef struct variance_vtable {
|
||||
vpx_sad_fn_t sdf;
|
||||
vpx_variance_fn_t vf;
|
||||
vpx_subpixvariance_fn_t svf;
|
||||
vpx_variance_fn_t svf_halfpix_h;
|
||||
vpx_variance_fn_t svf_halfpix_v;
|
||||
vpx_variance_fn_t svf_halfpix_hv;
|
||||
vpx_sad_multi_fn_t sdx3f;
|
||||
vpx_sad_multi_fn_t sdx8f;
|
||||
vpx_sad_multi_d_fn_t sdx4df;
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
vp8_copy32xn_fn_t copymem;
|
||||
#endif
|
||||
} vp8_variance_fn_ptr_t;
|
||||
#endif // CONFIG_VP8
|
||||
|
||||
#if CONFIG_VP9
|
||||
typedef struct vp9_variance_vtable {
|
||||
vpx_sad_fn_t sdf;
|
||||
vpx_sad_avg_fn_t sdaf;
|
||||
vpx_variance_fn_t vf;
|
||||
vpx_subpixvariance_fn_t svf;
|
||||
vpx_subp_avg_variance_fn_t svaf;
|
||||
vpx_sad_multi_fn_t sdx3f;
|
||||
vpx_sad_multi_fn_t sdx8f;
|
||||
vpx_sad_multi_d_fn_t sdx4df;
|
||||
} vp9_variance_fn_ptr_t;
|
||||
#endif // CONFIG_VP9
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VPX_DSP_VARIANCE_H_
|
|
@ -10,6 +10,8 @@
|
|||
|
||||
DSP_SRCS-yes += vpx_dsp.mk
|
||||
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
|
||||
|
||||
ifeq ($(CONFIG_ENCODERS),yes)
|
||||
DSP_SRCS-yes += sad.c
|
||||
DSP_SRCS-yes += subtract.c
|
||||
|
@ -19,7 +21,6 @@ DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
|
|||
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c
|
||||
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
|
||||
|
||||
|
@ -45,21 +46,36 @@ endif # CONFIG_ENCODERS
|
|||
|
||||
ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
|
||||
DSP_SRCS-yes += variance.c
|
||||
DSP_SRCS-yes += variance.h
|
||||
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
|
||||
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
|
||||
|
||||
DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c
|
||||
DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
|
||||
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3
|
||||
endif # CONFIG_USE_X86INC
|
||||
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
|
||||
endif # CONFIG_USE_X86INC
|
||||
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
|
||||
|
||||
|
|
|
@ -412,6 +412,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||
|
||||
if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
|
||||
|
||||
#
|
||||
# Variance
|
||||
#
|
||||
add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
|
||||
|
||||
|
@ -451,7 +454,9 @@ add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_
|
|||
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance4x4 mmx sse2 msa/;
|
||||
|
||||
|
||||
#
|
||||
# Specialty Variance
|
||||
#
|
||||
add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
|
||||
|
||||
|
@ -478,6 +483,99 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int
|
|||
|
||||
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
|
||||
|
||||
#
|
||||
# Subpixel Variance
|
||||
#
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
|
||||
|
||||
#
|
||||
# Specialty Subpixel
|
||||
#
|
||||
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_variance_halfpixvar16x16_h mmx media/;
|
||||
|
||||
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_variance_halfpixvar16x16_v mmx media/;
|
||||
|
||||
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_variance_halfpixvar16x16_hv mmx media/;
|
||||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance64x64 sse2/;
|
||||
|
@ -615,6 +713,226 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||
specialize qw/vpx_highbd_12_mse8x8 sse2/;
|
||||
|
||||
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
|
||||
|
||||
#
|
||||
# Subpixel Variance
|
||||
#
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
|
||||
|
||||
} # CONFIG_VP9_HIGHBITDEPTH
|
||||
} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
|
||||
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
%define program_name vpx
|
||||
|
||||
%include "third_party/x86inc/x86inc.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
@ -30,7 +32,7 @@ bilin_filter_m_sse2: times 8 dw 16
|
|||
|
||||
SECTION .text
|
||||
|
||||
; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
|
||||
; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
|
||||
; int x_offset, int y_offset,
|
||||
; const uint8_t *dst, ptrdiff_t dst_stride,
|
||||
; int height, unsigned int *sse);
|
|
@ -8,9 +8,7 @@
|
|||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#include "./vpx_config.h"
|
||||
#include "vp9/common/vp9_common.h"
|
||||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
|
||||
|
@ -243,3 +241,341 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
|||
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
#if CONFIG_USE_X86INC
|
||||
#define DECL(w, opt) \
|
||||
int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint16_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
int height, unsigned int *sse);
|
||||
#define DECLS(opt1, opt2) \
|
||||
DECL(8, opt1); \
|
||||
DECL(16, opt1)
|
||||
|
||||
DECLS(sse2, sse);
|
||||
// TODO(johannkoenig): enable the ssse3 or delete
|
||||
// DECLS(ssse3, ssse3);
|
||||
#undef DECLS
|
||||
#undef DECL
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst8, \
|
||||
int dst_stride, \
|
||||
uint32_t *sse_ptr) { \
|
||||
uint32_t sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst, dst_stride, h, \
|
||||
&sse); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
|
||||
src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, \
|
||||
dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 48, src_stride, x_offset, y_offset, \
|
||||
dst + 48, dst_stride, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
|
||||
uint32_t sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst, dst_stride, \
|
||||
h, &sse); \
|
||||
if (w > wf) { \
|
||||
uint32_t sse2; \
|
||||
int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
|
||||
src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, \
|
||||
dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 2); \
|
||||
sse = ROUND_POWER_OF_TWO(sse, 4); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
|
||||
int start_row; \
|
||||
uint32_t sse; \
|
||||
int se = 0; \
|
||||
uint64_t long_sse = 0; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
for (start_row = 0; start_row < h; start_row +=16) { \
|
||||
uint32_t sse2; \
|
||||
int height = h - start_row < 16 ? h - start_row : 16; \
|
||||
int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf) { \
|
||||
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 16 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 32 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 48 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
}\
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 4); \
|
||||
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
#define FNS(opt1, opt2) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
|
||||
FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (int64_t));
|
||||
|
||||
|
||||
FNS(sse2, sse);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
|
||||
#define DECL(w, opt) \
|
||||
int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint16_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
const uint16_t *sec, \
|
||||
ptrdiff_t sec_stride, \
|
||||
int height, \
|
||||
unsigned int *sse);
|
||||
#define DECLS(opt1) \
|
||||
DECL(16, opt1) \
|
||||
DECL(8, opt1)
|
||||
|
||||
DECLS(sse2);
|
||||
#undef DECL
|
||||
#undef DECLS
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
|
||||
const uint8_t *sec8) { \
|
||||
uint32_t sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
|
||||
int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, sec, w, h, &sse); \
|
||||
if (w > wf) { \
|
||||
uint32_t sse2; \
|
||||
int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 16, src_stride, x_offset, y_offset, \
|
||||
dst + 16, dst_stride, sec + 16, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 32, src_stride, x_offset, y_offset, \
|
||||
dst + 32, dst_stride, sec + 32, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 48, src_stride, x_offset, y_offset, \
|
||||
dst + 48, dst_stride, sec + 48, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
|
||||
const uint8_t *sec8) { \
|
||||
uint32_t sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
|
||||
int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, \
|
||||
sec, w, h, &sse); \
|
||||
if (w > wf) { \
|
||||
uint32_t sse2; \
|
||||
int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 16, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, dst_stride, \
|
||||
sec + 16, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
sec + 32, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
sec + 48, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 2); \
|
||||
sse = ROUND_POWER_OF_TWO(sse, 4); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
|
||||
const uint8_t *sec8) { \
|
||||
int start_row; \
|
||||
uint32_t sse; \
|
||||
int se = 0; \
|
||||
uint64_t long_sse = 0; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
|
||||
for (start_row = 0; start_row < h; start_row +=16) { \
|
||||
uint32_t sse2; \
|
||||
int height = h - start_row < 16 ? h - start_row : 16; \
|
||||
int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + (start_row * src_stride), src_stride, x_offset, \
|
||||
y_offset, dst + (start_row * dst_stride), dst_stride, \
|
||||
sec + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf) { \
|
||||
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 16 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16 + (start_row * dst_stride), dst_stride, \
|
||||
sec + 16 + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 32 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32 + (start_row * dst_stride), dst_stride, \
|
||||
sec + 32 + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 48 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48 + (start_row * dst_stride), dst_stride, \
|
||||
sec + 48 + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 4); \
|
||||
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
|
||||
#define FNS(opt1) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
|
||||
FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (int64_t));
|
||||
|
||||
FNS(sse2);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
#endif // CONFIG_USE_X86INC
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
%define program_name vpx
|
||||
|
||||
%include "third_party/x86inc/x86inc.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
@ -39,7 +41,7 @@ bilin_filter_m_ssse3: times 8 db 16, 0
|
|||
|
||||
SECTION .text
|
||||
|
||||
; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
|
||||
; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
|
||||
; int x_offset, int y_offset,
|
||||
; const uint8_t *dst, ptrdiff_t dst_stride,
|
||||
; int height, unsigned int *sse);
|
|
@ -91,3 +91,93 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
|
|||
sse, &sum, vpx_get32x32var_avx2, 32);
|
||||
return *sse - (((int64_t)sum * sum) >> 11);
|
||||
}
|
||||
|
||||
unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
|
||||
int x_offset, int y_offset,
|
||||
const uint8_t *dst, int dst_stride,
|
||||
int height,
|
||||
unsigned int *sse);
|
||||
|
||||
unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
const uint8_t *sec,
|
||||
int sec_stride,
|
||||
int height,
|
||||
unsigned int *sseptr);
|
||||
|
||||
unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse1;
|
||||
const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
|
||||
y_offset, dst, dst_stride,
|
||||
64, &sse1);
|
||||
unsigned int sse2;
|
||||
const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
|
||||
x_offset, y_offset,
|
||||
dst + 32, dst_stride,
|
||||
64, &sse2);
|
||||
const int se = se1 + se2;
|
||||
*sse = sse1 + sse2;
|
||||
return *sse - (((int64_t)se * se) >> 12);
|
||||
}
|
||||
|
||||
unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
unsigned int *sse) {
|
||||
const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
|
||||
y_offset, dst, dst_stride,
|
||||
32, sse);
|
||||
return *sse - (((int64_t)se * se) >> 10);
|
||||
}
|
||||
|
||||
unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
unsigned int *sse,
|
||||
const uint8_t *sec) {
|
||||
unsigned int sse1;
|
||||
const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
|
||||
y_offset, dst, dst_stride,
|
||||
sec, 64, 64, &sse1);
|
||||
unsigned int sse2;
|
||||
const int se2 =
|
||||
vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
|
||||
y_offset, dst + 32, dst_stride,
|
||||
sec + 32, 64, 64, &sse2);
|
||||
const int se = se1 + se2;
|
||||
|
||||
*sse = sse1 + sse2;
|
||||
|
||||
return *sse - (((int64_t)se * se) >> 12);
|
||||
}
|
||||
|
||||
unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
unsigned int *sse,
|
||||
const uint8_t *sec) {
|
||||
// Process 32 elements in parallel.
|
||||
const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
|
||||
y_offset, dst, dst_stride,
|
||||
sec, 32, 32, sse);
|
||||
return *sse - (((int64_t)se * se) >> 10);
|
||||
}
|
||||
|
|
|
@ -11,6 +11,27 @@
|
|||
#include <immintrin.h> // AVX2
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
|
||||
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
|
||||
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
|
||||
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
|
||||
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
|
||||
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
|
||||
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
|
||||
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
|
||||
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
|
||||
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
|
||||
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
|
||||
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
|
||||
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
|
||||
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
|
||||
};
|
||||
|
||||
|
||||
void vpx_get16x16var_avx2(const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
|
@ -213,3 +234,494 @@ void vpx_get32x32var_avx2(const unsigned char *src_ptr,
|
|||
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
|
||||
}
|
||||
}
|
||||
|
||||
#define FILTER_SRC(filter) \
|
||||
/* filter the source */ \
|
||||
exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
|
||||
exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
|
||||
\
|
||||
/* add 8 to source */ \
|
||||
exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
|
||||
exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
|
||||
\
|
||||
/* divide source by 16 */ \
|
||||
exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
|
||||
exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
|
||||
|
||||
#define MERGE_WITH_SRC(src_reg, reg) \
|
||||
exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
|
||||
exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
|
||||
|
||||
#define LOAD_SRC_DST \
|
||||
/* load source and destination */ \
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
|
||||
dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
|
||||
|
||||
#define AVG_NEXT_SRC(src_reg, size_stride) \
|
||||
src_next_reg = _mm256_loadu_si256((__m256i const *) \
|
||||
(src + size_stride)); \
|
||||
/* average between current and next stride source */ \
|
||||
src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
|
||||
|
||||
#define MERGE_NEXT_SRC(src_reg, size_stride) \
|
||||
src_next_reg = _mm256_loadu_si256((__m256i const *) \
|
||||
(src + size_stride)); \
|
||||
MERGE_WITH_SRC(src_reg, src_next_reg)
|
||||
|
||||
#define CALC_SUM_SSE_INSIDE_LOOP \
|
||||
/* expand each byte to 2 bytes */ \
|
||||
exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
|
||||
exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
|
||||
/* source - dest */ \
|
||||
exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
|
||||
exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
|
||||
/* caculate sum */ \
|
||||
sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
|
||||
exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
|
||||
sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
|
||||
exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
|
||||
/* calculate sse */ \
|
||||
sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
|
||||
sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
|
||||
|
||||
// final calculation to sum and sse
|
||||
#define CALC_SUM_AND_SSE \
|
||||
res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
|
||||
sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
|
||||
sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
|
||||
sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
|
||||
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
|
||||
sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
|
||||
\
|
||||
sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
|
||||
sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
|
||||
\
|
||||
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
|
||||
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
|
||||
*((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
|
||||
_mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
|
||||
sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
|
||||
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
|
||||
sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
|
||||
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
|
||||
|
||||
|
||||
unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
int height,
|
||||
unsigned int *sse) {
|
||||
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
|
||||
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
|
||||
__m256i zero_reg;
|
||||
int i, sum;
|
||||
sum_reg = _mm256_set1_epi16(0);
|
||||
sse_reg = _mm256_set1_epi16(0);
|
||||
zero_reg = _mm256_set1_epi16(0);
|
||||
|
||||
// x_offset = 0 and y_offset = 0
|
||||
if (x_offset == 0) {
|
||||
if (y_offset == 0) {
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
// expend each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 0 and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i src_next_reg;
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, src_stride)
|
||||
// expend each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 0 and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i filter, pw8, src_next_reg;
|
||||
|
||||
y_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, src_stride)
|
||||
FILTER_SRC(filter)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
// x_offset = 8 and y_offset = 0
|
||||
} else if (x_offset == 8) {
|
||||
if (y_offset == 0) {
|
||||
__m256i src_next_reg;
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 8 and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i src_next_reg, src_avg;
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
for (i = 0; i < height ; i++) {
|
||||
src_avg = src_reg;
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
// average between previous average to current average
|
||||
src_avg = _mm256_avg_epu8(src_avg, src_reg);
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_avg, zero_reg)
|
||||
// save current source average
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 8 and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i filter, pw8, src_next_reg, src_avg;
|
||||
y_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
for (i = 0; i < height ; i++) {
|
||||
// save current source average
|
||||
src_avg = src_reg;
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
MERGE_WITH_SRC(src_avg, src_reg)
|
||||
FILTER_SRC(filter)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = 0
|
||||
} else {
|
||||
if (y_offset == 0) {
|
||||
__m256i filter, pw8, src_next_reg;
|
||||
x_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i filter, pw8, src_next_reg, src_pack;
|
||||
x_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
// convert each 16 bit to 8 bit to each low and high lane source
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
for (i = 0; i < height ; i++) {
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
// average between previous pack to the current
|
||||
src_pack = _mm256_avg_epu8(src_pack, src_reg);
|
||||
MERGE_WITH_SRC(src_pack, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src_pack = src_reg;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
|
||||
x_offset <<= 5;
|
||||
xfilter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
y_offset <<= 5;
|
||||
yfilter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
|
||||
FILTER_SRC(xfilter)
|
||||
// convert each 16 bit to 8 bit to each low and high lane source
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
for (i = 0; i < height ; i++) {
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(xfilter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
// merge previous pack to current pack source
|
||||
MERGE_WITH_SRC(src_pack, src_reg)
|
||||
// filter the source
|
||||
FILTER_SRC(yfilter)
|
||||
src_pack = src_reg;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
CALC_SUM_AND_SSE
|
||||
return sum;
|
||||
}
|
||||
|
||||
unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
int y_offset,
|
||||
const uint8_t *dst,
|
||||
int dst_stride,
|
||||
const uint8_t *sec,
|
||||
int sec_stride,
|
||||
int height,
|
||||
unsigned int *sse) {
|
||||
__m256i sec_reg;
|
||||
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
|
||||
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
|
||||
__m256i zero_reg;
|
||||
int i, sum;
|
||||
sum_reg = _mm256_set1_epi16(0);
|
||||
sse_reg = _mm256_set1_epi16(0);
|
||||
zero_reg = _mm256_set1_epi16(0);
|
||||
|
||||
// x_offset = 0 and y_offset = 0
|
||||
if (x_offset == 0) {
|
||||
if (y_offset == 0) {
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
// expend each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
} else if (y_offset == 8) {
|
||||
__m256i src_next_reg;
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, src_stride)
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
// expend each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 0 and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i filter, pw8, src_next_reg;
|
||||
|
||||
y_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, src_stride)
|
||||
FILTER_SRC(filter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
// x_offset = 8 and y_offset = 0
|
||||
} else if (x_offset == 8) {
|
||||
if (y_offset == 0) {
|
||||
__m256i src_next_reg;
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 8 and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i src_next_reg, src_avg;
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
for (i = 0; i < height ; i++) {
|
||||
// save current source average
|
||||
src_avg = src_reg;
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
// average between previous average to current average
|
||||
src_avg = _mm256_avg_epu8(src_avg, src_reg);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
|
||||
sec+= sec_stride;
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_avg, zero_reg)
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = 8 and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i filter, pw8, src_next_reg, src_avg;
|
||||
y_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
for (i = 0; i < height ; i++) {
|
||||
// save current source average
|
||||
src_avg = src_reg;
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
AVG_NEXT_SRC(src_reg, 1)
|
||||
MERGE_WITH_SRC(src_avg, src_reg)
|
||||
FILTER_SRC(filter)
|
||||
src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
|
||||
// expand each byte to 2 bytes
|
||||
MERGE_WITH_SRC(src_avg, zero_reg)
|
||||
sec+= sec_stride;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = 0
|
||||
} else {
|
||||
if (y_offset == 0) {
|
||||
__m256i filter, pw8, src_next_reg;
|
||||
x_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
for (i = 0; i < height ; i++) {
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
|
||||
MERGE_WITH_SRC(src_reg, zero_reg)
|
||||
sec+= sec_stride;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
src+= src_stride;
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = 8
|
||||
} else if (y_offset == 8) {
|
||||
__m256i filter, pw8, src_next_reg, src_pack;
|
||||
x_offset <<= 5;
|
||||
filter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
// convert each 16 bit to 8 bit to each low and high lane source
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
for (i = 0; i < height ; i++) {
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(filter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
// average between previous pack to the current
|
||||
src_pack = _mm256_avg_epu8(src_pack, src_reg);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
|
||||
sec+= sec_stride;
|
||||
MERGE_WITH_SRC(src_pack, zero_reg)
|
||||
src_pack = src_reg;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
// x_offset = bilin interpolation and y_offset = bilin interpolation
|
||||
} else {
|
||||
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
|
||||
x_offset <<= 5;
|
||||
xfilter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + x_offset));
|
||||
y_offset <<= 5;
|
||||
yfilter = _mm256_load_si256((__m256i const *)
|
||||
(bilinear_filters_avx2 + y_offset));
|
||||
pw8 = _mm256_set1_epi16(8);
|
||||
// load source and another source starting from the next
|
||||
// following byte
|
||||
src_reg = _mm256_loadu_si256((__m256i const *) (src));
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
|
||||
FILTER_SRC(xfilter)
|
||||
// convert each 16 bit to 8 bit to each low and high lane source
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
for (i = 0; i < height ; i++) {
|
||||
src+= src_stride;
|
||||
LOAD_SRC_DST
|
||||
MERGE_NEXT_SRC(src_reg, 1)
|
||||
FILTER_SRC(xfilter)
|
||||
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
// merge previous pack to current pack source
|
||||
MERGE_WITH_SRC(src_pack, src_reg)
|
||||
// filter the source
|
||||
FILTER_SRC(yfilter)
|
||||
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
|
||||
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
|
||||
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
|
||||
MERGE_WITH_SRC(src_pack, zero_reg)
|
||||
src_pack = src_reg;
|
||||
sec+= sec_stride;
|
||||
CALC_SUM_SSE_INSIDE_LOOP
|
||||
dst+= dst_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
CALC_SUM_AND_SSE
|
||||
return sum;
|
||||
}
|
||||
|
|
|
@ -11,6 +11,8 @@
|
|||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
%define mmx_filter_shift 7
|
||||
|
||||
;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
|
||||
global sym(vpx_get_mb_ss_mmx) PRIVATE
|
||||
sym(vpx_get_mb_ss_mmx):
|
||||
|
@ -52,7 +54,6 @@ sym(vpx_get_mb_ss_mmx):
|
|||
movsxd rcx, dword ptr [rsp+4]
|
||||
add rax, rcx
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 8
|
||||
pop rdi
|
||||
|
@ -62,7 +63,6 @@ sym(vpx_get_mb_ss_mmx):
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vpx_get8x8var_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
|
@ -83,7 +83,6 @@ sym(vpx_get8x8var_mmx):
|
|||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm5, mm5 ; Blank mmx6
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
@ -117,7 +116,6 @@ sym(vpx_get8x8var_mmx):
|
|||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
|
||||
; Row 2
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
|
@ -298,7 +296,6 @@ sym(vpx_get8x8var_mmx):
|
|||
mov dword ptr [rdi], edx
|
||||
xor rax, rax ; return 0
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rbx
|
||||
|
@ -308,8 +305,6 @@ sym(vpx_get8x8var_mmx):
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
;void
|
||||
;vpx_get4x4var_mmx
|
||||
;(
|
||||
|
@ -331,7 +326,6 @@ sym(vpx_get4x4var_mmx):
|
|||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm5, mm5 ; Blank mmx6
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
@ -354,7 +348,6 @@ sym(vpx_get4x4var_mmx):
|
|||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
|
||||
; Row 2
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
|
@ -393,7 +386,6 @@ sym(vpx_get4x4var_mmx):
|
|||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
|
||||
; Now accumulate the final results.
|
||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||||
|
@ -413,7 +405,6 @@ sym(vpx_get4x4var_mmx):
|
|||
mov dword ptr [rdi], edx
|
||||
xor rax, rax ; return 0
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rbx
|
||||
|
@ -422,3 +413,332 @@ sym(vpx_get4x4var_mmx):
|
|||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vpx_filter_block2d_bil4x4_var_mmx
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned short *HFilter,
|
||||
; unsigned short *VFilter,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
|
||||
sym(vpx_filter_block2d_bil4x4_var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 8
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
pxor mm6, mm6 ;
|
||||
pxor mm7, mm7 ;
|
||||
|
||||
mov rax, arg(4) ;HFilter ;
|
||||
mov rdx, arg(5) ;VFilter ;
|
||||
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
|
||||
mov rcx, 4 ;
|
||||
pxor mm0, mm0 ;
|
||||
|
||||
movd mm1, [rsi] ;
|
||||
movd mm3, [rsi+1] ;
|
||||
|
||||
punpcklbw mm1, mm0 ;
|
||||
pmullw mm1, [rax] ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
pmullw mm3, [rax+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
movq mm5, mm1
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
add rsi, r8
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil4x4_var_mmx_loop:
|
||||
|
||||
movd mm1, [rsi] ;
|
||||
movd mm3, [rsi+1] ;
|
||||
|
||||
punpcklbw mm1, mm0 ;
|
||||
pmullw mm1, [rax] ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
pmullw mm3, [rax+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
movq mm3, mm5 ;
|
||||
|
||||
movq mm5, mm1 ;
|
||||
pmullw mm3, [rdx] ;
|
||||
|
||||
pmullw mm1, [rdx+8] ;
|
||||
paddw mm1, mm3 ;
|
||||
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
|
||||
movd mm3, [rdi] ;
|
||||
punpcklbw mm3, mm0 ;
|
||||
|
||||
psubw mm1, mm3 ;
|
||||
paddw mm6, mm1 ;
|
||||
|
||||
pmaddwd mm1, mm1 ;
|
||||
paddd mm7, mm1 ;
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
add rsi, r8
|
||||
add rdi, r9
|
||||
%endif
|
||||
sub rcx, 1 ;
|
||||
jnz .filter_block2d_bil4x4_var_mmx_loop ;
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rdi, arg(6) ;sum
|
||||
mov rsi, arg(7) ;sumsquared
|
||||
|
||||
movd dword ptr [rdi], mm2 ;
|
||||
movd dword ptr [rsi], mm4 ;
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vpx_filter_block2d_bil_var_mmx
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_pixels_per_line,
|
||||
; unsigned char *src_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; unsigned int Height,
|
||||
; unsigned short *HFilter,
|
||||
; unsigned short *VFilter,
|
||||
; int *sum,
|
||||
; unsigned int *sumsquared
|
||||
;)
|
||||
global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
|
||||
sym(vpx_filter_block2d_bil_var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 9
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
pxor mm6, mm6 ;
|
||||
pxor mm7, mm7 ;
|
||||
mov rax, arg(5) ;HFilter ;
|
||||
|
||||
mov rdx, arg(6) ;VFilter ;
|
||||
mov rsi, arg(0) ;ref_ptr ;
|
||||
|
||||
mov rdi, arg(2) ;src_ptr ;
|
||||
movsxd rcx, dword ptr arg(4) ;Height ;
|
||||
|
||||
pxor mm0, mm0 ;
|
||||
movq mm1, [rsi] ;
|
||||
|
||||
movq mm3, [rsi+1] ;
|
||||
movq mm2, mm1 ;
|
||||
|
||||
movq mm4, mm3 ;
|
||||
punpcklbw mm1, mm0 ;
|
||||
|
||||
punpckhbw mm2, mm0 ;
|
||||
pmullw mm1, [rax] ;
|
||||
|
||||
pmullw mm2, [rax] ;
|
||||
punpcklbw mm3, mm0 ;
|
||||
|
||||
punpckhbw mm4, mm0 ;
|
||||
pmullw mm3, [rax+8] ;
|
||||
|
||||
pmullw mm4, [rax+8] ;
|
||||
paddw mm1, mm3 ;
|
||||
|
||||
paddw mm2, mm4 ;
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm2, mmx_filter_shift ;
|
||||
movq mm5, mm1
|
||||
|
||||
packuswb mm5, mm2 ;
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||||
add rsi, r8
|
||||
%endif
|
||||
|
||||
.filter_block2d_bil_var_mmx_loop:
|
||||
|
||||
movq mm1, [rsi] ;
|
||||
movq mm3, [rsi+1] ;
|
||||
|
||||
movq mm2, mm1 ;
|
||||
movq mm4, mm3 ;
|
||||
|
||||
punpcklbw mm1, mm0 ;
|
||||
punpckhbw mm2, mm0 ;
|
||||
|
||||
pmullw mm1, [rax] ;
|
||||
pmullw mm2, [rax] ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
pmullw mm3, [rax+8] ;
|
||||
pmullw mm4, [rax+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm2, mm4 ;
|
||||
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
|
||||
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
||||
psraw mm2, mmx_filter_shift ;
|
||||
|
||||
movq mm3, mm5 ;
|
||||
movq mm4, mm5 ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
movq mm5, mm1 ;
|
||||
packuswb mm5, mm2 ;
|
||||
|
||||
pmullw mm3, [rdx] ;
|
||||
pmullw mm4, [rdx] ;
|
||||
|
||||
pmullw mm1, [rdx+8] ;
|
||||
pmullw mm2, [rdx+8] ;
|
||||
|
||||
paddw mm1, mm3 ;
|
||||
paddw mm2, mm4 ;
|
||||
|
||||
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
|
||||
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
|
||||
|
||||
psraw mm1, mmx_filter_shift ;
|
||||
psraw mm2, mmx_filter_shift ;
|
||||
|
||||
movq mm3, [rdi] ;
|
||||
movq mm4, mm3 ;
|
||||
|
||||
punpcklbw mm3, mm0 ;
|
||||
punpckhbw mm4, mm0 ;
|
||||
|
||||
psubw mm1, mm3 ;
|
||||
psubw mm2, mm4 ;
|
||||
|
||||
paddw mm6, mm1 ;
|
||||
pmaddwd mm1, mm1 ;
|
||||
|
||||
paddw mm6, mm2 ;
|
||||
pmaddwd mm2, mm2 ;
|
||||
|
||||
paddd mm7, mm1 ;
|
||||
paddd mm7, mm2 ;
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
||||
%else
|
||||
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
||||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
|
||||
add rsi, r8
|
||||
add rdi, r9
|
||||
%endif
|
||||
sub rcx, 1 ;
|
||||
jnz .filter_block2d_bil_var_mmx_loop ;
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
pxor mm2, mm2 ;
|
||||
|
||||
punpcklwd mm2, mm6 ;
|
||||
punpckhwd mm3, mm6 ;
|
||||
|
||||
paddd mm2, mm3 ;
|
||||
movq mm6, mm2 ;
|
||||
|
||||
psrlq mm6, 32 ;
|
||||
paddd mm2, mm6 ;
|
||||
|
||||
psrad mm2, 16 ;
|
||||
movq mm4, mm7 ;
|
||||
|
||||
psrlq mm4, 32 ;
|
||||
paddd mm4, mm7 ;
|
||||
|
||||
mov rdi, arg(7) ;sum
|
||||
mov rsi, arg(8) ;sumsquared
|
||||
|
||||
movd dword ptr [rdi], mm2 ;
|
||||
movd dword ptr [rsi], mm4 ;
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
;short mmx_bi_rd[4] = { 64, 64, 64, 64};
|
||||
align 16
|
||||
mmx_bi_rd:
|
||||
times 4 dw 64
|
||||
|
|
|
@ -10,12 +10,45 @@
|
|||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
|
||||
{ 128, 128, 128, 128, 0, 0, 0, 0 },
|
||||
{ 112, 112, 112, 112, 16, 16, 16, 16 },
|
||||
{ 96, 96, 96, 96, 32, 32, 32, 32 },
|
||||
{ 80, 80, 80, 80, 48, 48, 48, 48 },
|
||||
{ 64, 64, 64, 64, 64, 64, 64, 64 },
|
||||
{ 48, 48, 48, 48, 80, 80, 80, 80 },
|
||||
{ 32, 32, 32, 32, 96, 96, 96, 96 },
|
||||
{ 16, 16, 16, 16, 112, 112, 112, 112 }
|
||||
};
|
||||
|
||||
extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
const int16_t *HFilter,
|
||||
const int16_t *VFilter,
|
||||
int *sum,
|
||||
unsigned int *sumsquared);
|
||||
|
||||
extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
|
||||
int ref_pixels_per_line,
|
||||
const unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
unsigned int Height,
|
||||
const int16_t *HFilter,
|
||||
const int16_t *VFilter,
|
||||
int *sum,
|
||||
unsigned int *sumsquared);
|
||||
|
||||
|
||||
unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
@ -25,8 +58,8 @@ unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
|
|||
return (var - (((unsigned int)avg * avg) >> 4));
|
||||
}
|
||||
|
||||
unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
@ -37,8 +70,8 @@ unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
|
|||
return (var - (((unsigned int)avg * avg) >> 6));
|
||||
}
|
||||
|
||||
unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, sse2, sse3, var;
|
||||
int sum0, sum1, sum2, sum3;
|
||||
|
@ -55,8 +88,8 @@ unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
|
|||
return var;
|
||||
}
|
||||
|
||||
unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, sse2, sse3, var;
|
||||
int sum0, sum1, sum2, sum3, avg;
|
||||
|
@ -74,8 +107,8 @@ unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
|
|||
return (var - (((unsigned int)avg * avg) >> 8));
|
||||
}
|
||||
|
||||
unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, var;
|
||||
int sum0, sum1, avg;
|
||||
|
@ -89,8 +122,8 @@ unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
|
|||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
}
|
||||
|
||||
unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, var;
|
||||
int sum0, sum1, avg;
|
||||
|
@ -105,3 +138,112 @@ unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
|
|||
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
}
|
||||
|
||||
uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
|
||||
int xoffset, int yoffset,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
|
||||
bilinear_filters_mmx[xoffset],
|
||||
bilinear_filters_mmx[yoffset],
|
||||
&xsum, &xxsum);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
|
||||
}
|
||||
|
||||
|
||||
uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
|
||||
int xoffset, int yoffset,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
int xsum;
|
||||
uint32_t xxsum;
|
||||
vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
|
||||
bilinear_filters_mmx[xoffset],
|
||||
bilinear_filters_mmx[yoffset],
|
||||
&xsum, &xxsum);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((uint32_t)xsum * xsum) >> 6));
|
||||
}
|
||||
|
||||
uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
|
||||
int xoffset, int yoffset,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
|
||||
bilinear_filters_mmx[xoffset],
|
||||
bilinear_filters_mmx[yoffset],
|
||||
&xsum0, &xxsum0);
|
||||
|
||||
vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
|
||||
bilinear_filters_mmx[xoffset],
|
||||
bilinear_filters_mmx[yoffset],
|
||||
&xsum1, &xxsum1);
|
||||
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
|
||||
}
|
||||
|
||||
uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
|
||||
int xoffset, int yoffset,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
int xsum0, xsum1;
|
||||
unsigned int xxsum0, xxsum1;
|
||||
|
||||
vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
|
||||
bilinear_filters_mmx[xoffset],
|
||||
bilinear_filters_mmx[yoffset],
|
||||
&xsum0, &xxsum0);
|
||||
|
||||
vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
|
||||
bilinear_filters_mmx[xoffset],
|
||||
bilinear_filters_mmx[yoffset],
|
||||
&xsum1, &xxsum1);
|
||||
|
||||
xsum0 += xsum1;
|
||||
xxsum0 += xxsum1;
|
||||
|
||||
*sse = xxsum0;
|
||||
return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
|
||||
}
|
||||
|
||||
uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
|
||||
int xoffset, int yoffset,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
int xsum;
|
||||
unsigned int xxsum;
|
||||
vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
|
||||
bilinear_filters_mmx[xoffset],
|
||||
bilinear_filters_mmx[yoffset],
|
||||
&xsum, &xxsum);
|
||||
*sse = xxsum;
|
||||
return (xxsum - (((uint32_t)xsum * xsum) >> 7));
|
||||
}
|
||||
|
||||
uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
|
||||
}
|
||||
|
||||
uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
|
||||
}
|
||||
|
||||
uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
|
||||
}
|
||||
|
|
|
@ -307,3 +307,171 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
|
|||
vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
#if CONFIG_USE_X86INC
|
||||
// The 2 unused parameters are place holders for PIC enabled build.
|
||||
// These definitions are for functions defined in subpel_variance.asm
|
||||
#define DECL(w, opt) \
|
||||
int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
int height, unsigned int *sse, \
|
||||
void *unused0, void *unused)
|
||||
#define DECLS(opt1, opt2) \
|
||||
DECL(4, opt2); \
|
||||
DECL(8, opt1); \
|
||||
DECL(16, opt1)
|
||||
|
||||
DECLS(sse2, sse);
|
||||
DECLS(ssse3, ssse3);
|
||||
#undef DECLS
|
||||
#undef DECL
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
int dst_stride, \
|
||||
unsigned int *sse_ptr) { \
|
||||
unsigned int sse; \
|
||||
int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, \
|
||||
h, &sse, NULL, NULL); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, dst_stride, \
|
||||
h, &sse2, NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
h, &sse2, NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
h, &sse2, NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
#define FNS(opt1, opt2) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \
|
||||
FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \
|
||||
FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \
|
||||
FN(4, 4, 4, 2, 2, opt2, (uint32_t))
|
||||
|
||||
FNS(sse2, sse);
|
||||
FNS(ssse3, ssse3);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
|
||||
// The 2 unused parameters are place holders for PIC enabled build.
|
||||
#define DECL(w, opt) \
|
||||
int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
const uint8_t *sec, \
|
||||
ptrdiff_t sec_stride, \
|
||||
int height, unsigned int *sse, \
|
||||
void *unused0, void *unused)
|
||||
#define DECLS(opt1, opt2) \
|
||||
DECL(4, opt2); \
|
||||
DECL(8, opt1); \
|
||||
DECL(16, opt1)
|
||||
|
||||
DECLS(sse2, sse);
|
||||
DECLS(ssse3, ssse3);
|
||||
#undef DECL
|
||||
#undef DECLS
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst, \
|
||||
int dst_stride, \
|
||||
unsigned int *sseptr, \
|
||||
const uint8_t *sec) { \
|
||||
unsigned int sse; \
|
||||
int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, \
|
||||
sec, w, h, &sse, NULL, \
|
||||
NULL); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, dst_stride, \
|
||||
sec + 16, w, h, &sse2, \
|
||||
NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
sec + 32, w, h, &sse2, \
|
||||
NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
sec + 48, w, h, &sse2, \
|
||||
NULL, NULL); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sseptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
#define FNS(opt1, opt2) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \
|
||||
FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \
|
||||
FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \
|
||||
FN(4, 4, 4, 2, 2, opt2, (uint32_t))
|
||||
|
||||
FNS(sse2, sse);
|
||||
FNS(ssse3, ssse3);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
#endif // CONFIG_USE_X86INC
|
||||
|
|
Загрузка…
Ссылка в новой задаче