Move sub pixel variance to vpx_dsp

Change-Id: I66bf6720c396c89aa2d1fd26d5d52bf5d5e3dff1
This commit is contained in:
Johann 2015-06-05 09:54:19 -07:00
Родитель 155b9416b3
Коммит 6a82f0d7fb
67 изменённых файлов: 5171 добавлений и 9177 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,137 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp8/common/variance.h"
#include "vp8/common/filter.h"
// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
#if CONFIG_VP8_ENCODER
#if HAVE_MEDIA
#include "vp8/common/arm/bilinearfilter_arm.h"
unsigned int vp8_sub_pixel_variance8x8_armv6
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
unsigned short first_pass[10*8];
unsigned char second_pass[8*8];
const short *HFilter, *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
src_pixels_per_line,
9, 8, HFilter);
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
8, 8, 8, VFilter);
return vpx_variance8x8_media(second_pass, 8, dst_ptr,
dst_pixels_per_line, sse);
}
unsigned int vp8_sub_pixel_variance16x16_armv6
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
unsigned short first_pass[36*16];
unsigned char second_pass[20*16];
const short *HFilter, *VFilter;
unsigned int var;
if (xoffset == 4 && yoffset == 0)
{
var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, sse);
}
else if (xoffset == 0 && yoffset == 4)
{
var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, sse);
}
else if (xoffset == 4 && yoffset == 4)
{
var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, sse);
}
else
{
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
src_pixels_per_line,
17, 16, HFilter);
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
16, 16, 16, VFilter);
var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
dst_pixels_per_line, sse);
}
return var;
}
#endif // HAVE_MEDIA
#if HAVE_NEON
extern unsigned int vp8_sub_pixel_variance16x16_neon_func
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
);
unsigned int vp8_sub_pixel_variance16x16_neon
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
if (xoffset == 4 && yoffset == 0)
return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
else if (xoffset == 0 && yoffset == 4)
return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
else if (xoffset == 4 && yoffset == 4)
return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
else
return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
}
#endif // HAVE_NEON
#endif // CONFIG_VP8_ENCODER

Просмотреть файл

@ -20,7 +20,7 @@
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp8/common/postproc.h"
#include "vp8/common/variance.h"
#include "vpx_dsp/variance.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/yv12config.h"

Просмотреть файл

@ -237,47 +237,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch,
specialize qw/vp8_bilinear_predict4x4 mmx media/;
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
#
# Sub-pixel Variance
#
add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/;
$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;
add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media/;
$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;
$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;
add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;
$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt;
add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/;
$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;
add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/;
$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;
$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;
$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;
$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;
$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
#
# Encoder functions below this point.
#

Просмотреть файл

@ -1,92 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_VARIANCE_H_
#define VP8_COMMON_VARIANCE_H_
#include "vpx_config.h"
#include "vpx/vpx_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef unsigned int(*vpx_sad_fn_t)(
const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride);
typedef void (*vp8_copy32xn_fn_t)(
const unsigned char *src_ptr,
int source_stride,
unsigned char *ref_ptr,
int ref_stride,
int n);
typedef void (*vpx_sad_multi_fn_t)(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_array,
int ref_stride,
unsigned int *sad_array);
typedef void (*vpx_sad_multi_d_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char * const ref_array[],
int ref_stride,
unsigned int *sad_array
);
typedef unsigned int (*vpx_variance_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int ref_stride,
unsigned int *sse
);
typedef unsigned int (*vp8_subpixvariance_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
int xoffset,
int yoffset,
const unsigned char *ref_ptr,
int Refstride,
unsigned int *sse
);
typedef struct variance_vtable
{
vpx_sad_fn_t sdf;
vpx_variance_fn_t vf;
vp8_subpixvariance_fn_t svf;
vpx_variance_fn_t svf_halfpix_h;
vpx_variance_fn_t svf_halfpix_v;
vpx_variance_fn_t svf_halfpix_hv;
vpx_sad_multi_fn_t sdx3f;
vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;
#if ARCH_X86 || ARCH_X86_64
vp8_copy32xn_fn_t copymem;
#endif
} vp8_variance_fn_ptr_t;
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_VARIANCE_H_

Просмотреть файл

@ -1,337 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp8_rtcd.h"
#include "filter.h"
#include "variance.h"
/* This is a bad idea.
* ctz = count trailing zeros */
static int ctz(int a) {
int b = 0;
while (a != 1) {
a >>= 1;
b++;
}
return b;
}
static unsigned int variance(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
int w,
int h,
unsigned int *sse)
{
int i, j;
int diff, sum;
sum = 0;
*sse = 0;
for (i = 0; i < h; i++)
{
for (j = 0; j < w; j++)
{
diff = src_ptr[j] - ref_ptr[j];
sum += diff;
*sse += diff * diff;
}
src_ptr += source_stride;
ref_ptr += recon_stride;
}
return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
}
/****************************************************************************
*
* ROUTINE : filter_block2d_bil_first_pass
*
* INPUTS : UINT8 *src_ptr : Pointer to source block.
* UINT32 src_pixels_per_line : Stride of input block.
* UINT32 pixel_step : Offset between filter input samples (see notes).
* UINT32 output_height : Input block height.
* UINT32 output_width : Input block width.
* INT32 *vp8_filter : Array of 2 bi-linear filter taps.
*
* OUTPUTS : INT32 *output_ptr : Pointer to filtered block.
*
* RETURNS : void
*
* FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
* either horizontal or vertical direction to produce the
* filtered output block. Used to implement first-pass
* of 2-D separable filter.
*
* SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
* Two filter taps should sum to VP8_FILTER_WEIGHT.
* pixel_step defines whether the filter is applied
* horizontally (pixel_step=1) or vertically (pixel_step=stride).
* It defines the offset required to move from one input
* to the next.
*
****************************************************************************/
static void var_filter_block2d_bil_first_pass
(
const unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
)
{
unsigned int i, j;
for (i = 0; i < output_height; i++)
{
for (j = 0; j < output_width; j++)
{
/* Apply bilinear filter */
output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
((int)src_ptr[pixel_step] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
src_ptr++;
}
/* Next row... */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
/****************************************************************************
*
* ROUTINE : filter_block2d_bil_second_pass
*
* INPUTS : INT32 *src_ptr : Pointer to source block.
* UINT32 src_pixels_per_line : Stride of input block.
* UINT32 pixel_step : Offset between filter input samples (see notes).
* UINT32 output_height : Input block height.
* UINT32 output_width : Input block width.
* INT32 *vp8_filter : Array of 2 bi-linear filter taps.
*
* OUTPUTS : UINT16 *output_ptr : Pointer to filtered block.
*
* RETURNS : void
*
* FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in
* either horizontal or vertical direction to produce the
* filtered output block. Used to implement second-pass
* of 2-D separable filter.
*
* SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
* Two filter taps should sum to VP8_FILTER_WEIGHT.
* pixel_step defines whether the filter is applied
* horizontally (pixel_step=1) or vertically (pixel_step=stride).
* It defines the offset required to move from one input
* to the next.
*
****************************************************************************/
static void var_filter_block2d_bil_second_pass
(
const unsigned short *src_ptr,
unsigned char *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const short *vp8_filter
)
{
unsigned int i, j;
int Temp;
for (i = 0; i < output_height; i++)
{
for (j = 0; j < output_width; j++)
{
/* Apply filter */
Temp = ((int)src_ptr[0] * vp8_filter[0]) +
((int)src_ptr[pixel_step] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT / 2);
output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
src_ptr++;
}
/* Next row... */
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
unsigned int vp8_sub_pixel_variance4x4_c
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
unsigned char temp2[20*16];
const short *HFilter, *VFilter;
unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
/* First filter 1d Horizontal */
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
/* Now filter Verticaly */
var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
}
unsigned int vp8_sub_pixel_variance8x8_c
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */
unsigned char temp2[20*16];
const short *HFilter, *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
}
unsigned int vp8_sub_pixel_variance16x16_c
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
unsigned short FData3[17*16]; /* Temp data bufffer used in filtering */
unsigned char temp2[20*16];
const short *HFilter, *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
}
unsigned int vp8_variance_halfpixvar16x16_h_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,
ref_ptr, recon_stride, sse);
}
unsigned int vp8_variance_halfpixvar16x16_v_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,
ref_ptr, recon_stride, sse);
}
unsigned int vp8_variance_halfpixvar16x16_hv_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
ref_ptr, recon_stride, sse);
}
unsigned int vp8_sub_pixel_variance16x8_c
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
unsigned short FData3[16*9]; /* Temp data bufffer used in filtering */
unsigned char temp2[20*16];
const short *HFilter, *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
}
unsigned int vp8_sub_pixel_variance8x16_c
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
unsigned short FData3[9*16]; /* Temp data bufffer used in filtering */
unsigned char temp2[20*16];
const short *HFilter, *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
}

Просмотреть файл

@ -1,972 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define xmm_filter_shift 7
;void vp8_filter_block2d_bil_var_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int xoffset,
; int yoffset,
; int *sum,
; unsigned int *sumsquared;;
;
;)
global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
sym(vp8_filter_block2d_bil_var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
push rbx
; end prolog
pxor xmm6, xmm6 ;
pxor xmm7, xmm7 ;
lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
movdqa xmm4, XMMWORD PTR [rsi]
lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
je filter_block2d_bil_var_sse2_sp_only
shl rax, 5 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
je filter_block2d_bil_var_sse2_fp_only
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
pxor xmm0, xmm0 ;
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm5, xmm1
movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
lea rsi, [rsi + rbx]
%if ABI_IS_32BIT=0
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
filter_block2d_bil_var_sse2_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm3, xmm5 ;
movdqa xmm5, xmm1 ;
pmullw xmm3, [rdx] ;
pmullw xmm1, [rdx+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rbx] ;ref_pixels_per_line
%if ABI_IS_32BIT
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rdi, [rdi + r9]
%endif
sub rcx, 1 ;
jnz filter_block2d_bil_var_sse2_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
je filter_block2d_bil_var_sse2_full_pixel
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
movq xmm1, QWORD PTR [rsi] ;
punpcklbw xmm1, xmm0 ;
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
lea rsi, [rsi + rax]
filter_block2d_bil_sp_only_loop:
movq xmm3, QWORD PTR [rsi] ;
punpcklbw xmm3, xmm0 ;
movdqa xmm5, xmm3
pmullw xmm1, [rdx] ;
pmullw xmm3, [rdx+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
movdqa xmm1, xmm5 ;
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_sp_only_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_full_pixel:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0 ;
filter_block2d_bil_full_pixel_loop:
movq xmm1, QWORD PTR [rsi] ;
punpcklbw xmm1, xmm0 ;
movq xmm2, QWORD PTR [rdi] ;
punpcklbw xmm2, xmm0 ;
psubw xmm1, xmm2 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_full_pixel_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
filter_block2d_bil_fp_only_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rdx]
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_fp_only_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_variance:
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
psrldq xmm6, 8
psrldq xmm7, 8
movdq2q mm2, xmm6
movdq2q mm3, xmm7
paddw mm6, mm2
paddd mm7, mm3
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rsi, arg(7) ; sum
mov rdi, arg(8) ; sumsquared
movd [rsi], mm2 ; xsum
movd [rdi], mm4 ; xxsum
; begin epilog
pop rbx
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_half_horiz_vert_variance8x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
sym(vp8_half_horiz_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
%if ABI_IS_32BIT=0
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
%else
add rsi, r8
%endif
vp8_half_horiz_vert_variance8x_h_1:
movq xmm1, QWORD PTR [rsi] ;
movq xmm2, QWORD PTR [rsi+1] ;
pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
pavgb xmm5, xmm1 ; xmm = vertical average of the above
punpcklbw xmm5, xmm0 ; xmm5 = words of above
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
punpcklbw xmm3, xmm0 ; xmm3 = words of above
psubw xmm5, xmm3 ; xmm5 -= xmm3
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
movdqa xmm5, xmm1 ; save xmm1 for use on the next row
%if ABI_IS_32BIT
add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
%else
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz vp8_half_horiz_vert_variance8x_h_1 ;
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
psrldq xmm6, 8
psrldq xmm7, 8
movdq2q mm2, xmm6
movdq2q mm3, xmm7
paddw mm6, mm2
paddd mm7, mm3
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rsi, arg(5) ; sum
mov rdi, arg(6) ; sumsquared
movd [rsi], mm2 ;
movd [rdi], mm4 ;
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_half_horiz_vert_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
sym(vp8_half_horiz_vert_variance16x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0 ;
movdqu xmm5, XMMWORD PTR [rsi]
movdqu xmm3, XMMWORD PTR [rsi+1]
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
lea rsi, [rsi + rax]
vp8_half_horiz_vert_variance16x_h_1:
movdqu xmm1, XMMWORD PTR [rsi] ;
movdqu xmm2, XMMWORD PTR [rsi+1] ;
pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
pavgb xmm5, xmm1 ; xmm = vertical average of the above
movdqa xmm4, xmm5
punpcklbw xmm5, xmm0 ; xmm5 = words of above
punpckhbw xmm4, xmm0
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
punpcklbw xmm3, xmm0 ; xmm3 = words of above
psubw xmm5, xmm3 ; xmm5 -= xmm3
movq xmm3, QWORD PTR [rdi+8]
punpcklbw xmm3, xmm0
psubw xmm4, xmm3
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
paddw xmm6, xmm4
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
pmaddwd xmm4, xmm4
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
paddd xmm7, xmm4
movdqa xmm5, xmm1 ; save xmm1 for use on the next row
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
sub rcx, 1 ;
jnz vp8_half_horiz_vert_variance16x_h_1 ;
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(5) ;[Sum]
mov rdi, arg(6) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_half_vert_variance8x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
sym(vp8_half_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
%if ABI_IS_32BIT=0
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
vp8_half_vert_variance8x_h_1:
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
punpcklbw xmm5, xmm0 ; xmm5 = words of above
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
punpcklbw xmm3, xmm0 ; xmm3 = words of above
psubw xmm5, xmm3 ; xmm5 -= xmm3
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
%if ABI_IS_32BIT
add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
%else
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz vp8_half_vert_variance8x_h_1 ;
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
psrldq xmm6, 8
psrldq xmm7, 8
movdq2q mm2, xmm6
movdq2q mm3, xmm7
paddw mm6, mm2
paddd mm7, mm3
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rsi, arg(5) ; sum
mov rdi, arg(6) ; sumsquared
movd [rsi], mm2 ;
movd [rdi], mm4 ;
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_half_vert_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
sym(vp8_half_vert_variance16x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
movdqu xmm5, XMMWORD PTR [rsi]
lea rsi, [rsi + rax ]
pxor xmm0, xmm0
vp8_half_vert_variance16x_h_1:
movdqu xmm3, XMMWORD PTR [rsi]
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
movdqa xmm4, xmm5
punpcklbw xmm5, xmm0
punpckhbw xmm4, xmm0
movq xmm2, QWORD PTR [rdi]
punpcklbw xmm2, xmm0
psubw xmm5, xmm2
movq xmm2, QWORD PTR [rdi+8]
punpcklbw xmm2, xmm0
psubw xmm4, xmm2
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
paddw xmm6, xmm4
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
pmaddwd xmm4, xmm4
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
paddd xmm7, xmm4
movdqa xmm5, xmm3
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
sub rcx, 1
jnz vp8_half_vert_variance16x_h_1
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(5) ;[Sum]
mov rdi, arg(6) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_half_horiz_variance8x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
sym(vp8_half_horiz_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
%if ABI_IS_32BIT=0
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
pxor xmm0, xmm0 ;
vp8_half_horiz_variance8x_h_1:
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
punpcklbw xmm5, xmm0 ; xmm5 = words of above
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
punpcklbw xmm3, xmm0 ; xmm3 = words of above
psubw xmm5, xmm3 ; xmm5 -= xmm3
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
%if ABI_IS_32BIT
add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
%else
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz vp8_half_horiz_variance8x_h_1 ;
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
psrldq xmm6, 8
psrldq xmm7, 8
movdq2q mm2, xmm6
movdq2q mm3, xmm7
paddw mm6, mm2
paddd mm7, mm3
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rsi, arg(5) ; sum
mov rdi, arg(6) ; sumsquared
movd [rsi], mm2 ;
movd [rdi], mm4 ;
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_half_horiz_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
sym(vp8_half_horiz_variance16x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0 ;
vp8_half_horiz_variance16x_h_1:
movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
movdqa xmm1, xmm5
punpcklbw xmm5, xmm0 ; xmm5 = words of above
punpckhbw xmm1, xmm0
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
punpcklbw xmm3, xmm0 ; xmm3 = words of above
movq xmm2, QWORD PTR [rdi+8]
punpcklbw xmm2, xmm0
psubw xmm5, xmm3 ; xmm5 -= xmm3
psubw xmm1, xmm2
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
paddw xmm6, xmm1
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
pmaddwd xmm1, xmm1
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
paddd xmm7, xmm1
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
sub rcx, 1 ;
jnz vp8_half_horiz_variance16x_h_1 ;
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(5) ;[Sum]
mov rdi, arg(6) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
align 16
xmm_bi_rd:
times 8 dw 64
align 16
vp8_bilinear_filters_sse2:
dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

Просмотреть файл

@ -1,364 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define xmm_filter_shift 7
;void vp8_filter_block2d_bil_var_ssse3
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int xoffset,
; int yoffset,
; int *sum,
; unsigned int *sumsquared;;
;
;)
;Note: The filter coefficient at offset=0 is 128. Since the second register
;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
sym(vp8_filter_block2d_bil_var_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6
pxor xmm7, xmm7
lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
je .filter_block2d_bil_var_ssse3_sp_only
shl rax, 4 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
je .filter_block2d_bil_var_ssse3_fp_only
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi+1]
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1
punpckhbw xmm2, xmm1
pmaddubsw xmm0, [rax]
pmaddubsw xmm2, [rax]
paddw xmm0, [GLOBAL(xmm_bi_rd)]
paddw xmm2, [GLOBAL(xmm_bi_rd)]
psraw xmm0, xmm_filter_shift
psraw xmm2, xmm_filter_shift
packuswb xmm0, xmm2
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
lea rsi, [rsi + r8]
%endif
.filter_block2d_bil_var_ssse3_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
punpcklbw xmm1, xmm2
punpckhbw xmm3, xmm2
pmaddubsw xmm1, [rax]
pmaddubsw xmm3, [rax]
paddw xmm1, [GLOBAL(xmm_bi_rd)]
paddw xmm3, [GLOBAL(xmm_bi_rd)]
psraw xmm1, xmm_filter_shift
psraw xmm3, xmm_filter_shift
packuswb xmm1, xmm3
movdqa xmm2, xmm0
movdqa xmm0, xmm1
movdqa xmm3, xmm2
punpcklbw xmm2, xmm1
punpckhbw xmm3, xmm1
pmaddubsw xmm2, [rdx]
pmaddubsw xmm3, [rdx]
paddw xmm2, [GLOBAL(xmm_bi_rd)]
paddw xmm3, [GLOBAL(xmm_bi_rd)]
psraw xmm2, xmm_filter_shift
psraw xmm3, xmm_filter_shift
movq xmm1, QWORD PTR [rdi]
pxor xmm4, xmm4
punpcklbw xmm1, xmm4
movq xmm5, QWORD PTR [rdi+8]
punpcklbw xmm5, xmm4
psubw xmm2, xmm1
psubw xmm3, xmm5
paddw xmm6, xmm2
paddw xmm6, xmm3
pmaddwd xmm2, xmm2
pmaddwd xmm3, xmm3
paddd xmm7, xmm2
paddd xmm7, xmm3
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rsi, [rsi + r8]
lea rdi, [rdi + r9]
%endif
sub rcx, 1
jnz .filter_block2d_bil_var_ssse3_loop
jmp .filter_block2d_bil_variance
.filter_block2d_bil_var_ssse3_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; Both xoffset =0 and yoffset=0
je .filter_block2d_bil_var_ssse3_full_pixel
shl rdx, 4
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movdqu xmm1, XMMWORD PTR [rsi]
movdqa xmm0, xmm1
%if ABI_IS_32BIT=0
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
lea rsi, [rsi + rax]
.filter_block2d_bil_sp_only_loop:
movdqu xmm3, XMMWORD PTR [rsi]
movdqa xmm2, xmm1
movdqa xmm0, xmm3
punpcklbw xmm1, xmm3
punpckhbw xmm2, xmm3
pmaddubsw xmm1, [rdx]
pmaddubsw xmm2, [rdx]
paddw xmm1, [GLOBAL(xmm_bi_rd)]
paddw xmm2, [GLOBAL(xmm_bi_rd)]
psraw xmm1, xmm_filter_shift
psraw xmm2, xmm_filter_shift
movq xmm3, QWORD PTR [rdi]
pxor xmm4, xmm4
punpcklbw xmm3, xmm4
movq xmm5, QWORD PTR [rdi+8]
punpcklbw xmm5, xmm4
psubw xmm1, xmm3
psubw xmm2, xmm5
paddw xmm6, xmm1
paddw xmm6, xmm2
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
paddd xmm7, xmm1
paddd xmm7, xmm2
movdqa xmm1, xmm0
lea rsi, [rsi + rax] ;ref_pixels_per_line
%if ABI_IS_32BIT
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rdi, [rdi + r9]
%endif
sub rcx, 1
jnz .filter_block2d_bil_sp_only_loop
jmp .filter_block2d_bil_variance
.filter_block2d_bil_var_ssse3_full_pixel:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0
.filter_block2d_bil_full_pixel_loop:
movq xmm1, QWORD PTR [rsi]
punpcklbw xmm1, xmm0
movq xmm2, QWORD PTR [rsi+8]
punpcklbw xmm2, xmm0
movq xmm3, QWORD PTR [rdi]
punpcklbw xmm3, xmm0
movq xmm4, QWORD PTR [rdi+8]
punpcklbw xmm4, xmm0
psubw xmm1, xmm3
psubw xmm2, xmm4
paddw xmm6, xmm1
paddw xmm6, xmm2
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
paddd xmm7, xmm1
paddd xmm7, xmm2
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rdx] ;src_pixels_per_line
sub rcx, 1
jnz .filter_block2d_bil_full_pixel_loop
jmp .filter_block2d_bil_variance
.filter_block2d_bil_var_ssse3_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0
%if ABI_IS_32BIT=0
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
.filter_block2d_bil_fp_only_loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rsi+1]
movdqa xmm3, xmm1
punpcklbw xmm1, xmm2
punpckhbw xmm3, xmm2
pmaddubsw xmm1, [rax]
pmaddubsw xmm3, [rax]
paddw xmm1, [GLOBAL(xmm_bi_rd)]
paddw xmm3, [GLOBAL(xmm_bi_rd)]
psraw xmm1, xmm_filter_shift
psraw xmm3, xmm_filter_shift
movq xmm2, XMMWORD PTR [rdi]
pxor xmm4, xmm4
punpcklbw xmm2, xmm4
movq xmm5, QWORD PTR [rdi+8]
punpcklbw xmm5, xmm4
psubw xmm1, xmm2
psubw xmm3, xmm5
paddw xmm6, xmm1
paddw xmm6, xmm3
pmaddwd xmm1, xmm1
pmaddwd xmm3, xmm3
paddd xmm7, xmm1
paddd xmm7, xmm3
lea rsi, [rsi + rdx]
%if ABI_IS_32BIT
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rdi, [rdi + r9]
%endif
sub rcx, 1
jnz .filter_block2d_bil_fp_only_loop
jmp .filter_block2d_bil_variance
.filter_block2d_bil_variance:
pxor xmm0, xmm0
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(7) ;[Sum]
mov rdi, arg(8) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
xmm_bi_rd:
times 8 dw 64
align 16
vp8_bilinear_filters_ssse3:
times 8 db 128, 0
times 8 db 112, 16
times 8 db 96, 32
times 8 db 80, 48
times 8 db 64, 64
times 8 db 48, 80
times 8 db 32, 96
times 8 db 16, 112

Просмотреть файл

@ -1,157 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp8_rtcd.h"
#include "vpx_config.h"
#include "vp8/common/variance.h"
#include "vpx_ports/mem.h"
extern void vp8_half_horiz_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
extern void vp8_half_horiz_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
extern void vp8_half_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
extern void vp8_filter_block2d_bil_var_ssse3
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int xoffset,
int yoffset,
int *sum,
unsigned int *sumsquared
);
unsigned int vp8_sub_pixel_variance16x16_ssse3
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0;
unsigned int xxsum0;
/* note we could avoid these if statements if the calling function
* just called the appropriate functions inside.
*/
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
}
else
{
vp8_filter_block2d_bil_var_ssse3(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum0, &xxsum0);
}
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
}
unsigned int vp8_sub_pixel_variance16x8_ssse3
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0;
unsigned int xxsum0;
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
}
else
{
vp8_filter_block2d_bil_var_ssse3(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum0, &xxsum0);
}
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
}

Просмотреть файл

@ -1,353 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define mmx_filter_shift 7
;void vp8_filter_block2d_bil4x4_var_mmx
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned short *HFilter,
; unsigned short *VFilter,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
sym(vp8_filter_block2d_bil4x4_var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov rax, arg(4) ;HFilter ;
mov rdx, arg(5) ;VFilter ;
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
mov rcx, 4 ;
pxor mm0, mm0 ;
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
punpcklbw mm1, mm0 ;
pmullw mm1, [rax] ;
punpcklbw mm3, mm0 ;
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm5, mm1
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
add rsi, r8
%endif
.filter_block2d_bil4x4_var_mmx_loop:
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
punpcklbw mm1, mm0 ;
pmullw mm1, [rax] ;
punpcklbw mm3, mm0 ;
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm3, mm5 ;
movq mm5, mm1 ;
pmullw mm3, [rdx] ;
pmullw mm1, [rdx+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movd mm3, [rdi] ;
punpcklbw mm3, mm0 ;
psubw mm1, mm3 ;
paddw mm6, mm1 ;
pmaddwd mm1, mm1 ;
paddd mm7, mm1 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz .filter_block2d_bil4x4_var_mmx_loop ;
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rdi, arg(6) ;sum
mov rsi, arg(7) ;sumsquared
movd dword ptr [rdi], mm2 ;
movd dword ptr [rsi], mm4 ;
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_filter_block2d_bil_var_mmx
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; unsigned short *HFilter,
; unsigned short *VFilter,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
sym(vp8_filter_block2d_bil_var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov rax, arg(5) ;HFilter ;
mov rdx, arg(6) ;VFilter ;
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
pxor mm0, mm0 ;
movq mm1, [rsi] ;
movq mm3, [rsi+1] ;
movq mm2, mm1 ;
movq mm4, mm3 ;
punpcklbw mm1, mm0 ;
punpckhbw mm2, mm0 ;
pmullw mm1, [rax] ;
pmullw mm2, [rax] ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
pmullw mm3, [rax+8] ;
pmullw mm4, [rax+8] ;
paddw mm1, mm3 ;
paddw mm2, mm4 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm2, mmx_filter_shift ;
movq mm5, mm1
packuswb mm5, mm2 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
add rsi, r8
%endif
.filter_block2d_bil_var_mmx_loop:
movq mm1, [rsi] ;
movq mm3, [rsi+1] ;
movq mm2, mm1 ;
movq mm4, mm3 ;
punpcklbw mm1, mm0 ;
punpckhbw mm2, mm0 ;
pmullw mm1, [rax] ;
pmullw mm2, [rax] ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
pmullw mm3, [rax+8] ;
pmullw mm4, [rax+8] ;
paddw mm1, mm3 ;
paddw mm2, mm4 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm2, mmx_filter_shift ;
movq mm3, mm5 ;
movq mm4, mm5 ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
movq mm5, mm1 ;
packuswb mm5, mm2 ;
pmullw mm3, [rdx] ;
pmullw mm4, [rdx] ;
pmullw mm1, [rdx+8] ;
pmullw mm2, [rdx+8] ;
paddw mm1, mm3 ;
paddw mm2, mm4 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
psraw mm2, mmx_filter_shift ;
movq mm3, [rdi] ;
movq mm4, mm3 ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
psubw mm1, mm3 ;
psubw mm2, mm4 ;
paddw mm6, mm1 ;
pmaddwd mm1, mm1 ;
paddw mm6, mm2 ;
pmaddwd mm2, mm2 ;
paddd mm7, mm1 ;
paddd mm7, mm2 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz .filter_block2d_bil_var_mmx_loop ;
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rdi, arg(7) ;sum
mov rsi, arg(8) ;sumsquared
movd dword ptr [rdi], mm2 ;
movd dword ptr [rsi], mm4 ;
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
;short mmx_bi_rd[4] = { 64, 64, 64, 64};
align 16
mmx_bi_rd:
times 4 dw 64

Просмотреть файл

@ -1,244 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp8_rtcd.h"
#include "vpx_config.h"
#include "vp8/common/variance.h"
#include "vpx_ports/mem.h"
#include "vp8/common/x86/filter_x86.h"
extern void filter_block1d_h6_mmx
(
const unsigned char *src_ptr,
unsigned short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
short *filter
);
extern void filter_block1d_v6_mmx
(
const short *src_ptr,
unsigned char *output_ptr,
unsigned int pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
short *filter
);
extern void vp8_filter_block2d_bil4x4_var_mmx
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
const short *HFilter,
const short *VFilter,
int *sum,
unsigned int *sumsquared
);
extern void vp8_filter_block2d_bil_var_mmx
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
const short *HFilter,
const short *VFilter,
int *sum,
unsigned int *sumsquared
);
unsigned int vp8_sub_pixel_variance4x4_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse)
{
int xsum;
unsigned int xxsum;
vp8_filter_block2d_bil4x4_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line,
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
}
unsigned int vp8_sub_pixel_variance8x8_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum;
unsigned int xxsum;
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 6));
}
unsigned int vp8_sub_pixel_variance16x16_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum0, &xxsum0
);
vp8_filter_block2d_bil_var_mmx(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
}
unsigned int vp8_sub_pixel_variance16x8_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum0, &xxsum0
);
vp8_filter_block2d_bil_var_mmx(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
}
unsigned int vp8_sub_pixel_variance8x16_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum;
unsigned int xxsum;
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 7));
}
unsigned int vp8_variance_halfpixvar16x16_h_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
ref_ptr, recon_stride, sse);
}
unsigned int vp8_variance_halfpixvar16x16_v_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
ref_ptr, recon_stride, sse);
}
unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
ref_ptr, recon_stride, sse);
}

Просмотреть файл

@ -1,403 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp8_rtcd.h"
#include "vpx_config.h"
#include "vp8/common/variance.h"
#include "vpx_ports/mem.h"
#include "vp8/common/x86/filter_x86.h"
extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
extern void vp8_filter_block2d_bil4x4_var_mmx
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
const short *HFilter,
const short *VFilter,
int *sum,
unsigned int *sumsquared
);
void vp8_filter_block2d_bil_var_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int xoffset,
int yoffset,
int *sum,
unsigned int *sumsquared
);
void vp8_half_horiz_vert_variance8x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
void vp8_half_horiz_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
void vp8_half_horiz_variance8x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
void vp8_half_horiz_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
void vp8_half_vert_variance8x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
void vp8_half_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
unsigned int vp8_sub_pixel_variance4x4_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum;
unsigned int xxsum;
vp8_filter_block2d_bil4x4_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line,
vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
&xsum, &xxsum
);
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
}
unsigned int vp8_sub_pixel_variance8x8_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum;
unsigned int xxsum;
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else
{
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum, &xxsum);
}
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 6));
}
unsigned int vp8_sub_pixel_variance16x16_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
/* note we could avoid these if statements if the calling function
* just called the appropriate functions inside.
*/
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
}
else
{
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum0, &xxsum0
);
vp8_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
}
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
}
unsigned int vp8_sub_pixel_variance16x8_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
}
else
{
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum0, &xxsum0);
vp8_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
}
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
}
unsigned int vp8_sub_pixel_variance8x16_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum;
unsigned int xxsum;
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else
{
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum, &xxsum);
}
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 7));
}
unsigned int vp8_variance_halfpixvar16x16_h_wmt(
const unsigned char *src_ptr,
int src_pixels_per_line,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse)
{
int xsum0;
unsigned int xxsum0;
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
}
unsigned int vp8_variance_halfpixvar16x16_v_wmt(
const unsigned char *src_ptr,
int src_pixels_per_line,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse)
{
int xsum0;
unsigned int xxsum0;
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
}
unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
const unsigned char *src_ptr,
int src_pixels_per_line,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse)
{
int xsum0;
unsigned int xxsum0;
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
*sse = xxsum0;
return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
}

Просмотреть файл

@ -16,7 +16,7 @@
#include "./vpx_scale_rtcd.h"
#include "block.h"
#include "onyx_int.h"
#include "vp8/common/variance.h"
#include "vpx_dsp/variance.h"
#include "encodeintra.h"
#include "vp8/common/setupintrarecon.h"
#include "vp8/common/systemdependent.h"

Просмотреть файл

@ -13,7 +13,7 @@
#define VP8_ENCODER_MCOMP_H_
#include "block.h"
#include "vp8/common/variance.h"
#include "vpx_dsp/variance.h"
#ifdef __cplusplus
extern "C" {

Просмотреть файл

@ -2132,17 +2132,17 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vpx_variance_halfpixvar16x16_h;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vpx_variance_halfpixvar16x16_v;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
@ -2152,7 +2152,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
@ -2162,7 +2162,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
@ -2172,7 +2172,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;

Просмотреть файл

@ -18,7 +18,7 @@
#include "treewriter.h"
#include "tokenize.h"
#include "vp8/common/onyxc_int.h"
#include "vp8/common/variance.h"
#include "vpx_dsp/variance.h"
#include "encodemb.h"
#include "quantize.h"
#include "vp8/common/entropy.h"

Просмотреть файл

@ -22,7 +22,7 @@
#include "encodemb.h"
#include "vp8/common/reconinter.h"
#include "vp8/common/reconintra4x4.h"
#include "vp8/common/variance.h"
#include "vpx_dsp/variance.h"
#include "mcomp.h"
#include "rdopt.h"
#include "vpx_mem/vpx_mem.h"

Просмотреть файл

@ -29,7 +29,7 @@
#include "vp8/common/quant_common.h"
#include "encodemb.h"
#include "quantize.h"
#include "vp8/common/variance.h"
#include "vpx_dsp/variance.h"
#include "mcomp.h"
#include "rdopt.h"
#include "vpx_mem/vpx_mem.h"
@ -500,9 +500,9 @@ int VP8_UVSSE(MACROBLOCK *x)
if ((mv_row | mv_col) & 7)
{
vp8_sub_pixel_variance8x8(uptr, pre_stride,
vpx_sub_pixel_variance8x8(uptr, pre_stride,
mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
vp8_sub_pixel_variance8x8(vptr, pre_stride,
vpx_sub_pixel_variance8x8(vptr, pre_stride,
mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
sse2 += sse1;
}

Просмотреть файл

@ -63,8 +63,6 @@ VP8_COMMON_SRCS-yes += common/reconintra.c
VP8_COMMON_SRCS-yes += common/reconintra4x4.c
VP8_COMMON_SRCS-yes += common/setupintrarecon.c
VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
VP8_COMMON_SRCS-yes += common/variance_c.c
VP8_COMMON_SRCS-yes += common/variance.h
VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
@ -86,8 +84,6 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_mmx.c
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_impl_mmx.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
@ -96,12 +92,8 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp8_variance_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
@ -129,7 +121,6 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/variance_arm.c
# common (media)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.c
@ -149,9 +140,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
# common (neon intrinsics)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c
@ -170,6 +158,5 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))

Просмотреть файл

@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, int32_t height) {

Просмотреть файл

@ -9,7 +9,7 @@
*/
#include <string.h>
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, int32_t height) {

Просмотреть файл

@ -12,7 +12,7 @@
#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
#include "vp9/common/vp9_filter.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
extern const uint8_t mc_filt_mask_arr[16 * 3];

Просмотреть файл

@ -13,7 +13,7 @@
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_idct.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
v8i16 k0_m = __msa_fill_h(cnst0); \

Просмотреть файл

@ -9,7 +9,7 @@
*/
#include "./vp9_rtcd.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \
out0 = __msa_subs_u_h(out0, in0); \

Просмотреть файл

@ -11,7 +11,7 @@
#ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
#define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
p1_out, p0_out, q0_out, q1_out) { \

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -10,7 +10,7 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
uint8_t *dst_ptr, int32_t dst_stride,

Просмотреть файл

@ -802,88 +802,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
# variance
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance16x16 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance8x8 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_avg_8x8 sse2 neon msa/;
@ -1085,241 +1003,6 @@ specialize qw/vp9_temporal_filter_apply sse2 msa/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance4x8/;
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance4x8/;
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance4x4/;
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x8/;
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance4x4/;
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x8/;
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance4x4/;
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
# ENCODEMB INVOKE
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";

Просмотреть файл

@ -9,7 +9,7 @@
*/
#include "./vp9_rtcd.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
uint32_t sum_out;

Просмотреть файл

@ -9,7 +9,7 @@
*/
#include "./vp9_rtcd.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \
static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr, \

Просмотреть файл

@ -13,7 +13,7 @@
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_idct.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
v8i16 k0_m = __msa_fill_h(cnst0); \

Просмотреть файл

@ -9,7 +9,7 @@
*/
#include "./vp9_rtcd.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
uint32_t stride,

Просмотреть файл

@ -1023,8 +1023,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad32x16_bits8,
vpx_highbd_sad32x16_avg_bits8,
vpx_highbd_8_variance32x16,
vp9_highbd_sub_pixel_variance32x16,
vp9_highbd_sub_pixel_avg_variance32x16,
vpx_highbd_8_sub_pixel_variance32x16,
vpx_highbd_8_sub_pixel_avg_variance32x16,
NULL,
NULL,
vpx_highbd_sad32x16x4d_bits8)
@ -1033,8 +1033,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad16x32_bits8,
vpx_highbd_sad16x32_avg_bits8,
vpx_highbd_8_variance16x32,
vp9_highbd_sub_pixel_variance16x32,
vp9_highbd_sub_pixel_avg_variance16x32,
vpx_highbd_8_sub_pixel_variance16x32,
vpx_highbd_8_sub_pixel_avg_variance16x32,
NULL,
NULL,
vpx_highbd_sad16x32x4d_bits8)
@ -1043,8 +1043,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad64x32_bits8,
vpx_highbd_sad64x32_avg_bits8,
vpx_highbd_8_variance64x32,
vp9_highbd_sub_pixel_variance64x32,
vp9_highbd_sub_pixel_avg_variance64x32,
vpx_highbd_8_sub_pixel_variance64x32,
vpx_highbd_8_sub_pixel_avg_variance64x32,
NULL,
NULL,
vpx_highbd_sad64x32x4d_bits8)
@ -1053,8 +1053,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad32x64_bits8,
vpx_highbd_sad32x64_avg_bits8,
vpx_highbd_8_variance32x64,
vp9_highbd_sub_pixel_variance32x64,
vp9_highbd_sub_pixel_avg_variance32x64,
vpx_highbd_8_sub_pixel_variance32x64,
vpx_highbd_8_sub_pixel_avg_variance32x64,
NULL,
NULL,
vpx_highbd_sad32x64x4d_bits8)
@ -1063,8 +1063,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad32x32_bits8,
vpx_highbd_sad32x32_avg_bits8,
vpx_highbd_8_variance32x32,
vp9_highbd_sub_pixel_variance32x32,
vp9_highbd_sub_pixel_avg_variance32x32,
vpx_highbd_8_sub_pixel_variance32x32,
vpx_highbd_8_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits8,
vpx_highbd_sad32x32x8_bits8,
vpx_highbd_sad32x32x4d_bits8)
@ -1073,8 +1073,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad64x64_bits8,
vpx_highbd_sad64x64_avg_bits8,
vpx_highbd_8_variance64x64,
vp9_highbd_sub_pixel_variance64x64,
vp9_highbd_sub_pixel_avg_variance64x64,
vpx_highbd_8_sub_pixel_variance64x64,
vpx_highbd_8_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits8,
vpx_highbd_sad64x64x8_bits8,
vpx_highbd_sad64x64x4d_bits8)
@ -1083,8 +1083,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad16x16_bits8,
vpx_highbd_sad16x16_avg_bits8,
vpx_highbd_8_variance16x16,
vp9_highbd_sub_pixel_variance16x16,
vp9_highbd_sub_pixel_avg_variance16x16,
vpx_highbd_8_sub_pixel_variance16x16,
vpx_highbd_8_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits8,
vpx_highbd_sad16x16x8_bits8,
vpx_highbd_sad16x16x4d_bits8)
@ -1093,8 +1093,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad16x8_bits8,
vpx_highbd_sad16x8_avg_bits8,
vpx_highbd_8_variance16x8,
vp9_highbd_sub_pixel_variance16x8,
vp9_highbd_sub_pixel_avg_variance16x8,
vpx_highbd_8_sub_pixel_variance16x8,
vpx_highbd_8_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits8,
vpx_highbd_sad16x8x8_bits8,
vpx_highbd_sad16x8x4d_bits8)
@ -1103,8 +1103,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad8x16_bits8,
vpx_highbd_sad8x16_avg_bits8,
vpx_highbd_8_variance8x16,
vp9_highbd_sub_pixel_variance8x16,
vp9_highbd_sub_pixel_avg_variance8x16,
vpx_highbd_8_sub_pixel_variance8x16,
vpx_highbd_8_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits8,
vpx_highbd_sad8x16x8_bits8,
vpx_highbd_sad8x16x4d_bits8)
@ -1113,8 +1113,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad8x8_bits8,
vpx_highbd_sad8x8_avg_bits8,
vpx_highbd_8_variance8x8,
vp9_highbd_sub_pixel_variance8x8,
vp9_highbd_sub_pixel_avg_variance8x8,
vpx_highbd_8_sub_pixel_variance8x8,
vpx_highbd_8_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits8,
vpx_highbd_sad8x8x8_bits8,
vpx_highbd_sad8x8x4d_bits8)
@ -1123,8 +1123,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad8x4_bits8,
vpx_highbd_sad8x4_avg_bits8,
vpx_highbd_8_variance8x4,
vp9_highbd_sub_pixel_variance8x4,
vp9_highbd_sub_pixel_avg_variance8x4,
vpx_highbd_8_sub_pixel_variance8x4,
vpx_highbd_8_sub_pixel_avg_variance8x4,
NULL,
vpx_highbd_sad8x4x8_bits8,
vpx_highbd_sad8x4x4d_bits8)
@ -1133,8 +1133,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad4x8_bits8,
vpx_highbd_sad4x8_avg_bits8,
vpx_highbd_8_variance4x8,
vp9_highbd_sub_pixel_variance4x8,
vp9_highbd_sub_pixel_avg_variance4x8,
vpx_highbd_8_sub_pixel_variance4x8,
vpx_highbd_8_sub_pixel_avg_variance4x8,
NULL,
vpx_highbd_sad4x8x8_bits8,
vpx_highbd_sad4x8x4d_bits8)
@ -1143,8 +1143,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad4x4_bits8,
vpx_highbd_sad4x4_avg_bits8,
vpx_highbd_8_variance4x4,
vp9_highbd_sub_pixel_variance4x4,
vp9_highbd_sub_pixel_avg_variance4x4,
vpx_highbd_8_sub_pixel_variance4x4,
vpx_highbd_8_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits8,
vpx_highbd_sad4x4x8_bits8,
vpx_highbd_sad4x4x4d_bits8)
@ -1155,8 +1155,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad32x16_bits10,
vpx_highbd_sad32x16_avg_bits10,
vpx_highbd_10_variance32x16,
vp9_highbd_10_sub_pixel_variance32x16,
vp9_highbd_10_sub_pixel_avg_variance32x16,
vpx_highbd_10_sub_pixel_variance32x16,
vpx_highbd_10_sub_pixel_avg_variance32x16,
NULL,
NULL,
vpx_highbd_sad32x16x4d_bits10)
@ -1165,8 +1165,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad16x32_bits10,
vpx_highbd_sad16x32_avg_bits10,
vpx_highbd_10_variance16x32,
vp9_highbd_10_sub_pixel_variance16x32,
vp9_highbd_10_sub_pixel_avg_variance16x32,
vpx_highbd_10_sub_pixel_variance16x32,
vpx_highbd_10_sub_pixel_avg_variance16x32,
NULL,
NULL,
vpx_highbd_sad16x32x4d_bits10)
@ -1175,8 +1175,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad64x32_bits10,
vpx_highbd_sad64x32_avg_bits10,
vpx_highbd_10_variance64x32,
vp9_highbd_10_sub_pixel_variance64x32,
vp9_highbd_10_sub_pixel_avg_variance64x32,
vpx_highbd_10_sub_pixel_variance64x32,
vpx_highbd_10_sub_pixel_avg_variance64x32,
NULL,
NULL,
vpx_highbd_sad64x32x4d_bits10)
@ -1185,8 +1185,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad32x64_bits10,
vpx_highbd_sad32x64_avg_bits10,
vpx_highbd_10_variance32x64,
vp9_highbd_10_sub_pixel_variance32x64,
vp9_highbd_10_sub_pixel_avg_variance32x64,
vpx_highbd_10_sub_pixel_variance32x64,
vpx_highbd_10_sub_pixel_avg_variance32x64,
NULL,
NULL,
vpx_highbd_sad32x64x4d_bits10)
@ -1195,8 +1195,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad32x32_bits10,
vpx_highbd_sad32x32_avg_bits10,
vpx_highbd_10_variance32x32,
vp9_highbd_10_sub_pixel_variance32x32,
vp9_highbd_10_sub_pixel_avg_variance32x32,
vpx_highbd_10_sub_pixel_variance32x32,
vpx_highbd_10_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits10,
vpx_highbd_sad32x32x8_bits10,
vpx_highbd_sad32x32x4d_bits10)
@ -1205,8 +1205,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad64x64_bits10,
vpx_highbd_sad64x64_avg_bits10,
vpx_highbd_10_variance64x64,
vp9_highbd_10_sub_pixel_variance64x64,
vp9_highbd_10_sub_pixel_avg_variance64x64,
vpx_highbd_10_sub_pixel_variance64x64,
vpx_highbd_10_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits10,
vpx_highbd_sad64x64x8_bits10,
vpx_highbd_sad64x64x4d_bits10)
@ -1215,8 +1215,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad16x16_bits10,
vpx_highbd_sad16x16_avg_bits10,
vpx_highbd_10_variance16x16,
vp9_highbd_10_sub_pixel_variance16x16,
vp9_highbd_10_sub_pixel_avg_variance16x16,
vpx_highbd_10_sub_pixel_variance16x16,
vpx_highbd_10_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits10,
vpx_highbd_sad16x16x8_bits10,
vpx_highbd_sad16x16x4d_bits10)
@ -1225,8 +1225,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad16x8_bits10,
vpx_highbd_sad16x8_avg_bits10,
vpx_highbd_10_variance16x8,
vp9_highbd_10_sub_pixel_variance16x8,
vp9_highbd_10_sub_pixel_avg_variance16x8,
vpx_highbd_10_sub_pixel_variance16x8,
vpx_highbd_10_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits10,
vpx_highbd_sad16x8x8_bits10,
vpx_highbd_sad16x8x4d_bits10)
@ -1235,8 +1235,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad8x16_bits10,
vpx_highbd_sad8x16_avg_bits10,
vpx_highbd_10_variance8x16,
vp9_highbd_10_sub_pixel_variance8x16,
vp9_highbd_10_sub_pixel_avg_variance8x16,
vpx_highbd_10_sub_pixel_variance8x16,
vpx_highbd_10_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits10,
vpx_highbd_sad8x16x8_bits10,
vpx_highbd_sad8x16x4d_bits10)
@ -1245,8 +1245,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad8x8_bits10,
vpx_highbd_sad8x8_avg_bits10,
vpx_highbd_10_variance8x8,
vp9_highbd_10_sub_pixel_variance8x8,
vp9_highbd_10_sub_pixel_avg_variance8x8,
vpx_highbd_10_sub_pixel_variance8x8,
vpx_highbd_10_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits10,
vpx_highbd_sad8x8x8_bits10,
vpx_highbd_sad8x8x4d_bits10)
@ -1255,8 +1255,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad8x4_bits10,
vpx_highbd_sad8x4_avg_bits10,
vpx_highbd_10_variance8x4,
vp9_highbd_10_sub_pixel_variance8x4,
vp9_highbd_10_sub_pixel_avg_variance8x4,
vpx_highbd_10_sub_pixel_variance8x4,
vpx_highbd_10_sub_pixel_avg_variance8x4,
NULL,
vpx_highbd_sad8x4x8_bits10,
vpx_highbd_sad8x4x4d_bits10)
@ -1265,8 +1265,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad4x8_bits10,
vpx_highbd_sad4x8_avg_bits10,
vpx_highbd_10_variance4x8,
vp9_highbd_10_sub_pixel_variance4x8,
vp9_highbd_10_sub_pixel_avg_variance4x8,
vpx_highbd_10_sub_pixel_variance4x8,
vpx_highbd_10_sub_pixel_avg_variance4x8,
NULL,
vpx_highbd_sad4x8x8_bits10,
vpx_highbd_sad4x8x4d_bits10)
@ -1275,8 +1275,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad4x4_bits10,
vpx_highbd_sad4x4_avg_bits10,
vpx_highbd_10_variance4x4,
vp9_highbd_10_sub_pixel_variance4x4,
vp9_highbd_10_sub_pixel_avg_variance4x4,
vpx_highbd_10_sub_pixel_variance4x4,
vpx_highbd_10_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits10,
vpx_highbd_sad4x4x8_bits10,
vpx_highbd_sad4x4x4d_bits10)
@ -1287,8 +1287,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad32x16_bits12,
vpx_highbd_sad32x16_avg_bits12,
vpx_highbd_12_variance32x16,
vp9_highbd_12_sub_pixel_variance32x16,
vp9_highbd_12_sub_pixel_avg_variance32x16,
vpx_highbd_12_sub_pixel_variance32x16,
vpx_highbd_12_sub_pixel_avg_variance32x16,
NULL,
NULL,
vpx_highbd_sad32x16x4d_bits12)
@ -1297,8 +1297,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad16x32_bits12,
vpx_highbd_sad16x32_avg_bits12,
vpx_highbd_12_variance16x32,
vp9_highbd_12_sub_pixel_variance16x32,
vp9_highbd_12_sub_pixel_avg_variance16x32,
vpx_highbd_12_sub_pixel_variance16x32,
vpx_highbd_12_sub_pixel_avg_variance16x32,
NULL,
NULL,
vpx_highbd_sad16x32x4d_bits12)
@ -1307,8 +1307,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad64x32_bits12,
vpx_highbd_sad64x32_avg_bits12,
vpx_highbd_12_variance64x32,
vp9_highbd_12_sub_pixel_variance64x32,
vp9_highbd_12_sub_pixel_avg_variance64x32,
vpx_highbd_12_sub_pixel_variance64x32,
vpx_highbd_12_sub_pixel_avg_variance64x32,
NULL,
NULL,
vpx_highbd_sad64x32x4d_bits12)
@ -1317,8 +1317,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad32x64_bits12,
vpx_highbd_sad32x64_avg_bits12,
vpx_highbd_12_variance32x64,
vp9_highbd_12_sub_pixel_variance32x64,
vp9_highbd_12_sub_pixel_avg_variance32x64,
vpx_highbd_12_sub_pixel_variance32x64,
vpx_highbd_12_sub_pixel_avg_variance32x64,
NULL,
NULL,
vpx_highbd_sad32x64x4d_bits12)
@ -1327,8 +1327,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad32x32_bits12,
vpx_highbd_sad32x32_avg_bits12,
vpx_highbd_12_variance32x32,
vp9_highbd_12_sub_pixel_variance32x32,
vp9_highbd_12_sub_pixel_avg_variance32x32,
vpx_highbd_12_sub_pixel_variance32x32,
vpx_highbd_12_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits12,
vpx_highbd_sad32x32x8_bits12,
vpx_highbd_sad32x32x4d_bits12)
@ -1337,8 +1337,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad64x64_bits12,
vpx_highbd_sad64x64_avg_bits12,
vpx_highbd_12_variance64x64,
vp9_highbd_12_sub_pixel_variance64x64,
vp9_highbd_12_sub_pixel_avg_variance64x64,
vpx_highbd_12_sub_pixel_variance64x64,
vpx_highbd_12_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits12,
vpx_highbd_sad64x64x8_bits12,
vpx_highbd_sad64x64x4d_bits12)
@ -1347,8 +1347,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad16x16_bits12,
vpx_highbd_sad16x16_avg_bits12,
vpx_highbd_12_variance16x16,
vp9_highbd_12_sub_pixel_variance16x16,
vp9_highbd_12_sub_pixel_avg_variance16x16,
vpx_highbd_12_sub_pixel_variance16x16,
vpx_highbd_12_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits12,
vpx_highbd_sad16x16x8_bits12,
vpx_highbd_sad16x16x4d_bits12)
@ -1357,8 +1357,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad16x8_bits12,
vpx_highbd_sad16x8_avg_bits12,
vpx_highbd_12_variance16x8,
vp9_highbd_12_sub_pixel_variance16x8,
vp9_highbd_12_sub_pixel_avg_variance16x8,
vpx_highbd_12_sub_pixel_variance16x8,
vpx_highbd_12_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits12,
vpx_highbd_sad16x8x8_bits12,
vpx_highbd_sad16x8x4d_bits12)
@ -1367,8 +1367,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad8x16_bits12,
vpx_highbd_sad8x16_avg_bits12,
vpx_highbd_12_variance8x16,
vp9_highbd_12_sub_pixel_variance8x16,
vp9_highbd_12_sub_pixel_avg_variance8x16,
vpx_highbd_12_sub_pixel_variance8x16,
vpx_highbd_12_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits12,
vpx_highbd_sad8x16x8_bits12,
vpx_highbd_sad8x16x4d_bits12)
@ -1377,8 +1377,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad8x8_bits12,
vpx_highbd_sad8x8_avg_bits12,
vpx_highbd_12_variance8x8,
vp9_highbd_12_sub_pixel_variance8x8,
vp9_highbd_12_sub_pixel_avg_variance8x8,
vpx_highbd_12_sub_pixel_variance8x8,
vpx_highbd_12_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits12,
vpx_highbd_sad8x8x8_bits12,
vpx_highbd_sad8x8x4d_bits12)
@ -1387,8 +1387,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad8x4_bits12,
vpx_highbd_sad8x4_avg_bits12,
vpx_highbd_12_variance8x4,
vp9_highbd_12_sub_pixel_variance8x4,
vp9_highbd_12_sub_pixel_avg_variance8x4,
vpx_highbd_12_sub_pixel_variance8x4,
vpx_highbd_12_sub_pixel_avg_variance8x4,
NULL,
vpx_highbd_sad8x4x8_bits12,
vpx_highbd_sad8x4x4d_bits12)
@ -1397,8 +1397,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad4x8_bits12,
vpx_highbd_sad4x8_avg_bits12,
vpx_highbd_12_variance4x8,
vp9_highbd_12_sub_pixel_variance4x8,
vp9_highbd_12_sub_pixel_avg_variance4x8,
vpx_highbd_12_sub_pixel_variance4x8,
vpx_highbd_12_sub_pixel_avg_variance4x8,
NULL,
vpx_highbd_sad4x8x8_bits12,
vpx_highbd_sad4x8x4d_bits12)
@ -1407,8 +1407,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad4x4_bits12,
vpx_highbd_sad4x4_avg_bits12,
vpx_highbd_12_variance4x4,
vp9_highbd_12_sub_pixel_variance4x4,
vp9_highbd_12_sub_pixel_avg_variance4x4,
vpx_highbd_12_sub_pixel_variance4x4,
vpx_highbd_12_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits12,
vpx_highbd_sad4x4x8_bits12,
vpx_highbd_sad4x4x4d_bits12)
@ -1832,62 +1832,62 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
cpi->fn_ptr[BT].sdx4df = SDX4DF;
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
vpx_variance32x16, vp9_sub_pixel_variance32x16,
vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
vpx_variance32x16, vpx_sub_pixel_variance32x16,
vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
vpx_variance16x32, vp9_sub_pixel_variance16x32,
vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
vpx_variance16x32, vpx_sub_pixel_variance16x32,
vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
vpx_variance64x32, vp9_sub_pixel_variance64x32,
vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
vpx_variance64x32, vpx_sub_pixel_variance64x32,
vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
vpx_variance32x64, vp9_sub_pixel_variance32x64,
vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
vpx_variance32x64, vpx_sub_pixel_variance32x64,
vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
vpx_variance32x32, vp9_sub_pixel_variance32x32,
vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
vpx_variance32x32, vpx_sub_pixel_variance32x32,
vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
vpx_sad32x32x4d)
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
vpx_variance64x64, vp9_sub_pixel_variance64x64,
vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
vpx_variance64x64, vpx_sub_pixel_variance64x64,
vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
vpx_sad64x64x4d)
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
vpx_variance16x16, vp9_sub_pixel_variance16x16,
vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
vpx_variance16x16, vpx_sub_pixel_variance16x16,
vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
vpx_sad16x16x4d)
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
vpx_variance16x8, vp9_sub_pixel_variance16x8,
vp9_sub_pixel_avg_variance16x8,
vpx_variance16x8, vpx_sub_pixel_variance16x8,
vpx_sub_pixel_avg_variance16x8,
vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
vpx_variance8x16, vp9_sub_pixel_variance8x16,
vp9_sub_pixel_avg_variance8x16,
vpx_variance8x16, vpx_sub_pixel_variance8x16,
vpx_sub_pixel_avg_variance8x16,
vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
vpx_variance8x8, vp9_sub_pixel_variance8x8,
vp9_sub_pixel_avg_variance8x8,
vpx_variance8x8, vpx_sub_pixel_variance8x8,
vpx_sub_pixel_avg_variance8x8,
vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
vpx_variance8x4, vp9_sub_pixel_variance8x4,
vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
vpx_variance8x4, vpx_sub_pixel_variance8x4,
vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
vpx_variance4x8, vp9_sub_pixel_variance4x8,
vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
vpx_variance4x8, vpx_sub_pixel_variance4x8,
vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
vpx_variance4x4, vp9_sub_pixel_variance4x4,
vp9_sub_pixel_avg_variance4x4,
vpx_variance4x4, vpx_sub_pixel_variance4x4,
vpx_sub_pixel_avg_variance4x4,
vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
#if CONFIG_VP9_HIGHBITDEPTH

Просмотреть файл

@ -40,7 +40,7 @@
#include "vp9/encoder/vp9_speed_features.h"
#include "vp9/encoder/vp9_svc_layercontext.h"
#include "vp9/encoder/vp9_tokenize.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_dsp/variance.h"
#if CONFIG_VP9_TEMPORAL_DENOISING
#include "vp9/encoder/vp9_denoiser.h"

Просмотреть файл

@ -35,7 +35,7 @@
#include "vp9/encoder/vp9_mcomp.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rd.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_dsp/variance.h"
#define OUTPUT_FPF 0
#define ARF_STATS_OUTPUT 0
@ -298,7 +298,7 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
}
}
static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
switch (bsize) {
case BLOCK_8X8:
return vpx_mse8x8;
@ -315,13 +315,13 @@ static unsigned int get_prediction_error(BLOCK_SIZE bsize,
const struct buf_2d *src,
const struct buf_2d *ref) {
unsigned int sse;
const vp9_variance_fn_t fn = get_block_variance_fn(bsize);
const vpx_variance_fn_t fn = get_block_variance_fn(bsize);
fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
return sse;
}
#if CONFIG_VP9_HIGHBITDEPTH
static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
int bd) {
switch (bd) {
default:
@ -368,7 +368,7 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
const struct buf_2d *ref,
int bd) {
unsigned int sse;
const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
return sse;
}

Просмотреть файл

@ -13,7 +13,7 @@
#define VP9_ENCODER_VP9_MCOMP_H_
#include "vp9/encoder/vp9_block.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_dsp/variance.h"
#ifdef __cplusplus
extern "C" {

Просмотреть файл

@ -37,7 +37,6 @@
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_rd.h"
#include "vp9/encoder/vp9_tokenize.h"
#include "vp9/encoder/vp9_variance.h"
#define RD_THRESH_POW 1.25
#define RD_MULT_EPB_RATIO 64

Просмотреть файл

@ -39,7 +39,6 @@
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_rd.h"
#include "vp9/encoder/vp9_rdopt.h"
#include "vp9/encoder/vp9_variance.h"
#include "vp9/encoder/vp9_aq_variance.h"
#define LAST_FRAME_MODE_MASK ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \

Просмотреть файл

@ -1,380 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_filter.h"
#include "vp9/encoder/vp9_variance.h"
static const uint8_t bilinear_filters[8][2] = {
{ 128, 0, },
{ 112, 16, },
{ 96, 32, },
{ 80, 48, },
{ 64, 64, },
{ 48, 80, },
{ 32, 96, },
{ 16, 112, },
};
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
// or vertical direction to produce the filtered output block. Used to implement
// first-pass of 2-D separable filter.
//
// Produces int32_t output to retain precision for next pass. Two filter taps
// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is
// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It
// defines the offset required to move from one input to the next.
static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
uint16_t *output_ptr,
unsigned int src_pixels_per_line,
int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *vp9_filter) {
unsigned int i, j;
for (i = 0; i < output_height; i++) {
for (j = 0; j < output_width; j++) {
output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
(int)src_ptr[pixel_step] * vp9_filter[1],
FILTER_BITS);
src_ptr++;
}
// Next row...
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
// or vertical direction to produce the filtered output block. Used to implement
// second-pass of 2-D separable filter.
//
// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two
// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the
// filter is applied horizontally (pixel_step=1) or vertically (pixel_step=
// stride). It defines the offset required to move from one input to the next.
static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
uint8_t *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *vp9_filter) {
unsigned int i, j;
for (i = 0; i < output_height; i++) {
for (j = 0; j < output_width; j++) {
output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
(int)src_ptr[pixel_step] * vp9_filter[1],
FILTER_BITS);
src_ptr++;
}
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
#define SUBPIX_VAR(W, H) \
unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
unsigned int *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint8_t temp2[H * W]; \
\
var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
bilinear_filters[xoffset]); \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
}
#define SUBPIX_AVG_VAR(W, H) \
unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
unsigned int *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint8_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
\
var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
bilinear_filters[xoffset]); \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
\
return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
}
SUBPIX_VAR(4, 4)
SUBPIX_AVG_VAR(4, 4)
SUBPIX_VAR(4, 8)
SUBPIX_AVG_VAR(4, 8)
SUBPIX_VAR(8, 4)
SUBPIX_AVG_VAR(8, 4)
SUBPIX_VAR(8, 8)
SUBPIX_AVG_VAR(8, 8)
SUBPIX_VAR(8, 16)
SUBPIX_AVG_VAR(8, 16)
SUBPIX_VAR(16, 8)
SUBPIX_AVG_VAR(16, 8)
SUBPIX_VAR(16, 16)
SUBPIX_AVG_VAR(16, 16)
SUBPIX_VAR(16, 32)
SUBPIX_AVG_VAR(16, 32)
SUBPIX_VAR(32, 16)
SUBPIX_AVG_VAR(32, 16)
SUBPIX_VAR(32, 32)
SUBPIX_AVG_VAR(32, 32)
SUBPIX_VAR(32, 64)
SUBPIX_AVG_VAR(32, 64)
SUBPIX_VAR(64, 32)
SUBPIX_AVG_VAR(64, 32)
SUBPIX_VAR(64, 64)
SUBPIX_AVG_VAR(64, 64)
#if CONFIG_VP9_HIGHBITDEPTH
static void highbd_var_filter_block2d_bil_first_pass(
const uint8_t *src_ptr8,
uint16_t *output_ptr,
unsigned int src_pixels_per_line,
int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *vp9_filter) {
unsigned int i, j;
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
for (i = 0; i < output_height; i++) {
for (j = 0; j < output_width; j++) {
output_ptr[j] =
ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
(int)src_ptr[pixel_step] * vp9_filter[1],
FILTER_BITS);
src_ptr++;
}
// Next row...
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
static void highbd_var_filter_block2d_bil_second_pass(
const uint16_t *src_ptr,
uint16_t *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *vp9_filter) {
unsigned int i, j;
for (i = 0; i < output_height; i++) {
for (j = 0; j < output_width; j++) {
output_ptr[j] =
ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
(int)src_ptr[pixel_step] * vp9_filter[1],
FILTER_BITS);
src_ptr++;
}
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
#define HIGHBD_SUBPIX_VAR(W, H) \
unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
unsigned int *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
dst_stride, sse); \
} \
\
unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
unsigned int *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
} \
\
unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
unsigned int *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
}
#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
unsigned int *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
dst_stride, sse); \
} \
\
unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
unsigned int *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
} \
\
unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
unsigned int *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
}
HIGHBD_SUBPIX_VAR(4, 4)
HIGHBD_SUBPIX_AVG_VAR(4, 4)
HIGHBD_SUBPIX_VAR(4, 8)
HIGHBD_SUBPIX_AVG_VAR(4, 8)
HIGHBD_SUBPIX_VAR(8, 4)
HIGHBD_SUBPIX_AVG_VAR(8, 4)
HIGHBD_SUBPIX_VAR(8, 8)
HIGHBD_SUBPIX_AVG_VAR(8, 8)
HIGHBD_SUBPIX_VAR(8, 16)
HIGHBD_SUBPIX_AVG_VAR(8, 16)
HIGHBD_SUBPIX_VAR(16, 8)
HIGHBD_SUBPIX_AVG_VAR(16, 8)
HIGHBD_SUBPIX_VAR(16, 16)
HIGHBD_SUBPIX_AVG_VAR(16, 16)
HIGHBD_SUBPIX_VAR(16, 32)
HIGHBD_SUBPIX_AVG_VAR(16, 32)
HIGHBD_SUBPIX_VAR(32, 16)
HIGHBD_SUBPIX_AVG_VAR(32, 16)
HIGHBD_SUBPIX_VAR(32, 32)
HIGHBD_SUBPIX_AVG_VAR(32, 32)
HIGHBD_SUBPIX_VAR(32, 64)
HIGHBD_SUBPIX_AVG_VAR(32, 64)
HIGHBD_SUBPIX_VAR(64, 32)
HIGHBD_SUBPIX_AVG_VAR(64, 32)
HIGHBD_SUBPIX_VAR(64, 64)
HIGHBD_SUBPIX_AVG_VAR(64, 64)
#endif // CONFIG_VP9_HIGHBITDEPTH

Просмотреть файл

@ -1,81 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_ENCODER_VP9_VARIANCE_H_
#define VP9_ENCODER_VP9_VARIANCE_H_
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride);
typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride,
const uint8_t *second_pred);
typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int *sad_array);
typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t* const ref_ptr[],
int ref_stride, unsigned int *sad_array);
typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int *sse);
typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
int source_stride,
int xoffset,
int yoffset,
const uint8_t *ref_ptr,
int Refstride,
unsigned int *sse);
typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
int source_stride,
int xoffset,
int yoffset,
const uint8_t *ref_ptr,
int Refstride,
unsigned int *sse,
const uint8_t *second_pred);
typedef struct vp9_variance_vtable {
vp9_sad_fn_t sdf;
vp9_sad_avg_fn_t sdaf;
vp9_variance_fn_t vf;
vp9_subpixvariance_fn_t svf;
vp9_subp_avg_variance_fn_t svaf;
vp9_sad_multi_fn_t sdx3f;
vp9_sad_multi_fn_t sdx8f;
vp9_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP9_ENCODER_VP9_VARIANCE_H_

Просмотреть файл

@ -1,349 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse);
#define DECLS(opt1, opt2) \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
// DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
uint32_t *sse_ptr) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, h, \
&sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48, src_stride, x_offset, y_offset, \
dst + 48, dst_stride, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
h, &sse); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
int start_row; \
uint32_t sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
for (start_row = 0; start_row < h; start_row +=16) { \
uint32_t sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
}\
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2, sse);
#undef FNS
#undef FN
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
const uint16_t *sec, \
ptrdiff_t sec_stride, \
int height, \
unsigned int *sse);
#define DECLS(opt1) \
DECL(16, opt1) \
DECL(8, opt1)
DECLS(sse2);
#undef DECL
#undef DECLS
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
const uint8_t *sec8) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, sec, w, h, &sse); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, x_offset, y_offset, \
dst + 16, dst_stride, sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, x_offset, y_offset, \
dst + 32, dst_stride, sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, x_offset, y_offset, \
dst + 48, dst_stride, sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
const uint8_t *sec8) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
const uint8_t *sec8) { \
int start_row; \
uint32_t sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
for (start_row = 0; start_row < h; start_row +=16) { \
uint32_t sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, x_offset, \
y_offset, dst + (start_row * dst_stride), dst_stride, \
sec + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 16 + (start_row * dst_stride), dst_stride, \
sec + 16 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 32 + (start_row * dst_stride), dst_stride, \
sec + 32 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 48 + (start_row * dst_stride), dst_stride, \
sec + 48 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
} \
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2);
#undef FNS
#undef FN

Просмотреть файл

@ -1,525 +0,0 @@
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <immintrin.h> // AVX2
#include "./vp9_rtcd.h"
#include "vpx_ports/mem.h"
#include "vp9/encoder/vp9_variance.h"
DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
};
#define FILTER_SRC(filter) \
/* filter the source */ \
exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
\
/* add 8 to source */ \
exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
\
/* divide source by 16 */ \
exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
#define MERGE_WITH_SRC(src_reg, reg) \
exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
#define LOAD_SRC_DST \
/* load source and destination */ \
src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
#define AVG_NEXT_SRC(src_reg, size_stride) \
src_next_reg = _mm256_loadu_si256((__m256i const *) \
(src + size_stride)); \
/* average between current and next stride source */ \
src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
#define MERGE_NEXT_SRC(src_reg, size_stride) \
src_next_reg = _mm256_loadu_si256((__m256i const *) \
(src + size_stride)); \
MERGE_WITH_SRC(src_reg, src_next_reg)
#define CALC_SUM_SSE_INSIDE_LOOP \
/* expand each byte to 2 bytes */ \
exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
/* source - dest */ \
exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
/* caculate sum */ \
sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
/* calculate sse */ \
sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
// final calculation to sum and sse
#define CALC_SUM_AND_SSE \
res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
\
sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
\
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
*((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
_mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
int height,
unsigned int *sse) {
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
int i, sum;
sum_reg = _mm256_set1_epi16(0);
sse_reg = _mm256_set1_epi16(0);
zero_reg = _mm256_set1_epi16(0);
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
if (y_offset == 0) {
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 0 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 0 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
}
// x_offset = 8 and y_offset = 0
} else if (x_offset == 8) {
if (y_offset == 0) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 8 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg, src_avg;
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg);
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
// save current source average
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
// x_offset = 8 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg, src_avg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
// save current source average
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
// x_offset = bilin interpolation and y_offset = 0
} else {
if (y_offset == 0) {
__m256i filter, pw8, src_next_reg;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = 8
} else if (y_offset == 8) {
__m256i filter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src_pack = src_reg;
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = bilin interpolation
} else {
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
xfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
y_offset <<= 5;
yfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// merge previous pack to current pack source
MERGE_WITH_SRC(src_pack, src_reg)
// filter the source
FILTER_SRC(yfilter)
src_pack = src_reg;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
}
CALC_SUM_AND_SSE
return sum;
}
unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
const uint8_t *sec,
int sec_stride,
int height,
unsigned int *sse) {
__m256i sec_reg;
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
int i, sum;
sum_reg = _mm256_set1_epi16(0);
sse_reg = _mm256_set1_epi16(0);
zero_reg = _mm256_set1_epi16(0);
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
if (y_offset == 0) {
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
} else if (y_offset == 8) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 0 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
}
// x_offset = 8 and y_offset = 0
} else if (x_offset == 8) {
if (y_offset == 0) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 8 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg, src_avg;
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
// save current source average
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
// x_offset = 8 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg, src_avg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
// save current source average
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
sec+= sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
// x_offset = bilin interpolation and y_offset = 0
} else {
if (y_offset == 0) {
__m256i filter, pw8, src_next_reg;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
MERGE_WITH_SRC(src_reg, zero_reg)
sec+= sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = 8
} else if (y_offset == 8) {
__m256i filter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = bilin interpolation
} else {
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
xfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
y_offset <<= 5;
yfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// merge previous pack to current pack source
MERGE_WITH_SRC(src_pack, src_reg)
// filter the source
FILTER_SRC(yfilter)
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;
sec+= sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
}
CALC_SUM_AND_SSE
return sum;
}

Просмотреть файл

@ -1,104 +0,0 @@
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
int x_offset, int y_offset,
const uint8_t *dst, int dst_stride,
int height,
unsigned int *sse);
unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
const uint8_t *sec,
int sec_stride,
int height,
unsigned int *sseptr);
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
unsigned int sse1;
const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
64, &sse1);
unsigned int sse2;
const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
x_offset, y_offset,
dst + 32, dst_stride,
64, &sse2);
const int se = se1 + se2;
*sse = sse1 + sse2;
return *sse - (((int64_t)se * se) >> 12);
}
unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
32, sse);
return *sse - (((int64_t)se * se) >> 10);
}
unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse,
const uint8_t *sec) {
unsigned int sse1;
const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
sec, 64, 64, &sse1);
unsigned int sse2;
const int se2 =
vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
y_offset, dst + 32, dst_stride,
sec + 32, 64, 64, &sse2);
const int se = se1 + se2;
*sse = sse1 + sse2;
return *sse - (((int64_t)se * se) >> 12);
}
unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse,
const uint8_t *sec) {
// processing 32 element in parallel
const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
sec, 32, 32, sse);
return *sse - (((int64_t)se * se) >> 10);
}

Просмотреть файл

@ -1,182 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <emmintrin.h> // SSE2
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst, \
int dst_stride, \
unsigned int *sse_ptr) { \
unsigned int sse; \
int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
h, &sse, NULL, NULL); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
FN(4, 4, 4, 2, 2, opt2, (unsigned int))
FNS(sse2, sse);
FNS(ssse3, ssse3);
#undef FNS
#undef FN
// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *sec, \
ptrdiff_t sec_stride, \
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECL
#undef DECLS
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst, \
int dst_stride, \
unsigned int *sseptr, \
const uint8_t *sec) { \
unsigned int sse; \
int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse, NULL, \
NULL); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2, \
NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2, \
NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2, \
NULL, NULL); \
se += se2; \
sse += sse2; \
} \
} \
*sseptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
FN(4, 4, 4, 2, 2, opt2, (unsigned int))
FNS(sse2, sse);
FNS(ssse3, ssse3);
#undef FNS
#undef FN

Просмотреть файл

@ -131,7 +131,6 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_d
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
# common (msa)
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c

Просмотреть файл

@ -58,7 +58,6 @@ VP9_CX_SRCS-yes += encoder/vp9_pickmode.h
VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h
VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
VP9_CX_SRCS-yes += encoder/vp9_variance.h
VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
VP9_CX_SRCS-yes += encoder/vp9_encoder.c
VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
@ -84,7 +83,6 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
VP9_CX_SRCS-yes += encoder/vp9_variance.c
VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c
VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h
VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
@ -103,7 +101,6 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@ -114,12 +111,6 @@ endif
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
endif
endif
ifeq ($(ARCH_X86_64),yes)
@ -143,14 +134,12 @@ endif
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2_impl.h
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
endif
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
@ -160,6 +149,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_variance_msa.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

Просмотреть файл

@ -0,0 +1,237 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vpx_filter_block2d_bil_first_pass_media|
EXPORT |vpx_filter_block2d_bil_second_pass_media|
AREA |.text|, CODE, READONLY ; name this block of code
;-------------------------------------
; r0 unsigned char *src_ptr,
; r1 unsigned short *dst_ptr,
; r2 unsigned int src_pitch,
; r3 unsigned int height,
; stack unsigned int width,
; stack const short *vpx_filter
;-------------------------------------
; The output is transposed stroed in output array to make it easy for second pass filtering.
|vpx_filter_block2d_bil_first_pass_media| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vpx_filter address
ldr r4, [sp, #36] ; width
mov r12, r3 ; outer-loop counter
add r7, r2, r4 ; preload next row
pld [r0, r7]
sub r2, r2, r4 ; src increment for height loop
ldr r5, [r11] ; load up filter coefficients
mov r3, r3, lsl #1 ; height*2
add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
mov r11, r1 ; save dst_ptr for each row
cmp r5, #128 ; if filter coef = 128, then skip the filter
beq bil_null_1st_filter
|bil_height_loop_1st_v6|
ldrb r6, [r0] ; load source data
ldrb r7, [r0, #1]
ldrb r8, [r0, #2]
mov lr, r4, lsr #2 ; 4-in-parellel loop counter
|bil_width_loop_1st_v6|
ldrb r9, [r0, #3]
ldrb r10, [r0, #4]
pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
smuad r6, r6, r5 ; apply the filter
pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
smuad r7, r7, r5
pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
smuad r8, r8, r5
smuad r9, r9, r5
add r0, r0, #4
subs lr, lr, #1
add r6, r6, #0x40 ; round_shift_and_clamp
add r7, r7, #0x40
usat r6, #16, r6, asr #7
usat r7, #16, r7, asr #7
strh r6, [r1], r3 ; result is transposed and stored
add r8, r8, #0x40 ; round_shift_and_clamp
strh r7, [r1], r3
add r9, r9, #0x40
usat r8, #16, r8, asr #7
usat r9, #16, r9, asr #7
strh r8, [r1], r3 ; result is transposed and stored
ldrneb r6, [r0] ; load source data
strh r9, [r1], r3
ldrneb r7, [r0, #1]
ldrneb r8, [r0, #2]
bne bil_width_loop_1st_v6
add r0, r0, r2 ; move to next input row
subs r12, r12, #1
add r9, r2, r4, lsl #1 ; adding back block width
pld [r0, r9] ; preload next row
add r11, r11, #2 ; move over to next column
mov r1, r11
bne bil_height_loop_1st_v6
ldmia sp!, {r4 - r11, pc}
|bil_null_1st_filter|
|bil_height_loop_null_1st|
mov lr, r4, lsr #2 ; loop counter
|bil_width_loop_null_1st|
ldrb r6, [r0] ; load data
ldrb r7, [r0, #1]
ldrb r8, [r0, #2]
ldrb r9, [r0, #3]
strh r6, [r1], r3 ; store it to immediate buffer
add r0, r0, #4
strh r7, [r1], r3
subs lr, lr, #1
strh r8, [r1], r3
strh r9, [r1], r3
bne bil_width_loop_null_1st
subs r12, r12, #1
add r0, r0, r2 ; move to next input line
add r11, r11, #2 ; move over to next column
mov r1, r11
bne bil_height_loop_null_1st
ldmia sp!, {r4 - r11, pc}
ENDP ; |vpx_filter_block2d_bil_first_pass_media|
;---------------------------------
; r0 unsigned short *src_ptr,
; r1 unsigned char *dst_ptr,
; r2 int dst_pitch,
; r3 unsigned int height,
; stack unsigned int width,
; stack const short *vpx_filter
;---------------------------------
|vpx_filter_block2d_bil_second_pass_media| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vpx_filter address
ldr r4, [sp, #36] ; width
ldr r5, [r11] ; load up filter coefficients
mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
mov r11, r1
cmp r5, #128 ; if filter coef = 128, then skip the filter
beq bil_null_2nd_filter
|bil_height_loop_2nd|
ldr r6, [r0] ; load the data
ldr r8, [r0, #4]
ldrh r10, [r0, #8]
mov lr, r3, lsr #2 ; loop counter
|bil_width_loop_2nd|
pkhtb r7, r6, r8 ; src[1] | src[2]
pkhtb r9, r8, r10 ; src[3] | src[4]
smuad r6, r6, r5 ; apply filter
smuad r8, r8, r5 ; apply filter
subs lr, lr, #1
smuadx r7, r7, r5 ; apply filter
smuadx r9, r9, r5 ; apply filter
add r0, r0, #8
add r6, r6, #0x40 ; round_shift_and_clamp
add r7, r7, #0x40
usat r6, #8, r6, asr #7
usat r7, #8, r7, asr #7
strb r6, [r1], r2 ; the result is transposed back and stored
add r8, r8, #0x40 ; round_shift_and_clamp
strb r7, [r1], r2
add r9, r9, #0x40
usat r8, #8, r8, asr #7
usat r9, #8, r9, asr #7
strb r8, [r1], r2 ; the result is transposed back and stored
ldrne r6, [r0] ; load data
strb r9, [r1], r2
ldrne r8, [r0, #4]
ldrneh r10, [r0, #8]
bne bil_width_loop_2nd
subs r12, r12, #1
add r0, r0, #4 ; update src for next row
add r11, r11, #1
mov r1, r11
bne bil_height_loop_2nd
ldmia sp!, {r4 - r11, pc}
|bil_null_2nd_filter|
|bil_height_loop_null_2nd|
mov lr, r3, lsr #2
|bil_width_loop_null_2nd|
ldr r6, [r0], #4 ; load data
subs lr, lr, #1
ldr r8, [r0], #4
strb r6, [r1], r2 ; store data
mov r7, r6, lsr #16
strb r7, [r1], r2
mov r9, r8, lsr #16
strb r8, [r1], r2
strb r9, [r1], r2
bne bil_width_loop_null_2nd
subs r12, r12, #1
add r0, r0, #4
add r11, r11, #1
mov r1, r11
bne bil_height_loop_null_2nd
ldmia sp!, {r4 - r11, pc}
ENDP ; |vpx_filter_block2d_second_pass_media|
END

Просмотреть файл

@ -0,0 +1,105 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
#if HAVE_MEDIA
static const int16_t bilinear_filters_media[8][2] = {
{ 128, 0 },
{ 112, 16 },
{ 96, 32 },
{ 80, 48 },
{ 64, 64 },
{ 48, 80 },
{ 32, 96 },
{ 16, 112 }
};
extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
uint16_t *dst_ptr,
uint32_t src_pitch,
uint32_t height,
uint32_t width,
const int16_t *filter);
extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
uint8_t *dst_ptr,
int32_t src_pitch,
uint32_t height,
uint32_t width,
const int16_t *filter);
unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset, int yoffset,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
uint16_t first_pass[10*8];
uint8_t second_pass[8*8];
const int16_t *HFilter, *VFilter;
HFilter = bilinear_filters_media[xoffset];
VFilter = bilinear_filters_media[yoffset];
vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
src_pixels_per_line,
9, 8, HFilter);
vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
8, 8, 8, VFilter);
return vpx_variance8x8_media(second_pass, 8, dst_ptr,
dst_pixels_per_line, sse);
}
unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const uint8_t *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse) {
uint16_t first_pass[36*16];
uint8_t second_pass[20*16];
const int16_t *HFilter, *VFilter;
unsigned int var;
if (xoffset == 4 && yoffset == 0) {
var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line,
sse);
} else if (xoffset == 0 && yoffset == 4) {
var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line,
sse);
} else if (xoffset == 4 && yoffset == 4) {
var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line,
sse);
} else {
HFilter = bilinear_filters_media[xoffset];
VFilter = bilinear_filters_media[yoffset];
vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
src_pixels_per_line,
17, 16, HFilter);
vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
16, 16, 16, VFilter);
var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
dst_pixels_per_line, sse);
}
return var;
}
#endif // HAVE_MEDIA

Просмотреть файл

@ -9,14 +9,13 @@
*/
#include <arm_neon.h>
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_filter.h"
#include "vpx_dsp/variance.h"
static const uint8_t bilinear_filters[8][2] = {
{ 128, 0, },
@ -35,9 +34,9 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *vp9_filter) {
const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
const uint8_t *filter) {
const uint8x8_t f0 = vmov_n_u8(filter[0]);
const uint8x8_t f1 = vmov_n_u8(filter[1]);
unsigned int i;
for (i = 0; i < output_height; ++i) {
const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@ -58,9 +57,9 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *vp9_filter) {
const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
const uint8_t *filter) {
const uint8x8_t f0 = vmov_n_u8(filter[0]);
const uint8x8_t f1 = vmov_n_u8(filter[1]);
unsigned int i, j;
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; j += 16) {
@ -80,7 +79,7 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
}
}
unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
int src_stride,
int xoffset,
int yoffset,
@ -98,7 +97,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
int src_stride,
int xoffset,
int yoffset,
@ -116,7 +115,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
int src_stride,
int xoffset,
int yoffset,
@ -134,7 +133,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
int src_stride,
int xoffset,
int yoffset,

Просмотреть файл

@ -9,7 +9,7 @@
;
EXPORT |vp8_variance_halfpixvar16x16_h_armv6|
EXPORT |vpx_variance_halfpixvar16x16_h_media|
ARM
REQUIRE8
@ -22,7 +22,7 @@
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance_halfpixvar16x16_h_armv6| PROC
|vpx_variance_halfpixvar16x16_h_media| PROC
stmfd sp!, {r4-r12, lr}

Просмотреть файл

@ -9,7 +9,7 @@
;
EXPORT |vp8_variance_halfpixvar16x16_hv_armv6|
EXPORT |vpx_variance_halfpixvar16x16_hv_media|
ARM
REQUIRE8
@ -22,7 +22,7 @@
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance_halfpixvar16x16_hv_armv6| PROC
|vpx_variance_halfpixvar16x16_hv_media| PROC
stmfd sp!, {r4-r12, lr}

Просмотреть файл

@ -9,7 +9,7 @@
;
EXPORT |vp8_variance_halfpixvar16x16_v_armv6|
EXPORT |vpx_variance_halfpixvar16x16_v_media|
ARM
REQUIRE8
@ -22,7 +22,7 @@
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance_halfpixvar16x16_v_armv6| PROC
|vpx_variance_halfpixvar16x16_v_media| PROC
stmfd sp!, {r4-r12, lr}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -8,13 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_filter.h"
#include "vp9/common/mips/msa/vp9_macros_msa.h"
#include "vpx_dsp/mips/macros_msa.h"
#include "vpx_dsp/variance.h"
static const uint8_t bilinear_filters[8][2] = {
static const uint8_t bilinear_filters_msa[8][2] = {
{ 128, 0, },
{ 112, 16, },
{ 96, 32, },
@ -707,8 +706,8 @@ static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
#define VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
int32_t src_stride, \
int32_t xoffset, \
int32_t yoffset, \
@ -717,8 +716,8 @@ uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
uint32_t *sse) { \
int32_t diff; \
uint32_t var; \
const uint8_t *h_filter = bilinear_filters[xoffset]; \
const uint8_t *v_filter = bilinear_filters[yoffset]; \
const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \
const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \
\
if (yoffset) { \
if (xoffset) { \
@ -749,20 +748,20 @@ uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \
return var; \
}
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);

Просмотреть файл

@ -14,13 +14,26 @@
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride) {
#include "vpx_dsp/variance.h"
static const uint8_t bilinear_filters[8][2] = {
{ 128, 0 },
{ 112, 16 },
{ 96, 32 },
{ 80, 48 },
{ 64, 64 },
{ 48, 80 },
{ 32, 96 },
{ 16, 112 },
};
uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride) {
int distortion = 0;
int r, c;
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
for (r = 0; r < 4; ++r) {
for (c = 0; c < 4; ++c) {
int diff = a[c] - b[c];
distortion += diff * diff;
}
@ -32,7 +45,7 @@ unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride,
return distortion;
}
unsigned int vpx_get_mb_ss_c(const int16_t *a) {
uint32_t vpx_get_mb_ss_c(const int16_t *a) {
unsigned int i, sum = 0;
for (i = 0; i < 256; ++i) {
@ -42,16 +55,38 @@ unsigned int vpx_get_mb_ss_c(const int16_t *a) {
return sum;
}
uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
b, b_stride, sse);
}
uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
b, b_stride, sse);
}
uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
b, b_stride, sse);
}
static void variance(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
int w, int h, uint32_t *sse, int *sum) {
int i, j;
*sum = 0;
*sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
@ -62,15 +97,113 @@ static void variance(const uint8_t *a, int a_stride,
}
}
// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
// or vertical direction to produce the filtered output block. Used to implement
// the first-pass of 2-D separable filter.
//
// Produces int16_t output to retain precision for the next pass. Two filter
// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
// It defines the offset required to move from one input to the next.
static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
unsigned int src_pixels_per_line,
int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *filter) {
unsigned int i, j;
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; ++j) {
b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
(int)a[pixel_step] * filter[1],
FILTER_BITS);
++a;
}
a += src_pixels_per_line - output_width;
b += output_width;
}
}
// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
// or vertical direction to produce the filtered output block. Used to implement
// the second-pass of 2-D separable filter.
//
// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
// filter is applied horizontally (pixel_step = 1) or vertically
// (pixel_step = stride). It defines the offset required to move from one input
// to the next. Output is 8-bit.
static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *filter) {
unsigned int i, j;
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; ++j) {
b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
(int)a[pixel_step] * filter[1],
FILTER_BITS);
++a;
}
a += src_pixels_per_line - output_width;
b += output_width;
}
}
#define VAR(W, H) \
unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
unsigned int *sse) { \
uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
uint32_t *sse) { \
int sum; \
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
}
#define SUBPIX_VAR(W, H) \
uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
int xoffset, int yoffset, \
const uint8_t *b, int b_stride, \
uint32_t *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint8_t temp2[H * W]; \
\
var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
bilinear_filters[xoffset]); \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
}
#define SUBPIX_AVG_VAR(W, H) \
uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
int xoffset, int yoffset, \
const uint8_t *b, \
int b_stride, \
uint32_t *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint8_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
\
var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
bilinear_filters[xoffset]); \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
\
return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
}
/* Identical to the variance call except it takes an additional parameter, sum,
* and returns that value using pass-by-reference instead of returning
* sse - sum^2 / w*h
@ -78,7 +211,7 @@ unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
#define GET_VAR(W, H) \
void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
unsigned int *sse, int *sum) { \
uint32_t *sse, int *sum) { \
variance(a, a_stride, b, b_stride, W, H, sse, sum); \
}
@ -87,27 +220,33 @@ void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
* variable.
*/
#define MSE(W, H) \
unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
unsigned int *sse) { \
uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
uint32_t *sse) { \
int sum; \
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse; \
}
VAR(64, 64)
VAR(64, 32)
VAR(32, 64)
VAR(32, 32)
VAR(32, 16)
VAR(16, 32)
VAR(16, 16)
VAR(16, 8)
VAR(8, 16)
VAR(8, 8)
VAR(8, 4)
VAR(4, 8)
VAR(4, 4)
/* All three forms of the variance are available in the same sizes. */
#define VARIANCES(W, H) \
VAR(W, H) \
SUBPIX_VAR(W, H) \
SUBPIX_AVG_VAR(W, H)
VARIANCES(64, 64)
VARIANCES(64, 32)
VARIANCES(32, 64)
VARIANCES(32, 32)
VARIANCES(32, 16)
VARIANCES(16, 32)
VARIANCES(16, 16)
VARIANCES(16, 8)
VARIANCES(8, 16)
VARIANCES(8, 8)
VARIANCES(8, 4)
VARIANCES(4, 8)
VARIANCES(4, 4)
GET_VAR(16, 16)
GET_VAR(8, 8)
@ -117,12 +256,13 @@ MSE(16, 8)
MSE(8, 16)
MSE(8, 8)
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
int width, int height,
const uint8_t *ref, int ref_stride) {
int i, j;
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
@ -143,8 +283,8 @@ static void highbd_variance64(const uint8_t *a8, int a_stride,
*sum = 0;
*sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
@ -156,60 +296,60 @@ static void highbd_variance64(const uint8_t *a8, int a_stride,
static void highbd_8_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
int w, int h, uint32_t *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sse = (unsigned int)sse_long;
*sse = (uint32_t)sse_long;
*sum = (int)sum_long;
}
static void highbd_10_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
int w, int h, uint32_t *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
*sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
}
static void highbd_12_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
int w, int h, uint32_t *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
*sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
}
#define HIGHBD_VAR(W, H) \
unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
unsigned int *sse) { \
uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
uint32_t *sse) { \
int sum; \
highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
unsigned int *sse) { \
uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
uint32_t *sse) { \
int sum; \
highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
unsigned int *sse) { \
uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
uint32_t *sse) { \
int sum; \
highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
@ -217,54 +357,243 @@ unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
#define HIGHBD_GET_VAR(S) \
void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sse, int *sum) { \
const uint8_t *ref, int ref_stride, \
uint32_t *sse, int *sum) { \
highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
} \
\
void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sse, int *sum) { \
uint32_t *sse, int *sum) { \
highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
} \
\
void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sse, int *sum) { \
uint32_t *sse, int *sum) { \
highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
}
#define HIGHBD_MSE(W, H) \
unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
unsigned int *sse) { \
uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
uint32_t *sse) { \
int sum; \
highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
} \
\
unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
unsigned int *sse) { \
uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
uint32_t *sse) { \
int sum; \
highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
} \
\
unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
unsigned int *sse) { \
uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
uint32_t *sse) { \
int sum; \
highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
}
static void highbd_var_filter_block2d_bil_first_pass(
const uint8_t *src_ptr8,
uint16_t *output_ptr,
unsigned int src_pixels_per_line,
int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *filter) {
unsigned int i, j;
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; ++j) {
output_ptr[j] =
ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
(int)src_ptr[pixel_step] * filter[1],
FILTER_BITS);
++src_ptr;
}
// Next row...
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
static void highbd_var_filter_block2d_bil_second_pass(
const uint16_t *src_ptr,
uint16_t *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *filter) {
unsigned int i, j;
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; ++j) {
output_ptr[j] =
ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
(int)src_ptr[pixel_step] * filter[1],
FILTER_BITS);
++src_ptr;
}
src_ptr += src_pixels_per_line - output_width;
output_ptr += output_width;
}
}
#define HIGHBD_SUBPIX_VAR(W, H) \
uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
uint32_t *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
dst_stride, sse); \
} \
\
uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
uint32_t *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
} \
\
uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
uint32_t *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
}
#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
uint32_t *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
dst_stride, sse); \
} \
\
uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
uint32_t *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
} \
\
uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
uint32_t *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
\
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
}
/* All three forms of the variance are available in the same sizes. */
#define HIGHBD_VARIANCES(W, H) \
HIGHBD_VAR(W, H) \
HIGHBD_SUBPIX_VAR(W, H) \
HIGHBD_SUBPIX_AVG_VAR(W, H)
HIGHBD_VARIANCES(64, 64)
HIGHBD_VARIANCES(64, 32)
HIGHBD_VARIANCES(32, 64)
HIGHBD_VARIANCES(32, 32)
HIGHBD_VARIANCES(32, 16)
HIGHBD_VARIANCES(16, 32)
HIGHBD_VARIANCES(16, 16)
HIGHBD_VARIANCES(16, 8)
HIGHBD_VARIANCES(8, 16)
HIGHBD_VARIANCES(8, 8)
HIGHBD_VARIANCES(8, 4)
HIGHBD_VARIANCES(4, 8)
HIGHBD_VARIANCES(4, 4)
HIGHBD_GET_VAR(8)
HIGHBD_GET_VAR(16)
@ -273,28 +602,14 @@ HIGHBD_MSE(16, 8)
HIGHBD_MSE(8, 16)
HIGHBD_MSE(8, 8)
HIGHBD_VAR(64, 64)
HIGHBD_VAR(64, 32)
HIGHBD_VAR(32, 64)
HIGHBD_VAR(32, 32)
HIGHBD_VAR(32, 16)
HIGHBD_VAR(16, 32)
HIGHBD_VAR(16, 16)
HIGHBD_VAR(16, 8)
HIGHBD_VAR(8, 16)
HIGHBD_VAR(8, 8)
HIGHBD_VAR(8, 4)
HIGHBD_VAR(4, 8)
HIGHBD_VAR(4, 4)
void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride) {
int i, j;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}

94
vpx_dsp/variance.h Normal file
Просмотреть файл

@ -0,0 +1,94 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_VARIANCE_H_
#define VPX_DSP_VARIANCE_H_
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
#define FILTER_BITS 7
#define FILTER_WEIGHT 128
typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
const uint8_t *b_ptr, int b_stride);
typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
const uint8_t *b_ptr, int b_stride,
const uint8_t *second_pred);
typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
uint8_t *b, int b_stride, int n);
typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sad_array);
typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
const uint8_t *const b_array[],
int b_stride,
unsigned int *sad_array);
typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse);
typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
int xoffset, int yoffset,
const uint8_t *b, int b_stride,
unsigned int *sse);
typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
int a_stride,
int xoffset, int yoffset,
const uint8_t *b_ptr,
int b_stride,
unsigned int *sse,
const uint8_t *second_pred);
#if CONFIG_VP8
typedef struct variance_vtable {
vpx_sad_fn_t sdf;
vpx_variance_fn_t vf;
vpx_subpixvariance_fn_t svf;
vpx_variance_fn_t svf_halfpix_h;
vpx_variance_fn_t svf_halfpix_v;
vpx_variance_fn_t svf_halfpix_hv;
vpx_sad_multi_fn_t sdx3f;
vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;
#if ARCH_X86 || ARCH_X86_64
vp8_copy32xn_fn_t copymem;
#endif
} vp8_variance_fn_ptr_t;
#endif // CONFIG_VP8
#if CONFIG_VP9
typedef struct vp9_variance_vtable {
vpx_sad_fn_t sdf;
vpx_sad_avg_fn_t sdaf;
vpx_variance_fn_t vf;
vpx_subpixvariance_fn_t svf;
vpx_subp_avg_variance_fn_t svaf;
vpx_sad_multi_fn_t sdx3f;
vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
#endif // CONFIG_VP9
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VPX_DSP_VARIANCE_H_

Просмотреть файл

@ -10,6 +10,8 @@
DSP_SRCS-yes += vpx_dsp.mk
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
@ -19,7 +21,6 @@ DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
@ -45,21 +46,36 @@ endif # CONFIG_ENCODERS
ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
DSP_SRCS-yes += variance.c
DSP_SRCS-yes += variance.h
DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM)
DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM)
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM)
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM)
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c
DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3
endif # CONFIG_USE_X86INC
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
endif # CONFIG_USE_X86INC
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

Просмотреть файл

@ -412,6 +412,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
#
# Variance
#
add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
@ -451,7 +454,9 @@ add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance4x4 mmx sse2 msa/;
#
# Specialty Variance
#
add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
@ -478,6 +483,99 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
#
# Subpixel Variance
#
add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
#
# Specialty Subpixel
#
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_variance_halfpixvar16x16_h mmx media/;
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_variance_halfpixvar16x16_v mmx media/;
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_variance_halfpixvar16x16_hv mmx media/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance64x64 sse2/;
@ -615,6 +713,226 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_12_mse8x8 sse2/;
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
#
# Subpixel Variance
#
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

Просмотреть файл

@ -8,6 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
%define program_name vpx
%include "third_party/x86inc/x86inc.asm"
SECTION_RODATA
@ -30,7 +32,7 @@ bilin_filter_m_sse2: times 8 dw 16
SECTION .text
; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
; int x_offset, int y_offset,
; const uint8_t *dst, ptrdiff_t dst_stride,
; int height, unsigned int *sse);

Просмотреть файл

@ -8,9 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
@ -243,3 +241,341 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
return *sse;
}
#if CONFIG_USE_X86INC
#define DECL(w, opt) \
int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse);
#define DECLS(opt1, opt2) \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
// TODO(johannkoenig): enable the ssse3 or delete
// DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
uint32_t *sse_ptr) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, h, \
&sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48, src_stride, x_offset, y_offset, \
dst + 48, dst_stride, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
h, &sse); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
int start_row; \
uint32_t sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
for (start_row = 0; start_row < h; start_row +=16) { \
uint32_t sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
}\
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2, sse);
#undef FNS
#undef FN
#define DECL(w, opt) \
int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
const uint16_t *sec, \
ptrdiff_t sec_stride, \
int height, \
unsigned int *sse);
#define DECLS(opt1) \
DECL(16, opt1) \
DECL(8, opt1)
DECLS(sse2);
#undef DECL
#undef DECLS
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
const uint8_t *sec8) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, sec, w, h, &sse); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, x_offset, y_offset, \
dst + 16, dst_stride, sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, x_offset, y_offset, \
dst + 32, dst_stride, sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, x_offset, y_offset, \
dst + 48, dst_stride, sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
const uint8_t *sec8) { \
uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse); \
if (w > wf) { \
uint32_t sse2; \
int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
\
uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
const uint8_t *sec8) { \
int start_row; \
uint32_t sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
for (start_row = 0; start_row < h; start_row +=16) { \
uint32_t sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, x_offset, \
y_offset, dst + (start_row * dst_stride), dst_stride, \
sec + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 16 + (start_row * dst_stride), dst_stride, \
sec + 16 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 32 + (start_row * dst_stride), dst_stride, \
sec + 32 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 48 + (start_row * dst_stride), dst_stride, \
sec + 48 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
} \
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2);
#undef FNS
#undef FN
#endif // CONFIG_USE_X86INC

Просмотреть файл

@ -8,6 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
%define program_name vpx
%include "third_party/x86inc/x86inc.asm"
SECTION_RODATA
@ -39,7 +41,7 @@ bilin_filter_m_ssse3: times 8 db 16, 0
SECTION .text
; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
; int x_offset, int y_offset,
; const uint8_t *dst, ptrdiff_t dst_stride,
; int height, unsigned int *sse);

Просмотреть файл

@ -91,3 +91,93 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
sse, &sum, vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 11);
}
unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
int x_offset, int y_offset,
const uint8_t *dst, int dst_stride,
int height,
unsigned int *sse);
unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
const uint8_t *sec,
int sec_stride,
int height,
unsigned int *sseptr);
unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
unsigned int sse1;
const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
64, &sse1);
unsigned int sse2;
const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
x_offset, y_offset,
dst + 32, dst_stride,
64, &sse2);
const int se = se1 + se2;
*sse = sse1 + sse2;
return *sse - (((int64_t)se * se) >> 12);
}
unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
32, sse);
return *sse - (((int64_t)se * se) >> 10);
}
unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse,
const uint8_t *sec) {
unsigned int sse1;
const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
sec, 64, 64, &sse1);
unsigned int sse2;
const int se2 =
vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
y_offset, dst + 32, dst_stride,
sec + 32, 64, 64, &sse2);
const int se = se1 + se2;
*sse = sse1 + sse2;
return *sse - (((int64_t)se * se) >> 12);
}
unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse,
const uint8_t *sec) {
// Process 32 elements in parallel.
const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
sec, 32, 32, sse);
return *sse - (((int64_t)se * se) >> 10);
}

Просмотреть файл

@ -11,6 +11,27 @@
#include <immintrin.h> // AVX2
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
};
void vpx_get16x16var_avx2(const unsigned char *src_ptr,
int source_stride,
@ -213,3 +234,494 @@ void vpx_get32x32var_avx2(const unsigned char *src_ptr,
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
}
}
#define FILTER_SRC(filter) \
/* filter the source */ \
exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
\
/* add 8 to source */ \
exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
\
/* divide source by 16 */ \
exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
#define MERGE_WITH_SRC(src_reg, reg) \
exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
#define LOAD_SRC_DST \
/* load source and destination */ \
src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
#define AVG_NEXT_SRC(src_reg, size_stride) \
src_next_reg = _mm256_loadu_si256((__m256i const *) \
(src + size_stride)); \
/* average between current and next stride source */ \
src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
#define MERGE_NEXT_SRC(src_reg, size_stride) \
src_next_reg = _mm256_loadu_si256((__m256i const *) \
(src + size_stride)); \
MERGE_WITH_SRC(src_reg, src_next_reg)
#define CALC_SUM_SSE_INSIDE_LOOP \
/* expand each byte to 2 bytes */ \
exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
/* source - dest */ \
exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
/* caculate sum */ \
sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
/* calculate sse */ \
sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
// final calculation to sum and sse
#define CALC_SUM_AND_SSE \
res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
\
sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
\
sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
*((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
_mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
int height,
unsigned int *sse) {
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
int i, sum;
sum_reg = _mm256_set1_epi16(0);
sse_reg = _mm256_set1_epi16(0);
zero_reg = _mm256_set1_epi16(0);
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
if (y_offset == 0) {
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 0 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 0 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
}
// x_offset = 8 and y_offset = 0
} else if (x_offset == 8) {
if (y_offset == 0) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 8 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg, src_avg;
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg);
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
// save current source average
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
// x_offset = 8 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg, src_avg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
// save current source average
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
// x_offset = bilin interpolation and y_offset = 0
} else {
if (y_offset == 0) {
__m256i filter, pw8, src_next_reg;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = 8
} else if (y_offset == 8) {
__m256i filter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src_pack = src_reg;
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = bilin interpolation
} else {
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
xfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
y_offset <<= 5;
yfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// merge previous pack to current pack source
MERGE_WITH_SRC(src_pack, src_reg)
// filter the source
FILTER_SRC(yfilter)
src_pack = src_reg;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
}
CALC_SUM_AND_SSE
return sum;
}
unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
const uint8_t *sec,
int sec_stride,
int height,
unsigned int *sse) {
__m256i sec_reg;
__m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
__m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
__m256i zero_reg;
int i, sum;
sum_reg = _mm256_set1_epi16(0);
sse_reg = _mm256_set1_epi16(0);
zero_reg = _mm256_set1_epi16(0);
// x_offset = 0 and y_offset = 0
if (x_offset == 0) {
if (y_offset == 0) {
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
} else if (y_offset == 8) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 0 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
}
// x_offset = 8 and y_offset = 0
} else if (x_offset == 8) {
if (y_offset == 0) {
__m256i src_next_reg;
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_reg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = 8 and y_offset = 8
} else if (y_offset == 8) {
__m256i src_next_reg, src_avg;
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
// save current source average
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
// x_offset = 8 and y_offset = bilin interpolation
} else {
__m256i filter, pw8, src_next_reg, src_avg;
y_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
AVG_NEXT_SRC(src_reg, 1)
for (i = 0; i < height ; i++) {
// save current source average
src_avg = src_reg;
src+= src_stride;
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
sec+= sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
// x_offset = bilin interpolation and y_offset = 0
} else {
if (y_offset == 0) {
__m256i filter, pw8, src_next_reg;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
MERGE_WITH_SRC(src_reg, zero_reg)
sec+= sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
src+= src_stride;
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = 8
} else if (y_offset == 8) {
__m256i filter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
filter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
pw8 = _mm256_set1_epi16(8);
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
// x_offset = bilin interpolation and y_offset = bilin interpolation
} else {
__m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
x_offset <<= 5;
xfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + x_offset));
y_offset <<= 5;
yfilter = _mm256_load_si256((__m256i const *)
(bilinear_filters_avx2 + y_offset));
pw8 = _mm256_set1_epi16(8);
// load source and another source starting from the next
// following byte
src_reg = _mm256_loadu_si256((__m256i const *) (src));
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
// convert each 16 bit to 8 bit to each low and high lane source
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
for (i = 0; i < height ; i++) {
src+= src_stride;
LOAD_SRC_DST
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(xfilter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// merge previous pack to current pack source
MERGE_WITH_SRC(src_pack, src_reg)
// filter the source
FILTER_SRC(yfilter)
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;
sec+= sec_stride;
CALC_SUM_SSE_INSIDE_LOOP
dst+= dst_stride;
}
}
}
CALC_SUM_AND_SSE
return sum;
}

Просмотреть файл

@ -11,6 +11,8 @@
%include "vpx_ports/x86_abi_support.asm"
%define mmx_filter_shift 7
;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
global sym(vpx_get_mb_ss_mmx) PRIVATE
sym(vpx_get_mb_ss_mmx):
@ -52,7 +54,6 @@ sym(vpx_get_mb_ss_mmx):
movsxd rcx, dword ptr [rsp+4]
add rax, rcx
; begin epilog
add rsp, 8
pop rdi
@ -62,7 +63,6 @@ sym(vpx_get_mb_ss_mmx):
pop rbp
ret
;void vpx_get8x8var_mmx
;(
; unsigned char *src_ptr,
@ -83,7 +83,6 @@ sym(vpx_get8x8var_mmx):
sub rsp, 16
; end prolog
pxor mm5, mm5 ; Blank mmx6
pxor mm6, mm6 ; Blank mmx7
pxor mm7, mm7 ; Blank mmx7
@ -117,7 +116,6 @@ sym(vpx_get8x8var_mmx):
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 2
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
@ -298,7 +296,6 @@ sym(vpx_get8x8var_mmx):
mov dword ptr [rdi], edx
xor rax, rax ; return 0
; begin epilog
add rsp, 16
pop rbx
@ -308,8 +305,6 @@ sym(vpx_get8x8var_mmx):
pop rbp
ret
;void
;vpx_get4x4var_mmx
;(
@ -331,7 +326,6 @@ sym(vpx_get4x4var_mmx):
sub rsp, 16
; end prolog
pxor mm5, mm5 ; Blank mmx6
pxor mm6, mm6 ; Blank mmx7
pxor mm7, mm7 ; Blank mmx7
@ -354,7 +348,6 @@ sym(vpx_get4x4var_mmx):
movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 2
movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
@ -393,7 +386,6 @@ sym(vpx_get4x4var_mmx):
pmaddwd mm0, mm0 ; square and accumulate
paddd mm7, mm0 ; accumulate in mm7
; Now accumulate the final results.
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
@ -413,7 +405,6 @@ sym(vpx_get4x4var_mmx):
mov dword ptr [rdi], edx
xor rax, rax ; return 0
; begin epilog
add rsp, 16
pop rbx
@ -422,3 +413,332 @@ sym(vpx_get4x4var_mmx):
UNSHADOW_ARGS
pop rbp
ret
;void vpx_filter_block2d_bil4x4_var_mmx
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned short *HFilter,
; unsigned short *VFilter,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
sym(vpx_filter_block2d_bil4x4_var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov rax, arg(4) ;HFilter ;
mov rdx, arg(5) ;VFilter ;
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
mov rcx, 4 ;
pxor mm0, mm0 ;
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
punpcklbw mm1, mm0 ;
pmullw mm1, [rax] ;
punpcklbw mm3, mm0 ;
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm5, mm1
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
add rsi, r8
%endif
.filter_block2d_bil4x4_var_mmx_loop:
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
punpcklbw mm1, mm0 ;
pmullw mm1, [rax] ;
punpcklbw mm3, mm0 ;
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm3, mm5 ;
movq mm5, mm1 ;
pmullw mm3, [rdx] ;
pmullw mm1, [rdx+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movd mm3, [rdi] ;
punpcklbw mm3, mm0 ;
psubw mm1, mm3 ;
paddw mm6, mm1 ;
pmaddwd mm1, mm1 ;
paddd mm7, mm1 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz .filter_block2d_bil4x4_var_mmx_loop ;
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rdi, arg(6) ;sum
mov rsi, arg(7) ;sumsquared
movd dword ptr [rdi], mm2 ;
movd dword ptr [rsi], mm4 ;
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vpx_filter_block2d_bil_var_mmx
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; unsigned short *HFilter,
; unsigned short *VFilter,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
sym(vpx_filter_block2d_bil_var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov rax, arg(5) ;HFilter ;
mov rdx, arg(6) ;VFilter ;
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
pxor mm0, mm0 ;
movq mm1, [rsi] ;
movq mm3, [rsi+1] ;
movq mm2, mm1 ;
movq mm4, mm3 ;
punpcklbw mm1, mm0 ;
punpckhbw mm2, mm0 ;
pmullw mm1, [rax] ;
pmullw mm2, [rax] ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
pmullw mm3, [rax+8] ;
pmullw mm4, [rax+8] ;
paddw mm1, mm3 ;
paddw mm2, mm4 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm2, mmx_filter_shift ;
movq mm5, mm1
packuswb mm5, mm2 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
add rsi, r8
%endif
.filter_block2d_bil_var_mmx_loop:
movq mm1, [rsi] ;
movq mm3, [rsi+1] ;
movq mm2, mm1 ;
movq mm4, mm3 ;
punpcklbw mm1, mm0 ;
punpckhbw mm2, mm0 ;
pmullw mm1, [rax] ;
pmullw mm2, [rax] ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
pmullw mm3, [rax+8] ;
pmullw mm4, [rax+8] ;
paddw mm1, mm3 ;
paddw mm2, mm4 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm2, mmx_filter_shift ;
movq mm3, mm5 ;
movq mm4, mm5 ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
movq mm5, mm1 ;
packuswb mm5, mm2 ;
pmullw mm3, [rdx] ;
pmullw mm4, [rdx] ;
pmullw mm1, [rdx+8] ;
pmullw mm2, [rdx+8] ;
paddw mm1, mm3 ;
paddw mm2, mm4 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
paddw mm2, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
psraw mm2, mmx_filter_shift ;
movq mm3, [rdi] ;
movq mm4, mm3 ;
punpcklbw mm3, mm0 ;
punpckhbw mm4, mm0 ;
psubw mm1, mm3 ;
psubw mm2, mm4 ;
paddw mm6, mm1 ;
pmaddwd mm1, mm1 ;
paddw mm6, mm2 ;
pmaddwd mm2, mm2 ;
paddd mm7, mm1 ;
paddd mm7, mm2 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz .filter_block2d_bil_var_mmx_loop ;
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rdi, arg(7) ;sum
mov rsi, arg(8) ;sumsquared
movd dword ptr [rdi], mm2 ;
movd dword ptr [rsi], mm4 ;
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
;short mmx_bi_rd[4] = { 64, 64, 64, 64};
align 16
mmx_bi_rd:
times 4 dw 64

Просмотреть файл

@ -10,12 +10,45 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
{ 128, 128, 128, 128, 0, 0, 0, 0 },
{ 112, 112, 112, 112, 16, 16, 16, 16 },
{ 96, 96, 96, 96, 32, 32, 32, 32 },
{ 80, 80, 80, 80, 48, 48, 48, 48 },
{ 64, 64, 64, 64, 64, 64, 64, 64 },
{ 48, 48, 48, 48, 80, 80, 80, 80 },
{ 32, 32, 32, 32, 96, 96, 96, 96 },
{ 16, 16, 16, 16, 112, 112, 112, 112 }
};
extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse, int *sum);
unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
const int16_t *HFilter,
const int16_t *VFilter,
int *sum,
unsigned int *sumsquared);
extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
const int16_t *HFilter,
const int16_t *VFilter,
int *sum,
unsigned int *sumsquared);
unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int var;
int avg;
@ -25,8 +58,8 @@ unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
return (var - (((unsigned int)avg * avg) >> 4));
}
unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int var;
int avg;
@ -37,8 +70,8 @@ unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
return (var - (((unsigned int)avg * avg) >> 6));
}
unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3;
@ -55,8 +88,8 @@ unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
return var;
}
unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3, avg;
@ -74,8 +107,8 @@ unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
return (var - (((unsigned int)avg * avg) >> 8));
}
unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
@ -89,8 +122,8 @@ unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
return (var - (((unsigned int)avg * avg) >> 7));
}
unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
@ -105,3 +138,112 @@ unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
return (var - (((unsigned int)avg * avg) >> 7));
}
uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
int xoffset, int yoffset,
const uint8_t *b, int b_stride,
uint32_t *sse) {
int xsum;
unsigned int xxsum;
vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
bilinear_filters_mmx[xoffset],
bilinear_filters_mmx[yoffset],
&xsum, &xxsum);
*sse = xxsum;
return (xxsum - (((unsigned int)xsum * xsum) >> 4));
}
uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
int xoffset, int yoffset,
const uint8_t *b, int b_stride,
uint32_t *sse) {
int xsum;
uint32_t xxsum;
vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
bilinear_filters_mmx[xoffset],
bilinear_filters_mmx[yoffset],
&xsum, &xxsum);
*sse = xxsum;
return (xxsum - (((uint32_t)xsum * xsum) >> 6));
}
uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
int xoffset, int yoffset,
const uint8_t *b, int b_stride,
uint32_t *sse) {
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
bilinear_filters_mmx[xoffset],
bilinear_filters_mmx[yoffset],
&xsum0, &xxsum0);
vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
bilinear_filters_mmx[xoffset],
bilinear_filters_mmx[yoffset],
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
}
uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
int xoffset, int yoffset,
const uint8_t *b, int b_stride,
uint32_t *sse) {
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
bilinear_filters_mmx[xoffset],
bilinear_filters_mmx[yoffset],
&xsum0, &xxsum0);
vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
bilinear_filters_mmx[xoffset],
bilinear_filters_mmx[yoffset],
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
}
uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
int xoffset, int yoffset,
const uint8_t *b, int b_stride,
uint32_t *sse) {
int xsum;
unsigned int xxsum;
vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
bilinear_filters_mmx[xoffset],
bilinear_filters_mmx[yoffset],
&xsum, &xxsum);
*sse = xxsum;
return (xxsum - (((uint32_t)xsum * xsum) >> 7));
}
uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
}
uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
}
uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
uint32_t *sse) {
return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
}

Просмотреть файл

@ -307,3 +307,171 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}
#if CONFIG_USE_X86INC
// The 2 unused parameters are place holders for PIC enabled build.
// These definitions are for functions defined in subpel_variance.asm
#define DECL(w, opt) \
int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst, \
int dst_stride, \
unsigned int *sse_ptr) { \
unsigned int sse; \
int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
h, &sse, NULL, NULL); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \
FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \
FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \
FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \
FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \
FN(4, 4, 4, 2, 2, opt2, (uint32_t))
FNS(sse2, sse);
FNS(ssse3, ssse3);
#undef FNS
#undef FN
// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *sec, \
ptrdiff_t sec_stride, \
int height, unsigned int *sse, \
void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECL
#undef DECLS
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst, \
int dst_stride, \
unsigned int *sseptr, \
const uint8_t *sec) { \
unsigned int sse; \
int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse, NULL, \
NULL); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2, \
NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2, \
NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2, \
NULL, NULL); \
se += se2; \
sse += sse2; \
} \
} \
*sseptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \
FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \
FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \
FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \
FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \
FN(4, 4, 4, 2, 2, opt2, (uint32_t))
FNS(sse2, sse);
FNS(ssse3, ssse3);
#undef FNS
#undef FN
#endif // CONFIG_USE_X86INC