Move shared SAD code to vpx_dsp

Create a new component, vpx_dsp, for code that can be shared
between codecs. Move the SAD code into the component.

This reduces the size of vpxenc/dec by 36k on x86_64 builds.

Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a
This commit is contained in:
Johann 2015-04-17 16:11:38 -04:00
Родитель c77b1f5acd
Коммит d5d9289800
44 изменённых файлов: 2132 добавлений и 4349 удалений

Просмотреть файл

@ -54,6 +54,9 @@ CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk
CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))
include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk
CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
VP8_PREFIX=vp8/
include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -19,6 +19,7 @@ extern void vp8_rtcd();
#if CONFIG_VP9
extern void vp9_rtcd();
#endif // CONFIG_VP9
extern void vpx_dsp_rtcd();
extern void vpx_scale_rtcd();
}
#include "third_party/googletest/src/include/gtest/gtest.h"
@ -64,6 +65,7 @@ int main(int argc, char **argv) {
#if CONFIG_VP9
vp9_rtcd();
#endif // CONFIG_VP9
vpx_dsp_rtcd();
vpx_scale_rtcd();
#endif // !CONFIG_SHARED

Просмотреть файл

@ -1,184 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
unsigned int vp8_sad8x8_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 7; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad8x16_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 15; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad4x4_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x2_t d1;
uint64x1_t d3;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 3; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
d1 = vpaddl_u16(vget_low_u16(q12));
d3 = vpaddl_u32(d1);
return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
}
unsigned int vp8_sad16x16_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 15; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad16x8_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 7; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}

71
vp8/common/copy_c.c Normal file
Просмотреть файл

@ -0,0 +1,71 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vpx/vpx_integer.h"
/* Copy 2 macroblocks to a buffer */
void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
unsigned char *dst_ptr, int dst_stride,
int height)
{
int r;
for (r = 0; r < height; r++)
{
#if !(CONFIG_FAST_UNALIGNED)
dst_ptr[0] = src_ptr[0];
dst_ptr[1] = src_ptr[1];
dst_ptr[2] = src_ptr[2];
dst_ptr[3] = src_ptr[3];
dst_ptr[4] = src_ptr[4];
dst_ptr[5] = src_ptr[5];
dst_ptr[6] = src_ptr[6];
dst_ptr[7] = src_ptr[7];
dst_ptr[8] = src_ptr[8];
dst_ptr[9] = src_ptr[9];
dst_ptr[10] = src_ptr[10];
dst_ptr[11] = src_ptr[11];
dst_ptr[12] = src_ptr[12];
dst_ptr[13] = src_ptr[13];
dst_ptr[14] = src_ptr[14];
dst_ptr[15] = src_ptr[15];
dst_ptr[16] = src_ptr[16];
dst_ptr[17] = src_ptr[17];
dst_ptr[18] = src_ptr[18];
dst_ptr[19] = src_ptr[19];
dst_ptr[20] = src_ptr[20];
dst_ptr[21] = src_ptr[21];
dst_ptr[22] = src_ptr[22];
dst_ptr[23] = src_ptr[23];
dst_ptr[24] = src_ptr[24];
dst_ptr[25] = src_ptr[25];
dst_ptr[26] = src_ptr[26];
dst_ptr[27] = src_ptr[27];
dst_ptr[28] = src_ptr[28];
dst_ptr[29] = src_ptr[29];
dst_ptr[30] = src_ptr[30];
dst_ptr[31] = src_ptr[31];
#else
((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ;
((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ;
((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ;
((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ;
((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ;
((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ;
((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ;
((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ;
#endif
src_ptr += src_stride;
dst_ptr += dst_stride;
}
}

Просмотреть файл

@ -17,10 +17,11 @@
* higher quality.
*/
#include "postproc.h"
#include "variance.h"
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp8/common/postproc.h"
#include "vp8/common/variance.h"
#include "vpx_mem/vpx_mem.h"
#include "vp8_rtcd.h"
#include "vpx_scale/yv12config.h"
#include <limits.h>
@ -160,9 +161,9 @@ static void multiframe_quality_enhance_block
vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 32)>>6;
#else
sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6;
vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6;
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride)+ 32) >> 6;
#endif
}
else /* if (blksize == 8) */
@ -177,9 +178,9 @@ static void multiframe_quality_enhance_block
vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 8)>>4;
#else
sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;
usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4;
vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4;
sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4;
vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4;
#endif
}

Просмотреть файл

@ -303,88 +303,6 @@ specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
#
# Single block SAD
#
add_proto qw/unsigned int vp8_sad4x4/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
specialize qw/vp8_sad4x4 mmx sse2 neon/;
$vp8_sad4x4_sse2=vp8_sad4x4_wmt;
add_proto qw/unsigned int vp8_sad8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
specialize qw/vp8_sad8x8 mmx sse2 neon/;
$vp8_sad8x8_sse2=vp8_sad8x8_wmt;
add_proto qw/unsigned int vp8_sad8x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
specialize qw/vp8_sad8x16 mmx sse2 neon/;
$vp8_sad8x16_sse2=vp8_sad8x16_wmt;
add_proto qw/unsigned int vp8_sad16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
specialize qw/vp8_sad16x8 mmx sse2 neon/;
$vp8_sad16x8_sse2=vp8_sad16x8_wmt;
add_proto qw/unsigned int vp8_sad16x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
specialize qw/vp8_sad16x16 mmx sse2 sse3 media neon/;
$vp8_sad16x16_sse2=vp8_sad16x16_wmt;
$vp8_sad16x16_media=vp8_sad16x16_armv6;
#
# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
#
add_proto qw/void vp8_sad4x4x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad4x4x3 sse3/;
add_proto qw/void vp8_sad8x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad8x8x3 sse3/;
add_proto qw/void vp8_sad8x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad8x16x3 sse3/;
add_proto qw/void vp8_sad16x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad16x8x3 sse3 ssse3/;
add_proto qw/void vp8_sad16x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad16x16x3 sse3 ssse3/;
# Note the only difference in the following prototypes is that they return into
# an array of short
add_proto qw/void vp8_sad4x4x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
specialize qw/vp8_sad4x4x8 sse4_1/;
$vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4;
add_proto qw/void vp8_sad8x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
specialize qw/vp8_sad8x8x8 sse4_1/;
$vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4;
add_proto qw/void vp8_sad8x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
specialize qw/vp8_sad8x16x8 sse4_1/;
$vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4;
add_proto qw/void vp8_sad16x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
specialize qw/vp8_sad16x8x8 sse4_1/;
$vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4;
add_proto qw/void vp8_sad16x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array";
specialize qw/vp8_sad16x16x8 sse4_1/;
$vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4;
#
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vp8_sad4x4x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad4x4x4d sse3/;
add_proto qw/void vp8_sad8x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad8x8x4d sse3/;
add_proto qw/void vp8_sad8x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad8x16x4d sse3/;
add_proto qw/void vp8_sad16x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad16x8x4d sse3/;
add_proto qw/void vp8_sad16x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp8_sad16x16x4d sse3/;
#
# Encoder functions below this point.
#

Просмотреть файл

@ -1,302 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <limits.h>
#include <stdlib.h>
#include "vpx_config.h"
#include "vpx/vpx_integer.h"
static unsigned int sad_mx_n_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int max_sad, int m, int n)
{
int r, c;
unsigned int sad = 0;
for (r = 0; r < n; r++)
{
for (c = 0; c < m; c++)
{
sad += abs(src_ptr[c] - ref_ptr[c]);
}
if (sad > max_sad)
break;
src_ptr += src_stride;
ref_ptr += ref_stride;
}
return sad;
}
/* max_sad is provided as an optional optimization point. Alternative
* implementations of these functions are not required to check it.
*/
unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int max_sad)
{
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 16);
}
unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int max_sad)
{
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 8);
}
unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int max_sad)
{
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 8);
}
unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int max_sad)
{
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 16);
}
unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int max_sad)
{
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 4, 4);
}
void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
}
void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned short *sad_array)
{
sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
}
void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
}
void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned short *sad_array)
{
sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
}
void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
}
void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned short *sad_array)
{
sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
}
void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
}
void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned short *sad_array)
{
sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
}
void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
}
void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
unsigned short *sad_array)
{
sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
}
void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride,
const unsigned char * const ref_ptr[], int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
}
void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride,
const unsigned char * const ref_ptr[], int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
}
void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride,
const unsigned char * const ref_ptr[], int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
}
void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride,
const unsigned char * const ref_ptr[], int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
}
void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride,
const unsigned char * const ref_ptr[], int ref_stride,
unsigned int *sad_array)
{
sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
}
/* Copy 2 macroblocks to a buffer */
void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
unsigned char *dst_ptr, int dst_stride,
int height)
{
int r;
for (r = 0; r < height; r++)
{
#if !(CONFIG_FAST_UNALIGNED)
dst_ptr[0] = src_ptr[0];
dst_ptr[1] = src_ptr[1];
dst_ptr[2] = src_ptr[2];
dst_ptr[3] = src_ptr[3];
dst_ptr[4] = src_ptr[4];
dst_ptr[5] = src_ptr[5];
dst_ptr[6] = src_ptr[6];
dst_ptr[7] = src_ptr[7];
dst_ptr[8] = src_ptr[8];
dst_ptr[9] = src_ptr[9];
dst_ptr[10] = src_ptr[10];
dst_ptr[11] = src_ptr[11];
dst_ptr[12] = src_ptr[12];
dst_ptr[13] = src_ptr[13];
dst_ptr[14] = src_ptr[14];
dst_ptr[15] = src_ptr[15];
dst_ptr[16] = src_ptr[16];
dst_ptr[17] = src_ptr[17];
dst_ptr[18] = src_ptr[18];
dst_ptr[19] = src_ptr[19];
dst_ptr[20] = src_ptr[20];
dst_ptr[21] = src_ptr[21];
dst_ptr[22] = src_ptr[22];
dst_ptr[23] = src_ptr[23];
dst_ptr[24] = src_ptr[24];
dst_ptr[25] = src_ptr[25];
dst_ptr[26] = src_ptr[26];
dst_ptr[27] = src_ptr[27];
dst_ptr[28] = src_ptr[28];
dst_ptr[29] = src_ptr[29];
dst_ptr[30] = src_ptr[30];
dst_ptr[31] = src_ptr[31];
#else
((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ;
((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ;
((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ;
((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ;
((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ;
((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ;
((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ;
((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ;
#endif
src_ptr += src_stride;
dst_ptr += dst_stride;
}
}

Просмотреть файл

@ -14,16 +14,17 @@
#include "vpx_config.h"
#include "vpx/vpx_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef unsigned int(*vp8_sad_fn_t)(
const unsigned char *src_ptr,
typedef unsigned int(*vpx_sad_fn_t)(
const uint8_t *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int ref_stride,
unsigned int max_sad);
const uint8_t *ref_ptr,
int ref_stride);
typedef void (*vp8_copy32xn_fn_t)(
const unsigned char *src_ptr,
@ -32,27 +33,17 @@ typedef void (*vp8_copy32xn_fn_t)(
int ref_stride,
int n);
typedef void (*vp8_sad_multi_fn_t)(
typedef void (*vpx_sad_multi_fn_t)(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
const unsigned char *ref_array,
int ref_stride,
unsigned int *sad_array);
typedef void (*vp8_sad_multi1_fn_t)
typedef void (*vpx_sad_multi_d_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int ref_stride,
unsigned short *sad_array
);
typedef void (*vp8_sad_multi_d_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char * const ref_ptr[],
const unsigned char * const ref_array[],
int ref_stride,
unsigned int *sad_array
);
@ -102,15 +93,15 @@ typedef unsigned int (*vp8_get16x16prederror_fn_t)
typedef struct variance_vtable
{
vp8_sad_fn_t sdf;
vpx_sad_fn_t sdf;
vp8_variance_fn_t vf;
vp8_subpixvariance_fn_t svf;
vp8_variance_fn_t svf_halfpix_h;
vp8_variance_fn_t svf_halfpix_v;
vp8_variance_fn_t svf_halfpix_hv;
vp8_sad_multi_fn_t sdx3f;
vp8_sad_multi1_fn_t sdx8f;
vp8_sad_multi_d_fn_t sdx4df;
vpx_sad_multi_fn_t sdx3f;
vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;
#if ARCH_X86 || ARCH_X86_64
vp8_copy32xn_fn_t copymem;
#endif

Просмотреть файл

@ -0,0 +1,93 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_copy32xn_sse2(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; int height);
global sym(vp8_copy32xn_sse2) PRIVATE
sym(vp8_copy32xn_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;dst_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;dst_stride
movsxd rcx, dword ptr arg(4) ;height
.block_copy_sse2_loopx4:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
movdqu xmm2, XMMWORD PTR [rsi + rax]
movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqu xmm4, XMMWORD PTR [rsi]
movdqu xmm5, XMMWORD PTR [rsi + 16]
movdqu xmm6, XMMWORD PTR [rsi + rax]
movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
movdqa XMMWORD PTR [rdi + rdx], xmm2
movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
lea rdi, [rdi+rdx*2]
movdqa XMMWORD PTR [rdi], xmm4
movdqa XMMWORD PTR [rdi + 16], xmm5
movdqa XMMWORD PTR [rdi + rdx], xmm6
movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
lea rdi, [rdi+rdx*2]
sub rcx, 4
cmp rcx, 4
jge .block_copy_sse2_loopx4
cmp rcx, 0
je .copy_is_done
.block_copy_sse2_loop:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
lea rsi, [rsi+rax]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
lea rdi, [rdi+rdx]
sub rcx, 1
jne .block_copy_sse2_loop
.copy_is_done:
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -0,0 +1,146 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro STACK_FRAME_CREATE_X3 0
%if ABI_IS_32BIT
%define src_ptr rsi
%define src_stride rax
%define ref_ptr rdi
%define ref_stride rdx
%define end_ptr rcx
%define ret_var rbx
%define result_ptr arg(4)
%define max_sad arg(4)
%define height dword ptr arg(4)
push rbp
mov rbp, rsp
push rsi
push rdi
push rbx
mov rsi, arg(0) ; src_ptr
mov rdi, arg(2) ; ref_ptr
movsxd rax, dword ptr arg(1) ; src_stride
movsxd rdx, dword ptr arg(3) ; ref_stride
%else
%if LIBVPX_YASM_WIN64
SAVE_XMM 7, u
%define src_ptr rcx
%define src_stride rdx
%define ref_ptr r8
%define ref_stride r9
%define end_ptr r10
%define ret_var r11
%define result_ptr [rsp+xmm_stack_space+8+4*8]
%define max_sad [rsp+xmm_stack_space+8+4*8]
%define height dword ptr [rsp+xmm_stack_space+8+4*8]
%else
%define src_ptr rdi
%define src_stride rsi
%define ref_ptr rdx
%define ref_stride rcx
%define end_ptr r9
%define ret_var r10
%define result_ptr r8
%define max_sad r8
%define height r8
%endif
%endif
%endmacro
%macro STACK_FRAME_DESTROY_X3 0
%define src_ptr
%define src_stride
%define ref_ptr
%define ref_stride
%define end_ptr
%define ret_var
%define result_ptr
%define max_sad
%define height
%if ABI_IS_32BIT
pop rbx
pop rdi
pop rsi
pop rbp
%else
%if LIBVPX_YASM_WIN64
RESTORE_XMM
%endif
%endif
ret
%endmacro
;void vp8_copy32xn_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; int height);
global sym(vp8_copy32xn_sse3) PRIVATE
sym(vp8_copy32xn_sse3):
STACK_FRAME_CREATE_X3
.block_copy_sse3_loopx4:
lea end_ptr, [src_ptr+src_stride*2]
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
movdqu xmm4, XMMWORD PTR [end_ptr]
movdqu xmm5, XMMWORD PTR [end_ptr + 16]
movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
lea src_ptr, [src_ptr+src_stride*4]
lea end_ptr, [ref_ptr+ref_stride*2]
movdqa XMMWORD PTR [ref_ptr], xmm0
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
movdqa XMMWORD PTR [end_ptr], xmm4
movdqa XMMWORD PTR [end_ptr + 16], xmm5
movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
lea ref_ptr, [ref_ptr+ref_stride*4]
sub height, 4
cmp height, 4
jge .block_copy_sse3_loopx4
;Check to see if there is more rows need to be copied.
cmp height, 0
je .copy_is_done
.block_copy_sse3_loop:
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
lea src_ptr, [src_ptr+src_stride]
movdqa XMMWORD PTR [ref_ptr], xmm0
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
lea ref_ptr, [ref_ptr+ref_stride]
sub height, 1
jne .block_copy_sse3_loop
.copy_is_done:
STACK_FRAME_DESTROY_X3

Просмотреть файл

@ -1,410 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;unsigned int vp8_sad16x16_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
global sym(vp8_sad16x16_wmt) PRIVATE
sym(vp8_sad16x16_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM 6
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
lea rcx, [rsi+rax*8]
lea rcx, [rcx+rax*8]
pxor xmm6, xmm6
.x16x16sad_wmt_loop:
movq xmm0, QWORD PTR [rsi]
movq xmm2, QWORD PTR [rsi+8]
movq xmm1, QWORD PTR [rdi]
movq xmm3, QWORD PTR [rdi+8]
movq xmm4, QWORD PTR [rsi+rax]
movq xmm5, QWORD PTR [rdi+rdx]
punpcklbw xmm0, xmm2
punpcklbw xmm1, xmm3
psadbw xmm0, xmm1
movq xmm2, QWORD PTR [rsi+rax+8]
movq xmm3, QWORD PTR [rdi+rdx+8]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
punpcklbw xmm4, xmm2
punpcklbw xmm5, xmm3
psadbw xmm4, xmm5
paddw xmm6, xmm0
paddw xmm6, xmm4
cmp rsi, rcx
jne .x16x16sad_wmt_loop
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movq rax, xmm0
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad8x16_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int max_sad)
global sym(vp8_sad8x16_wmt) PRIVATE
sym(vp8_sad8x16_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
lea rcx, [rsi+rbx*8]
lea rcx, [rcx+rbx*8]
pxor mm7, mm7
.x8x16sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
ja .x8x16sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
movq mm2, QWORD PTR [rsi+rbx]
movq mm3, QWORD PTR [rdi+rdx]
psadbw mm0, mm1
psadbw mm2, mm3
lea rsi, [rsi+rbx*2]
lea rdi, [rdi+rdx*2]
paddw mm7, mm0
paddw mm7, mm2
cmp rsi, rcx
jne .x8x16sad_wmt_loop
movq rax, mm7
.x8x16sad_wmt_early_exit:
; begin epilog
pop rdi
pop rsi
pop rbx
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad8x8_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
global sym(vp8_sad8x8_wmt) PRIVATE
sym(vp8_sad8x8_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
.x8x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
ja .x8x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
psadbw mm0, mm1
lea rsi, [rsi+rbx]
add rdi, rdx
paddw mm7, mm0
cmp rsi, rcx
jne .x8x8sad_wmt_loop
movq rax, mm7
.x8x8sad_wmt_early_exit:
; begin epilog
pop rdi
pop rsi
pop rbx
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad4x4_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
global sym(vp8_sad4x4_wmt) PRIVATE
sym(vp8_sad4x4_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movd mm0, DWORD PTR [rsi]
movd mm1, DWORD PTR [rdi]
movd mm2, DWORD PTR [rsi+rax]
movd mm3, DWORD PTR [rdi+rdx]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
psadbw mm0, mm1
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
movd mm4, DWORD PTR [rsi]
movd mm5, DWORD PTR [rdi]
movd mm6, DWORD PTR [rsi+rax]
movd mm7, DWORD PTR [rdi+rdx]
punpcklbw mm4, mm6
punpcklbw mm5, mm7
psadbw mm4, mm5
paddw mm0, mm4
movq rax, mm0
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad16x8_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
global sym(vp8_sad16x8_wmt) PRIVATE
sym(vp8_sad16x8_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
.x16x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
ja .x16x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm2, QWORD PTR [rsi+8]
movq mm1, QWORD PTR [rdi]
movq mm3, QWORD PTR [rdi+8]
movq mm4, QWORD PTR [rsi+rbx]
movq mm5, QWORD PTR [rdi+rdx]
psadbw mm0, mm1
psadbw mm2, mm3
movq mm1, QWORD PTR [rsi+rbx+8]
movq mm3, QWORD PTR [rdi+rdx+8]
psadbw mm4, mm5
psadbw mm1, mm3
lea rsi, [rsi+rbx*2]
lea rdi, [rdi+rdx*2]
paddw mm0, mm2
paddw mm4, mm1
paddw mm7, mm0
paddw mm7, mm4
cmp rsi, rcx
jne .x16x8sad_wmt_loop
movq rax, mm7
.x16x8sad_wmt_early_exit:
; begin epilog
pop rdi
pop rsi
pop rbx
UNSHADOW_ARGS
pop rbp
ret
;void vp8_copy32xn_sse2(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; int height);
global sym(vp8_copy32xn_sse2) PRIVATE
sym(vp8_copy32xn_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;dst_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;dst_stride
movsxd rcx, dword ptr arg(4) ;height
.block_copy_sse2_loopx4:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
movdqu xmm2, XMMWORD PTR [rsi + rax]
movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqu xmm4, XMMWORD PTR [rsi]
movdqu xmm5, XMMWORD PTR [rsi + 16]
movdqu xmm6, XMMWORD PTR [rsi + rax]
movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
movdqa XMMWORD PTR [rdi + rdx], xmm2
movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
lea rdi, [rdi+rdx*2]
movdqa XMMWORD PTR [rdi], xmm4
movdqa XMMWORD PTR [rdi + 16], xmm5
movdqa XMMWORD PTR [rdi + rdx], xmm6
movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
lea rdi, [rdi+rdx*2]
sub rcx, 4
cmp rcx, 4
jge .block_copy_sse2_loopx4
cmp rcx, 0
je .copy_is_done
.block_copy_sse2_loop:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
lea rsi, [rsi+rax]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
lea rdi, [rdi+rdx]
sub rcx, 1
jne .block_copy_sse2_loop
.copy_is_done:
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -1,960 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro STACK_FRAME_CREATE_X3 0
%if ABI_IS_32BIT
%define src_ptr rsi
%define src_stride rax
%define ref_ptr rdi
%define ref_stride rdx
%define end_ptr rcx
%define ret_var rbx
%define result_ptr arg(4)
%define max_sad arg(4)
%define height dword ptr arg(4)
push rbp
mov rbp, rsp
push rsi
push rdi
push rbx
mov rsi, arg(0) ; src_ptr
mov rdi, arg(2) ; ref_ptr
movsxd rax, dword ptr arg(1) ; src_stride
movsxd rdx, dword ptr arg(3) ; ref_stride
%else
%if LIBVPX_YASM_WIN64
SAVE_XMM 7, u
%define src_ptr rcx
%define src_stride rdx
%define ref_ptr r8
%define ref_stride r9
%define end_ptr r10
%define ret_var r11
%define result_ptr [rsp+xmm_stack_space+8+4*8]
%define max_sad [rsp+xmm_stack_space+8+4*8]
%define height dword ptr [rsp+xmm_stack_space+8+4*8]
%else
%define src_ptr rdi
%define src_stride rsi
%define ref_ptr rdx
%define ref_stride rcx
%define end_ptr r9
%define ret_var r10
%define result_ptr r8
%define max_sad r8
%define height r8
%endif
%endif
%endmacro
%macro STACK_FRAME_DESTROY_X3 0
%define src_ptr
%define src_stride
%define ref_ptr
%define ref_stride
%define end_ptr
%define ret_var
%define result_ptr
%define max_sad
%define height
%if ABI_IS_32BIT
pop rbx
pop rdi
pop rsi
pop rbp
%else
%if LIBVPX_YASM_WIN64
RESTORE_XMM
%endif
%endif
ret
%endmacro
%macro STACK_FRAME_CREATE_X4 0
%if ABI_IS_32BIT
%define src_ptr rsi
%define src_stride rax
%define r0_ptr rcx
%define r1_ptr rdx
%define r2_ptr rbx
%define r3_ptr rdi
%define ref_stride rbp
%define result_ptr arg(4)
push rbp
mov rbp, rsp
push rsi
push rdi
push rbx
push rbp
mov rdi, arg(2) ; ref_ptr_base
LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
mov rsi, arg(0) ; src_ptr
movsxd rbx, dword ptr arg(1) ; src_stride
movsxd rbp, dword ptr arg(3) ; ref_stride
xchg rbx, rax
%else
%if LIBVPX_YASM_WIN64
SAVE_XMM 7, u
%define src_ptr rcx
%define src_stride rdx
%define r0_ptr rsi
%define r1_ptr r10
%define r2_ptr r11
%define r3_ptr r8
%define ref_stride r9
%define result_ptr [rsp+xmm_stack_space+16+4*8]
push rsi
LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
%else
%define src_ptr rdi
%define src_stride rsi
%define r0_ptr r9
%define r1_ptr r10
%define r2_ptr r11
%define r3_ptr rdx
%define ref_stride rcx
%define result_ptr r8
LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
%endif
%endif
%endmacro
%macro STACK_FRAME_DESTROY_X4 0
%define src_ptr
%define src_stride
%define r0_ptr
%define r1_ptr
%define r2_ptr
%define r3_ptr
%define ref_stride
%define result_ptr
%if ABI_IS_32BIT
pop rbx
pop rdi
pop rsi
pop rbp
%else
%if LIBVPX_YASM_WIN64
pop rsi
RESTORE_XMM
%endif
%endif
ret
%endmacro
%macro PROCESS_16X2X3 5
%if %1==0
movdqa xmm0, XMMWORD PTR [%2]
lddqu xmm5, XMMWORD PTR [%3]
lddqu xmm6, XMMWORD PTR [%3+1]
lddqu xmm7, XMMWORD PTR [%3+2]
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
movdqa xmm0, XMMWORD PTR [%2]
lddqu xmm1, XMMWORD PTR [%3]
lddqu xmm2, XMMWORD PTR [%3+1]
lddqu xmm3, XMMWORD PTR [%3+2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
paddw xmm7, xmm3
%endif
movdqa xmm0, XMMWORD PTR [%2+%4]
lddqu xmm1, XMMWORD PTR [%3+%5]
lddqu xmm2, XMMWORD PTR [%3+%5+1]
lddqu xmm3, XMMWORD PTR [%3+%5+2]
%if %1==0 || %1==1
lea %2, [%2+%4*2]
lea %3, [%3+%5*2]
%endif
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
paddw xmm7, xmm3
%endmacro
%macro PROCESS_8X2X3 5
%if %1==0
movq mm0, QWORD PTR [%2]
movq mm5, QWORD PTR [%3]
movq mm6, QWORD PTR [%3+1]
movq mm7, QWORD PTR [%3+2]
psadbw mm5, mm0
psadbw mm6, mm0
psadbw mm7, mm0
%else
movq mm0, QWORD PTR [%2]
movq mm1, QWORD PTR [%3]
movq mm2, QWORD PTR [%3+1]
movq mm3, QWORD PTR [%3+2]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm5, mm1
paddw mm6, mm2
paddw mm7, mm3
%endif
movq mm0, QWORD PTR [%2+%4]
movq mm1, QWORD PTR [%3+%5]
movq mm2, QWORD PTR [%3+%5+1]
movq mm3, QWORD PTR [%3+%5+2]
%if %1==0 || %1==1
lea %2, [%2+%4*2]
lea %3, [%3+%5*2]
%endif
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm5, mm1
paddw mm6, mm2
paddw mm7, mm3
%endmacro
%macro LOAD_X4_ADDRESSES 5
mov %2, [%1+REG_SZ_BYTES*0]
mov %3, [%1+REG_SZ_BYTES*1]
mov %4, [%1+REG_SZ_BYTES*2]
mov %5, [%1+REG_SZ_BYTES*3]
%endmacro
%macro PROCESS_16X2X4 8
%if %1==0
movdqa xmm0, XMMWORD PTR [%2]
lddqu xmm4, XMMWORD PTR [%3]
lddqu xmm5, XMMWORD PTR [%4]
lddqu xmm6, XMMWORD PTR [%5]
lddqu xmm7, XMMWORD PTR [%6]
psadbw xmm4, xmm0
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
movdqa xmm0, XMMWORD PTR [%2]
lddqu xmm1, XMMWORD PTR [%3]
lddqu xmm2, XMMWORD PTR [%4]
lddqu xmm3, XMMWORD PTR [%5]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm4, xmm1
lddqu xmm1, XMMWORD PTR [%6]
paddw xmm5, xmm2
paddw xmm6, xmm3
psadbw xmm1, xmm0
paddw xmm7, xmm1
%endif
movdqa xmm0, XMMWORD PTR [%2+%7]
lddqu xmm1, XMMWORD PTR [%3+%8]
lddqu xmm2, XMMWORD PTR [%4+%8]
lddqu xmm3, XMMWORD PTR [%5+%8]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm4, xmm1
lddqu xmm1, XMMWORD PTR [%6+%8]
paddw xmm5, xmm2
paddw xmm6, xmm3
%if %1==0 || %1==1
lea %2, [%2+%7*2]
lea %3, [%3+%8*2]
lea %4, [%4+%8*2]
lea %5, [%5+%8*2]
lea %6, [%6+%8*2]
%endif
psadbw xmm1, xmm0
paddw xmm7, xmm1
%endmacro
%macro PROCESS_8X2X4 8
%if %1==0
movq mm0, QWORD PTR [%2]
movq mm4, QWORD PTR [%3]
movq mm5, QWORD PTR [%4]
movq mm6, QWORD PTR [%5]
movq mm7, QWORD PTR [%6]
psadbw mm4, mm0
psadbw mm5, mm0
psadbw mm6, mm0
psadbw mm7, mm0
%else
movq mm0, QWORD PTR [%2]
movq mm1, QWORD PTR [%3]
movq mm2, QWORD PTR [%4]
movq mm3, QWORD PTR [%5]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm4, mm1
movq mm1, QWORD PTR [%6]
paddw mm5, mm2
paddw mm6, mm3
psadbw mm1, mm0
paddw mm7, mm1
%endif
movq mm0, QWORD PTR [%2+%7]
movq mm1, QWORD PTR [%3+%8]
movq mm2, QWORD PTR [%4+%8]
movq mm3, QWORD PTR [%5+%8]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm4, mm1
movq mm1, QWORD PTR [%6+%8]
paddw mm5, mm2
paddw mm6, mm3
%if %1==0 || %1==1
lea %2, [%2+%7*2]
lea %3, [%3+%8*2]
lea %4, [%4+%8*2]
lea %5, [%5+%8*2]
lea %6, [%6+%8*2]
%endif
psadbw mm1, mm0
paddw mm7, mm1
%endmacro
;void int vp8_sad16x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad16x16x3_sse3) PRIVATE
sym(vp8_sad16x16x3_sse3):
STACK_FRAME_CREATE_X3
PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
mov rcx, result_ptr
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rcx], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rcx+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rcx+8], xmm0
STACK_FRAME_DESTROY_X3
;void int vp8_sad16x8x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad16x8x3_sse3) PRIVATE
sym(vp8_sad16x8x3_sse3):
STACK_FRAME_CREATE_X3
PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
mov rcx, result_ptr
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rcx], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rcx+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rcx+8], xmm0
STACK_FRAME_DESTROY_X3
;void int vp8_sad8x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad8x16x3_sse3) PRIVATE
sym(vp8_sad8x16x3_sse3):
STACK_FRAME_CREATE_X3
PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
mov rcx, result_ptr
punpckldq mm5, mm6
movq [rcx], mm5
movd [rcx+8], mm7
STACK_FRAME_DESTROY_X3
;void int vp8_sad8x8x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad8x8x3_sse3) PRIVATE
sym(vp8_sad8x8x3_sse3):
STACK_FRAME_CREATE_X3
PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
mov rcx, result_ptr
punpckldq mm5, mm6
movq [rcx], mm5
movd [rcx+8], mm7
STACK_FRAME_DESTROY_X3
;void int vp8_sad4x4x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad4x4x3_sse3) PRIVATE
sym(vp8_sad4x4x3_sse3):
STACK_FRAME_CREATE_X3
movd mm0, DWORD PTR [src_ptr]
movd mm1, DWORD PTR [ref_ptr]
movd mm2, DWORD PTR [src_ptr+src_stride]
movd mm3, DWORD PTR [ref_ptr+ref_stride]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
movd mm4, DWORD PTR [ref_ptr+1]
movd mm5, DWORD PTR [ref_ptr+2]
movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
psadbw mm1, mm0
punpcklbw mm4, mm2
punpcklbw mm5, mm3
psadbw mm4, mm0
psadbw mm5, mm0
lea src_ptr, [src_ptr+src_stride*2]
lea ref_ptr, [ref_ptr+ref_stride*2]
movd mm0, DWORD PTR [src_ptr]
movd mm2, DWORD PTR [ref_ptr]
movd mm3, DWORD PTR [src_ptr+src_stride]
movd mm6, DWORD PTR [ref_ptr+ref_stride]
punpcklbw mm0, mm3
punpcklbw mm2, mm6
movd mm3, DWORD PTR [ref_ptr+1]
movd mm7, DWORD PTR [ref_ptr+2]
psadbw mm2, mm0
paddw mm1, mm2
movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
punpcklbw mm3, mm2
punpcklbw mm7, mm6
psadbw mm3, mm0
psadbw mm7, mm0
paddw mm3, mm4
paddw mm7, mm5
mov rcx, result_ptr
punpckldq mm1, mm3
movq [rcx], mm1
movd [rcx+8], mm7
STACK_FRAME_DESTROY_X3
;unsigned int vp8_sad16x16_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int max_sad)
;%define lddqu movdqu
global sym(vp8_sad16x16_sse3) PRIVATE
sym(vp8_sad16x16_sse3):
STACK_FRAME_CREATE_X3
mov end_ptr, 4
pxor xmm7, xmm7
.vp8_sad16x16_sse3_loop:
movdqa xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [ref_ptr]
movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
lea src_ptr, [src_ptr+src_stride*2]
lea ref_ptr, [ref_ptr+ref_stride*2]
movdqa xmm4, XMMWORD PTR [src_ptr]
movdqu xmm5, XMMWORD PTR [ref_ptr]
movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
psadbw xmm0, xmm1
movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
psadbw xmm2, xmm3
psadbw xmm4, xmm5
psadbw xmm6, xmm1
lea src_ptr, [src_ptr+src_stride*2]
lea ref_ptr, [ref_ptr+ref_stride*2]
paddw xmm7, xmm0
paddw xmm7, xmm2
paddw xmm7, xmm4
paddw xmm7, xmm6
sub end_ptr, 1
jne .vp8_sad16x16_sse3_loop
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movq rax, xmm0
STACK_FRAME_DESTROY_X3
;void vp8_copy32xn_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; int height);
global sym(vp8_copy32xn_sse3) PRIVATE
sym(vp8_copy32xn_sse3):
STACK_FRAME_CREATE_X3
.block_copy_sse3_loopx4:
lea end_ptr, [src_ptr+src_stride*2]
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
movdqu xmm4, XMMWORD PTR [end_ptr]
movdqu xmm5, XMMWORD PTR [end_ptr + 16]
movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
lea src_ptr, [src_ptr+src_stride*4]
lea end_ptr, [ref_ptr+ref_stride*2]
movdqa XMMWORD PTR [ref_ptr], xmm0
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
movdqa XMMWORD PTR [end_ptr], xmm4
movdqa XMMWORD PTR [end_ptr + 16], xmm5
movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
lea ref_ptr, [ref_ptr+ref_stride*4]
sub height, 4
cmp height, 4
jge .block_copy_sse3_loopx4
;Check to see if there is more rows need to be copied.
cmp height, 0
je .copy_is_done
.block_copy_sse3_loop:
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
lea src_ptr, [src_ptr+src_stride]
movdqa XMMWORD PTR [ref_ptr], xmm0
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
lea ref_ptr, [ref_ptr+ref_stride]
sub height, 1
jne .block_copy_sse3_loop
.copy_is_done:
STACK_FRAME_DESTROY_X3
;void vp8_sad16x16x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr_base,
; int ref_stride,
; int *results)
global sym(vp8_sad16x16x4d_sse3) PRIVATE
sym(vp8_sad16x16x4d_sse3):
STACK_FRAME_CREATE_X4
PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
%if ABI_IS_32BIT
pop rbp
%endif
mov rcx, result_ptr
movq xmm0, xmm4
psrldq xmm4, 8
paddw xmm0, xmm4
movd [rcx], xmm0
;-
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rcx+4], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rcx+8], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rcx+12], xmm0
STACK_FRAME_DESTROY_X4
;void vp8_sad16x8x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr_base,
; int ref_stride,
; int *results)
global sym(vp8_sad16x8x4d_sse3) PRIVATE
sym(vp8_sad16x8x4d_sse3):
STACK_FRAME_CREATE_X4
PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
%if ABI_IS_32BIT
pop rbp
%endif
mov rcx, result_ptr
movq xmm0, xmm4
psrldq xmm4, 8
paddw xmm0, xmm4
movd [rcx], xmm0
;-
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rcx+4], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rcx+8], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rcx+12], xmm0
STACK_FRAME_DESTROY_X4
;void int vp8_sad8x16x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad8x16x4d_sse3) PRIVATE
sym(vp8_sad8x16x4d_sse3):
STACK_FRAME_CREATE_X4
PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
%if ABI_IS_32BIT
pop rbp
%endif
mov rcx, result_ptr
punpckldq mm4, mm5
punpckldq mm6, mm7
movq [rcx], mm4
movq [rcx+8], mm6
STACK_FRAME_DESTROY_X4
;void int vp8_sad8x8x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad8x8x4d_sse3) PRIVATE
sym(vp8_sad8x8x4d_sse3):
STACK_FRAME_CREATE_X4
PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
%if ABI_IS_32BIT
pop rbp
%endif
mov rcx, result_ptr
punpckldq mm4, mm5
punpckldq mm6, mm7
movq [rcx], mm4
movq [rcx+8], mm6
STACK_FRAME_DESTROY_X4
;void int vp8_sad4x4x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad4x4x4d_sse3) PRIVATE
sym(vp8_sad4x4x4d_sse3):
STACK_FRAME_CREATE_X4
movd mm0, DWORD PTR [src_ptr]
movd mm1, DWORD PTR [r0_ptr]
movd mm2, DWORD PTR [src_ptr+src_stride]
movd mm3, DWORD PTR [r0_ptr+ref_stride]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
movd mm4, DWORD PTR [r1_ptr]
movd mm5, DWORD PTR [r2_ptr]
movd mm6, DWORD PTR [r3_ptr]
movd mm2, DWORD PTR [r1_ptr+ref_stride]
movd mm3, DWORD PTR [r2_ptr+ref_stride]
movd mm7, DWORD PTR [r3_ptr+ref_stride]
psadbw mm1, mm0
punpcklbw mm4, mm2
punpcklbw mm5, mm3
punpcklbw mm6, mm7
psadbw mm4, mm0
psadbw mm5, mm0
psadbw mm6, mm0
lea src_ptr, [src_ptr+src_stride*2]
lea r0_ptr, [r0_ptr+ref_stride*2]
lea r1_ptr, [r1_ptr+ref_stride*2]
lea r2_ptr, [r2_ptr+ref_stride*2]
lea r3_ptr, [r3_ptr+ref_stride*2]
movd mm0, DWORD PTR [src_ptr]
movd mm2, DWORD PTR [r0_ptr]
movd mm3, DWORD PTR [src_ptr+src_stride]
movd mm7, DWORD PTR [r0_ptr+ref_stride]
punpcklbw mm0, mm3
punpcklbw mm2, mm7
movd mm3, DWORD PTR [r1_ptr]
movd mm7, DWORD PTR [r2_ptr]
psadbw mm2, mm0
%if ABI_IS_32BIT
mov rax, rbp
pop rbp
%define ref_stride rax
%endif
mov rsi, result_ptr
paddw mm1, mm2
movd [rsi], mm1
movd mm2, DWORD PTR [r1_ptr+ref_stride]
movd mm1, DWORD PTR [r2_ptr+ref_stride]
punpcklbw mm3, mm2
punpcklbw mm7, mm1
psadbw mm3, mm0
psadbw mm7, mm0
movd mm2, DWORD PTR [r3_ptr]
movd mm1, DWORD PTR [r3_ptr+ref_stride]
paddw mm3, mm4
paddw mm7, mm5
movd [rsi+4], mm3
punpcklbw mm2, mm1
movd [rsi+8], mm7
psadbw mm2, mm0
paddw mm2, mm6
movd [rsi+12], mm2
STACK_FRAME_DESTROY_X4

Просмотреть файл

@ -1,353 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro PROCESS_16X2X8 1
%if %1
movdqa xmm0, XMMWORD PTR [rsi]
movq xmm1, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
movq xmm2, MMWORD PTR [rdi+16]
punpcklqdq xmm1, xmm3
punpcklqdq xmm3, xmm2
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
psrldq xmm0, 8
movdqa xmm4, xmm3
mpsadbw xmm3, xmm0, 0x0
mpsadbw xmm4, xmm0, 0x5
paddw xmm1, xmm2
paddw xmm1, xmm3
paddw xmm1, xmm4
%else
movdqa xmm0, XMMWORD PTR [rsi]
movq xmm5, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
movq xmm2, MMWORD PTR [rdi+16]
punpcklqdq xmm5, xmm3
punpcklqdq xmm3, xmm2
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
psrldq xmm0, 8
movdqa xmm4, xmm3
mpsadbw xmm3, xmm0, 0x0
mpsadbw xmm4, xmm0, 0x5
paddw xmm5, xmm2
paddw xmm5, xmm3
paddw xmm5, xmm4
paddw xmm1, xmm5
%endif
movdqa xmm0, XMMWORD PTR [rsi + rax]
movq xmm5, MMWORD PTR [rdi+ rdx]
movq xmm3, MMWORD PTR [rdi+ rdx+8]
movq xmm2, MMWORD PTR [rdi+ rdx+16]
punpcklqdq xmm5, xmm3
punpcklqdq xmm3, xmm2
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
psrldq xmm0, 8
movdqa xmm4, xmm3
mpsadbw xmm3, xmm0, 0x0
mpsadbw xmm4, xmm0, 0x5
paddw xmm5, xmm2
paddw xmm5, xmm3
paddw xmm5, xmm4
paddw xmm1, xmm5
%endmacro
%macro PROCESS_8X2X8 1
%if %1
movq xmm0, MMWORD PTR [rsi]
movq xmm1, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
punpcklqdq xmm1, xmm3
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
paddw xmm1, xmm2
%else
movq xmm0, MMWORD PTR [rsi]
movq xmm5, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
punpcklqdq xmm5, xmm3
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
paddw xmm5, xmm2
paddw xmm1, xmm5
%endif
movq xmm0, MMWORD PTR [rsi + rax]
movq xmm5, MMWORD PTR [rdi+ rdx]
movq xmm3, MMWORD PTR [rdi+ rdx+8]
punpcklqdq xmm5, xmm3
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
paddw xmm5, xmm2
paddw xmm1, xmm5
%endmacro
%macro PROCESS_4X2X8 1
%if %1
movd xmm0, [rsi]
movq xmm1, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
punpcklqdq xmm1, xmm3
mpsadbw xmm1, xmm0, 0x0
%else
movd xmm0, [rsi]
movq xmm5, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
punpcklqdq xmm5, xmm3
mpsadbw xmm5, xmm0, 0x0
paddw xmm1, xmm5
%endif
movd xmm0, [rsi + rax]
movq xmm5, MMWORD PTR [rdi+ rdx]
movq xmm3, MMWORD PTR [rdi+ rdx+8]
punpcklqdq xmm5, xmm3
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
mpsadbw xmm5, xmm0, 0x0
paddw xmm1, xmm5
%endmacro
;void vp8_sad16x16x8_sse4(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array);
global sym(vp8_sad16x16x8_sse4) PRIVATE
sym(vp8_sad16x16x8_sse4):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_sad16x8x8_sse4(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
global sym(vp8_sad16x8x8_sse4) PRIVATE
sym(vp8_sad16x8x8_sse4):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_sad8x8x8_sse4(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
global sym(vp8_sad8x8x8_sse4) PRIVATE
sym(vp8_sad8x8x8_sse4):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_sad8x16x8_sse4(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
global sym(vp8_sad8x16x8_sse4) PRIVATE
sym(vp8_sad8x16x8_sse4):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_sad4x4x8_c(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
global sym(vp8_sad4x4x8_sse4) PRIVATE
sym(vp8_sad4x4x8_sse4):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_4X2X8 1
PROCESS_4X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -9,6 +9,8 @@
*/
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "onyx_int.h"
#include "mcomp.h"
#include "vpx_mem/vpx_mem.h"
@ -900,7 +902,7 @@ int vp8_hex_search
this_offset = base_offset + (br * (pre_stride)) + bc;
this_mv.as_mv.row = br;
this_mv.as_mv.col = bc;
bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, UINT_MAX)
bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride)
+ mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
#if CONFIG_MULTI_RES_ENCODING
@ -927,7 +929,7 @@ int vp8_hex_search
this_mv.as_mv.row = br + hex[i].row;
this_mv.as_mv.col = bc + hex[i].col;
this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}else
@ -938,7 +940,7 @@ int vp8_hex_search
this_mv.as_mv.col = bc + hex[i].col;
CHECK_POINT
this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}
@ -964,7 +966,7 @@ int vp8_hex_search
this_mv.as_mv.row = br + next_chkpts[k][i].row;
this_mv.as_mv.col = bc + next_chkpts[k][i].col;
this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}else
@ -975,7 +977,7 @@ int vp8_hex_search
this_mv.as_mv.col = bc + next_chkpts[k][i].col;
CHECK_POINT
this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}
@ -1006,7 +1008,7 @@ cal_neighbors:
this_mv.as_mv.row = br + neighbors[i].row;
this_mv.as_mv.col = bc + neighbors[i].col;
this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}else
@ -1017,7 +1019,7 @@ cal_neighbors:
this_mv.as_mv.col = bc + neighbors[i].col;
CHECK_POINT
this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
CHECK_BETTER
}
}
@ -1101,7 +1103,7 @@ int vp8_diamond_search_sad_c
best_address = in_what;
/* Check the starting position */
bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* search_param determines the length of the initial step and hence
@ -1126,7 +1128,7 @@ int vp8_diamond_search_sad_c
{
check_here = ss[i].offset + best_address;
thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
if (thissad < bestsad)
{
@ -1225,7 +1227,7 @@ int vp8_diamond_search_sadx4
best_address = in_what;
/* Check the starting position */
bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* search_param determines the length of the initial step and hence the
@ -1293,7 +1295,7 @@ int vp8_diamond_search_sadx4
(this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
{
check_here = ss[i].offset + best_address;
thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
if (thissad < bestsad)
{
@ -1376,8 +1378,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
best_mv->as_mv.col = ref_col;
/* Baseline value at the centre */
bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
in_what_stride, UINT_MAX)
bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* Apply further limits to prevent us looking using vectors that
@ -1402,7 +1403,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
for (c = col_min; c < col_max; c++)
{
thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
this_mv.as_mv.col = c;
thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
@ -1474,8 +1475,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
best_mv->as_mv.col = ref_col;
/* Baseline value at the centre */
bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
in_what_stride, UINT_MAX)
bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* Apply further limits to prevent us looking using vectors that stretch
@ -1531,7 +1531,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
while (c < col_max)
{
thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
if (thissad < bestsad)
{
@ -1590,7 +1590,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
int col_min = ref_col - distance;
int col_max = ref_col + distance;
DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
// TODO(johannkoenig): check if this alignment is necessary.
DECLARE_ALIGNED_ARRAY(16, unsigned int, sad_array8, 8);
unsigned int sad_array[3];
int *mvsadcost[2];
@ -1609,8 +1610,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
best_mv->as_mv.col = ref_col;
/* Baseline value at the centre */
bestsad = fn_ptr->sdf(what, what_stride,
bestaddress, in_what_stride, UINT_MAX)
bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
+ mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
/* Apply further limits to prevent us looking using vectors that stretch
@ -1696,7 +1696,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
while (c < col_max)
{
thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
if (thissad < bestsad)
{
@ -1754,8 +1754,7 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
bestsad = fn_ptr->sdf(what, what_stride, best_address,
in_what_stride, UINT_MAX)
bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
+ mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
for (i=0; i<search_range; i++)
@ -1771,7 +1770,7 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv
(this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
{
check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
if (thissad < bestsad)
{
@ -1834,8 +1833,7 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
bestsad = fn_ptr->sdf(what, what_stride, best_address,
in_what_stride, UINT_MAX)
bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
+ mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
for (i=0; i<search_range; i++)
@ -1886,7 +1884,7 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
(this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
{
check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
if (thissad < bestsad)
{

Просмотреть файл

@ -11,6 +11,7 @@
#include "vpx_config.h"
#include "./vpx_scale_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vp8_rtcd.h"
#include "vp8/common/onyxc_int.h"
#include "vp8/common/blockd.h"
@ -2126,55 +2127,55 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
}
#endif
cpi->fn_ptr[BLOCK_16X16].sdf = vp8_sad16x16;
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
cpi->fn_ptr[BLOCK_16X16].sdx3f = vp8_sad16x16x3;
cpi->fn_ptr[BLOCK_16X16].sdx8f = vp8_sad16x16x8;
cpi->fn_ptr[BLOCK_16X16].sdx4df = vp8_sad16x16x4d;
cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
cpi->fn_ptr[BLOCK_16X8].sdf = vp8_sad16x8;
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_16X8].sdx3f = vp8_sad16x8x3;
cpi->fn_ptr[BLOCK_16X8].sdx8f = vp8_sad16x8x8;
cpi->fn_ptr[BLOCK_16X8].sdx4df = vp8_sad16x8x4d;
cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
cpi->fn_ptr[BLOCK_8X16].sdf = vp8_sad8x16;
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X16].sdx3f = vp8_sad8x16x3;
cpi->fn_ptr[BLOCK_8X16].sdx8f = vp8_sad8x16x8;
cpi->fn_ptr[BLOCK_8X16].sdx4df = vp8_sad8x16x4d;
cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
cpi->fn_ptr[BLOCK_8X8].sdf = vp8_sad8x8;
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X8].sdx3f = vp8_sad8x8x3;
cpi->fn_ptr[BLOCK_8X8].sdx8f = vp8_sad8x8x8;
cpi->fn_ptr[BLOCK_8X8].sdx4df = vp8_sad8x8x4d;
cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
cpi->fn_ptr[BLOCK_4X4].sdf = vp8_sad4x4;
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_4X4].sdx3f = vp8_sad4x4x3;
cpi->fn_ptr[BLOCK_4X4].sdx8f = vp8_sad4x4x8;
cpi->fn_ptr[BLOCK_4X4].sdx4df = vp8_sad4x4x4d;
cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;
#if ARCH_X86 || ARCH_X86_64
cpi->fn_ptr[BLOCK_16X16].copymem = vp8_copy32xn;

Просмотреть файл

@ -1690,16 +1690,16 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
}else if(xd->mb_to_top_edge==0)
{ /* only has left MB for sad calculation. */
near_sad[0] = near_sad[2] = INT_MAX;
near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
}else if(xd->mb_to_left_edge ==0)
{ /* only has left MB for sad calculation. */
near_sad[1] = near_sad[2] = INT_MAX;
near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
}else
{
near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, UINT_MAX);
near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride);
}
if(cpi->common.last_frame_type != KEY_FRAME)
@ -1714,14 +1714,14 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
if(near_sad[4] != INT_MAX)
near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, UINT_MAX);
near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride);
if(near_sad[5] != INT_MAX)
near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, UINT_MAX);
near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, UINT_MAX);
near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride);
near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride);
if(near_sad[6] != INT_MAX)
near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, UINT_MAX);
near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride);
if(near_sad[7] != INT_MAX)
near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, UINT_MAX);
near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride);
}
if(cpi->common.last_frame_type != KEY_FRAME)

Просмотреть файл

@ -15,6 +15,7 @@ VP8_COMMON_SRCS-yes += common/onyxd.h
VP8_COMMON_SRCS-yes += common/alloccommon.c
VP8_COMMON_SRCS-yes += common/blockd.c
VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
VP8_COMMON_SRCS-yes += common/copy_c.c
VP8_COMMON_SRCS-yes += common/debugmodes.c
VP8_COMMON_SRCS-yes += common/default_coef_probs.h
VP8_COMMON_SRCS-yes += common/dequantize.c
@ -60,7 +61,6 @@ VP8_COMMON_SRCS-yes += common/quant_common.c
VP8_COMMON_SRCS-yes += common/reconinter.c
VP8_COMMON_SRCS-yes += common/reconintra.c
VP8_COMMON_SRCS-yes += common/reconintra4x4.c
VP8_COMMON_SRCS-yes += common/sad_c.c
VP8_COMMON_SRCS-yes += common/setupintrarecon.c
VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
VP8_COMMON_SRCS-yes += common/variance_c.c
@ -85,26 +85,23 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/sad_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sad_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/sad_sse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/sad_ssse3.asm
VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
@ -148,7 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_sad16x16_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
@ -170,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c

Просмотреть файл

@ -11,6 +11,7 @@
#include "./vpx_config.h"
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx/vpx_codec.h"
#include "vpx/internal/vpx_codec_internal.h"
@ -650,6 +651,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
vp8_rtcd();
vpx_dsp_rtcd();
vpx_scale_rtcd();
if (!ctx->priv)

Просмотреть файл

@ -12,6 +12,7 @@
#include <stdlib.h>
#include <string.h>
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx/vpx_decoder.h"
#include "vpx/vp8dx.h"
@ -107,6 +108,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
(void) data;
vp8_rtcd();
vpx_dsp_rtcd();
vpx_scale_rtcd();
/* This function only allocates space for the vpx_codec_alg_priv_t

Просмотреть файл

@ -9,8 +9,9 @@
*/
#include "./vpx_config.h"
#include "./vpx_scale_rtcd.h"
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_postproc.h"
@ -171,13 +172,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
if (bs == BLOCK_16X16) {
vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
} else if (bs == BLOCK_32X32) {
vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
sad = (vp9_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
} else /* if (bs == BLOCK_64X64) */ {
vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
}
// vdiff > sad * 3 means vdiff should not be too small, otherwise,

Просмотреть файл

@ -922,177 +922,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad64x64 neon avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad32x64 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad64x32 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad32x16 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad32x32 neon avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad8x8 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad4x8/, "$sse_x86inc";
add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad4x4/, "$sse_x86inc";
add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad64x64_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad32x64_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad64x32_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad32x16_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad32x32_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad4x8_avg/, "$sse_x86inc";
add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad64x64x3/;
add_proto qw/void vp9_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x32x3/;
add_proto qw/void vp9_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x16x3 sse3 ssse3/;
add_proto qw/void vp9_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x8x3 sse3 ssse3/;
add_proto qw/void vp9_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad8x16x3 sse3/;
add_proto qw/void vp9_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad8x8x3 sse3/;
add_proto qw/void vp9_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad4x4x3 sse3/;
add_proto qw/void vp9_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad64x64x8/;
add_proto qw/void vp9_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad32x32x8/;
add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad16x16x8 sse4_1/;
$vp9_sad16x16x8_sse4_1=vp9_sad16x16x8_sse4;
add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad16x8x8 sse4_1/;
$vp9_sad16x8x8_sse4_1=vp9_sad16x8x8_sse4;
add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad8x16x8 sse4_1/;
$vp9_sad8x16x8_sse4_1=vp9_sad8x16x8_sse4;
add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad8x8x8 sse4_1/;
$vp9_sad8x8x8_sse4_1=vp9_sad8x8x8_sse4;
add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad8x4x8/;
add_proto qw/void vp9_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad4x8x8/;
add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad4x4x8 sse4_1/;
$vp9_sad4x4x8_sse4_1=vp9_sad4x4x8_sse4;
add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad64x64x4d sse2 avx2 neon/;
add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x64x4d sse2/;
add_proto qw/void vp9_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad64x32x4d sse2/;
add_proto qw/void vp9_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x16x4d sse2/;
add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x32x4d sse2/;
add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x32x4d sse2 avx2 neon/;
add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x16x4d sse2 neon/;
add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x8x4d sse2/;
add_proto qw/void vp9_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad8x16x4d sse2/;
add_proto qw/void vp9_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad8x8x4d sse2/;
# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
add_proto qw/void vp9_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad8x4x4d sse2/;
add_proto qw/void vp9_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad4x8x4d sse/;
add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad4x4x4d sse/;
add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
@ -1682,171 +1511,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad4x8/;
add_proto qw/unsigned int vp9_highbd_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad4x4/;
add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad64x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad32x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad64x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad32x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad16x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad32x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad16x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad16x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad8x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad8x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad8x4_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad4x8_avg/;
add_proto qw/unsigned int vp9_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad4x4_avg/;
add_proto qw/void vp9_highbd_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad64x64x3/;
add_proto qw/void vp9_highbd_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x32x3/;
add_proto qw/void vp9_highbd_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x16x3/;
add_proto qw/void vp9_highbd_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x8x3/;
add_proto qw/void vp9_highbd_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x16x3/;
add_proto qw/void vp9_highbd_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x8x3/;
add_proto qw/void vp9_highbd_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad4x4x3/;
add_proto qw/void vp9_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_highbd_sad64x64x8/;
add_proto qw/void vp9_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_highbd_sad32x32x8/;
add_proto qw/void vp9_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_highbd_sad16x16x8/;
add_proto qw/void vp9_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_highbd_sad16x8x8/;
add_proto qw/void vp9_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_highbd_sad8x16x8/;
add_proto qw/void vp9_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_highbd_sad8x8x8/;
add_proto qw/void vp9_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_highbd_sad8x4x8/;
add_proto qw/void vp9_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_highbd_sad4x8x8/;
add_proto qw/void vp9_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_highbd_sad4x4x8/;
add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad64x64x4d sse2/;
add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x64x4d sse2/;
add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad64x32x4d sse2/;
add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x16x4d sse2/;
add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x32x4d sse2/;
add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x32x4d sse2/;
add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x16x4d sse2/;
add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x8x4d sse2/;
add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x16x4d sse2/;
add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x8x4d sse2/;
add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x4x4d sse2/;
add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad4x8x4d sse2/;
add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad4x4x4d sse2/;
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";

Просмотреть файл

@ -13,6 +13,7 @@
#include <stdio.h>
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx_mem/vpx_mem.h"
@ -40,6 +41,7 @@ static void initialize_dec(void) {
if (!init_done) {
vp9_rtcd();
vpx_dsp_rtcd();
vpx_scale_rtcd();
vp9_init_intra_predictors();
init_done = 1;

Просмотреть файл

@ -14,6 +14,7 @@
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx/internal/vpx_psnr.h"
#include "vpx_ports/vpx_timer.h"
@ -318,6 +319,7 @@ void vp9_initialize_enc(void) {
if (!init_done) {
vp9_rtcd();
vpx_dsp_rtcd();
vpx_scale_rtcd();
vp9_init_intra_predictors();
vp9_init_me_luts();
@ -929,61 +931,61 @@ static void fnname##_bits12(const uint8_t *src_ptr, \
sad_array[i] >>= 4; \
}
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x16)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x16_avg)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x16x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x32)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x32x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x32)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x32x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x64)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x64_avg)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x64x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x32)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x32_avg)
MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad32x32x3)
MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad32x32x8)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x32x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x64)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x64_avg)
MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad64x64x3)
MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad64x64x8)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x64x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x16)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x16_avg)
MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x16x3)
MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x16x8)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x16x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x8)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x8_avg)
MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x8x3)
MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x8x8)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x8x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x16)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x16_avg)
MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x16x3)
MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x16x8)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x16x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x8)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x8_avg)
MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x8x3)
MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x8x8)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x8x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x4)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x4_avg)
MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x4x8)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x4x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x8)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x8_avg)
MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x8x8)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x8x4d)
MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x4)
MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x4_avg)
MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad4x4x3)
MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x4x8)
MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x4x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
static void highbd_set_var_fns(VP9_COMP *const cpi) {
VP9_COMMON *const cm = &cpi->common;
@ -991,398 +993,398 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
switch (cm->bit_depth) {
case VPX_BITS_8:
HIGHBD_BFP(BLOCK_32X16,
vp9_highbd_sad32x16_bits8,
vp9_highbd_sad32x16_avg_bits8,
vpx_highbd_sad32x16_bits8,
vpx_highbd_sad32x16_avg_bits8,
vp9_highbd_variance32x16,
vp9_highbd_sub_pixel_variance32x16,
vp9_highbd_sub_pixel_avg_variance32x16,
NULL,
NULL,
vp9_highbd_sad32x16x4d_bits8)
vpx_highbd_sad32x16x4d_bits8)
HIGHBD_BFP(BLOCK_16X32,
vp9_highbd_sad16x32_bits8,
vp9_highbd_sad16x32_avg_bits8,
vpx_highbd_sad16x32_bits8,
vpx_highbd_sad16x32_avg_bits8,
vp9_highbd_variance16x32,
vp9_highbd_sub_pixel_variance16x32,
vp9_highbd_sub_pixel_avg_variance16x32,
NULL,
NULL,
vp9_highbd_sad16x32x4d_bits8)
vpx_highbd_sad16x32x4d_bits8)
HIGHBD_BFP(BLOCK_64X32,
vp9_highbd_sad64x32_bits8,
vp9_highbd_sad64x32_avg_bits8,
vpx_highbd_sad64x32_bits8,
vpx_highbd_sad64x32_avg_bits8,
vp9_highbd_variance64x32,
vp9_highbd_sub_pixel_variance64x32,
vp9_highbd_sub_pixel_avg_variance64x32,
NULL,
NULL,
vp9_highbd_sad64x32x4d_bits8)
vpx_highbd_sad64x32x4d_bits8)
HIGHBD_BFP(BLOCK_32X64,
vp9_highbd_sad32x64_bits8,
vp9_highbd_sad32x64_avg_bits8,
vpx_highbd_sad32x64_bits8,
vpx_highbd_sad32x64_avg_bits8,
vp9_highbd_variance32x64,
vp9_highbd_sub_pixel_variance32x64,
vp9_highbd_sub_pixel_avg_variance32x64,
NULL,
NULL,
vp9_highbd_sad32x64x4d_bits8)
vpx_highbd_sad32x64x4d_bits8)
HIGHBD_BFP(BLOCK_32X32,
vp9_highbd_sad32x32_bits8,
vp9_highbd_sad32x32_avg_bits8,
vpx_highbd_sad32x32_bits8,
vpx_highbd_sad32x32_avg_bits8,
vp9_highbd_variance32x32,
vp9_highbd_sub_pixel_variance32x32,
vp9_highbd_sub_pixel_avg_variance32x32,
vp9_highbd_sad32x32x3_bits8,
vp9_highbd_sad32x32x8_bits8,
vp9_highbd_sad32x32x4d_bits8)
vpx_highbd_sad32x32x3_bits8,
vpx_highbd_sad32x32x8_bits8,
vpx_highbd_sad32x32x4d_bits8)
HIGHBD_BFP(BLOCK_64X64,
vp9_highbd_sad64x64_bits8,
vp9_highbd_sad64x64_avg_bits8,
vpx_highbd_sad64x64_bits8,
vpx_highbd_sad64x64_avg_bits8,
vp9_highbd_variance64x64,
vp9_highbd_sub_pixel_variance64x64,
vp9_highbd_sub_pixel_avg_variance64x64,
vp9_highbd_sad64x64x3_bits8,
vp9_highbd_sad64x64x8_bits8,
vp9_highbd_sad64x64x4d_bits8)
vpx_highbd_sad64x64x3_bits8,
vpx_highbd_sad64x64x8_bits8,
vpx_highbd_sad64x64x4d_bits8)
HIGHBD_BFP(BLOCK_16X16,
vp9_highbd_sad16x16_bits8,
vp9_highbd_sad16x16_avg_bits8,
vpx_highbd_sad16x16_bits8,
vpx_highbd_sad16x16_avg_bits8,
vp9_highbd_variance16x16,
vp9_highbd_sub_pixel_variance16x16,
vp9_highbd_sub_pixel_avg_variance16x16,
vp9_highbd_sad16x16x3_bits8,
vp9_highbd_sad16x16x8_bits8,
vp9_highbd_sad16x16x4d_bits8)
vpx_highbd_sad16x16x3_bits8,
vpx_highbd_sad16x16x8_bits8,
vpx_highbd_sad16x16x4d_bits8)
HIGHBD_BFP(BLOCK_16X8,
vp9_highbd_sad16x8_bits8,
vp9_highbd_sad16x8_avg_bits8,
vpx_highbd_sad16x8_bits8,
vpx_highbd_sad16x8_avg_bits8,
vp9_highbd_variance16x8,
vp9_highbd_sub_pixel_variance16x8,
vp9_highbd_sub_pixel_avg_variance16x8,
vp9_highbd_sad16x8x3_bits8,
vp9_highbd_sad16x8x8_bits8,
vp9_highbd_sad16x8x4d_bits8)
vpx_highbd_sad16x8x3_bits8,
vpx_highbd_sad16x8x8_bits8,
vpx_highbd_sad16x8x4d_bits8)
HIGHBD_BFP(BLOCK_8X16,
vp9_highbd_sad8x16_bits8,
vp9_highbd_sad8x16_avg_bits8,
vpx_highbd_sad8x16_bits8,
vpx_highbd_sad8x16_avg_bits8,
vp9_highbd_variance8x16,
vp9_highbd_sub_pixel_variance8x16,
vp9_highbd_sub_pixel_avg_variance8x16,
vp9_highbd_sad8x16x3_bits8,
vp9_highbd_sad8x16x8_bits8,
vp9_highbd_sad8x16x4d_bits8)
vpx_highbd_sad8x16x3_bits8,
vpx_highbd_sad8x16x8_bits8,
vpx_highbd_sad8x16x4d_bits8)
HIGHBD_BFP(BLOCK_8X8,
vp9_highbd_sad8x8_bits8,
vp9_highbd_sad8x8_avg_bits8,
vpx_highbd_sad8x8_bits8,
vpx_highbd_sad8x8_avg_bits8,
vp9_highbd_variance8x8,
vp9_highbd_sub_pixel_variance8x8,
vp9_highbd_sub_pixel_avg_variance8x8,
vp9_highbd_sad8x8x3_bits8,
vp9_highbd_sad8x8x8_bits8,
vp9_highbd_sad8x8x4d_bits8)
vpx_highbd_sad8x8x3_bits8,
vpx_highbd_sad8x8x8_bits8,
vpx_highbd_sad8x8x4d_bits8)
HIGHBD_BFP(BLOCK_8X4,
vp9_highbd_sad8x4_bits8,
vp9_highbd_sad8x4_avg_bits8,
vpx_highbd_sad8x4_bits8,
vpx_highbd_sad8x4_avg_bits8,
vp9_highbd_variance8x4,
vp9_highbd_sub_pixel_variance8x4,
vp9_highbd_sub_pixel_avg_variance8x4,
NULL,
vp9_highbd_sad8x4x8_bits8,
vp9_highbd_sad8x4x4d_bits8)
vpx_highbd_sad8x4x8_bits8,
vpx_highbd_sad8x4x4d_bits8)
HIGHBD_BFP(BLOCK_4X8,
vp9_highbd_sad4x8_bits8,
vp9_highbd_sad4x8_avg_bits8,
vpx_highbd_sad4x8_bits8,
vpx_highbd_sad4x8_avg_bits8,
vp9_highbd_variance4x8,
vp9_highbd_sub_pixel_variance4x8,
vp9_highbd_sub_pixel_avg_variance4x8,
NULL,
vp9_highbd_sad4x8x8_bits8,
vp9_highbd_sad4x8x4d_bits8)
vpx_highbd_sad4x8x8_bits8,
vpx_highbd_sad4x8x4d_bits8)
HIGHBD_BFP(BLOCK_4X4,
vp9_highbd_sad4x4_bits8,
vp9_highbd_sad4x4_avg_bits8,
vpx_highbd_sad4x4_bits8,
vpx_highbd_sad4x4_avg_bits8,
vp9_highbd_variance4x4,
vp9_highbd_sub_pixel_variance4x4,
vp9_highbd_sub_pixel_avg_variance4x4,
vp9_highbd_sad4x4x3_bits8,
vp9_highbd_sad4x4x8_bits8,
vp9_highbd_sad4x4x4d_bits8)
vpx_highbd_sad4x4x3_bits8,
vpx_highbd_sad4x4x8_bits8,
vpx_highbd_sad4x4x4d_bits8)
break;
case VPX_BITS_10:
HIGHBD_BFP(BLOCK_32X16,
vp9_highbd_sad32x16_bits10,
vp9_highbd_sad32x16_avg_bits10,
vpx_highbd_sad32x16_bits10,
vpx_highbd_sad32x16_avg_bits10,
vp9_highbd_10_variance32x16,
vp9_highbd_10_sub_pixel_variance32x16,
vp9_highbd_10_sub_pixel_avg_variance32x16,
NULL,
NULL,
vp9_highbd_sad32x16x4d_bits10)
vpx_highbd_sad32x16x4d_bits10)
HIGHBD_BFP(BLOCK_16X32,
vp9_highbd_sad16x32_bits10,
vp9_highbd_sad16x32_avg_bits10,
vpx_highbd_sad16x32_bits10,
vpx_highbd_sad16x32_avg_bits10,
vp9_highbd_10_variance16x32,
vp9_highbd_10_sub_pixel_variance16x32,
vp9_highbd_10_sub_pixel_avg_variance16x32,
NULL,
NULL,
vp9_highbd_sad16x32x4d_bits10)
vpx_highbd_sad16x32x4d_bits10)
HIGHBD_BFP(BLOCK_64X32,
vp9_highbd_sad64x32_bits10,
vp9_highbd_sad64x32_avg_bits10,
vpx_highbd_sad64x32_bits10,
vpx_highbd_sad64x32_avg_bits10,
vp9_highbd_10_variance64x32,
vp9_highbd_10_sub_pixel_variance64x32,
vp9_highbd_10_sub_pixel_avg_variance64x32,
NULL,
NULL,
vp9_highbd_sad64x32x4d_bits10)
vpx_highbd_sad64x32x4d_bits10)
HIGHBD_BFP(BLOCK_32X64,
vp9_highbd_sad32x64_bits10,
vp9_highbd_sad32x64_avg_bits10,
vpx_highbd_sad32x64_bits10,
vpx_highbd_sad32x64_avg_bits10,
vp9_highbd_10_variance32x64,
vp9_highbd_10_sub_pixel_variance32x64,
vp9_highbd_10_sub_pixel_avg_variance32x64,
NULL,
NULL,
vp9_highbd_sad32x64x4d_bits10)
vpx_highbd_sad32x64x4d_bits10)
HIGHBD_BFP(BLOCK_32X32,
vp9_highbd_sad32x32_bits10,
vp9_highbd_sad32x32_avg_bits10,
vpx_highbd_sad32x32_bits10,
vpx_highbd_sad32x32_avg_bits10,
vp9_highbd_10_variance32x32,
vp9_highbd_10_sub_pixel_variance32x32,
vp9_highbd_10_sub_pixel_avg_variance32x32,
vp9_highbd_sad32x32x3_bits10,
vp9_highbd_sad32x32x8_bits10,
vp9_highbd_sad32x32x4d_bits10)
vpx_highbd_sad32x32x3_bits10,
vpx_highbd_sad32x32x8_bits10,
vpx_highbd_sad32x32x4d_bits10)
HIGHBD_BFP(BLOCK_64X64,
vp9_highbd_sad64x64_bits10,
vp9_highbd_sad64x64_avg_bits10,
vpx_highbd_sad64x64_bits10,
vpx_highbd_sad64x64_avg_bits10,
vp9_highbd_10_variance64x64,
vp9_highbd_10_sub_pixel_variance64x64,
vp9_highbd_10_sub_pixel_avg_variance64x64,
vp9_highbd_sad64x64x3_bits10,
vp9_highbd_sad64x64x8_bits10,
vp9_highbd_sad64x64x4d_bits10)
vpx_highbd_sad64x64x3_bits10,
vpx_highbd_sad64x64x8_bits10,
vpx_highbd_sad64x64x4d_bits10)
HIGHBD_BFP(BLOCK_16X16,
vp9_highbd_sad16x16_bits10,
vp9_highbd_sad16x16_avg_bits10,
vpx_highbd_sad16x16_bits10,
vpx_highbd_sad16x16_avg_bits10,
vp9_highbd_10_variance16x16,
vp9_highbd_10_sub_pixel_variance16x16,
vp9_highbd_10_sub_pixel_avg_variance16x16,
vp9_highbd_sad16x16x3_bits10,
vp9_highbd_sad16x16x8_bits10,
vp9_highbd_sad16x16x4d_bits10)
vpx_highbd_sad16x16x3_bits10,
vpx_highbd_sad16x16x8_bits10,
vpx_highbd_sad16x16x4d_bits10)
HIGHBD_BFP(BLOCK_16X8,
vp9_highbd_sad16x8_bits10,
vp9_highbd_sad16x8_avg_bits10,
vpx_highbd_sad16x8_bits10,
vpx_highbd_sad16x8_avg_bits10,
vp9_highbd_10_variance16x8,
vp9_highbd_10_sub_pixel_variance16x8,
vp9_highbd_10_sub_pixel_avg_variance16x8,
vp9_highbd_sad16x8x3_bits10,
vp9_highbd_sad16x8x8_bits10,
vp9_highbd_sad16x8x4d_bits10)
vpx_highbd_sad16x8x3_bits10,
vpx_highbd_sad16x8x8_bits10,
vpx_highbd_sad16x8x4d_bits10)
HIGHBD_BFP(BLOCK_8X16,
vp9_highbd_sad8x16_bits10,
vp9_highbd_sad8x16_avg_bits10,
vpx_highbd_sad8x16_bits10,
vpx_highbd_sad8x16_avg_bits10,
vp9_highbd_10_variance8x16,
vp9_highbd_10_sub_pixel_variance8x16,
vp9_highbd_10_sub_pixel_avg_variance8x16,
vp9_highbd_sad8x16x3_bits10,
vp9_highbd_sad8x16x8_bits10,
vp9_highbd_sad8x16x4d_bits10)
vpx_highbd_sad8x16x3_bits10,
vpx_highbd_sad8x16x8_bits10,
vpx_highbd_sad8x16x4d_bits10)
HIGHBD_BFP(BLOCK_8X8,
vp9_highbd_sad8x8_bits10,
vp9_highbd_sad8x8_avg_bits10,
vpx_highbd_sad8x8_bits10,
vpx_highbd_sad8x8_avg_bits10,
vp9_highbd_10_variance8x8,
vp9_highbd_10_sub_pixel_variance8x8,
vp9_highbd_10_sub_pixel_avg_variance8x8,
vp9_highbd_sad8x8x3_bits10,
vp9_highbd_sad8x8x8_bits10,
vp9_highbd_sad8x8x4d_bits10)
vpx_highbd_sad8x8x3_bits10,
vpx_highbd_sad8x8x8_bits10,
vpx_highbd_sad8x8x4d_bits10)
HIGHBD_BFP(BLOCK_8X4,
vp9_highbd_sad8x4_bits10,
vp9_highbd_sad8x4_avg_bits10,
vpx_highbd_sad8x4_bits10,
vpx_highbd_sad8x4_avg_bits10,
vp9_highbd_10_variance8x4,
vp9_highbd_10_sub_pixel_variance8x4,
vp9_highbd_10_sub_pixel_avg_variance8x4,
NULL,
vp9_highbd_sad8x4x8_bits10,
vp9_highbd_sad8x4x4d_bits10)
vpx_highbd_sad8x4x8_bits10,
vpx_highbd_sad8x4x4d_bits10)
HIGHBD_BFP(BLOCK_4X8,
vp9_highbd_sad4x8_bits10,
vp9_highbd_sad4x8_avg_bits10,
vpx_highbd_sad4x8_bits10,
vpx_highbd_sad4x8_avg_bits10,
vp9_highbd_10_variance4x8,
vp9_highbd_10_sub_pixel_variance4x8,
vp9_highbd_10_sub_pixel_avg_variance4x8,
NULL,
vp9_highbd_sad4x8x8_bits10,
vp9_highbd_sad4x8x4d_bits10)
vpx_highbd_sad4x8x8_bits10,
vpx_highbd_sad4x8x4d_bits10)
HIGHBD_BFP(BLOCK_4X4,
vp9_highbd_sad4x4_bits10,
vp9_highbd_sad4x4_avg_bits10,
vpx_highbd_sad4x4_bits10,
vpx_highbd_sad4x4_avg_bits10,
vp9_highbd_10_variance4x4,
vp9_highbd_10_sub_pixel_variance4x4,
vp9_highbd_10_sub_pixel_avg_variance4x4,
vp9_highbd_sad4x4x3_bits10,
vp9_highbd_sad4x4x8_bits10,
vp9_highbd_sad4x4x4d_bits10)
vpx_highbd_sad4x4x3_bits10,
vpx_highbd_sad4x4x8_bits10,
vpx_highbd_sad4x4x4d_bits10)
break;
case VPX_BITS_12:
HIGHBD_BFP(BLOCK_32X16,
vp9_highbd_sad32x16_bits12,
vp9_highbd_sad32x16_avg_bits12,
vpx_highbd_sad32x16_bits12,
vpx_highbd_sad32x16_avg_bits12,
vp9_highbd_12_variance32x16,
vp9_highbd_12_sub_pixel_variance32x16,
vp9_highbd_12_sub_pixel_avg_variance32x16,
NULL,
NULL,
vp9_highbd_sad32x16x4d_bits12)
vpx_highbd_sad32x16x4d_bits12)
HIGHBD_BFP(BLOCK_16X32,
vp9_highbd_sad16x32_bits12,
vp9_highbd_sad16x32_avg_bits12,
vpx_highbd_sad16x32_bits12,
vpx_highbd_sad16x32_avg_bits12,
vp9_highbd_12_variance16x32,
vp9_highbd_12_sub_pixel_variance16x32,
vp9_highbd_12_sub_pixel_avg_variance16x32,
NULL,
NULL,
vp9_highbd_sad16x32x4d_bits12)
vpx_highbd_sad16x32x4d_bits12)
HIGHBD_BFP(BLOCK_64X32,
vp9_highbd_sad64x32_bits12,
vp9_highbd_sad64x32_avg_bits12,
vpx_highbd_sad64x32_bits12,
vpx_highbd_sad64x32_avg_bits12,
vp9_highbd_12_variance64x32,
vp9_highbd_12_sub_pixel_variance64x32,
vp9_highbd_12_sub_pixel_avg_variance64x32,
NULL,
NULL,
vp9_highbd_sad64x32x4d_bits12)
vpx_highbd_sad64x32x4d_bits12)
HIGHBD_BFP(BLOCK_32X64,
vp9_highbd_sad32x64_bits12,
vp9_highbd_sad32x64_avg_bits12,
vpx_highbd_sad32x64_bits12,
vpx_highbd_sad32x64_avg_bits12,
vp9_highbd_12_variance32x64,
vp9_highbd_12_sub_pixel_variance32x64,
vp9_highbd_12_sub_pixel_avg_variance32x64,
NULL,
NULL,
vp9_highbd_sad32x64x4d_bits12)
vpx_highbd_sad32x64x4d_bits12)
HIGHBD_BFP(BLOCK_32X32,
vp9_highbd_sad32x32_bits12,
vp9_highbd_sad32x32_avg_bits12,
vpx_highbd_sad32x32_bits12,
vpx_highbd_sad32x32_avg_bits12,
vp9_highbd_12_variance32x32,
vp9_highbd_12_sub_pixel_variance32x32,
vp9_highbd_12_sub_pixel_avg_variance32x32,
vp9_highbd_sad32x32x3_bits12,
vp9_highbd_sad32x32x8_bits12,
vp9_highbd_sad32x32x4d_bits12)
vpx_highbd_sad32x32x3_bits12,
vpx_highbd_sad32x32x8_bits12,
vpx_highbd_sad32x32x4d_bits12)
HIGHBD_BFP(BLOCK_64X64,
vp9_highbd_sad64x64_bits12,
vp9_highbd_sad64x64_avg_bits12,
vpx_highbd_sad64x64_bits12,
vpx_highbd_sad64x64_avg_bits12,
vp9_highbd_12_variance64x64,
vp9_highbd_12_sub_pixel_variance64x64,
vp9_highbd_12_sub_pixel_avg_variance64x64,
vp9_highbd_sad64x64x3_bits12,
vp9_highbd_sad64x64x8_bits12,
vp9_highbd_sad64x64x4d_bits12)
vpx_highbd_sad64x64x3_bits12,
vpx_highbd_sad64x64x8_bits12,
vpx_highbd_sad64x64x4d_bits12)
HIGHBD_BFP(BLOCK_16X16,
vp9_highbd_sad16x16_bits12,
vp9_highbd_sad16x16_avg_bits12,
vpx_highbd_sad16x16_bits12,
vpx_highbd_sad16x16_avg_bits12,
vp9_highbd_12_variance16x16,
vp9_highbd_12_sub_pixel_variance16x16,
vp9_highbd_12_sub_pixel_avg_variance16x16,
vp9_highbd_sad16x16x3_bits12,
vp9_highbd_sad16x16x8_bits12,
vp9_highbd_sad16x16x4d_bits12)
vpx_highbd_sad16x16x3_bits12,
vpx_highbd_sad16x16x8_bits12,
vpx_highbd_sad16x16x4d_bits12)
HIGHBD_BFP(BLOCK_16X8,
vp9_highbd_sad16x8_bits12,
vp9_highbd_sad16x8_avg_bits12,
vpx_highbd_sad16x8_bits12,
vpx_highbd_sad16x8_avg_bits12,
vp9_highbd_12_variance16x8,
vp9_highbd_12_sub_pixel_variance16x8,
vp9_highbd_12_sub_pixel_avg_variance16x8,
vp9_highbd_sad16x8x3_bits12,
vp9_highbd_sad16x8x8_bits12,
vp9_highbd_sad16x8x4d_bits12)
vpx_highbd_sad16x8x3_bits12,
vpx_highbd_sad16x8x8_bits12,
vpx_highbd_sad16x8x4d_bits12)
HIGHBD_BFP(BLOCK_8X16,
vp9_highbd_sad8x16_bits12,
vp9_highbd_sad8x16_avg_bits12,
vpx_highbd_sad8x16_bits12,
vpx_highbd_sad8x16_avg_bits12,
vp9_highbd_12_variance8x16,
vp9_highbd_12_sub_pixel_variance8x16,
vp9_highbd_12_sub_pixel_avg_variance8x16,
vp9_highbd_sad8x16x3_bits12,
vp9_highbd_sad8x16x8_bits12,
vp9_highbd_sad8x16x4d_bits12)
vpx_highbd_sad8x16x3_bits12,
vpx_highbd_sad8x16x8_bits12,
vpx_highbd_sad8x16x4d_bits12)
HIGHBD_BFP(BLOCK_8X8,
vp9_highbd_sad8x8_bits12,
vp9_highbd_sad8x8_avg_bits12,
vpx_highbd_sad8x8_bits12,
vpx_highbd_sad8x8_avg_bits12,
vp9_highbd_12_variance8x8,
vp9_highbd_12_sub_pixel_variance8x8,
vp9_highbd_12_sub_pixel_avg_variance8x8,
vp9_highbd_sad8x8x3_bits12,
vp9_highbd_sad8x8x8_bits12,
vp9_highbd_sad8x8x4d_bits12)
vpx_highbd_sad8x8x3_bits12,
vpx_highbd_sad8x8x8_bits12,
vpx_highbd_sad8x8x4d_bits12)
HIGHBD_BFP(BLOCK_8X4,
vp9_highbd_sad8x4_bits12,
vp9_highbd_sad8x4_avg_bits12,
vpx_highbd_sad8x4_bits12,
vpx_highbd_sad8x4_avg_bits12,
vp9_highbd_12_variance8x4,
vp9_highbd_12_sub_pixel_variance8x4,
vp9_highbd_12_sub_pixel_avg_variance8x4,
NULL,
vp9_highbd_sad8x4x8_bits12,
vp9_highbd_sad8x4x4d_bits12)
vpx_highbd_sad8x4x8_bits12,
vpx_highbd_sad8x4x4d_bits12)
HIGHBD_BFP(BLOCK_4X8,
vp9_highbd_sad4x8_bits12,
vp9_highbd_sad4x8_avg_bits12,
vpx_highbd_sad4x8_bits12,
vpx_highbd_sad4x8_avg_bits12,
vp9_highbd_12_variance4x8,
vp9_highbd_12_sub_pixel_variance4x8,
vp9_highbd_12_sub_pixel_avg_variance4x8,
NULL,
vp9_highbd_sad4x8x8_bits12,
vp9_highbd_sad4x8x4d_bits12)
vpx_highbd_sad4x8x8_bits12,
vpx_highbd_sad4x8x4d_bits12)
HIGHBD_BFP(BLOCK_4X4,
vp9_highbd_sad4x4_bits12,
vp9_highbd_sad4x4_avg_bits12,
vpx_highbd_sad4x4_bits12,
vpx_highbd_sad4x4_avg_bits12,
vp9_highbd_12_variance4x4,
vp9_highbd_12_sub_pixel_variance4x4,
vp9_highbd_12_sub_pixel_avg_variance4x4,
vp9_highbd_sad4x4x3_bits12,
vp9_highbd_sad4x4x8_bits12,
vp9_highbd_sad4x4x4d_bits12)
vpx_highbd_sad4x4x3_bits12,
vpx_highbd_sad4x4x8_bits12,
vpx_highbd_sad4x4x4d_bits12)
break;
default:
@ -1799,64 +1801,64 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
cpi->fn_ptr[BT].sdx8f = SDX8F; \
cpi->fn_ptr[BT].sdx4df = SDX4DF;
BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
vp9_variance32x16, vp9_sub_pixel_variance32x16,
vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d)
vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
vp9_variance16x32, vp9_sub_pixel_variance16x32,
vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d)
vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
vp9_variance64x32, vp9_sub_pixel_variance64x32,
vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d)
vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
vp9_variance32x64, vp9_sub_pixel_variance32x64,
vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d)
vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
vp9_variance32x32, vp9_sub_pixel_variance32x32,
vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8,
vp9_sad32x32x4d)
vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
vpx_sad32x32x4d)
BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
vp9_variance64x64, vp9_sub_pixel_variance64x64,
vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8,
vp9_sad64x64x4d)
vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
vpx_sad64x64x4d)
BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
vp9_variance16x16, vp9_sub_pixel_variance16x16,
vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8,
vp9_sad16x16x4d)
vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
vpx_sad16x16x4d)
BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
vp9_variance16x8, vp9_sub_pixel_variance16x8,
vp9_sub_pixel_avg_variance16x8,
vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
vp9_variance8x16, vp9_sub_pixel_variance8x16,
vp9_sub_pixel_avg_variance8x16,
vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
vp9_variance8x8, vp9_sub_pixel_variance8x8,
vp9_sub_pixel_avg_variance8x8,
vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
vp9_variance8x4, vp9_sub_pixel_variance8x4,
vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d)
vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
vp9_variance4x8, vp9_sub_pixel_variance4x8,
vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d)
vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
vp9_variance4x4, vp9_sub_pixel_variance4x4,
vp9_sub_pixel_avg_variance4x4,
vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);

Просмотреть файл

@ -10,6 +10,9 @@
#include <limits.h>
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vp9/encoder/vp9_segmentation.h"
#include "vp9/encoder/vp9_mcomp.h"
@ -74,8 +77,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
x->mv_row_min = tmp_row_min;
x->mv_row_max = tmp_row_max;
return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf, xd->plane[0].dst.stride);
return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf, xd->plane[0].dst.stride);
}
static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
@ -87,7 +90,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
// Try zero MV first
// FIXME should really use something like near/nearest MV and/or MV prediction
err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
dst_mv->as_int = 0;
@ -123,7 +126,7 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
// Try zero MV first
// FIXME should really use something like near/nearest MV and/or MV prediction
err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
dst_mv->as_int = 0;
@ -146,7 +149,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf, xd->plane[0].dst.stride,
0, 0, 0);
err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
xd->plane[0].dst.buf, xd->plane[0].dst.stride);
// find best

Просмотреть файл

@ -1,370 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro PROCESS_16X2X3 1
%if %1
movdqa xmm0, XMMWORD PTR [rsi]
lddqu xmm5, XMMWORD PTR [rdi]
lddqu xmm6, XMMWORD PTR [rdi+1]
lddqu xmm7, XMMWORD PTR [rdi+2]
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
movdqa xmm0, XMMWORD PTR [rsi]
lddqu xmm1, XMMWORD PTR [rdi]
lddqu xmm2, XMMWORD PTR [rdi+1]
lddqu xmm3, XMMWORD PTR [rdi+2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
paddw xmm7, xmm3
%endif
movdqa xmm0, XMMWORD PTR [rsi+rax]
lddqu xmm1, XMMWORD PTR [rdi+rdx]
lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
paddw xmm7, xmm3
%endmacro
%macro PROCESS_16X2X3_OFFSET 2
%if %1
movdqa xmm0, XMMWORD PTR [rsi]
movdqa xmm4, XMMWORD PTR [rdi]
movdqa xmm7, XMMWORD PTR [rdi+16]
movdqa xmm5, xmm7
palignr xmm5, xmm4, %2
movdqa xmm6, xmm7
palignr xmm6, xmm4, (%2+1)
palignr xmm7, xmm4, (%2+2)
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
movdqa xmm0, XMMWORD PTR [rsi]
movdqa xmm4, XMMWORD PTR [rdi]
movdqa xmm3, XMMWORD PTR [rdi+16]
movdqa xmm1, xmm3
palignr xmm1, xmm4, %2
movdqa xmm2, xmm3
palignr xmm2, xmm4, (%2+1)
palignr xmm3, xmm4, (%2+2)
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
paddw xmm7, xmm3
%endif
movdqa xmm0, XMMWORD PTR [rsi+rax]
movdqa xmm4, XMMWORD PTR [rdi+rdx]
movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
movdqa xmm1, xmm3
palignr xmm1, xmm4, %2
movdqa xmm2, xmm3
palignr xmm2, xmm4, (%2+1)
palignr xmm3, xmm4, (%2+2)
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
paddw xmm7, xmm3
%endmacro
%macro PROCESS_16X16X3_OFFSET 2
%2_aligned_by_%1:
sub rdi, %1
PROCESS_16X2X3_OFFSET 1, %1
PROCESS_16X2X3_OFFSET 0, %1
PROCESS_16X2X3_OFFSET 0, %1
PROCESS_16X2X3_OFFSET 0, %1
PROCESS_16X2X3_OFFSET 0, %1
PROCESS_16X2X3_OFFSET 0, %1
PROCESS_16X2X3_OFFSET 0, %1
PROCESS_16X2X3_OFFSET 0, %1
jmp %2_store_off
%endmacro
%macro PROCESS_16X8X3_OFFSET 2
%2_aligned_by_%1:
sub rdi, %1
PROCESS_16X2X3_OFFSET 1, %1
PROCESS_16X2X3_OFFSET 0, %1
PROCESS_16X2X3_OFFSET 0, %1
PROCESS_16X2X3_OFFSET 0, %1
jmp %2_store_off
%endmacro
;void int vp9_sad16x16x3_ssse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp9_sad16x16x3_ssse3) PRIVATE
sym(vp9_sad16x16x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
push rsi
push rdi
push rcx
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rdx, 0xf
and rdx, rdi
jmp .vp9_sad16x16x3_ssse3_skiptable
.vp9_sad16x16x3_ssse3_jumptable:
dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
.vp9_sad16x16x3_ssse3_skiptable:
call .vp9_sad16x16x3_ssse3_do_jump
.vp9_sad16x16x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
add rcx, rax
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
jmp rcx
PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
.vp9_sad16x16x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
.vp9_sad16x16x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rdi], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rdi+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rdi+8], xmm0
; begin epilog
pop rcx
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void int vp9_sad16x8x3_ssse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp9_sad16x8x3_ssse3) PRIVATE
sym(vp9_sad16x8x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
push rsi
push rdi
push rcx
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rdx, 0xf
and rdx, rdi
jmp .vp9_sad16x8x3_ssse3_skiptable
.vp9_sad16x8x3_ssse3_jumptable:
dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
.vp9_sad16x8x3_ssse3_skiptable:
call .vp9_sad16x8x3_ssse3_do_jump
.vp9_sad16x8x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
add rcx, rax
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
jmp rcx
PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
.vp9_sad16x8x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
.vp9_sad16x8x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rdi], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rdi+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rdi+8], xmm0
; begin epilog
pop rcx
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -69,7 +69,6 @@ VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
VP9_CX_SRCS-yes += encoder/vp9_rd.c
VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
VP9_CX_SRCS-yes += encoder/vp9_sad.c
VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
VP9_CX_SRCS-yes += encoder/vp9_speed_features.c
@ -104,15 +103,11 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@ -121,12 +116,10 @@ endif
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
endif
@ -136,9 +129,6 @@ ifeq ($(ARCH_X86_64),yes)
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
endif
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad_intrin_avx2.c
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
@ -161,8 +151,6 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
endif
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c

Просмотреть файл

@ -9,9 +9,9 @@
*/
#include <arm_neon.h>
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
@ -80,9 +80,9 @@ static void sad_neon_32(const uint8x16_t vec_src_00,
vget_high_u8(vec_ref_16));
}
void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride,
void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
const uint8_t* const ref[4], int ref_stride,
unsigned int *res) {
uint32_t *res) {
int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
@ -126,9 +126,9 @@ void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride,
res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
}
void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride,
void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
const uint8_t* const ref[4], int ref_stride,
unsigned int *res) {
uint32_t *res) {
int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
@ -170,9 +170,9 @@ void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride,
res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
}
void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride,
void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
const uint8_t* const ref[4], int ref_stride,
unsigned int *res) {
uint32_t *res) {
int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);

Просмотреть файл

@ -9,7 +9,7 @@
;
EXPORT |vp8_sad16x16_armv6|
EXPORT |vpx_sad16x16_media|
ARM
REQUIRE8
@ -21,8 +21,7 @@
; r1 int src_stride
; r2 const unsigned char *ref_ptr
; r3 int ref_stride
; stack max_sad (not used)
|vp8_sad16x16_armv6| PROC
|vpx_sad16x16_media| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]

Просмотреть файл

@ -9,11 +9,113 @@
*/
#include <arm_neon.h>
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
unsigned int vpx_sad8x16_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 15; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vpx_sad4x4_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x2_t d1;
uint64x1_t d3;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 3; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
d1 = vpaddl_u16(vget_low_u16(q12));
d3 = vpaddl_u32(d1);
return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
}
unsigned int vpx_sad16x8_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 7; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
@ -34,7 +136,7 @@ static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
return vget_lane_u32(c, 0);
}
unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,
unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@ -70,7 +172,7 @@ unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,
return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
}
unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@ -95,7 +197,7 @@ unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@ -114,7 +216,7 @@ unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride,
unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum = vdupq_n_u16(0);

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -10,15 +10,19 @@
#include <stdlib.h>
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#if CONFIG_VP9_HIGHBITDEPTH
#include "vp9/common/vp9_common.h"
#endif
#include "vp9/encoder/vp9_variance.h"
#endif // CONFIG_VP9_HIGHBITDEPTH
// Temporary ...
#define ROUND_POWER_OF_TWO(value, n) \
(((value) + (1 << ((n) - 1))) >> (n))
/* Sum the difference between every corresponding element of the buffers. */
static INLINE unsigned int sad(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int width, int height) {
@ -35,35 +39,78 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
return sad;
}
/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
* The function averages every corresponding element of the buffers and stores
* the value in a third buffer, comp_pred.
* pred and comp_pred are assumed to have stride = width
* In the usage below comp_pred is a local array.
*/
static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
int i, j;
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
comp_pred += width;
pred += width;
ref += ref_stride;
}
}
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride) {
int i, j;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
comp_pred += width;
pred += width;
ref += ref_stride;
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#define sadMxN(m, n) \
unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride) { \
return sad(src, src_stride, ref, ref_stride, m, n); \
} \
unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred) { \
uint8_t comp_pred[m * n]; \
vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
return sad(src, src_stride, comp_pred, m, m, n); \
}
// depending on call sites, pass **ref_array to avoid & in subsequent call and
// de-dup with 4D below.
#define sadMxNxK(m, n, k) \
void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sads) { \
void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
const uint8_t *ref_array, int ref_stride, \
uint32_t *sad_array) { \
int i; \
for (i = 0; i < k; ++i) \
sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
}
// This appears to be equivalent to the above when k == 4 and refs is const
#define sadMxNx4D(m, n) \
void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
const uint8_t *const refs[], int ref_stride, \
unsigned int *sads) { \
void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
const uint8_t *const ref_array[], int ref_stride, \
uint32_t *sad_array) { \
int i; \
for (i = 0; i < 4; ++i) \
sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
}
// 64x64
@ -169,40 +216,40 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
}
#define highbd_sadMxN(m, n) \
unsigned int vp9_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride) { \
return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
} \
unsigned int vp9_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
const uint8_t *second_pred) { \
uint16_t comp_pred[m * n]; \
vp9_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
}
#define highbd_sadMxNxK(m, n, k) \
void vp9_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sads) { \
void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
const uint8_t *ref_array, int ref_stride, \
uint32_t *sad_array) { \
int i; \
for (i = 0; i < k; ++i) { \
sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, &ref[i], \
ref_stride); \
sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \
ref_stride); \
} \
}
#define highbd_sadMxNx4D(m, n) \
void vp9_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
const uint8_t *const refs[], \
int ref_stride, unsigned int *sads) { \
void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
const uint8_t *const ref_array[], \
int ref_stride, uint32_t *sad_array) { \
int i; \
for (i = 0; i < 4; ++i) { \
sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, refs[i], \
ref_stride); \
} \
sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \
ref_stride); \
} \
}
// 64x64

40
vpx_dsp/vpx_dsp.mk Normal file
Просмотреть файл

@ -0,0 +1,40 @@
##
## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
##
## Use of this source code is governed by a BSD-style license
## that can be found in the LICENSE file in the root of the source
## tree. An additional intellectual property rights grant can be found
## in the file PATENTS. All contributing project authors may
## be found in the AUTHORS file in the root of the source tree.
##
DSP_SRCS-yes += vpx_dsp.mk
ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-yes += sad.c
DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm
DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
DSP_SRCS-yes += vpx_dsp_rtcd.c
DSP_SRCS-yes += vpx_dsp_rtcd_defs.pl
$(eval $(call rtcd_h_template,vpx_dsp_rtcd,vpx_dsp/vpx_dsp_rtcd_defs.pl))

17
vpx_dsp/vpx_dsp_rtcd.c Normal file
Просмотреть файл

@ -0,0 +1,17 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#define RTCD_C
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/vpx_once.h"
void vpx_dsp_rtcd() {
once(setup_rtcd_internal);
}

Просмотреть файл

@ -0,0 +1,395 @@
sub vpx_dsp_forward_decls() {
print <<EOF
/*
* DSP
*/
#include "vpx/vpx_integer.h"
EOF
}
forward_decls qw/vpx_dsp_forward_decls/;
# Functions which use x86inc.asm instead of x86_abi_support.asm
if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
$mmx_x86inc = 'mmx';
$sse_x86inc = 'sse';
$sse2_x86inc = 'sse2';
$ssse3_x86inc = 'ssse3';
$avx_x86inc = 'avx';
$avx2_x86inc = 'avx2';
} else {
$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =
$avx_x86inc = $avx2_x86inc = '';
}
# Functions which are 64 bit only.
if ($opts{arch} eq "x86_64") {
$mmx_x86_64 = 'mmx';
$sse2_x86_64 = 'sse2';
$ssse3_x86_64 = 'ssse3';
$avx_x86_64 = 'avx';
$avx2_x86_64 = 'avx2';
} else {
$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =
$avx_x86_64 = $avx2_x86_64 = '';
}
if (vpx_config("CONFIG_ENCODERS") eq "yes") {
#
# Single block SAD
#
add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad64x64 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad64x32 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad32x64 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad32x32 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad32x16 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad16x16 mmx media neon/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad16x8 mmx neon/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad8x16 mmx neon/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad8x8 mmx neon/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad4x8/, "$sse_x86inc";
add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad4x4 mmx neon/, "$sse_x86inc";
#
# Avg
#
add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad64x64_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad64x32_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad32x64_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad32x32_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad32x16_avg avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad16x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad16x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad16x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad8x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad8x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad8x4_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad4x8_avg/, "$sse_x86inc";
add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_sad4x4_avg/, "$sse_x86inc";
#
# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
#
# Blocks of 3
add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x16x3 sse3 ssse3/;
add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x8x3 sse3 ssse3/;
add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x16x3 sse3/;
add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x8x3 sse3/;
add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad4x4x3 sse3/;
# Blocks of 8
add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x16x8 sse4_1/;
add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x8x8 sse4_1/;
add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x16x8 sse4_1/;
add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x8x8 sse4_1/;
add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad4x4x8 sse4_1/;
#
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad64x64x4d avx2 neon/, "$sse2_x86inc";
add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad64x32x4d/, "$sse2_x86inc";
add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad32x64x4d/, "$sse2_x86inc";
add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad32x32x4d avx2 neon/, "$sse2_x86inc";
add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad32x16x4d/, "$sse2_x86inc";
add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x32x4d/, "$sse2_x86inc";
add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x16x4d neon/, "$sse2_x86inc";
add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x8x4d/, "$sse2_x86inc";
add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x16x4d/, "$sse2_x86inc";
add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x8x4d/, "$sse2_x86inc";
add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x4x4d/, "$sse2_x86inc";
add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad4x8x4d/, "$sse_x86inc";
add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad4x4x4d/, "$sse_x86inc";
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Single block SAD
#
add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad4x8/;
add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_highbd_sad4x4/;
#
# Avg
#
add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad64x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad32x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad32x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad32x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad16x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad16x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad16x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad8x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad8x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad8x4_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad4x8_avg/;
add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad4x4_avg/;
#
# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
#
# Blocks of 3
add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad64x64x3/;
add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad32x32x3/;
add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x16x3/;
add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x8x3/;
add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x16x3/;
add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x8x3/;
add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad4x4x3/;
# Blocks of 8
add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad64x64x8/;
add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad32x32x8/;
add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x16x8/;
add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x8x8/;
add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x16x8/;
add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x8x8/;
add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x4x8/;
add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad4x8x8/;
add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad4x4x8/;
#
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad64x64x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad64x32x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad32x64x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad32x32x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad32x16x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x32x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x16x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x8x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x16x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x8x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x4x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad4x8x4d/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_ENCODERS
1;

Просмотреть файл

@ -8,6 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
%define program_name vpx
%include "third_party/x86inc/x86inc.asm"
SECTION .text
@ -209,9 +211,9 @@ SECTION .text
HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
%endmacro
; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
; uint8_t *ref[4], int ref_stride,
; unsigned int res[4]);
; uint32_t res[4]);
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
%macro HIGH_SADNXN4D 2
%if UNIX64

Просмотреть файл

@ -8,6 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
%define program_name vpx
%include "third_party/x86inc/x86inc.asm"
SECTION .text
@ -50,7 +52,7 @@ cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
%endif
%endmacro
; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD64XN 1-2 0
HIGH_SAD_FN 64, %1, 5, %2
@ -157,7 +159,7 @@ HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD32XN 1-2 0
HIGH_SAD_FN 32, %1, 5, %2
@ -225,7 +227,7 @@ HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
; unsigned int vp9_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD16XN 1-2 0
HIGH_SAD_FN 16, %1, 5, %2
@ -294,7 +296,7 @@ HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
; unsigned int vp9_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD8XN 1-2 0
HIGH_SAD_FN 8, %1, 7, %2

Просмотреть файл

@ -10,11 +10,11 @@
#include <immintrin.h> // AVX2
#include "vpx/vpx_integer.h"
void vp9_sad32x32x4d_avx2(uint8_t *src,
void vpx_sad32x32x4d_avx2(uint8_t *src,
int src_stride,
uint8_t *ref[4],
int ref_stride,
unsigned int res[4]) {
uint32_t res[4]) {
__m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
__m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
__m256i sum_mlow, sum_mhigh;
@ -80,11 +80,11 @@ void vp9_sad32x32x4d_avx2(uint8_t *src,
}
}
void vp9_sad64x64x4d_avx2(uint8_t *src,
void vpx_sad64x64x4d_avx2(uint8_t *src,
int src_stride,
uint8_t *ref[4],
int ref_stride,
unsigned int res[4]) {
uint32_t res[4]) {
__m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
__m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
__m256i ref3_reg, ref3next_reg;

Просмотреть файл

@ -8,6 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
%define program_name vpx
%include "third_party/x86inc/x86inc.asm"
SECTION .text
@ -167,9 +169,9 @@ SECTION .text
PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
%endmacro
; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride,
; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride,
; uint8_t *ref[4], int ref_stride,
; unsigned int res[4]);
; uint32_t res[4]);
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
%macro SADNXN4D 2
%if UNIX64

Просмотреть файл

@ -11,7 +11,7 @@
#include "vpx_ports/mem.h"
#define FSAD64_H(h) \
unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \
unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride) { \
@ -40,7 +40,7 @@ unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \
}
#define FSAD32_H(h) \
unsigned int vp9_sad32x##h##_avx2(const uint8_t *src_ptr, \
unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride) { \
@ -89,7 +89,7 @@ FSAD32;
#undef FSAD32_H
#define FSADAVG64_H(h) \
unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride, \
@ -124,7 +124,7 @@ unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
}
#define FSADAVG32_H(h) \
unsigned int vp9_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride, \

Просмотреть файл

@ -11,18 +11,18 @@
%include "vpx_ports/x86_abi_support.asm"
global sym(vp8_sad16x16_mmx) PRIVATE
global sym(vp8_sad8x16_mmx) PRIVATE
global sym(vp8_sad8x8_mmx) PRIVATE
global sym(vp8_sad4x4_mmx) PRIVATE
global sym(vp8_sad16x8_mmx) PRIVATE
global sym(vpx_sad16x16_mmx) PRIVATE
global sym(vpx_sad8x16_mmx) PRIVATE
global sym(vpx_sad8x8_mmx) PRIVATE
global sym(vpx_sad4x4_mmx) PRIVATE
global sym(vpx_sad16x8_mmx) PRIVATE
;unsigned int vp8_sad16x16_mmx(
;unsigned int vpx_sad16x16_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
sym(vp8_sad16x16_mmx):
sym(vpx_sad16x16_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
@ -109,12 +109,12 @@ sym(vp8_sad16x16_mmx):
ret
;unsigned int vp8_sad8x16_mmx(
;unsigned int vpx_sad8x16_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
sym(vp8_sad8x16_mmx):
sym(vpx_sad8x16_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
@ -181,12 +181,12 @@ sym(vp8_sad8x16_mmx):
ret
;unsigned int vp8_sad8x8_mmx(
;unsigned int vpx_sad8x8_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
sym(vp8_sad8x8_mmx):
sym(vpx_sad8x8_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
@ -251,12 +251,12 @@ sym(vp8_sad8x8_mmx):
ret
;unsigned int vp8_sad4x4_mmx(
;unsigned int vpx_sad4x4_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
sym(vp8_sad4x4_mmx):
sym(vpx_sad4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
@ -340,12 +340,12 @@ sym(vp8_sad4x4_mmx):
ret
;unsigned int vp8_sad16x8_mmx(
;unsigned int vpx_sad16x8_mmx(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
sym(vp8_sad16x8_mmx):
sym(vpx_sad16x8_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4

Просмотреть файл

@ -8,6 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
%define program_name vpx
%include "third_party/x86inc/x86inc.asm"
SECTION .text
@ -44,7 +46,7 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
%endif ; %3 == 7
%endmacro
; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD64XN 1-2 0
SAD_FN 64, %1, 5, %2
@ -87,7 +89,7 @@ SAD64XN 32 ; sad64x32_sse2
SAD64XN 64, 1 ; sad64x64_avg_sse2
SAD64XN 32, 1 ; sad64x32_avg_sse2
; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD32XN 1-2 0
SAD_FN 32, %1, 5, %2
@ -132,7 +134,7 @@ SAD32XN 64, 1 ; sad32x64_avg_sse2
SAD32XN 32, 1 ; sad32x32_avg_sse2
SAD32XN 16, 1 ; sad32x16_avg_sse2
; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD16XN 1-2 0
SAD_FN 16, %1, 7, %2
@ -178,7 +180,7 @@ SAD16XN 32, 1 ; sad16x32_avg_sse2
SAD16XN 16, 1 ; sad16x16_avg_sse2
SAD16XN 8, 1 ; sad16x8_avg_sse2
; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD8XN 1-2 0
SAD_FN 8, %1, 7, %2
@ -222,7 +224,7 @@ SAD8XN 16, 1 ; sad8x16_avg_sse2
SAD8XN 8, 1 ; sad8x8_avg_sse2
SAD8XN 4, 1 ; sad8x4_avg_sse2
; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD4XN 1-2 0
SAD_FN 4, %1, 7, %2

Просмотреть файл

@ -19,7 +19,6 @@
%define end_ptr rcx
%define ret_var rbx
%define result_ptr arg(4)
%define max_err arg(4)
%define height dword ptr arg(4)
push rbp
mov rbp, rsp
@ -42,7 +41,6 @@
%define end_ptr r10
%define ret_var r11
%define result_ptr [rsp+xmm_stack_space+8+4*8]
%define max_err [rsp+xmm_stack_space+8+4*8]
%define height dword ptr [rsp+xmm_stack_space+8+4*8]
%else
%define src_ptr rdi
@ -52,7 +50,6 @@
%define end_ptr r9
%define ret_var r10
%define result_ptr r8
%define max_err r8
%define height r8
%endif
%endif
@ -67,7 +64,6 @@
%define end_ptr
%define ret_var
%define result_ptr
%define max_err
%define height
%if ABI_IS_32BIT
@ -169,14 +165,14 @@
paddw mm7, mm3
%endmacro
;void int vp9_sad16x16x3_sse3(
;void int vpx_sad16x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp9_sad16x16x3_sse3) PRIVATE
sym(vp9_sad16x16x3_sse3):
global sym(vpx_sad16x16x3_sse3) PRIVATE
sym(vpx_sad16x16x3_sse3):
STACK_FRAME_CREATE_X3
@ -211,14 +207,14 @@ sym(vp9_sad16x16x3_sse3):
STACK_FRAME_DESTROY_X3
;void int vp9_sad16x8x3_sse3(
;void int vpx_sad16x8x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp9_sad16x8x3_sse3) PRIVATE
sym(vp9_sad16x8x3_sse3):
global sym(vpx_sad16x8x3_sse3) PRIVATE
sym(vpx_sad16x8x3_sse3):
STACK_FRAME_CREATE_X3
@ -249,14 +245,14 @@ sym(vp9_sad16x8x3_sse3):
STACK_FRAME_DESTROY_X3
;void int vp9_sad8x16x3_sse3(
;void int vpx_sad8x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp9_sad8x16x3_sse3) PRIVATE
sym(vp9_sad8x16x3_sse3):
global sym(vpx_sad8x16x3_sse3) PRIVATE
sym(vpx_sad8x16x3_sse3):
STACK_FRAME_CREATE_X3
@ -278,14 +274,14 @@ sym(vp9_sad8x16x3_sse3):
STACK_FRAME_DESTROY_X3
;void int vp9_sad8x8x3_sse3(
;void int vpx_sad8x8x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp9_sad8x8x3_sse3) PRIVATE
sym(vp9_sad8x8x3_sse3):
global sym(vpx_sad8x8x3_sse3) PRIVATE
sym(vpx_sad8x8x3_sse3):
STACK_FRAME_CREATE_X3
@ -303,14 +299,14 @@ sym(vp9_sad8x8x3_sse3):
STACK_FRAME_DESTROY_X3
;void int vp9_sad4x4x3_sse3(
;void int vpx_sad4x4x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp9_sad4x4x3_sse3) PRIVATE
sym(vp9_sad4x4x3_sse3):
global sym(vpx_sad4x4x3_sse3) PRIVATE
sym(vpx_sad4x4x3_sse3):
STACK_FRAME_CREATE_X3

Просмотреть файл

@ -165,14 +165,14 @@
movdqa [rdi + 16], xmm2
%endmacro
;void vp9_sad16x16x8_sse4(
;void vpx_sad16x16x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array);
global sym(vp9_sad16x16x8_sse4) PRIVATE
sym(vp9_sad16x16x8_sse4):
global sym(vpx_sad16x16x8_sse4_1) PRIVATE
sym(vpx_sad16x16x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@ -205,15 +205,15 @@ sym(vp9_sad16x16x8_sse4):
ret
;void vp9_sad16x8x8_sse4(
;void vpx_sad16x8x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
global sym(vp9_sad16x8x8_sse4) PRIVATE
sym(vp9_sad16x8x8_sse4):
global sym(vpx_sad16x8x8_sse4_1) PRIVATE
sym(vpx_sad16x8x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@ -242,15 +242,15 @@ sym(vp9_sad16x8x8_sse4):
ret
;void vp9_sad8x8x8_sse4(
;void vpx_sad8x8x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
global sym(vp9_sad8x8x8_sse4) PRIVATE
sym(vp9_sad8x8x8_sse4):
global sym(vpx_sad8x8x8_sse4_1) PRIVATE
sym(vpx_sad8x8x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@ -279,15 +279,15 @@ sym(vp9_sad8x8x8_sse4):
ret
;void vp9_sad8x16x8_sse4(
;void vpx_sad8x16x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
global sym(vp9_sad8x16x8_sse4) PRIVATE
sym(vp9_sad8x16x8_sse4):
global sym(vpx_sad8x16x8_sse4_1) PRIVATE
sym(vpx_sad8x16x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@ -320,15 +320,15 @@ sym(vp9_sad8x16x8_sse4):
ret
;void vp9_sad4x4x8_c(
;void vpx_sad4x4x8_sse4_1(
; const unsigned char *src_ptr,
; int src_stride,
; const unsigned char *ref_ptr,
; int ref_stride,
; unsigned short *sad_array
;);
global sym(vp9_sad4x4x8_sse4) PRIVATE
sym(vp9_sad4x4x8_sse4):
global sym(vpx_sad4x4x8_sse4_1) PRIVATE
sym(vpx_sad4x4x8_sse4_1):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5

Просмотреть файл

@ -146,14 +146,14 @@
%endmacro
;void int vp8_sad16x16x3_ssse3(
;void int vpx_sad16x16x3_ssse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad16x16x3_ssse3) PRIVATE
sym(vp8_sad16x16x3_ssse3):
global sym(vpx_sad16x16x3_ssse3) PRIVATE
sym(vpx_sad16x16x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@ -169,31 +169,31 @@ sym(vp8_sad16x16x3_ssse3):
mov rdx, 0xf
and rdx, rdi
jmp .vp8_sad16x16x3_ssse3_skiptable
.vp8_sad16x16x3_ssse3_jumptable:
dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
.vp8_sad16x16x3_ssse3_skiptable:
jmp .vpx_sad16x16x3_ssse3_skiptable
.vpx_sad16x16x3_ssse3_jumptable:
dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
.vpx_sad16x16x3_ssse3_skiptable:
call .vp8_sad16x16x3_ssse3_do_jump
.vp8_sad16x16x3_ssse3_do_jump:
call .vpx_sad16x16x3_ssse3_do_jump
.vpx_sad16x16x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
add rcx, rax
@ -203,23 +203,23 @@ sym(vp8_sad16x16x3_ssse3):
jmp rcx
PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
.vp8_sad16x16x3_ssse3_aligned_by_15:
.vpx_sad16x16x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
@ -229,7 +229,7 @@ sym(vp8_sad16x16x3_ssse3):
PROCESS_16X2X3 0
PROCESS_16X2X3 0
.vp8_sad16x16x3_ssse3_store_off:
.vpx_sad16x16x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5
@ -259,14 +259,14 @@ sym(vp8_sad16x16x3_ssse3):
pop rbp
ret
;void int vp8_sad16x8x3_ssse3(
;void int vpx_sad16x8x3_ssse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad16x8x3_ssse3) PRIVATE
sym(vp8_sad16x8x3_ssse3):
global sym(vpx_sad16x8x3_ssse3) PRIVATE
sym(vpx_sad16x8x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@ -282,31 +282,31 @@ sym(vp8_sad16x8x3_ssse3):
mov rdx, 0xf
and rdx, rdi
jmp .vp8_sad16x8x3_ssse3_skiptable
.vp8_sad16x8x3_ssse3_jumptable:
dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
.vp8_sad16x8x3_ssse3_skiptable:
jmp .vpx_sad16x8x3_ssse3_skiptable
.vpx_sad16x8x3_ssse3_jumptable:
dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
.vpx_sad16x8x3_ssse3_skiptable:
call .vp8_sad16x8x3_ssse3_do_jump
.vp8_sad16x8x3_ssse3_do_jump:
call .vpx_sad16x8x3_ssse3_do_jump
.vpx_sad16x8x3_ssse3_do_jump:
pop rcx ; get the address of do_jump
mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
add rcx, rax
@ -316,30 +316,30 @@ sym(vp8_sad16x8x3_ssse3):
jmp rcx
PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
.vp8_sad16x8x3_ssse3_aligned_by_15:
.vpx_sad16x8x3_ssse3_aligned_by_15:
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
.vp8_sad16x8x3_ssse3_store_off:
.vpx_sad16x8x3_ssse3_store_off:
mov rdi, arg(4) ;Results
movq xmm0, xmm5