Merge remote branch 'internal/upstream' into HEAD

This commit is contained in:
John Koleszar 2011-03-10 00:05:04 -05:00
Родитель 0c8bb2f168 a0306ea660
Коммит 820b2b927f
10 изменённых файлов: 488 добавлений и 610 удалений

Просмотреть файл

@ -153,7 +153,7 @@ endif
# #
obj_int_extract: build/make/obj_int_extract.c obj_int_extract: build/make/obj_int_extract.c
$(if $(quiet),echo " [HOSTCC] $@") $(if $(quiet),echo " [HOSTCC] $@")
$(qexec)$(HOSTCC) -I. -o $@ $< $(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
CLEAN-OBJS += obj_int_extract CLEAN-OBJS += obj_int_extract
# #

Просмотреть файл

@ -14,7 +14,7 @@
#include "vpx_config.h" #include "vpx_config.h"
#if defined(_MSC_VER) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <io.h> #include <io.h>
#include <share.h> #include <share.h>
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
@ -816,7 +816,7 @@ bail:
#endif #endif
#if defined(_MSC_VER) #if defined(_MSC_VER) || defined(__MINGW32__)
/* See "Microsoft Portable Executable and Common Object File Format Specification" /* See "Microsoft Portable Executable and Common Object File Format Specification"
for reference. for reference.
*/ */
@ -830,7 +830,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
unsigned int i; unsigned int i;
unsigned __int8 *ptr; unsigned __int8 *ptr;
unsigned __int32 symoffset; unsigned __int32 symoffset;
FILE *fp;
char **sectionlist; //this array holds all section names in their correct order. char **sectionlist; //this array holds all section names in their correct order.
//it is used to check if the symbol is in .bss or .data section. //it is used to check if the symbol is in .bss or .data section.
@ -871,14 +870,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
//log_msg("COFF: Symbol table at offset %u\n", symtab_ptr); //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
//log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr); //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);
fp = fopen("assembly_offsets.asm", "w");
if (fp == NULL)
{
perror("open file");
goto bail;
}
/* The compiler puts the data with non-zero offset in .data section, but puts the data with /* The compiler puts the data with non-zero offset in .data section, but puts the data with
zero offset in .bss section. So, if the data in in .bss section, set offset=0. zero offset in .bss section. So, if the data in in .bss section, set offset=0.
Note from Wiki: In an object module compiled from C, the bss section contains Note from Wiki: In an object module compiled from C, the bss section contains
@ -912,13 +903,13 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
strncpy(name, ptr, 8); strncpy(name, ptr, 8);
//log_msg("COFF: Parsing symbol %s\n",name); //log_msg("COFF: Parsing symbol %s\n",name);
fprintf(fp, "%-40s EQU ", name); printf("%-40s EQU ", name + 1);
} }
else else
{ {
//log_msg("COFF: Parsing symbol %s\n", //log_msg("COFF: Parsing symbol %s\n",
// buf + strtab_ptr + get_le32(ptr+4)); // buf + strtab_ptr + get_le32(ptr+4));
fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4)); printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4) + 1);
} }
if (!(strcmp(sectionlist[section-1], ".bss"))) if (!(strcmp(sectionlist[section-1], ".bss")))
@ -935,14 +926,13 @@ int parse_coff(unsigned __int8 *buf, size_t sz)
//log_msg(" Address: %u\n",get_le32(ptr+8)); //log_msg(" Address: %u\n",get_le32(ptr+8));
//log_msg(" Offset: %u\n", symoffset); //log_msg(" Offset: %u\n", symoffset);
fprintf(fp, "%5d\n", symoffset); printf("%5d\n", symoffset);
} }
ptr += 18; ptr += 18;
} }
fprintf(fp, " END\n"); printf(" END\n");
fclose(fp);
for (i = 0; i < nsections; i++) for (i = 0; i < nsections; i++)
{ {
@ -992,11 +982,7 @@ int main(int argc, char **argv)
else else
f = argv[1]; f = argv[1];
if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE)) fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE);
{
perror("Unable to open file");
goto bail;
}
if (_fstat(fd, &stat_buf)) if (_fstat(fd, &stat_buf))
{ {

Просмотреть файл

@ -1583,252 +1583,29 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
cpi->oxcf = *oxcf; cpi->oxcf = *oxcf;
switch (cpi->oxcf.Mode)
{
case MODE_REALTIME:
cpi->pass = 0;
cpi->compressor_speed = 2;
if (cpi->oxcf.cpu_used < -16)
{
cpi->oxcf.cpu_used = -16;
}
if (cpi->oxcf.cpu_used > 16)
cpi->oxcf.cpu_used = 16;
break;
#if !(CONFIG_REALTIME_ONLY)
case MODE_GOODQUALITY:
cpi->pass = 0;
cpi->compressor_speed = 1;
if (cpi->oxcf.cpu_used < -5)
{
cpi->oxcf.cpu_used = -5;
}
if (cpi->oxcf.cpu_used > 5)
cpi->oxcf.cpu_used = 5;
break;
case MODE_BESTQUALITY:
cpi->pass = 0;
cpi->compressor_speed = 0;
break;
case MODE_FIRSTPASS:
cpi->pass = 1;
cpi->compressor_speed = 1;
break;
case MODE_SECONDPASS:
cpi->pass = 2;
cpi->compressor_speed = 1;
if (cpi->oxcf.cpu_used < -5)
{
cpi->oxcf.cpu_used = -5;
}
if (cpi->oxcf.cpu_used > 5)
cpi->oxcf.cpu_used = 5;
break;
case MODE_SECONDPASS_BEST:
cpi->pass = 2;
cpi->compressor_speed = 0;
break;
#endif
}
if (cpi->pass == 0)
cpi->auto_worst_q = 1;
cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
if (oxcf->fixed_q >= 0)
{
if (oxcf->worst_allowed_q < 0)
cpi->oxcf.fixed_q = q_trans[0];
else
cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
if (oxcf->alt_q < 0)
cpi->oxcf.alt_q = q_trans[0];
else
cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
if (oxcf->key_q < 0)
cpi->oxcf.key_q = q_trans[0];
else
cpi->oxcf.key_q = q_trans[oxcf->key_q];
if (oxcf->gold_q < 0)
cpi->oxcf.gold_q = q_trans[0];
else
cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
}
cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
//cpi->use_golden_frame_only = 0;
//cpi->use_last_frame_only = 0;
cm->refresh_golden_frame = 0;
cm->refresh_last_frame = 1;
cm->refresh_entropy_probs = 1;
if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
setup_features(cpi);
{
int i;
for (i = 0; i < MAX_MB_SEGMENTS; i++)
cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
}
// At the moment the first order values may not be > MAXQ
if (cpi->oxcf.fixed_q > MAXQ)
cpi->oxcf.fixed_q = MAXQ;
// local file playback mode == really big buffer
if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
{
cpi->oxcf.starting_buffer_level = 60000;
cpi->oxcf.optimal_buffer_level = 60000;
cpi->oxcf.maximum_buffer_size = 240000;
}
// Convert target bandwidth from Kbit/s to Bit/s // Convert target bandwidth from Kbit/s to Bit/s
cpi->oxcf.target_bandwidth *= 1000; cpi->oxcf.target_bandwidth *= 1000;
cpi->oxcf.starting_buffer_level = cpi->oxcf.starting_buffer_level =
rescale(cpi->oxcf.starting_buffer_level, rescale(cpi->oxcf.starting_buffer_level,
cpi->oxcf.target_bandwidth, 1000); cpi->oxcf.target_bandwidth, 1000);
if (cpi->oxcf.optimal_buffer_level == 0) cpi->buffer_level = cpi->oxcf.starting_buffer_level;
cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
else
cpi->oxcf.optimal_buffer_level =
rescale(cpi->oxcf.optimal_buffer_level,
cpi->oxcf.target_bandwidth, 1000);
if (cpi->oxcf.maximum_buffer_size == 0)
cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
else
cpi->oxcf.maximum_buffer_size =
rescale(cpi->oxcf.maximum_buffer_size,
cpi->oxcf.target_bandwidth, 1000);
cpi->buffer_level = cpi->oxcf.starting_buffer_level;
cpi->bits_off_target = cpi->oxcf.starting_buffer_level; cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
cpi->worst_quality = cpi->oxcf.worst_allowed_q;
cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
cpi->best_quality = cpi->oxcf.best_allowed_q;
cpi->active_best_quality = cpi->oxcf.best_allowed_q; cpi->active_best_quality = cpi->oxcf.best_allowed_q;
cpi->cq_target_quality = cpi->oxcf.cq_level; cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; cpi->rolling_target_bits = cpi->av_per_frame_bandwidth;
cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth;
cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth;
cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth;
cpi->total_actual_bits = 0; cpi->total_actual_bits = 0;
cpi->total_target_vs_actual = 0; cpi->total_target_vs_actual = 0;
// Only allow dropped frames in buffered mode // change includes all joint functionality
cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; vp8_change_config(ptr, oxcf);
cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type;
if (!cm->use_bilinear_mc_filter)
cm->mcomp_filter_type = SIXTAP;
else
cm->mcomp_filter_type = BILINEAR;
cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
cm->Width = cpi->oxcf.Width ;
cm->Height = cpi->oxcf.Height ;
cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
cm->horiz_scale = cpi->horiz_scale;
cm->vert_scale = cpi->vert_scale ;
// VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
if (cpi->oxcf.Sharpness > 7)
cpi->oxcf.Sharpness = 7;
cm->sharpness_level = cpi->oxcf.Sharpness;
if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
{
int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
Scale2Ratio(cm->horiz_scale, &hr, &hs);
Scale2Ratio(cm->vert_scale, &vr, &vs);
// always go to the next whole number
cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
}
if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
{
alloc_raw_frame_buffers(cpi);
vp8_alloc_compressor_data(cpi);
}
// Clamp KF frame size to quarter of data rate
if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
cpi->intra_frame_target = cpi->target_bandwidth >> 2;
if (cpi->oxcf.fixed_q >= 0)
{
cpi->last_q[0] = cpi->oxcf.fixed_q;
cpi->last_q[1] = cpi->oxcf.fixed_q;
}
cpi->Speed = cpi->oxcf.cpu_used;
// force to allowlag to 0 if lag_in_frames is 0;
if (cpi->oxcf.lag_in_frames == 0)
{
cpi->oxcf.allow_lag = 0;
}
// Limit on lag buffers as these are not currently dynamically allocated
else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
// YX Temp
cpi->last_alt_ref_sei = -1;
cpi->is_src_frame_alt_ref = 0;
cpi->is_next_src_alt_ref = 0;
#if 0
// Experimental RD Code
cpi->frame_distortion = 0;
cpi->last_frame_distortion = 0;
#endif
#if VP8_TEMPORAL_ALT_REF #if VP8_TEMPORAL_ALT_REF
@ -1845,12 +1622,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
#endif #endif
} }
/*
* This function needs more clean up, i.e. be more tuned torwards
* change_config rather than init_config !!!!!!!!!!!!!!!!
* YX - 5/28/2009
*
*/
void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
{ {
@ -2001,10 +1772,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
// Convert target bandwidth from Kbit/s to Bit/s // Convert target bandwidth from Kbit/s to Bit/s
cpi->oxcf.target_bandwidth *= 1000; cpi->oxcf.target_bandwidth *= 1000;
cpi->oxcf.starting_buffer_level =
rescale(cpi->oxcf.starting_buffer_level,
cpi->oxcf.target_bandwidth, 1000);
if (cpi->oxcf.optimal_buffer_level == 0) if (cpi->oxcf.optimal_buffer_level == 0)
cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
else else
@ -2019,29 +1786,36 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
rescale(cpi->oxcf.maximum_buffer_size, rescale(cpi->oxcf.maximum_buffer_size,
cpi->oxcf.target_bandwidth, 1000); cpi->oxcf.target_bandwidth, 1000);
cpi->buffer_level = cpi->oxcf.starting_buffer_level;
cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
cpi->worst_quality = cpi->oxcf.worst_allowed_q; cpi->worst_quality = cpi->oxcf.worst_allowed_q;
cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
cpi->best_quality = cpi->oxcf.best_allowed_q; cpi->best_quality = cpi->oxcf.best_allowed_q;
cpi->active_best_quality = cpi->oxcf.best_allowed_q;
// active values should only be modified if out of new range
if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
{
cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
}
// less likely
else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
{
cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
}
if (cpi->active_best_quality < cpi->oxcf.best_allowed_q)
{
cpi->active_best_quality = cpi->oxcf.best_allowed_q;
}
// less likely
else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
{
cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
}
cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE; cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
cpi->cq_target_quality = cpi->oxcf.cq_level; cpi->cq_target_quality = cpi->oxcf.cq_level;
cpi->rolling_target_bits = cpi->av_per_frame_bandwidth;
cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth;
cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth;
cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth;
cpi->total_actual_bits = 0;
cpi->total_target_vs_actual = 0;
// Only allow dropped frames in buffered mode // Only allow dropped frames in buffered mode
cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type;
@ -3613,6 +3387,7 @@ static void encode_frame_to_data_rate
int drop_mark50 = drop_mark / 4; int drop_mark50 = drop_mark / 4;
int drop_mark25 = drop_mark / 8; int drop_mark25 = drop_mark / 8;
// Clear down mmx registers to allow floating point in what follows // Clear down mmx registers to allow floating point in what follows
vp8_clear_system_state(); vp8_clear_system_state();

Просмотреть файл

@ -790,7 +790,7 @@ filter_block2d_bil_variance:
ret ret
;void vp8_half_horiz_vert_variance16x_h_sse2 ;void vp8_half_horiz_vert_variance8x_h_sse2
;( ;(
; unsigned char *ref_ptr, ; unsigned char *ref_ptr,
; int ref_pixels_per_line, ; int ref_pixels_per_line,
@ -800,8 +800,8 @@ filter_block2d_bil_variance:
; int *sum, ; int *sum,
; unsigned int *sumsquared ; unsigned int *sumsquared
;) ;)
global sym(vp8_half_horiz_vert_variance16x_h_sse2) global sym(vp8_half_horiz_vert_variance8x_h_sse2)
sym(vp8_half_horiz_vert_variance16x_h_sse2): sym(vp8_half_horiz_vert_variance8x_h_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 7 SHADOW_ARGS_TO_STACK 7
@ -835,7 +835,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2):
add rsi, r8 add rsi, r8
%endif %endif
vp8_half_horiz_vert_variance16x_h_1: vp8_half_horiz_vert_variance8x_h_1:
movq xmm1, QWORD PTR [rsi] ; movq xmm1, QWORD PTR [rsi] ;
movq xmm2, QWORD PTR [rsi+1] ; movq xmm2, QWORD PTR [rsi+1] ;
@ -863,7 +863,7 @@ vp8_half_horiz_vert_variance16x_h_1:
%endif %endif
sub rcx, 1 ; sub rcx, 1 ;
jnz vp8_half_horiz_vert_variance16x_h_1 ; jnz vp8_half_horiz_vert_variance8x_h_1 ;
movdq2q mm6, xmm6 ; movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ; movdq2q mm7, xmm7 ;
@ -910,8 +910,7 @@ vp8_half_horiz_vert_variance16x_h_1:
pop rbp pop rbp
ret ret
;void vp8_half_horiz_vert_variance16x_h_sse2
;void vp8_half_vert_variance16x_h_sse2
;( ;(
; unsigned char *ref_ptr, ; unsigned char *ref_ptr,
; int ref_pixels_per_line, ; int ref_pixels_per_line,
@ -921,8 +920,124 @@ vp8_half_horiz_vert_variance16x_h_1:
; int *sum, ; int *sum,
; unsigned int *sumsquared ; unsigned int *sumsquared
;) ;)
global sym(vp8_half_vert_variance16x_h_sse2) global sym(vp8_half_horiz_vert_variance16x_h_sse2)
sym(vp8_half_vert_variance16x_h_sse2): sym(vp8_half_horiz_vert_variance16x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0 ;
movdqu xmm5, XMMWORD PTR [rsi]
movdqu xmm3, XMMWORD PTR [rsi+1]
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
lea rsi, [rsi + rax]
vp8_half_horiz_vert_variance16x_h_1:
movdqu xmm1, XMMWORD PTR [rsi] ;
movdqu xmm2, XMMWORD PTR [rsi+1] ;
pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
pavgb xmm5, xmm1 ; xmm = vertical average of the above
movdqa xmm4, xmm5
punpcklbw xmm5, xmm0 ; xmm5 = words of above
punpckhbw xmm4, xmm0
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
punpcklbw xmm3, xmm0 ; xmm3 = words of above
psubw xmm5, xmm3 ; xmm5 -= xmm3
movq xmm3, QWORD PTR [rdi+8]
punpcklbw xmm3, xmm0
psubw xmm4, xmm3
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
paddw xmm6, xmm4
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
pmaddwd xmm4, xmm4
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
paddd xmm7, xmm4
movdqa xmm5, xmm1 ; save xmm1 for use on the next row
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
sub rcx, 1 ;
jnz vp8_half_horiz_vert_variance16x_h_1 ;
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(5) ;[Sum]
mov rdi, arg(6) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_half_vert_variance8x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_half_vert_variance8x_h_sse2)
sym(vp8_half_vert_variance8x_h_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 7 SHADOW_ARGS_TO_STACK 7
@ -945,7 +1060,7 @@ sym(vp8_half_vert_variance16x_h_sse2):
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ; pxor xmm0, xmm0 ;
vp8_half_vert_variance16x_h_1: vp8_half_vert_variance8x_h_1:
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
@ -969,7 +1084,7 @@ vp8_half_vert_variance16x_h_1:
%endif %endif
sub rcx, 1 ; sub rcx, 1 ;
jnz vp8_half_vert_variance16x_h_1 ; jnz vp8_half_vert_variance8x_h_1 ;
movdq2q mm6, xmm6 ; movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ; movdq2q mm7, xmm7 ;
@ -1016,8 +1131,7 @@ vp8_half_vert_variance16x_h_1:
pop rbp pop rbp
ret ret
;void vp8_half_vert_variance16x_h_sse2
;void vp8_half_horiz_variance16x_h_sse2
;( ;(
; unsigned char *ref_ptr, ; unsigned char *ref_ptr,
; int ref_pixels_per_line, ; int ref_pixels_per_line,
@ -1027,8 +1141,116 @@ vp8_half_vert_variance16x_h_1:
; int *sum, ; int *sum,
; unsigned int *sumsquared ; unsigned int *sumsquared
;) ;)
global sym(vp8_half_horiz_variance16x_h_sse2) global sym(vp8_half_vert_variance16x_h_sse2)
sym(vp8_half_horiz_variance16x_h_sse2): sym(vp8_half_vert_variance16x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
movdqu xmm5, XMMWORD PTR [rsi]
lea rsi, [rsi + rax ]
pxor xmm0, xmm0
vp8_half_vert_variance16x_h_1:
movdqu xmm3, XMMWORD PTR [rsi]
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
movdqa xmm4, xmm5
punpcklbw xmm5, xmm0
punpckhbw xmm4, xmm0
movq xmm2, QWORD PTR [rdi]
punpcklbw xmm2, xmm0
psubw xmm5, xmm2
movq xmm2, QWORD PTR [rdi+8]
punpcklbw xmm2, xmm0
psubw xmm4, xmm2
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
paddw xmm6, xmm4
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
pmaddwd xmm4, xmm4
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
paddd xmm7, xmm4
movdqa xmm5, xmm3
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
sub rcx, 1
jnz vp8_half_vert_variance16x_h_1
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(5) ;[Sum]
mov rdi, arg(6) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_half_horiz_variance8x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_half_horiz_variance8x_h_sse2)
sym(vp8_half_horiz_variance8x_h_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 7 SHADOW_ARGS_TO_STACK 7
@ -1050,7 +1272,7 @@ sym(vp8_half_horiz_variance16x_h_sse2):
movsxd rcx, dword ptr arg(4) ;Height ; movsxd rcx, dword ptr arg(4) ;Height ;
pxor xmm0, xmm0 ; pxor xmm0, xmm0 ;
vp8_half_horiz_variance16x16_1: vp8_half_horiz_variance8x_h_1:
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
@ -1073,7 +1295,7 @@ vp8_half_horiz_variance16x16_1:
add rdi, r9 add rdi, r9
%endif %endif
sub rcx, 1 ; sub rcx, 1 ;
jnz vp8_half_horiz_variance16x16_1 ; jnz vp8_half_horiz_variance8x_h_1 ;
movdq2q mm6, xmm6 ; movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ; movdq2q mm7, xmm7 ;
@ -1120,6 +1342,109 @@ vp8_half_horiz_variance16x16_1:
pop rbp pop rbp
ret ret
;void vp8_half_horiz_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp8_half_horiz_variance16x_h_sse2)
sym(vp8_half_horiz_variance16x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
; end prolog
pxor xmm6, xmm6 ; error accumulator
pxor xmm7, xmm7 ; sse eaccumulator
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0 ;
vp8_half_horiz_variance16x_h_1:
movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
movdqa xmm1, xmm5
punpcklbw xmm5, xmm0 ; xmm5 = words of above
punpckhbw xmm1, xmm0
movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
punpcklbw xmm3, xmm0 ; xmm3 = words of above
movq xmm2, QWORD PTR [rdi+8]
punpcklbw xmm2, xmm0
psubw xmm5, xmm3 ; xmm5 -= xmm3
psubw xmm1, xmm2
paddw xmm6, xmm5 ; xmm6 += accumulated column differences
paddw xmm6, xmm1
pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
pmaddwd xmm1, xmm1
paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
paddd xmm7, xmm1
lea rsi, [rsi + rax]
lea rdi, [rdi + rdx]
sub rcx, 1 ;
jnz vp8_half_horiz_variance16x_h_1 ;
pxor xmm1, xmm1
pxor xmm5, xmm5
punpcklwd xmm0, xmm6
punpckhwd xmm1, xmm6
psrad xmm0, 16
psrad xmm1, 16
paddd xmm0, xmm1
movdqa xmm1, xmm0
movdqa xmm6, xmm7
punpckldq xmm6, xmm5
punpckhdq xmm7, xmm5
paddd xmm6, xmm7
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm7, xmm6
movdqa xmm1, xmm0
psrldq xmm7, 8
psrldq xmm1, 8
paddd xmm6, xmm7
paddd xmm0, xmm1
mov rsi, arg(5) ;[Sum]
mov rdi, arg(6) ;[SSE]
movd [rsi], xmm0
movd [rdi], xmm6
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA SECTION_RODATA
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};

Просмотреть файл

@ -456,146 +456,6 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
return (xxsum - ((xsum * xsum) >> 7)); return (xxsum - ((xsum * xsum) >> 7));
} }
unsigned int vp8_i_variance16x16_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3, avg;
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
avg = sum0 + sum1 + sum2 + sum3;
*sse = var;
return (var - ((avg * avg) >> 8));
}
unsigned int vp8_i_variance8x16_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - ((avg * avg) >> 7));
}
unsigned int vp8_i_sub_pixel_variance16x16_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
int f2soffset = (src_pixels_per_line >> 1);
int f2doffset = (dst_pixels_per_line >> 1);
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
&xsum0, &xxsum0
);
vp8_filter_block2d_bil_var_mmx(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
vp8_filter_block2d_bil_var_mmx(
src_ptr + f2soffset, src_pixels_per_line,
dst_ptr + f2doffset, dst_pixels_per_line, 8,
vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
vp8_filter_block2d_bil_var_mmx(
src_ptr + f2soffset + 8, src_pixels_per_line,
dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
unsigned int vp8_i_sub_pixel_variance8x16_mmx
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
int f2soffset = (src_pixels_per_line >> 1);
int f2doffset = (dst_pixels_per_line >> 1);
vp8_filter_block2d_bil_var_mmx(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
&xsum0, &xxsum0
);
vp8_filter_block2d_bil_var_mmx(
src_ptr + f2soffset, src_pixels_per_line,
dst_ptr + f2doffset, dst_pixels_per_line, 8,
vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
&xsum1, &xxsum1
);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 7));
}
unsigned int vp8_variance_halfpixvar16x16_h_mmx( unsigned int vp8_variance_halfpixvar16x16_h_mmx(
const unsigned char *src_ptr, const unsigned char *src_ptr,

Просмотреть файл

@ -81,6 +81,16 @@ void vp8_filter_block2d_bil_var_sse2
int *sum, int *sum,
unsigned int *sumsquared unsigned int *sumsquared
); );
void vp8_half_horiz_vert_variance8x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
void vp8_half_horiz_vert_variance16x_h_sse2 void vp8_half_horiz_vert_variance16x_h_sse2
( (
const unsigned char *ref_ptr, const unsigned char *ref_ptr,
@ -91,6 +101,16 @@ void vp8_half_horiz_vert_variance16x_h_sse2
int *sum, int *sum,
unsigned int *sumsquared unsigned int *sumsquared
); );
void vp8_half_horiz_variance8x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
void vp8_half_horiz_variance16x_h_sse2 void vp8_half_horiz_variance16x_h_sse2
( (
const unsigned char *ref_ptr, const unsigned char *ref_ptr,
@ -101,6 +121,16 @@ void vp8_half_horiz_variance16x_h_sse2
int *sum, int *sum,
unsigned int *sumsquared unsigned int *sumsquared
); );
void vp8_half_vert_variance8x_h_sse2
(
const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
int *sum,
unsigned int *sumsquared
);
void vp8_half_vert_variance16x_h_sse2 void vp8_half_vert_variance16x_h_sse2
( (
const unsigned char *ref_ptr, const unsigned char *ref_ptr,
@ -262,21 +292,21 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
if (xoffset == 4 && yoffset == 0) if (xoffset == 4 && yoffset == 0)
{ {
vp8_half_horiz_variance16x_h_sse2( vp8_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8, dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum); &xsum, &xxsum);
} }
else if (xoffset == 0 && yoffset == 4) else if (xoffset == 0 && yoffset == 4)
{ {
vp8_half_vert_variance16x_h_sse2( vp8_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8, dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum); &xsum, &xxsum);
} }
else if (xoffset == 4 && yoffset == 4) else if (xoffset == 4 && yoffset == 4)
{ {
vp8_half_horiz_vert_variance16x_h_sse2( vp8_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8, dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum); &xsum, &xxsum);
@ -317,11 +347,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_horiz_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
} }
else if (xoffset == 0 && yoffset == 4) else if (xoffset == 0 && yoffset == 4)
{ {
@ -329,11 +354,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
} }
else if (xoffset == 4 && yoffset == 4) else if (xoffset == 4 && yoffset == 4)
{ {
@ -341,11 +361,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
} }
else else
{ {
@ -356,17 +371,16 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
&xsum0, &xxsum0 &xsum0, &xxsum0
); );
vp8_filter_block2d_bil_var_sse2( vp8_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line, src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16, dst_ptr + 8, dst_pixels_per_line, 16,
xoffset, yoffset, xoffset, yoffset,
&xsum1, &xxsum1 &xsum1, &xxsum1
); );
xsum0 += xsum1;
xxsum0 += xxsum1;
} }
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0; *sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8)); return (xxsum0 - ((xsum0 * xsum0) >> 8));
} }
@ -406,11 +420,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8, dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_horiz_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
&xsum1, &xxsum1);
} }
else if (xoffset == 0 && yoffset == 4) else if (xoffset == 0 && yoffset == 4)
{ {
@ -418,11 +427,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8, dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
&xsum1, &xxsum1);
} }
else if (xoffset == 4 && yoffset == 4) else if (xoffset == 4 && yoffset == 4)
{ {
@ -430,11 +434,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8, dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
&xsum1, &xxsum1);
} }
else else
{ {
@ -449,11 +448,10 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
dst_ptr + 8, dst_pixels_per_line, 8, dst_ptr + 8, dst_pixels_per_line, 8,
xoffset, yoffset, xoffset, yoffset,
&xsum1, &xxsum1); &xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
} }
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0; *sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 7)); return (xxsum0 - ((xsum0 * xsum0) >> 7));
} }
@ -474,21 +472,21 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
if (xoffset == 4 && yoffset == 0) if (xoffset == 4 && yoffset == 0)
{ {
vp8_half_horiz_variance16x_h_sse2( vp8_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum); &xsum, &xxsum);
} }
else if (xoffset == 0 && yoffset == 4) else if (xoffset == 0 && yoffset == 4)
{ {
vp8_half_vert_variance16x_h_sse2( vp8_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum); &xsum, &xxsum);
} }
else if (xoffset == 4 && yoffset == 4) else if (xoffset == 4 && yoffset == 4)
{ {
vp8_half_horiz_vert_variance16x_h_sse2( vp8_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum); &xsum, &xxsum);
@ -506,81 +504,6 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
return (xxsum - ((xsum * xsum) >> 7)); return (xxsum - ((xsum * xsum) >> 7));
} }
unsigned int vp8_i_variance16x16_wmt(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3, avg;
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
avg = sum0 + sum1 + sum2 + sum3;
*sse = var;
return (var - ((avg * avg) >> 8));
}
unsigned int vp8_i_variance8x16_wmt(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - ((avg * avg) >> 7));
}
unsigned int vp8_i_sub_pixel_variance16x16_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
}
unsigned int vp8_i_sub_pixel_variance8x16_wmt
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
}
unsigned int vp8_variance_halfpixvar16x16_h_wmt( unsigned int vp8_variance_halfpixvar16x16_h_wmt(
const unsigned char *src_ptr, const unsigned char *src_ptr,
@ -589,21 +512,14 @@ unsigned int vp8_variance_halfpixvar16x16_h_wmt(
int dst_pixels_per_line, int dst_pixels_per_line,
unsigned int *sse) unsigned int *sse)
{ {
int xsum0, xsum1; int xsum0;
unsigned int xxsum0, xxsum1; unsigned int xxsum0;
vp8_half_horiz_variance16x_h_sse2( vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_horiz_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0; *sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8)); return (xxsum0 - ((xsum0 * xsum0) >> 8));
} }
@ -616,21 +532,13 @@ unsigned int vp8_variance_halfpixvar16x16_v_wmt(
int dst_pixels_per_line, int dst_pixels_per_line,
unsigned int *sse) unsigned int *sse)
{ {
int xsum0, xsum1; int xsum0;
unsigned int xxsum0, xxsum1; unsigned int xxsum0;
vp8_half_vert_variance16x_h_sse2( vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0; *sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8)); return (xxsum0 - ((xsum0 * xsum0) >> 8));
} }
@ -643,21 +551,14 @@ unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
int dst_pixels_per_line, int dst_pixels_per_line,
unsigned int *sse) unsigned int *sse)
{ {
int xsum0, xsum1; int xsum0;
unsigned int xxsum0, xxsum1; unsigned int xxsum0;
vp8_half_horiz_vert_variance16x_h_sse2( vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
*sse = xxsum0; *sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8)); return (xxsum0 - ((xsum0 * xsum0) >> 8));
} }

Просмотреть файл

@ -76,8 +76,8 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
unsigned int *sse unsigned int *sse
) )
{ {
int xsum0, xsum1; int xsum0;
unsigned int xxsum0, xxsum1; unsigned int xxsum0;
// note we could avoid these if statements if the calling function // note we could avoid these if statements if the calling function
// just called the appropriate functions inside. // just called the appropriate functions inside.
@ -87,14 +87,6 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_horiz_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
} }
else if (xoffset == 0 && yoffset == 4) else if (xoffset == 0 && yoffset == 4)
{ {
@ -102,14 +94,6 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
} }
else if (xoffset == 4 && yoffset == 4) else if (xoffset == 4 && yoffset == 4)
{ {
@ -117,24 +101,65 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0); &xsum0, &xxsum0);
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
&xsum1, &xxsum1);
xsum0 += xsum1;
xxsum0 += xxsum1;
} }
else else
{ {
vp8_filter_block2d_bil_var_ssse3( vp8_filter_block2d_bil_var_ssse3(
src_ptr, src_pixels_per_line, src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16, dst_ptr, dst_pixels_per_line, 16,
xoffset, yoffset, xoffset, yoffset,
&xsum0, &xxsum0); &xsum0, &xxsum0);
} }
*sse = xxsum0; *sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8)); return (xxsum0 - ((xsum0 * xsum0) >> 8));
} }
unsigned int vp8_sub_pixel_variance16x8_ssse3
(
const unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
const unsigned char *dst_ptr,
int dst_pixels_per_line,
unsigned int *sse
)
{
int xsum0;
unsigned int xxsum0;
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
}
else
{
vp8_filter_block2d_bil_var_ssse3(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum0, &xxsum0);
}
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 7));
}

Просмотреть файл

@ -286,6 +286,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
#if HAVE_SSSE3 #if HAVE_SSSE3
extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3); extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_ssse3);
extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3); extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
#if !CONFIG_RUNTIME_CPU_DETECT #if !CONFIG_RUNTIME_CPU_DETECT
@ -295,6 +296,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
#undef vp8_variance_sad16x8x3 #undef vp8_variance_sad16x8x3
#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
#undef vp8_variance_subpixvar16x8
#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_ssse3
#undef vp8_variance_subpixvar16x16 #undef vp8_variance_subpixvar16x16
#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3

Просмотреть файл

@ -334,6 +334,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_ssse3;
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3; cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3;
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;

Просмотреть файл

@ -24,6 +24,7 @@ VP8_COMMON_SRCS-yes += common/entropymode.c
VP8_COMMON_SRCS-yes += common/entropymv.c VP8_COMMON_SRCS-yes += common/entropymv.c
VP8_COMMON_SRCS-yes += common/extend.c VP8_COMMON_SRCS-yes += common/extend.c
VP8_COMMON_SRCS-yes += common/filter.c VP8_COMMON_SRCS-yes += common/filter.c
VP8_COMMON_SRCS-yes += common/filter.h
VP8_COMMON_SRCS-yes += common/findnearmv.c VP8_COMMON_SRCS-yes += common/findnearmv.c
VP8_COMMON_SRCS-yes += common/generic/systemdependent.c VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
VP8_COMMON_SRCS-yes += common/idctllm.c VP8_COMMON_SRCS-yes += common/idctllm.c