From 84f7f209852e00ade24a08aecb989fa76feda9a8 Mon Sep 17 00:00:00 2001 From: Mikhal Shemer Date: Tue, 1 Mar 2011 15:07:47 -0800 Subject: [PATCH 1/6] Configuration updates:Making a clear distinction between Init and Change Change-Id: I7b2fb326e1aabc08b032177a7b914a5b8bb7376f --- vp8/encoder/onyx_if.c | 285 +++++------------------------------------- 1 file changed, 30 insertions(+), 255 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 39610a73f..797e18b30 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1517,252 +1517,29 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->oxcf = *oxcf; - switch (cpi->oxcf.Mode) - { - - case MODE_REALTIME: - cpi->pass = 0; - cpi->compressor_speed = 2; - - if (cpi->oxcf.cpu_used < -16) - { - cpi->oxcf.cpu_used = -16; - } - - if (cpi->oxcf.cpu_used > 16) - cpi->oxcf.cpu_used = 16; - - break; - -#if !(CONFIG_REALTIME_ONLY) - case MODE_GOODQUALITY: - cpi->pass = 0; - cpi->compressor_speed = 1; - - if (cpi->oxcf.cpu_used < -5) - { - cpi->oxcf.cpu_used = -5; - } - - if (cpi->oxcf.cpu_used > 5) - cpi->oxcf.cpu_used = 5; - - break; - - case MODE_BESTQUALITY: - cpi->pass = 0; - cpi->compressor_speed = 0; - break; - - case MODE_FIRSTPASS: - cpi->pass = 1; - cpi->compressor_speed = 1; - break; - case MODE_SECONDPASS: - cpi->pass = 2; - cpi->compressor_speed = 1; - - if (cpi->oxcf.cpu_used < -5) - { - cpi->oxcf.cpu_used = -5; - } - - if (cpi->oxcf.cpu_used > 5) - cpi->oxcf.cpu_used = 5; - - break; - case MODE_SECONDPASS_BEST: - cpi->pass = 2; - cpi->compressor_speed = 0; - break; -#endif - } - - if (cpi->pass == 0) - cpi->auto_worst_q = 1; - - cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q]; - cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q]; - cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level]; - - if (oxcf->fixed_q >= 0) - { - if (oxcf->worst_allowed_q < 0) - cpi->oxcf.fixed_q = q_trans[0]; - else - cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q]; - - if (oxcf->alt_q < 0) - cpi->oxcf.alt_q = q_trans[0]; - else - cpi->oxcf.alt_q = q_trans[oxcf->alt_q]; - - if (oxcf->key_q < 0) - cpi->oxcf.key_q = q_trans[0]; - else - cpi->oxcf.key_q = q_trans[oxcf->key_q]; - - if (oxcf->gold_q < 0) - cpi->oxcf.gold_q = q_trans[0]; - else - cpi->oxcf.gold_q = q_trans[oxcf->gold_q]; - - } - - cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; - cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG; - - //cpi->use_golden_frame_only = 0; - //cpi->use_last_frame_only = 0; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 1; - cm->refresh_entropy_probs = 1; - - if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3) - cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions; - - setup_features(cpi); - - { - int i; - - for (i = 0; i < MAX_MB_SEGMENTS; i++) - cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; - } - - // At the moment the first order values may not be > MAXQ - if (cpi->oxcf.fixed_q > MAXQ) - cpi->oxcf.fixed_q = MAXQ; - - // local file playback mode == really big buffer - if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) - { - cpi->oxcf.starting_buffer_level = 60000; - cpi->oxcf.optimal_buffer_level = 60000; - cpi->oxcf.maximum_buffer_size = 240000; - - } - - // Convert target bandwidth from Kbit/s to Bit/s cpi->oxcf.target_bandwidth *= 1000; cpi->oxcf.starting_buffer_level = rescale(cpi->oxcf.starting_buffer_level, cpi->oxcf.target_bandwidth, 1000); - if (cpi->oxcf.optimal_buffer_level == 0) - cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; - else - cpi->oxcf.optimal_buffer_level = - rescale(cpi->oxcf.optimal_buffer_level, - cpi->oxcf.target_bandwidth, 1000); - - if (cpi->oxcf.maximum_buffer_size == 0) - cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; - else - cpi->oxcf.maximum_buffer_size = - rescale(cpi->oxcf.maximum_buffer_size, - cpi->oxcf.target_bandwidth, 1000); - - cpi->buffer_level = cpi->oxcf.starting_buffer_level; + cpi->buffer_level = cpi->oxcf.starting_buffer_level; cpi->bits_off_target = cpi->oxcf.starting_buffer_level; - vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); - cpi->worst_quality = cpi->oxcf.worst_allowed_q; cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; - cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; - cpi->best_quality = cpi->oxcf.best_allowed_q; cpi->active_best_quality = cpi->oxcf.best_allowed_q; - cpi->cq_target_quality = cpi->oxcf.cq_level; - - cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE; + cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; + cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; + cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; cpi->total_actual_bits = 0; - cpi->total_target_vs_actual = 0; + cpi->total_target_vs_actual = 0; - // Only allow dropped frames in buffered mode - cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; - - cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; - - if (!cm->use_bilinear_mc_filter) - cm->mcomp_filter_type = SIXTAP; - else - cm->mcomp_filter_type = BILINEAR; - - cpi->target_bandwidth = cpi->oxcf.target_bandwidth; - - cm->Width = cpi->oxcf.Width ; - cm->Height = cpi->oxcf.Height ; - - cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8 - - cm->horiz_scale = cpi->horiz_scale; - cm->vert_scale = cpi->vert_scale ; - - // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) - if (cpi->oxcf.Sharpness > 7) - cpi->oxcf.Sharpness = 7; - - cm->sharpness_level = cpi->oxcf.Sharpness; - - if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) - { - int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); - int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); - - Scale2Ratio(cm->horiz_scale, &hr, &hs); - Scale2Ratio(cm->vert_scale, &vr, &vs); - - // always go to the next whole number - cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs; - cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; - } - - if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width || - ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height || - cm->yv12_fb[cm->lst_fb_idx].y_width == 0) - { - alloc_raw_frame_buffers(cpi); - vp8_alloc_compressor_data(cpi); - } - - // Clamp KF frame size to quarter of data rate - if (cpi->intra_frame_target > cpi->target_bandwidth >> 2) - cpi->intra_frame_target = cpi->target_bandwidth >> 2; - - if (cpi->oxcf.fixed_q >= 0) - { - cpi->last_q[0] = cpi->oxcf.fixed_q; - cpi->last_q[1] = cpi->oxcf.fixed_q; - } - - cpi->Speed = cpi->oxcf.cpu_used; - - // force to allowlag to 0 if lag_in_frames is 0; - if (cpi->oxcf.lag_in_frames == 0) - { - cpi->oxcf.allow_lag = 0; - } - // Limit on lag buffers as these are not currently dynamically allocated - else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) - cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS; - - // YX Temp - cpi->last_alt_ref_sei = -1; - cpi->is_src_frame_alt_ref = 0; - cpi->is_next_src_alt_ref = 0; - -#if 0 - // Experimental RD Code - cpi->frame_distortion = 0; - cpi->last_frame_distortion = 0; -#endif + // change includes all joint functionality + vp8_change_config(ptr, oxcf); #if VP8_TEMPORAL_ALT_REF @@ -1779,12 +1556,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) #endif } -/* - * This function needs more clean up, i.e. be more tuned torwards - * change_config rather than init_config !!!!!!!!!!!!!!!! - * YX - 5/28/2009 - * - */ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) { @@ -1935,10 +1706,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) // Convert target bandwidth from Kbit/s to Bit/s cpi->oxcf.target_bandwidth *= 1000; - cpi->oxcf.starting_buffer_level = - rescale(cpi->oxcf.starting_buffer_level, - cpi->oxcf.target_bandwidth, 1000); - if (cpi->oxcf.optimal_buffer_level == 0) cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; else @@ -1953,29 +1720,36 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) rescale(cpi->oxcf.maximum_buffer_size, cpi->oxcf.target_bandwidth, 1000); - cpi->buffer_level = cpi->oxcf.starting_buffer_level; - cpi->bits_off_target = cpi->oxcf.starting_buffer_level; - vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); cpi->worst_quality = cpi->oxcf.worst_allowed_q; - cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; - cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; cpi->best_quality = cpi->oxcf.best_allowed_q; - cpi->active_best_quality = cpi->oxcf.best_allowed_q; + + // active values should only be modified if out of new range + if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) + { + cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; + } + // less likely + else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) + { + cpi->active_worst_quality = cpi->oxcf.best_allowed_q; + } + if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) + { + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + } + // less likely + else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) + { + cpi->active_best_quality = cpi->oxcf.worst_allowed_q; + } + cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE; cpi->cq_target_quality = cpi->oxcf.cq_level; - cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; - cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; - - cpi->total_actual_bits = 0; - cpi->total_target_vs_actual = 0; - // Only allow dropped frames in buffered mode - cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; + cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; @@ -3542,6 +3316,7 @@ static void encode_frame_to_data_rate int drop_mark50 = drop_mark / 4; int drop_mark25 = drop_mark / 8; + // Clear down mmx registers to allow floating point in what follows vp8_clear_system_state(); From 419f638910245f5501fcad4eede1efcab0bd22ee Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 8 Mar 2011 16:25:06 -0500 Subject: [PATCH 2/6] Improve SSE2 half-pixel filter funtions Rewrote these functions to process 16 pixels once instead of 8. Change-Id: Ic67e80124467a446a3df4cfecfb76a4248602adb --- vp8/encoder/x86/variance_impl_sse2.asm | 359 +++++++++++++++++++++++-- vp8/encoder/x86/variance_sse2.c | 116 ++++---- vp8/encoder/x86/variance_ssse3.c | 34 +-- 3 files changed, 393 insertions(+), 116 deletions(-) diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index 5d1a17d44..c2c30deb2 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -790,7 +790,7 @@ filter_block2d_bil_variance: ret -;void vp8_half_horiz_vert_variance16x_h_sse2 +;void vp8_half_horiz_vert_variance8x_h_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, @@ -800,8 +800,8 @@ filter_block2d_bil_variance: ; int *sum, ; unsigned int *sumsquared ;) -global sym(vp8_half_horiz_vert_variance16x_h_sse2) -sym(vp8_half_horiz_vert_variance16x_h_sse2): +global sym(vp8_half_horiz_vert_variance8x_h_sse2) +sym(vp8_half_horiz_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -835,7 +835,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2): add rsi, r8 %endif -vp8_half_horiz_vert_variance16x_h_1: +vp8_half_horiz_vert_variance8x_h_1: movq xmm1, QWORD PTR [rsi] ; movq xmm2, QWORD PTR [rsi+1] ; @@ -863,7 +863,7 @@ vp8_half_horiz_vert_variance16x_h_1: %endif sub rcx, 1 ; - jnz vp8_half_horiz_vert_variance16x_h_1 ; + jnz vp8_half_horiz_vert_variance8x_h_1 ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; @@ -910,8 +910,7 @@ vp8_half_horiz_vert_variance16x_h_1: pop rbp ret - -;void vp8_half_vert_variance16x_h_sse2 +;void vp8_half_horiz_vert_variance16x_h_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, @@ -921,8 +920,124 @@ vp8_half_horiz_vert_variance16x_h_1: ; int *sum, ; unsigned int *sumsquared ;) -global sym(vp8_half_vert_variance16x_h_sse2) -sym(vp8_half_vert_variance16x_h_sse2): +global sym(vp8_half_horiz_vert_variance16x_h_sse2) +sym(vp8_half_horiz_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + + movdqu xmm5, XMMWORD PTR [rsi] + movdqu xmm3, XMMWORD PTR [rsi+1] + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + + lea rsi, [rsi + rax] + +vp8_half_horiz_vert_variance16x_h_1: + movdqu xmm1, XMMWORD PTR [rsi] ; + movdqu xmm2, XMMWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm4, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + psubw xmm5, xmm3 ; xmm5 -= xmm3 + + movq xmm3, QWORD PTR [rdi+8] + punpcklbw xmm3, xmm0 + psubw xmm4, xmm3 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz vp8_half_horiz_vert_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_vert_variance8x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_vert_variance8x_h_sse2) +sym(vp8_half_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -945,7 +1060,7 @@ sym(vp8_half_vert_variance16x_h_sse2): movsxd rax, dword ptr arg(1) ;ref_pixels_per_line pxor xmm0, xmm0 ; -vp8_half_vert_variance16x_h_1: +vp8_half_vert_variance8x_h_1: movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 @@ -969,7 +1084,7 @@ vp8_half_vert_variance16x_h_1: %endif sub rcx, 1 ; - jnz vp8_half_vert_variance16x_h_1 ; + jnz vp8_half_vert_variance8x_h_1 ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; @@ -1016,8 +1131,7 @@ vp8_half_vert_variance16x_h_1: pop rbp ret - -;void vp8_half_horiz_variance16x_h_sse2 +;void vp8_half_vert_variance16x_h_sse2 ;( ; unsigned char *ref_ptr, ; int ref_pixels_per_line, @@ -1027,8 +1141,116 @@ vp8_half_vert_variance16x_h_1: ; int *sum, ; unsigned int *sumsquared ;) -global sym(vp8_half_horiz_variance16x_h_sse2) -sym(vp8_half_horiz_variance16x_h_sse2): +global sym(vp8_half_vert_variance16x_h_sse2) +sym(vp8_half_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr + + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + movdqu xmm5, XMMWORD PTR [rsi] + lea rsi, [rsi + rax ] + pxor xmm0, xmm0 + +vp8_half_vert_variance16x_h_1: + movdqu xmm3, XMMWORD PTR [rsi] + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 + punpckhbw xmm4, xmm0 + + movq xmm2, QWORD PTR [rdi] + punpcklbw xmm2, xmm0 + psubw xmm5, xmm2 + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + psubw xmm4, xmm2 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm3 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 + jnz vp8_half_vert_variance16x_h_1 + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_half_horiz_variance8x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_variance8x_h_sse2) +sym(vp8_half_horiz_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -1050,7 +1272,7 @@ sym(vp8_half_horiz_variance16x_h_sse2): movsxd rcx, dword ptr arg(4) ;Height ; pxor xmm0, xmm0 ; -vp8_half_horiz_variance16x16_1: +vp8_half_horiz_variance8x_h_1: movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 @@ -1073,7 +1295,7 @@ vp8_half_horiz_variance16x16_1: add rdi, r9 %endif sub rcx, 1 ; - jnz vp8_half_horiz_variance16x16_1 ; + jnz vp8_half_horiz_variance8x_h_1 ; movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; @@ -1120,6 +1342,109 @@ vp8_half_horiz_variance16x16_1: pop rbp ret +;void vp8_half_horiz_variance16x_h_sse2 +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vp8_half_horiz_variance16x_h_sse2) +sym(vp8_half_horiz_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line + + pxor xmm0, xmm0 ; + +vp8_half_horiz_variance16x_h_1: + movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 + movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm1, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm1, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + psubw xmm1, xmm2 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm1 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm1, xmm1 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz vp8_half_horiz_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret SECTION_RODATA ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c index 7cf6a6308..4612a6711 100644 --- a/vp8/encoder/x86/variance_sse2.c +++ b/vp8/encoder/x86/variance_sse2.c @@ -81,6 +81,16 @@ void vp8_filter_block2d_bil_var_sse2 int *sum, unsigned int *sumsquared ); +void vp8_half_horiz_vert_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); void vp8_half_horiz_vert_variance16x_h_sse2 ( const unsigned char *ref_ptr, @@ -91,6 +101,16 @@ void vp8_half_horiz_vert_variance16x_h_sse2 int *sum, unsigned int *sumsquared ); +void vp8_half_horiz_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); void vp8_half_horiz_variance16x_h_sse2 ( const unsigned char *ref_ptr, @@ -101,6 +121,16 @@ void vp8_half_horiz_variance16x_h_sse2 int *sum, unsigned int *sumsquared ); +void vp8_half_vert_variance8x_h_sse2 +( + const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared +); void vp8_half_vert_variance16x_h_sse2 ( const unsigned char *ref_ptr, @@ -262,21 +292,21 @@ unsigned int vp8_sub_pixel_variance8x8_wmt if (xoffset == 4 && yoffset == 0) { - vp8_half_horiz_variance16x_h_sse2( + vp8_half_horiz_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum, &xxsum); } else if (xoffset == 0 && yoffset == 4) { - vp8_half_vert_variance16x_h_sse2( + vp8_half_vert_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum, &xxsum); } else if (xoffset == 4 && yoffset == 4) { - vp8_half_horiz_vert_variance16x_h_sse2( + vp8_half_horiz_vert_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum, &xxsum); @@ -317,11 +347,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - - vp8_half_horiz_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); } else if (xoffset == 0 && yoffset == 4) { @@ -329,11 +354,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - - vp8_half_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); } else if (xoffset == 4 && yoffset == 4) { @@ -341,11 +361,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); } else { @@ -356,17 +371,16 @@ unsigned int vp8_sub_pixel_variance16x16_wmt &xsum0, &xxsum0 ); - vp8_filter_block2d_bil_var_sse2( src_ptr + 8, src_pixels_per_line, dst_ptr + 8, dst_pixels_per_line, 16, xoffset, yoffset, &xsum1, &xxsum1 ); + xsum0 += xsum1; + xxsum0 += xxsum1; } - xsum0 += xsum1; - xxsum0 += xxsum1; *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 8)); } @@ -406,11 +420,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum0, &xxsum0); - - vp8_half_horiz_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - &xsum1, &xxsum1); } else if (xoffset == 0 && yoffset == 4) { @@ -418,11 +427,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum0, &xxsum0); - - vp8_half_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - &xsum1, &xxsum1); } else if (xoffset == 4 && yoffset == 4) { @@ -430,11 +434,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 8, &xsum0, &xxsum0); - - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - &xsum1, &xxsum1); } else { @@ -449,11 +448,10 @@ unsigned int vp8_sub_pixel_variance16x8_wmt dst_ptr + 8, dst_pixels_per_line, 8, xoffset, yoffset, &xsum1, &xxsum1); + xsum0 += xsum1; + xxsum0 += xxsum1; } - xsum0 += xsum1; - xxsum0 += xxsum1; - *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 7)); } @@ -474,21 +472,21 @@ unsigned int vp8_sub_pixel_variance8x16_wmt if (xoffset == 4 && yoffset == 0) { - vp8_half_horiz_variance16x_h_sse2( + vp8_half_horiz_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum, &xxsum); } else if (xoffset == 0 && yoffset == 4) { - vp8_half_vert_variance16x_h_sse2( + vp8_half_vert_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum, &xxsum); } else if (xoffset == 4 && yoffset == 4) { - vp8_half_horiz_vert_variance16x_h_sse2( + vp8_half_horiz_vert_variance8x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum, &xxsum); @@ -589,21 +587,14 @@ unsigned int vp8_variance_halfpixvar16x16_h_wmt( int dst_pixels_per_line, unsigned int *sse) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; + int xsum0; + unsigned int xxsum0; vp8_half_horiz_variance16x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - vp8_half_horiz_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 8)); } @@ -616,21 +607,13 @@ unsigned int vp8_variance_halfpixvar16x16_v_wmt( int dst_pixels_per_line, unsigned int *sse) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - + int xsum0; + unsigned int xxsum0; vp8_half_vert_variance16x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - vp8_half_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 8)); } @@ -643,21 +626,14 @@ unsigned int vp8_variance_halfpixvar16x16_hv_wmt( int dst_pixels_per_line, unsigned int *sse) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; + int xsum0; + unsigned int xxsum0; vp8_half_horiz_vert_variance16x_h_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 8)); } diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c index 750ae8b86..d50ae3ade 100644 --- a/vp8/encoder/x86/variance_ssse3.c +++ b/vp8/encoder/x86/variance_ssse3.c @@ -87,14 +87,6 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3 src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - - vp8_half_horiz_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; } else if (xoffset == 0 && yoffset == 4) { @@ -102,14 +94,6 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3 src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - - vp8_half_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; } else if (xoffset == 4 && yoffset == 4) { @@ -117,22 +101,14 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3 src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, &xsum0, &xxsum0); - - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; } else { - vp8_filter_block2d_bil_var_ssse3( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum0, &xxsum0); + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum0, &xxsum0); } *sse = xxsum0; From fb037ec05b2bf138b5e7c4f4ab8ac75de41187d6 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 8 Mar 2011 17:41:45 -0500 Subject: [PATCH 3/6] fix obj_int_extract for MinGW failed to find headers in the source directory output to stdout instead of a hardcoded file MinGW doesn't support _sopen_s _fstat catches non-existant files Change-Id: I24e0aacc6f6f26e6bcfc25f9ee7821aa3c8cc7e7 --- build/make/Makefile | 2 +- build/make/obj_int_extract.c | 28 +++++++--------------------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/build/make/Makefile b/build/make/Makefile index 40fa6d50c..62d139ea4 100755 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -153,7 +153,7 @@ endif # obj_int_extract: build/make/obj_int_extract.c $(if $(quiet),echo " [HOSTCC] $@") - $(qexec)$(HOSTCC) -I. -o $@ $< + $(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $< CLEAN-OBJS += obj_int_extract # diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c index 26cf45782..22c5cf2ab 100644 --- a/build/make/obj_int_extract.c +++ b/build/make/obj_int_extract.c @@ -14,7 +14,7 @@ #include "vpx_config.h" -#if defined(_MSC_VER) +#if defined(_MSC_VER) || defined(__MINGW32__) #include #include #include "vpx/vpx_integer.h" @@ -816,7 +816,7 @@ bail: #endif -#if defined(_MSC_VER) +#if defined(_MSC_VER) || defined(__MINGW32__) /* See "Microsoft Portable Executable and Common Object File Format Specification" for reference. */ @@ -830,7 +830,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz) unsigned int i; unsigned __int8 *ptr; unsigned __int32 symoffset; - FILE *fp; char **sectionlist; //this array holds all section names in their correct order. //it is used to check if the symbol is in .bss or .data section. @@ -871,14 +870,6 @@ int parse_coff(unsigned __int8 *buf, size_t sz) //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr); //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr); - fp = fopen("assembly_offsets.asm", "w"); - - if (fp == NULL) - { - perror("open file"); - goto bail; - } - /* The compiler puts the data with non-zero offset in .data section, but puts the data with zero offset in .bss section. So, if the data in in .bss section, set offset=0. Note from Wiki: In an object module compiled from C, the bss section contains @@ -912,13 +903,13 @@ int parse_coff(unsigned __int8 *buf, size_t sz) char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; strncpy(name, ptr, 8); //log_msg("COFF: Parsing symbol %s\n",name); - fprintf(fp, "%-40s EQU ", name); + printf("%-40s EQU ", name + 1); } else { //log_msg("COFF: Parsing symbol %s\n", // buf + strtab_ptr + get_le32(ptr+4)); - fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4)); + printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4) + 1); } if (!(strcmp(sectionlist[section-1], ".bss"))) @@ -935,14 +926,13 @@ int parse_coff(unsigned __int8 *buf, size_t sz) //log_msg(" Address: %u\n",get_le32(ptr+8)); //log_msg(" Offset: %u\n", symoffset); - fprintf(fp, "%5d\n", symoffset); + printf("%5d\n", symoffset); } ptr += 18; } - fprintf(fp, " END\n"); - fclose(fp); + printf(" END\n"); for (i = 0; i < nsections; i++) { @@ -992,11 +982,7 @@ int main(int argc, char **argv) else f = argv[1]; - if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE)) - { - perror("Unable to open file"); - goto bail; - } + fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE); if (_fstat(fd, &stat_buf)) { From 4561109a69338336d7ab1875fd2fd62f80392b14 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Wed, 9 Mar 2011 10:45:03 -0500 Subject: [PATCH 4/6] Remove unused functions Removed some unused functions Change-Id: Ifdfc27453e53cfc75997b38492901d193a16b245 --- vp8/encoder/x86/variance_mmx.c | 140 -------------------------------- vp8/encoder/x86/variance_sse2.c | 75 ----------------- 2 files changed, 215 deletions(-) diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c index 6eed98e07..07358c0c7 100644 --- a/vp8/encoder/x86/variance_mmx.c +++ b/vp8/encoder/x86/variance_mmx.c @@ -456,146 +456,6 @@ unsigned int vp8_sub_pixel_variance8x16_mmx return (xxsum - ((xsum * xsum) >> 7)); } -unsigned int vp8_i_variance16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - *sse = var; - return (var - ((avg * avg) >> 8)); - -} - -unsigned int vp8_i_variance8x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - - *sse = var; - return (var - ((avg * avg) >> 7)); - -} - -unsigned int vp8_i_sub_pixel_variance16x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - int f2soffset = (src_pixels_per_line >> 1); - int f2doffset = (dst_pixels_per_line >> 1); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - vp8_filter_block2d_bil_var_mmx( - src_ptr + f2soffset, src_pixels_per_line, - dst_ptr + f2doffset, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - vp8_filter_block2d_bil_var_mmx( - src_ptr + f2soffset + 8, src_pixels_per_line, - dst_ptr + f2doffset + 8, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - *sse = xxsum0; - return (xxsum0 - ((xsum0 * xsum0) >> 8)); -} - - -unsigned int vp8_i_sub_pixel_variance8x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - int f2soffset = (src_pixels_per_line >> 1); - int f2doffset = (dst_pixels_per_line >> 1); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr + f2soffset, src_pixels_per_line, - dst_ptr + f2doffset, dst_pixels_per_line, 8, - vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - *sse = xxsum0; - return (xxsum0 - ((xsum0 * xsum0) >> 7)); -} - unsigned int vp8_variance_halfpixvar16x16_h_mmx( const unsigned char *src_ptr, diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c index 4612a6711..0edda3062 100644 --- a/vp8/encoder/x86/variance_sse2.c +++ b/vp8/encoder/x86/variance_sse2.c @@ -504,81 +504,6 @@ unsigned int vp8_sub_pixel_variance8x16_wmt return (xxsum - ((xsum * xsum) >> 7)); } -unsigned int vp8_i_variance16x16_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ; - vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - - *sse = var; - return (var - ((avg * avg) >> 8)); - -} - -unsigned int vp8_i_variance8x16_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - - *sse = var; - return (var - ((avg * avg) >> 7)); - -} - - -unsigned int vp8_i_sub_pixel_variance16x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); -} - - -unsigned int vp8_i_sub_pixel_variance8x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - - return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse); -} - unsigned int vp8_variance_halfpixvar16x16_h_wmt( const unsigned char *src_ptr, From 7b8e7f0f3ae13ebf29200324b0c4d7fe64780a58 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Wed, 9 Mar 2011 11:16:30 -0500 Subject: [PATCH 5/6] Add vp8_sub_pixel_variance16x8_ssse3 function Added SSSE3 function Change-Id: I8c304c92458618d93fda3a2f62bd09ccb63e75ad --- vp8/encoder/x86/variance_ssse3.c | 53 +++++++++++++++++++++++++- vp8/encoder/x86/variance_x86.h | 4 ++ vp8/encoder/x86/x86_csystemdependent.c | 1 + 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c index d50ae3ade..eb5d486bf 100644 --- a/vp8/encoder/x86/variance_ssse3.c +++ b/vp8/encoder/x86/variance_ssse3.c @@ -76,8 +76,8 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3 unsigned int *sse ) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; + int xsum0; + unsigned int xxsum0; // note we could avoid these if statements if the calling function // just called the appropriate functions inside. @@ -114,3 +114,52 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3 *sse = xxsum0; return (xxsum0 - ((xsum0 * xsum0) >> 8)); } + +unsigned int vp8_sub_pixel_variance16x8_ssse3 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse + +) +{ + int xsum0; + unsigned int xxsum0; + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + } + else + { + vp8_filter_block2d_bil_var_ssse3( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum0, &xxsum0); + } + + *sse = xxsum0; + return (xxsum0 - ((xsum0 * xsum0) >> 7)); +} diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index 1e2fb3490..3560f7413 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -286,6 +286,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); #if HAVE_SSSE3 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3); extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_ssse3); extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3); #if !CONFIG_RUNTIME_CPU_DETECT @@ -295,6 +296,9 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3); #undef vp8_variance_sad16x8x3 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3 +#undef vp8_variance_subpixvar16x8 +#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_ssse3 + #undef vp8_variance_subpixvar16x16 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3 diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index c7639a7e4..3158ac12b 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -334,6 +334,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_ssse3; cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3; cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; From 5c240715042057b326953afca140324533ada3e8 Mon Sep 17 00:00:00 2001 From: John Koleszar Date: Wed, 9 Mar 2011 13:43:31 -0500 Subject: [PATCH 6/6] Add missing filter.h to build system Missing file causes 'make dist' to not include a complete copy of the source. Change-Id: I3f55aeb5a86d0e81234e4e4588cb8086ba4cfc4a --- vp8/vp8_common.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 4daadee32..ba9caa7ce 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -24,6 +24,7 @@ VP8_COMMON_SRCS-yes += common/entropymode.c VP8_COMMON_SRCS-yes += common/entropymv.c VP8_COMMON_SRCS-yes += common/extend.c VP8_COMMON_SRCS-yes += common/filter.c +VP8_COMMON_SRCS-yes += common/filter.h VP8_COMMON_SRCS-yes += common/findnearmv.c VP8_COMMON_SRCS-yes += common/generic/systemdependent.c VP8_COMMON_SRCS-yes += common/idctllm.c