Merge "Made AltRef filter adaptive & added motion compensation"

2010-09-28 08:34:44 -07:00 · 2010-09-28 08:34:44 -07:00 · 0090328164
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@ -545,31 +545,29 @@ void vp8_encode_frame(VP8_COMP *cpi)
    int segment_counts[MAX_MB_SEGMENTS];
    int totalrate;

-    if (cm->frame_type != KEY_FRAME)
+    // Functions setup for all frame types so we can use MC in AltRef
+    if (cm->mcomp_filter_type == SIXTAP)
    {
-        if (cm->mcomp_filter_type == SIXTAP)
-        {
-            xd->subpixel_predict     = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap4x4);
-            xd->subpixel_predict8x4      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x4);
-            xd->subpixel_predict8x8      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x8);
-            xd->subpixel_predict16x16    = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap16x16);
-        }
-        else
-        {
-            xd->subpixel_predict     = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear4x4);
-            xd->subpixel_predict8x4      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x4);
-            xd->subpixel_predict8x8      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x8);
-            xd->subpixel_predict16x16    = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear16x16);
-        }
+        xd->subpixel_predict        = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap4x4);
+        xd->subpixel_predict8x4     = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap8x4);
+        xd->subpixel_predict8x8     = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap8x8);
+        xd->subpixel_predict16x16   = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap16x16);
+    }
+    else
+    {
+        xd->subpixel_predict        = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, bilinear4x4);
+        xd->subpixel_predict8x4     = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, bilinear8x4);
+        xd->subpixel_predict8x8     = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, bilinear8x8);
+        xd->subpixel_predict16x16   = SUBPIX_INVOKE(
+                                      &cpi->common.rtcd.subpix, bilinear16x16);
    }
-
-    //else  // Key Frame
-    //{
-    // For key frames make sure the intra ref frame probability value
-    // is set to "all intra"
-    //cpi->prob_intra_coded = 255;
-    //}
-

    x->gf_active_ptr = (signed char *)cpi->gf_active_flags;     // Point to base of GF active flags data structure

--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@ -258,7 +258,7 @@ void vp8_output_stats(struct vpx_codec_pkt_list *pktlist,
    vpx_codec_pkt_list_add(pktlist, &pkt);

 // TEMP debug code
-#ifdef OUTPUT_FPF
+#if OUTPUT_FPF
    {
        FILE *fpfile;
        fpfile = fopen("firstpass.stt", "a");
@ -369,50 +369,33 @@ void vp8_fpmm_reset_pos(VP8_COMP *cpi, int target_pos)

 void vp8_advance_fpmm(VP8_COMP *cpi, int count)
 {
-#ifdef FIRSTPASS_MM
+#if FIRSTPASS_MM
    fseek(cpi->fp_motion_mapfile, (int)(count * cpi->common.MBs), SEEK_CUR);
 #endif
 }

-void vp8_input_fpmm(VP8_COMP *cpi, int count)
+void vp8_input_fpmm(VP8_COMP *cpi)
 {
-#ifdef FIRSTPASS_MM
-
-    unsigned char *tmp_motion_map;
-    int i, j;
+#if FIRSTPASS_MM
+    int MBs = cpi->common.MBs;
+    int max_frames = cpi->active_arnr_frames;

    if (!cpi->fp_motion_mapfile)
        return;                 // Error

-    // Create the first pass motion map structure and set to 0
-    CHECK_MEM_ERROR(tmp_motion_map, vpx_calloc(cpi->common.MBs, 1));
-
-    // Reset the state of the global map
-    vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs);
-
-    // Read the specified number of frame maps and set the global map to the highest value seen for each mb.
-    for (i = 0; i < count; i++)
+    // Read the specified number of frame motion maps
+    if (fread(cpi->fp_motion_map, 1,
+              max_frames * MBs,
+              cpi->fp_motion_mapfile) != max_frames*MBs)
    {
-        if (fread(tmp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile) == cpi->common.MBs)
-        {
-            for (j = 0; j < cpi->common.MBs; j++)
-            {
-                if (tmp_motion_map[j] > 1)
-                    cpi->fp_motion_map[j] += 5;   // Intra is flagged
-                else
-                    cpi->fp_motion_map[j] += tmp_motion_map[j];
-            }
-        }
-        else
-            break;  // Read error
-
+        // Read error
+        return;
    }

-    if (tmp_motion_map != 0)
-        vpx_free(tmp_motion_map);
+    // Flag the use of weights in the temporal filter
+    cpi->use_weighted_temporal_filter = 1;

 #endif
-
 }

 void vp8_init_first_pass(VP8_COMP *cpi)
@ -438,7 +421,7 @@ void vp8_end_first_pass(VP8_COMP *cpi)
 {
    vp8_output_stats(cpi->output_pkt_list, &cpi->total_stats);

-#ifdef FIRSTPASS_MM
+#if FIRSTPASS_MM

    if (cpi->fp_motion_mapfile)
        fclose(cpi->fp_motion_mapfile);
@ -603,6 +586,8 @@ void vp8_first_pass(VP8_COMP *cpi)
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
            int this_error;
+            int zero_error;
+            int zz_to_best_ratio;
            int gf_motion_error = INT_MAX;
            int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

@ -624,7 +609,7 @@ void vp8_first_pass(VP8_COMP *cpi)
            intra_error += this_error;

            // Indicate default assumption of intra in the motion map
-            *fp_motion_map_ptr = 2;
+            *fp_motion_map_ptr = 0;

            // Set up limit values for motion vectors to prevent them extending outside the UMV borders
            x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
@ -646,6 +631,9 @@ void vp8_first_pass(VP8_COMP *cpi)
                d->bmi.mv.as_mv.row = 0;
                d->bmi.mv.as_mv.col = 0;

+                // Save (0,0) error for later use
+                zero_error = motion_error;
+
                // Test last reference frame using the previous best mv as the
                // starting point (best reference) for the search
                vp8_first_pass_motion_search(cpi, x, &best_ref_mv,
@ -719,8 +707,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                    {
                        mvcount++;

-                        *fp_motion_map_ptr = 1;
-
                        // Does the Row vector point inwards or outwards
                        if (mb_row < cm->mb_rows / 2)
                        {
@ -752,12 +738,30 @@ void vp8_first_pass(VP8_COMP *cpi)
                            else if (d->bmi.mv.as_mv.col < 0)
                                sum_in_vectors--;
                        }
+
+                        // Compute how close (0,0) predictor is to best
+                        // predictor in terms of their prediction error
+                        zz_to_best_ratio = (10*zero_error + this_error/2)
+                                            / (this_error+!this_error);
+
+                        if ((zero_error < 50000) &&
+                            (zz_to_best_ratio <= 11) )
+                            *fp_motion_map_ptr = 1;
+                        else
+                            *fp_motion_map_ptr = 0;
                    }
                    else
-                        *fp_motion_map_ptr = 0;    // 0,0 mv was best
+                    {
+                        // 0,0 mv was best
+                        if( zero_error<50000 )
+                            *fp_motion_map_ptr = 2;
+                        else
+                            *fp_motion_map_ptr = 1;
+                    }
                }
                else
                {
+                    // Intra was best
                    best_ref_mv.row = 0;
                    best_ref_mv.col = 0;
                }
@ -839,7 +843,7 @@ void vp8_first_pass(VP8_COMP *cpi)
        vp8_output_stats(cpi->output_pkt_list, &cpi->this_frame_stats);
        vp8_accumulate_stats(&cpi->total_stats, &fps);

-#ifdef FIRSTPASS_MM
+#if FIRSTPASS_MM
        fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile);
 #endif
    }
@ -1180,7 +1184,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)

    }

-#ifdef FIRSTPASS_MM
+#if FIRSTPASS_MM
    cpi->fp_motion_mapfile = 0;
    cpi->fp_motion_mapfile = fopen("fpmotionmap.stt", "rb");
 #endif
@ -1189,7 +1193,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)

 void vp8_end_second_pass(VP8_COMP *cpi)
 {
-#ifdef FIRSTPASS_MM
+#if FIRSTPASS_MM

    if (cpi->fp_motion_mapfile)
        fclose(cpi->fp_motion_mapfile);
@ -1230,7 +1234,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    int max_bits = frame_max_bits(cpi);    // Max for a single frame

-#ifdef FIRSTPASS_MM
+#if FIRSTPASS_MM
    int fpmm_pos;
 #endif

@ -1239,7 +1243,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)

    vp8_clear_system_state();  //__asm emms;

-#ifdef FIRSTPASS_MM
+#if FIRSTPASS_MM
    fpmm_pos = vp8_fpmm_get_pos(cpi);
 #endif

@ -1452,6 +1456,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames.
        if (tmp_q < cpi->worst_quality)
        {
+            int half_gf_int;
+            int frames_after_arf;
+            int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
+            int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
+
            cpi->source_alt_ref_pending = TRUE;

            // For alt ref frames the error score for the end frame of the group (the alt ref frame) should not contribute to the group total and hence
@ -1462,20 +1471,63 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
            // The future frame itself is part of the next group
            cpi->baseline_gf_interval = i - 1;

-#ifdef FIRSTPASS_MM
-            // Read through the motion map to load up the entry for the ARF
-            {
-                int j;
+            // Define the arnr filter width for this group of frames:
+            // We only filter frames that lie within a distance of half
+            // the GF interval from the ARF frame. We also have to trap
+            // cases where the filter extends beyond the end of clip.
+            // Note: this_frame->frame has been updated in the loop
+            // so it now points at the ARF frame.
+            half_gf_int = cpi->baseline_gf_interval >> 1;
+            frames_after_arf = cpi->total_stats.count - this_frame->frame - 1;

-                // Advance to the region of interest
-                // Current default 2 frames before to 2 frames after the ARF frame itsef
+            switch (cpi->oxcf.arnr_type)
+            {
+            case 1: // Backward filter
+                frames_fwd = 0;
+                if (frames_bwd > half_gf_int)
+                    frames_bwd = half_gf_int;
+                break;
+
+            case 2: // Forward filter
+                if (frames_fwd > half_gf_int)
+                    frames_fwd = half_gf_int;
+                if (frames_fwd > frames_after_arf)
+                    frames_fwd = frames_after_arf;
+                frames_bwd = 0;
+                break;
+
+            case 3: // Centered filter
+            default:
+                frames_fwd >>= 1;
+                if (frames_fwd > frames_after_arf)
+                    frames_fwd = frames_after_arf;
+                if (frames_fwd > half_gf_int)
+                    frames_fwd = half_gf_int;
+
+                frames_bwd = frames_fwd;
+
+                // For even length filter there is one more frame backward
+                // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+                if (frames_bwd < half_gf_int)
+                    frames_bwd += (cpi->oxcf.arnr_max_frames+1) & 0x1;
+                break;
+            }
+
+            cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+
+#if FIRSTPASS_MM
+            {
+                // Advance to & read in the motion map for those frames
+                // to be considered for filtering based on the position
+                // of the ARF
                vp8_fpmm_reset_pos(cpi, cpi->fpmm_pos);

-                for (j = 0; j < cpi->baseline_gf_interval - 2; j++)
-                    vp8_advance_fpmm(cpi, 1);
+                // Position at the 'earliest' frame to be filtered
+                vp8_advance_fpmm(cpi,
+                    cpi->baseline_gf_interval - frames_bwd);

                // Read / create a motion map for the region of interest
-                vp8_input_fpmm(cpi, 5);
+                vp8_input_fpmm(cpi);
            }
 #endif
        }
@ -1713,7 +1765,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
        reset_fpf_position(cpi, start_pos);
    }

-#ifdef FIRSTPASS_MM
+#if FIRSTPASS_MM
    // Reset the First pass motion map file position
    vp8_fpmm_reset_pos(cpi, fpmm_pos);
 #endif
@ -1798,10 +1850,13 @@ void vp8_second_pass(VP8_COMP *cpi)
    if (EOF == vp8_input_stats(cpi, &this_frame))
        return;

-#ifdef FIRSTPASS_MM
-    vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs);
+#if FIRSTPASS_MM
+    vpx_memset(cpi->fp_motion_map, 0,
+                cpi->oxcf.arnr_max_frames*cpi->common.MBs);
    cpi->fpmm_pos = vp8_fpmm_get_pos(cpi);
-    vp8_advance_fpmm(cpi, 1);         // Read this frame's first pass motion map
+
+    // Step over this frame's first pass motion map
+    vp8_advance_fpmm(cpi, 1);
 #endif

    this_frame_error = this_frame.ssim_weighted_pred_err;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@ -43,6 +43,9 @@
 #define RTCD(x) NULL
 #endif

+#define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
+#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
+
 extern void vp8cx_init_mv_bits_sadcost();
 extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
 extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
@ -1662,13 +1665,16 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 #endif

 #if VP8_TEMPORAL_ALT_REF
+
+    cpi->use_weighted_temporal_filter = 0;
+
    {
        int i;

        cpi->fixed_divide[0] = 0;

-        for (i = 1; i < 255; i++)
-            cpi->fixed_divide[i] = 0x10000 / i;
+        for (i = 1; i < 512; i++)
+            cpi->fixed_divide[i] = 0x80000 / i;
    }
 #endif
 }
@ -2042,7 +2048,8 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->active_map_enabled = 0;

    // Create the first pass motion map structure and set to 0
-    CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(cpi->common.MBs, 1));
+    // Allocate space for maximum of 15 buffers
+    CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(15*cpi->common.MBs, 1));

 #if 0
    // Experimental code for lagged and one pass
@ -3290,97 +3297,479 @@ static int modifier_lut[7][19] =
    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 2, 1}
 };
 #endif
-static void vp8cx_temp_blur1_c
+static void build_predictors_mb
 (
-    VP8_COMP *cpi,
-    unsigned char **frames,
-    int frame_count,
-    unsigned char *src,
-    unsigned char *dst,
-    int width,
+    MACROBLOCKD *x,
+    unsigned char *y_mb_ptr,
+    unsigned char *u_mb_ptr,
+    unsigned char *v_mb_ptr,
    int stride,
-    int height,
-    int strength,
-    int *fixed_divide,
-    unsigned char *motion_map_ptr,
-    unsigned char block_size
+    int mv_row,
+    int mv_col,
+    unsigned char *pred
+)
+{
+    int offset;
+    unsigned char *yptr, *uptr, *vptr;
+
+    // Y
+    yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
+
+    if ((mv_row | mv_col) & 7)
+    {
+//        vp8_sixtap_predict16x16_c(yptr, stride,
+//                                    mv_col & 7, mv_row & 7, &pred[0], 16);
+        x->subpixel_predict16x16(yptr, stride,
+                                    mv_col & 7, mv_row & 7, &pred[0], 16);
+    }
+    else
+    {
+        //vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16);
+        RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16);
+    }
+
+    // U & V
+    mv_row >>= 1;
+    mv_col >>= 1;
+    stride >>= 1;
+    offset = (mv_row >> 3) * stride + (mv_col >> 3);
+    uptr = u_mb_ptr + offset;
+    vptr = v_mb_ptr + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict8x8(uptr, stride,
+                            mv_col & 7, mv_row & 7, &pred[256], 8);
+        x->subpixel_predict8x8(vptr, stride,
+                            mv_col & 7, mv_row & 7, &pred[320], 8);
+    }
+    else
+    {
+        RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, stride, &pred[256], 8);
+        RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, stride, &pred[320], 8);
+    }
+}
+static void apply_temporal_filter
+(
+    unsigned char *frame1,
+    unsigned int stride,
+    unsigned char *frame2,
+    unsigned int block_size,
+    int strength,
+    int filter_weight,
+    int *accumulator,
+    int *count
 )
 {
-    int byte = 0;         // Buffer offset for current pixel being filtered
-    int frame = 0;
-    int modifier = 0;
    int i, j, k;
-    int block_ofset;
-    int cols;
-    unsigned char Shift = (block_size == 16) ? 4 : 3;
+    int modifier;
+    int byte = 0;
+
 #if USE_FILTER_LUT
    int *lut = modifier_lut[strength];
 #endif

-    cols = cpi->common.mb_cols;
-
-    for (i = 0; i < height; i++)
+    for (i = 0,k = 0; i < block_size; i++)
    {
-        block_ofset = (i >> Shift) * cols;
-
-        for (j = 0; j < cols; j ++)
+        for (j = 0; j < block_size; j++, k++)
        {
-            if (motion_map_ptr[block_ofset] > 2)
-            {
-                vpx_memcpy(&dst[byte], &src[byte], block_size);
-                byte += block_size;
-            }
+
+            int src_byte = frame1[byte];
+            int pixel_value = *frame2++;
+
+#if USE_FILTER_LUT
+            // LUT implementation --
+            // improves precision of filter
+            modifier = abs(src_byte-pixel_value);
+            modifier = modifier>18 ? 0 : lut[modifier];
+#else
+            modifier   = src_byte;
+            modifier  -= pixel_value;
+            modifier  *= modifier;
+            modifier >>= strength;
+            modifier  *= 3;
+
+            if (modifier > 16)
+                modifier = 16;
+
+            modifier = 16 - modifier;
+#endif
+            modifier *= filter_weight;
+            
+            count[k] += modifier;
+            accumulator[k] += modifier * pixel_value;
+
+            byte++;
+        }
+
+        byte += stride - block_size;
+    }
+}
+
+#if ALT_REF_MC_ENABLED
+static int dummy_cost[2*mv_max+1];
+
+static int find_matching_mb
+(
+    VP8_COMP *cpi,
+    YV12_BUFFER_CONFIG *arf_frame,
+    YV12_BUFFER_CONFIG *frame_ptr,
+    int mb_offset,
+    int error_thresh
+)
+{
+    MACROBLOCK *x = &cpi->mb;
+    int thissme;
+    int step_param;
+    int further_steps;
+    int n = 0;
+    int sadpb = x->sadperbit16;
+    int bestsme = INT_MAX;
+    int num00 = 0;
+
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    MV best_ref_mv1 = {0,0};
+
+    int *mvcost[2]    = { &dummy_cost[mv_max+1], &dummy_cost[mv_max+1] };
+    int *mvsadcost[2] = { &dummy_cost[mv_max+1], &dummy_cost[mv_max+1] };
+
+    // Save input state
+    unsigned char **base_src = b->base_src;
+    int src = b->src;
+    int src_stride = b->src_stride;
+    unsigned char **base_pre = d->base_pre;
+    int pre = d->pre;
+    int pre_stride = d->pre_stride;
+
+    // Setup frame pointers
+    b->base_src = &arf_frame->y_buffer;
+    b->src_stride = arf_frame->y_stride;
+    b->src = mb_offset;
+
+    d->base_pre = &frame_ptr->y_buffer;
+    d->pre_stride = frame_ptr->y_stride;
+    d->pre = mb_offset;
+
+    // Further step/diamond searches as necessary
+    if (cpi->Speed < 8)
+    {
+        step_param = cpi->sf.first_step +
+                    ((cpi->Speed > 5) ? 1 : 0);
+        further_steps =
+            (cpi->sf.max_step_search_steps - 1)-step_param;
+    }
+    else
+    {
+        step_param = cpi->sf.first_step + 2;
+        further_steps = 0;
+    }
+
+    if (1/*cpi->sf.search_method == HEX*/)
+    {
+        // TODO Check that the 16x16 vf & sdf are selected here
+        bestsme = vp8_hex_search(x, b, d,
+            &best_ref_mv1, &d->bmi.mv.as_mv,
+            step_param,
+            sadpb/*x->errorperbit*/,
+            &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf,
+            mvsadcost, mvcost);
+    }
+    else
+    {
+        int mv_x, mv_y;
+
+        bestsme = cpi->diamond_search_sad(x, b, d,
+            &best_ref_mv1, &d->bmi.mv.as_mv,
+            step_param,
+            sadpb / 2/*x->errorperbit*/,
+            &num00, &cpi->fn_ptr,
+            mvsadcost, mvcost); //sadpb < 9
+
+        // Further step/diamond searches as necessary
+        n = 0;
+        //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+        n = num00;
+        num00 = 0;
+
+        while (n < further_steps)
+        {
+            n++;
+
+            if (num00)
+                num00--;
            else
            {
-                for (k = 0; k < block_size; k++)
+                thissme = cpi->diamond_search_sad(x, b, d,
+                    &best_ref_mv1, &d->bmi.mv.as_mv,
+                    step_param + n,
+                    sadpb / 4/*x->errorperbit*/,
+                    &num00, &cpi->fn_ptr,
+                    mvsadcost, mvcost); //sadpb = 9
+
+                if (thissme < bestsme)
                {
-                    int accumulator = 0;
-                    int count = 0;
-                    int src_byte = src[byte];
+                    bestsme = thissme;
+                    mv_y = d->bmi.mv.as_mv.row;
+                    mv_x = d->bmi.mv.as_mv.col;
+                }
+                else
+                {
+                    d->bmi.mv.as_mv.row = mv_y;
+                    d->bmi.mv.as_mv.col = mv_x;
+                }
+            }
+        }
+    }

-                    for (frame = 0; frame < frame_count; frame++)
-                    {
-                        // get current frame pixel value
-                        int pixel_value = frames[frame][byte];
-#if USE_FILTER_LUT
-                        // LUT implementation --
-                        // improves precision of filter
-                        modifier = abs(src_byte-pixel_value);
-                        modifier = modifier>18 ? 0 : lut[modifier];
-#else
-                        modifier   = src_byte;
-                        modifier  -= pixel_value;
-                        modifier  *= modifier;
-                        modifier >>= strength;
-                        modifier  *= 3;
-
-                        if (modifier > 16)
-                            modifier = 16;
-
-                        modifier = 16 - modifier;
+#if ALT_REF_SUBPEL_ENABLED
+    // Try sub-pixel MC?
+    //if (bestsme > error_thresh && bestsme < INT_MAX)
+    {
+        bestsme = cpi->find_fractional_mv_step(x, b, d,
+                    &d->bmi.mv.as_mv, &best_ref_mv1,
+                    x->errorperbit, cpi->fn_ptr.svf,
+                    cpi->fn_ptr.vf, cpi->mb.mvcost);
+    }
 #endif
-                        accumulator += modifier * pixel_value;

-                        count += modifier;
+    // Save input state
+    b->base_src = base_src;
+    b->src = src;
+    b->src_stride = src_stride;
+    d->base_pre = base_pre;
+    d->pre = pre;
+    d->pre_stride = pre_stride;
+
+    return bestsme;
+}
+#endif
+
+static void vp8cx_temp_blur1_c
+(
+    VP8_COMP *cpi,
+    int frame_count,
+    int alt_ref_index,
+    int strength
+)
+{
+    int byte;
+    int frame;
+    int mb_col, mb_row;
+    unsigned int filter_weight[MAX_LAG_BUFFERS];
+    unsigned char *mm_ptr = cpi->fp_motion_map;
+    int cols = cpi->common.mb_cols;
+    int rows = cpi->common.mb_rows;
+    int MBs  = cpi->common.MBs;
+    int mb_y_offset = 0;
+    int mb_uv_offset = 0;
+    unsigned int accumulator[384];
+    unsigned int count[384];
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+    unsigned char *dst1, *dst2;
+    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+    
+    // Save input state
+    unsigned char *y_buffer = mbd->pre.y_buffer;
+    unsigned char *u_buffer = mbd->pre.u_buffer;
+    unsigned char *v_buffer = mbd->pre.v_buffer;
+
+    if (!cpi->use_weighted_temporal_filter)
+    {
+        // Temporal filtering is unweighted
+        for (frame = 0; frame < frame_count; frame++)
+            filter_weight[frame] = 1;
+    }
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+#if ALT_REF_MC_ENABLED
+        // Reduced search extent by 3 for 6-tap filter & smaller UMV border
+        cpi->mb.mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 19));
+        cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+                                + (VP8BORDERINPIXELS - 19);
+#endif
+
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+            int i, j, k, w;
+            int weight_cap;
+            int stride;
+
+            vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
+            vpx_memset(count, 0, 384*sizeof(unsigned int));
+
+#if ALT_REF_MC_ENABLED
+            // Reduced search extent by 3 for 6-tap filter & smaller UMV border
+            cpi->mb.mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 19));
+            cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+                                    + (VP8BORDERINPIXELS - 19);
+#endif
+
+            // Read & process macroblock weights from motion map
+            if (cpi->use_weighted_temporal_filter)
+            {
+                weight_cap = 2;
+
+                for (frame = alt_ref_index-1; frame >= 0; frame--)
+                {
+                    w = *(mm_ptr + (frame+1)*MBs);
+                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
+                    weight_cap = w;
+                }
+
+                filter_weight[alt_ref_index] = 2;
+
+                weight_cap = 2;
+
+                for (frame = alt_ref_index+1; frame < frame_count; frame++)
+                {
+                    w = *(mm_ptr + frame*MBs);
+                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
+                    weight_cap = w;
+                }
+
+            }
+
+            for (frame = 0; frame < frame_count; frame++)
+            {
+                int err;
+
+                if (cpi->frames[frame] == NULL)
+                    continue;
+
+                mbd->block[0].bmi.mv.as_mv.row = 0;
+                mbd->block[0].bmi.mv.as_mv.col = 0;
+
+#if ALT_REF_MC_ENABLED
+                //if (filter_weight[frame] == 0)
+                {
+#define THRESH_LOW   10000
+#define THRESH_HIGH  20000
+
+                    // Correlation has been lost try MC
+                    err = find_matching_mb ( cpi,
+                                             cpi->frames[alt_ref_index],
+                                             cpi->frames[frame],
+                                             mb_y_offset,
+                                             THRESH_LOW );
+
+                    if (filter_weight[frame] < 2)
+                    {
+                        // Set weight depending on error
+                        filter_weight[frame] = err<THRESH_LOW
+                                                ? 2 : err<THRESH_HIGH ? 1 : 0;
                    }
+                }
+#endif
+                if (filter_weight[frame] != 0)
+                {
+                    // Construct the predictors
+                    build_predictors_mb (
+                              mbd,
+                              cpi->frames[frame]->y_buffer + mb_y_offset,
+                              cpi->frames[frame]->u_buffer + mb_uv_offset,
+                              cpi->frames[frame]->v_buffer + mb_uv_offset,
+                              cpi->frames[frame]->y_stride,
+                              mbd->block[0].bmi.mv.as_mv.row,
+                              mbd->block[0].bmi.mv.as_mv.col,
+                              predictor );

-                    accumulator += (count >> 1);
-                    accumulator *= fixed_divide[count];
-                    accumulator >>= 16;
+                    // Apply the filter (YUV)
+                    apply_temporal_filter ( f->y_buffer + mb_y_offset,
+                                            f->y_stride,
+                                            predictor,
+                                            16,
+                                            strength,
+                                            filter_weight[frame],
+                                            accumulator,
+                                            count );

-                    dst[byte] = accumulator;
+                    apply_temporal_filter ( f->u_buffer + mb_uv_offset,
+                                            f->uv_stride,
+                                            predictor + 256,
+                                            8,
+                                            strength,
+                                            filter_weight[frame],
+                                            accumulator + 256,
+                                            count + 256 );
+
+                    apply_temporal_filter ( f->v_buffer + mb_uv_offset,
+                                            f->uv_stride,
+                                            predictor + 320,
+                                            8,
+                                            strength,
+                                            filter_weight[frame],
+                                            accumulator + 320,
+                                            count + 320 );
+                }
+            }
+
+            // Normalize filter output to produce AltRef frame
+            dst1 = cpi->alt_ref_buffer.source_buffer.y_buffer;
+            stride = cpi->alt_ref_buffer.source_buffer.y_stride;
+            byte = mb_y_offset;
+            for (i = 0,k = 0; i < 16; i++)
+            {
+                for (j = 0; j < 16; j++, k++)
+                {
+                    unsigned int pval = accumulator[k] + (count[k] >> 1);
+                    pval *= cpi->fixed_divide[count[k]];
+                    pval >>= 19;
+
+                    dst1[byte] = (unsigned char)pval;

                    // move to next pixel
                    byte++;
                }
+
+                byte += stride - 16;
            }

-            block_ofset++;
+            dst1 = cpi->alt_ref_buffer.source_buffer.u_buffer;
+            dst2 = cpi->alt_ref_buffer.source_buffer.v_buffer;
+            stride = cpi->alt_ref_buffer.source_buffer.uv_stride;
+            byte = mb_uv_offset;
+            for (i = 0,k = 256; i < 8; i++)
+            {
+                for (j = 0; j < 8; j++, k++)
+                {
+                    int m=k+64;
+
+                    // U
+                    unsigned int pval = accumulator[k] + (count[k] >> 1);
+                    pval *= cpi->fixed_divide[count[k]];
+                    pval >>= 19;
+                    dst1[byte] = (unsigned char)pval;
+
+                    // V
+                    pval = accumulator[m] + (count[m] >> 1);
+                    pval *= cpi->fixed_divide[count[m]];
+                    pval >>= 19;
+                    dst2[byte] = (unsigned char)pval;
+
+                    // move to next pixel
+                    byte++;
+                }
+
+                byte += stride - 8;
+            }
+
+            mm_ptr++;
+            mb_y_offset += 16;
+            mb_uv_offset += 8;
        }

-        // Step byte on over the UMV border to the start of the next line
-        byte += stride - width;
+        mb_y_offset += 16*f->y_stride-f->y_width;
+        mb_uv_offset += 8*f->uv_stride-f->uv_width;
    }
+
+    // Restore input state
+    mbd->pre.y_buffer = y_buffer;
+    mbd->pre.u_buffer = u_buffer;
+    mbd->pre.v_buffer = v_buffer;
 }

 static void vp8cx_temp_filter_c
@ -3388,11 +3777,7 @@ static void vp8cx_temp_filter_c
    VP8_COMP *cpi
 )
 {
-    YV12_BUFFER_CONFIG *temp_source_buffer;
-    int *fixed_divide = cpi->fixed_divide;
-
    int frame = 0;
-    int max_frames = 11;

    int num_frames_backward = 0;
    int num_frames_forward = 0;
@ -3400,15 +3785,13 @@ static void vp8cx_temp_filter_c
    int frames_to_blur_forward = 0;
    int frames_to_blur = 0;
    int start_frame = 0;
+    unsigned int filtered = 0;

    int strength = cpi->oxcf.arnr_strength;

    int blur_type = cpi->oxcf.arnr_type;

-    int new_max_frames = cpi->oxcf.arnr_max_frames;
-
-    if (new_max_frames > 0)
-        max_frames = new_max_frames;
+    int max_frames = cpi->active_arnr_frames;

    num_frames_backward = cpi->last_alt_ref_sei - cpi->source_encode_index;

@ -3455,8 +3838,9 @@ static void vp8cx_temp_filter_c
        if (frames_to_blur_backward > frames_to_blur_forward)
            frames_to_blur_backward = frames_to_blur_forward;

-        if (frames_to_blur_forward > (max_frames / 2))
-            frames_to_blur_forward = (max_frames / 2);
+        // When max_frames is even we have 1 more frame backward than forward
+        if (frames_to_blur_forward > (max_frames - 1) / 2)
+            frames_to_blur_forward = ((max_frames - 1) / 2);

        if (frames_to_blur_backward > (max_frames / 2))
            frames_to_blur_backward = (max_frames / 2);
@ -3488,7 +3872,8 @@ static void vp8cx_temp_filter_c
        break;
    }

-    start_frame = (cpi->last_alt_ref_sei + frames_to_blur_forward) % cpi->oxcf.lag_in_frames;
+    start_frame = (cpi->last_alt_ref_sei
+                    + frames_to_blur_forward) % cpi->oxcf.lag_in_frames;

 #ifdef DEBUGFWG
    // DEBUG FWG
@ -3504,6 +3889,8 @@ static void vp8cx_temp_filter_c
           , start_frame);
 #endif

+    // Setup frame pointers, NULL indicates frame not included in filter
+    vpx_memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
    for (frame = 0; frame < frames_to_blur; frame++)
    {
        int which_buffer =  start_frame - frame;
@ -3511,80 +3898,26 @@ static void vp8cx_temp_filter_c
        if (which_buffer < 0)
            which_buffer += cpi->oxcf.lag_in_frames;

-        cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.y_buffer;
+        cpi->frames[frames_to_blur-1-frame]
+                = &cpi->src_buffer[which_buffer].source_buffer;
    }

-    temp_source_buffer = &cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer;
-
-    // Blur Y
-    vp8cx_temp_blur1_c(
+    vp8cx_temp_blur1_c (
        cpi,
-        cpi->frames,
        frames_to_blur,
-        temp_source_buffer->y_buffer,  // cpi->Source->y_buffer,
-        cpi->alt_ref_buffer.source_buffer.y_buffer,  // cpi->Source->y_buffer,
-        temp_source_buffer->y_width,
-        temp_source_buffer->y_stride,
-        temp_source_buffer->y_height,
-        //temp_source_buffer->y_height * temp_source_buffer->y_stride,
-        strength,
-        fixed_divide,
-        cpi->fp_motion_map, 16);
-
-    for (frame = 0; frame < frames_to_blur; frame++)
-    {
-        int which_buffer =  start_frame - frame;
-
-        if (which_buffer < 0)
-            which_buffer += cpi->oxcf.lag_in_frames;
-
-        cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.u_buffer;
-    }
-
-    // Blur U
-    vp8cx_temp_blur1_c(
-        cpi,
-        cpi->frames,
-        frames_to_blur,
-        temp_source_buffer->u_buffer,
-        cpi->alt_ref_buffer.source_buffer.u_buffer,  // cpi->Source->u_buffer,
-        temp_source_buffer->uv_width,
-        temp_source_buffer->uv_stride,
-        temp_source_buffer->uv_height,
-        //temp_source_buffer->uv_height * temp_source_buffer->uv_stride,
-        strength,
-        fixed_divide,
-        cpi->fp_motion_map, 8);
-
-    for (frame = 0; frame < frames_to_blur; frame++)
-    {
-        int which_buffer =  start_frame - frame;
-
-        if (which_buffer < 0)
-            which_buffer += cpi->oxcf.lag_in_frames;
-
-        cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.v_buffer;
-    }
-
-    // Blur V
-    vp8cx_temp_blur1_c(
-        cpi,
-        cpi->frames,
-        frames_to_blur,
-        temp_source_buffer->v_buffer,
-        cpi->alt_ref_buffer.source_buffer.v_buffer,  // cpi->Source->v_buffer,
-        temp_source_buffer->uv_width,
-        temp_source_buffer->uv_stride,
-        //temp_source_buffer->uv_height * temp_source_buffer->uv_stride,
-        temp_source_buffer->uv_height,
-        strength,
-        fixed_divide,
-        cpi->fp_motion_map, 8);
+        frames_to_blur_backward,
+        strength );
 }
 #endif


-static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+static void encode_frame_to_data_rate
+(
+    VP8_COMP *cpi,
+    unsigned long *size,
+    unsigned char *dest,
+    unsigned int *frame_flags
+)
 {
    int Q;
    int frame_over_shoot_limit;
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@ -378,6 +378,7 @@ typedef struct
    int max_gf_interval;
    int baseline_gf_interval;
    int gf_decay_rate;
+    int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames

    INT64 key_frame_count;
    INT64 tot_key_frame_bits;
@ -616,9 +617,11 @@ typedef struct
 #endif
 #if VP8_TEMPORAL_ALT_REF
    SOURCE_SAMPLE alt_ref_buffer;
-    unsigned char *frames[MAX_LAG_BUFFERS];
-    int fixed_divide[255];
+    YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+    int fixed_divide[512];
 #endif
+    // Flag to indicate temporal filter method
+    int use_weighted_temporal_filter;

 #if CONFIG_PSNR
    int    count;