diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm
index 09d7338d9..a86ed5d0a 100644
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -15,19 +15,19 @@
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
 ;-------------------------------------
-; r0    unsigned char *src_ptr,
-; r1    unsigned short *output_ptr,
-; r2    unsigned int src_pixels_per_line,
-; r3    unsigned int output_height,
-; stack    unsigned int output_width,
-; stack    const short *vp8_filter
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;-------------------------------------
 ; The output is transposed stroed in output array to make it easy for second pass filtering.
 |vp8_filter_block2d_bil_first_pass_armv6| PROC
     stmdb   sp!, {r4 - r11, lr}
 
     ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width
 
     mov     r12, r3                         ; outer-loop counter
     sub     r2, r2, r4                      ; src increment for height loop
@@ -38,10 +38,10 @@
 
     ldr     r5, [r11]                       ; load up filter coefficients
 
-    mov     r3, r3, lsl #1                  ; output_height*2
+    mov     r3, r3, lsl #1                  ; height*2
     add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
 
-    mov     r11, r1                         ; save output_ptr for each row
+    mov     r11, r1                         ; save dst_ptr for each row
 
     cmp     r5, #128                        ; if filter coef = 128, then skip the filter
     beq     bil_null_1st_filter
@@ -140,17 +140,17 @@
 
 ;---------------------------------
 ; r0    unsigned short *src_ptr,
-; r1    unsigned char *output_ptr,
-; r2    int output_pitch,
-; r3    unsigned int  output_height,
-; stack unsigned int  output_width,
-; stack const short *vp8_filter
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;---------------------------------
 |vp8_filter_block2d_bil_second_pass_armv6| PROC
     stmdb   sp!, {r4 - r11, lr}
 
     ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width
 
     ldr     r5, [r11]                       ; load up filter coefficients
     mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c
index 65afb41a1..961d142c9 100644
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -10,128 +10,48 @@
 
 
 #include <math.h>
+#include "filter.h"
 #include "subpixel.h"
 
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-static const short bilinear_filters[8][2] =
-{
-    { 128,   0 },
-    { 112,  16 },
-    {  96,  32 },
-    {  80,  48 },
-    {  64,  64 },
-    {  48,  80 },
-    {  32,  96 },
-    {  16, 112 }
-};
-
-
 extern void vp8_filter_block2d_bil_first_pass_armv6
 (
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
+    unsigned char  *src_ptr,
+    unsigned short *dst_ptr,
+    unsigned int    src_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 );
 
 extern void vp8_filter_block2d_bil_second_pass_armv6
 (
     unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 );
 
-#if 0
-void vp8_filter_block2d_bil_first_pass_6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +
-                               ((int)src_ptr[1] * vp8_filter[1]) +
-                                (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-void vp8_filter_block2d_bil_second_pass_6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int  i,j;
-    int  Temp;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply filter */
-            Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +
-                    ((int)src_ptr[output_width] * vp8_filter[1]) +
-                    (VP8_FILTER_WEIGHT/2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        /*src_ptr    += src_pixels_per_line - output_width;*/
-        output_ptr += output_pitch;
-    }
-}
-#endif
-
 void vp8_filter_block2d_bil_armv6
 (
     unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
     unsigned int   dst_pitch,
-    const short      *HFilter,
-    const short      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
     int            Width,
     int            Height
 )
 {
-
-    unsigned short FData[36*16]; /* Temp data bufffer used in filtering */
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */
 
     /* First filter 1-D horizontally... */
-    /* pixel_step = 1; */
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
 
     /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 
 
@@ -148,8 +68,8 @@ void vp8_bilinear_predict4x4_armv6
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
@@ -167,8 +87,8 @@ void vp8_bilinear_predict8x8_armv6
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 }
@@ -186,8 +106,8 @@ void vp8_bilinear_predict8x4_armv6
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 }
@@ -205,8 +125,8 @@ void vp8_bilinear_predict16x16_armv6
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c
index b4f2fe6ca..2612fc18a 100644
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -11,26 +11,10 @@
 
 #include "vpx_ports/config.h"
 #include <math.h>
+#include "filter.h"
 #include "subpixel.h"
 #include "vpx_ports/mem.h"
 
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
-{
-    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
-    { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
-    { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
-    { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
-    { 0, -1,   12,  123,  -6,  0 },
-};
-
-
 extern void vp8_filter_block2d_first_pass_armv6
 (
     unsigned char *src_ptr,
@@ -93,11 +77,11 @@ void vp8_sixtap_predict_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
 
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     /* Vfilter is null. First pass only */
     if (xoffset && !yoffset)
@@ -129,47 +113,6 @@ void vp8_sixtap_predict_armv6
     }
 }
 
-#if 0
-void vp8_sixtap_predict8x4_armv6
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
-
-
-    /*if (xoffset && !yoffset)
-    {
-        vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
-    }*/
-    /* Hfilter is null. Second pass only */
-    /*else if (!xoffset && yoffset)
-    {
-        vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
-    }
-    else
-    {
-        if (yoffset & 0x1)
-            vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
-        else*/
-
-        vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
-
-        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
-    /*}*/
-}
-#endif
-
 void vp8_sixtap_predict8x8_armv6
 (
     unsigned char  *src_ptr,
@@ -182,10 +125,10 @@ void vp8_sixtap_predict8x8_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     if (xoffset && !yoffset)
     {
@@ -224,10 +167,10 @@ void vp8_sixtap_predict16x16_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     if (xoffset && !yoffset)
     {
diff --git a/vp8/common/filter_c.c b/vp8/common/filter.c
similarity index 66%
rename from vp8/common/filter_c.c
rename to vp8/common/filter.c
index 399a847d5..6e364a94d 100644
--- a/vp8/common/filter_c.c
+++ b/vp8/common/filter.c
@@ -10,13 +10,10 @@
 
 
 #include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"
 
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-
-static const int bilinear_filters[8][2] =
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {
     { 128,   0 },
     { 112,  16 },
@@ -28,8 +25,7 @@ static const int bilinear_filters[8][2] =
     {  16, 112 }
 };
 
-
-static const short sub_pel_filters[8][6] =
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
 {
 
     { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
@@ -40,9 +36,6 @@ static const short sub_pel_filters[8][6] =
     { 0, -6,   50,   93,  -9,  0 },
     { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
     { 0, -1,   12,  123,  -6,  0 },
-
-
-
 };
 
 void vp8_filter_block2d_first_pass
@@ -146,7 +139,7 @@ void vp8_filter_block2d
     const short  *VFilter
 )
 {
-    int FData[9*4]; /* Temp data bufffer used in filtering */
+    int FData[9*4]; /* Temp data buffer used in filtering */
 
     /* First filter 1-D horizontally... */
     vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
@@ -195,8 +188,8 @@ void vp8_sixtap_predict_c
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
@@ -212,10 +205,10 @@ void vp8_sixtap_predict8x8_c
 {
     const short  *HFilter;
     const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     /* First filter 1-D horizontally... */
     vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
@@ -238,10 +231,10 @@ void vp8_sixtap_predict8x4_c
 {
     const short  *HFilter;
     const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     /* First filter 1-D horizontally... */
     vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
@@ -264,11 +257,11 @@ void vp8_sixtap_predict16x16_c
 {
     const short  *HFilter;
     const short  *VFilter;
-    int FData[21*24];   /* Temp data bufffer used in filtering */
+    int FData[21*24];   /* Temp data buffer used in filtering */
 
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     /* First filter 1-D horizontally... */
     vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
@@ -283,57 +276,50 @@ void vp8_sixtap_predict16x16_c
  *
  *  ROUTINE       : filter_block2d_bil_first_pass
  *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
+ *                  UINT32  src_stride : Stride of source block.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
  *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the horizontal direction to produce the filtered output
+ *                  block. Used to implement first-pass of 2-D separable filter.
  *
  *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
  *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
  *
  ****************************************************************************/
 void vp8_filter_block2d_bil_first_pass
 (
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const int *vp8_filter
+    unsigned char  *src_ptr,
+    unsigned short *dst_ptr,
+    unsigned int    src_stride,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
     unsigned int i, j;
 
-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
     {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
         {
             /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+                          ((int)src_ptr[1] * vp8_filter[1]) +
+                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
             src_ptr++;
         }
 
         /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
+        src_ptr += src_stride - width;
+        dst_ptr += width;
     }
 }
 
@@ -341,60 +327,51 @@ void vp8_filter_block2d_bil_first_pass
  *
  *  ROUTINE       : filter_block2d_bil_second_pass
  *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
+ *                  UINT32  dst_pitch  : Destination block pitch.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
  *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the vertical direction to produce the filtered output
+ *                  block. Used to implement second-pass of 2-D separable filter.
  *
  *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
  *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
  *
  ****************************************************************************/
 void vp8_filter_block2d_bil_second_pass
 (
     unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  src_pixels_per_line,
-    unsigned int  pixel_step,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const int *vp8_filter
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
     unsigned int  i, j;
     int  Temp;
 
-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
     {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
         {
             /* Apply filter */
-            Temp = ((int)src_ptr[0]         * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
                    (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
             src_ptr++;
         }
 
         /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_pitch;
+        dst_ptr += dst_pitch;
     }
 }
 
@@ -404,11 +381,14 @@ void vp8_filter_block2d_bil_second_pass
  *  ROUTINE       : filter_block2d_bil
  *
  *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  INT32  *HFilter         : Array of 2 horizontal filter taps.
- *                  INT32  *VFilter         : Array of 2 vertical filter taps.
+ *                  UINT32  src_pitch        : Stride of source block.
+ *                  UINT32  dst_pitch        : Stride of destination block.
+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.
+ *                  INT32  Width             : Block width
+ *                  INT32  Height            : Block height
  *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
@@ -422,23 +402,23 @@ void vp8_filter_block2d_bil_second_pass
 void vp8_filter_block2d_bil
 (
     unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
     unsigned int   dst_pitch,
-    const int      *HFilter,
-    const int      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
     int            Width,
     int            Height
 )
 {
 
-    unsigned short FData[17*16];    /* Temp data bufffer used in filtering */
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */
 
     /* First filter 1-D horizontally... */
-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
 
     /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 
 
@@ -452,11 +432,11 @@ void vp8_bilinear_predict4x4_c
     int dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 #if 0
     {
         int i;
@@ -490,11 +470,11 @@ void vp8_bilinear_predict8x8_c
     int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 
@@ -510,11 +490,11 @@ void vp8_bilinear_predict8x4_c
     int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 
@@ -530,11 +510,11 @@ void vp8_bilinear_predict16x16_c
     int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
diff --git a/vp8/common/filter.h b/vp8/common/filter.h
new file mode 100644
index 000000000..0f225c25a
--- /dev/null
+++ b/vp8/common/filter.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef FILTER_H
+#define FILTER_H
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+extern const short vp8_bilinear_filters[8][2];
+extern const short vp8_sub_pel_filters[8][6];
+
+#endif //FILTER_H
diff --git a/vp8/common/threading.h b/vp8/common/threading.h
index 0dd7bbc4c..bfd4916c6 100644
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -14,7 +14,7 @@
 
 #define VPXINFINITE 10000       /* 10second. */
 
-#if CONFIG_OS_SUPPORT
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
 
 /* Thread management macros */
 #ifdef _WIN32
@@ -90,8 +90,6 @@
 #define x86_pause_hint()
 #endif
 
-#else /* CONFIG_OS_SUPPORT = 0 */
-#define THREAD_FUNCTION void *
-#endif /* CONFIG_OS_SUPPORT */
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
 
 #endif
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 9bd6bc514..c97fa7f5f 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -484,9 +484,11 @@ static void setup_token_decoder(VP8D_COMP *pbi,
         bool_decoder++;
     }
 
+#if CONFIG_MULTITHREAD
     /* Clamp number of decoder threads */
     if (pbi->decoding_thread_count > num_part - 1)
         pbi->decoding_thread_count = num_part - 1;
+#endif
 }
 
 
@@ -849,7 +851,9 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 #endif
 
     /* set up frame new frame for intra coded blocks */
+#if CONFIG_MULTITHREAD
     if (!(pbi->b_multithreaded_rd) || pc->multi_token_partition == ONE_PARTITION || !(pc->filter_level))
+#endif
         vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
 
     vp8_setup_block_dptrs(xd);
@@ -869,6 +873,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
     vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));
 
+#if CONFIG_MULTITHREAD
     if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
     {
         vp8mt_decode_mb_rows(pbi, xd);
@@ -883,6 +888,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         vp8_yv12_extend_frame_borders_ptr(&pc->yv12_fb[pc->new_fb_idx]);    /*cm->frame_to_show);*/
     }
     else
+#endif
     {
         int ibc = 0;
         int num_part = 1 << pc->multi_token_partition;
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index ad0bf376e..f0ee8e157 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -113,8 +113,10 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
     pbi->ready_for_new_data = 1;
 
     pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
+#if CONFIG_MULTITHREAD
     pbi->max_threads = oxcf->max_threads;
     vp8_decoder_create_threads(pbi);
+#endif
 
     /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
      *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
@@ -152,8 +154,8 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr)
 #if CONFIG_MULTITHREAD
     if (pbi->b_multithreaded_rd)
         vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
-#endif
     vp8_decoder_remove_threads(pbi);
+#endif
     vp8_remove_common(&pbi->common);
     vpx_free(pbi);
 }
@@ -410,6 +412,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
         return retcode;
     }
 
+#if CONFIG_MULTITHREAD
     if (pbi->b_multithreaded_rd && cm->multi_token_partition != ONE_PARTITION)
     {
         if (swap_frame_buffers (cm))
@@ -427,6 +430,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
             return -1;
         }
     } else
+#endif
     {
         if (swap_frame_buffers (cm))
         {
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index f83c528c4..47b4fe0a5 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -87,13 +87,14 @@ typedef struct VP8Decompressor
     unsigned int time_decoding;
     unsigned int time_loop_filtering;
 
+#if CONFIG_MULTITHREAD
+    /* variable for threading */
+
     volatile int b_multithreaded_rd;
     int max_threads;
     int current_mb_col_main;
     int decoding_thread_count;
     int allocated_decoding_thread_count;
-    /* variable for threading */
-#if CONFIG_MULTITHREAD
     int mt_baseline_filter_level[MAX_MB_SEGMENTS];
     int sync_range;
     int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
diff --git a/vp8/decoder/reconintra_mt.c b/vp8/decoder/reconintra_mt.c
index ad4324b27..854aba35a 100644
--- a/vp8/decoder/reconintra_mt.c
+++ b/vp8/decoder/reconintra_mt.c
@@ -21,7 +21,6 @@
 
 void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
     unsigned char *yleft_col;
     unsigned char yleft_buf[16];
@@ -146,17 +145,10 @@ void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row
     case MB_MODE_COUNT:
         break;
     }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
     unsigned char *yleft_col;
     unsigned char yleft_buf[16];
@@ -289,17 +281,10 @@ void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_r
     case MB_MODE_COUNT:
         break;
     }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *uabove_row;   /* = x->dst.u_buffer - x->dst.uv_stride; */
     unsigned char *uleft_col;    /*[16];*/
     unsigned char uleft_buf[8];
@@ -452,17 +437,10 @@ void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_ro
     case MB_MODE_COUNT:
         break;
     }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *uabove_row;  /* = x->dst.u_buffer - x->dst.uv_stride; */
     unsigned char *uleft_col;   /*[16];*/
     unsigned char uleft_buf[8];
@@ -621,12 +599,6 @@ void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_
     case MB_MODE_COUNT:
         break;
     }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 
@@ -638,7 +610,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
                           int mb_col,
                           int num)
 {
-#if CONFIG_MULTITHREAD
     int i, r, c;
 
     unsigned char *Above;   /* = *(x->base_dst) + x->dst - x->dst_stride; */
@@ -935,15 +906,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
 
 
     }
-#else
-    (void) pbi;
-    (void) xd;
-    (void) b_mode;
-    (void) predictor;
-    (void) mb_row;
-    (void) mb_col;
-    (void) num;
-#endif
 }
 
 /* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
@@ -951,7 +913,6 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
  */
 void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *above_right;   /* = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; */
     unsigned int *src_ptr;
     unsigned int *dst_ptr0;
@@ -973,10 +934,4 @@ void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row
     *dst_ptr0 = *src_ptr;
     *dst_ptr1 = *src_ptr;
     *dst_ptr2 = *src_ptr;
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index ec2cb2b07..271249a8d 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -38,7 +38,6 @@ extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
 
 void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
-#if CONFIG_MULTITHREAD
     VP8_COMMON *const pc = & pbi->common;
     int i, j;
 
@@ -88,18 +87,11 @@ void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC
 
     for (i=0; i< pc->mb_rows; i++)
         pbi->mt_current_mb_col[i]=-1;
-#else
-    (void) pbi;
-    (void) xd;
-    (void) mbrd;
-    (void) count;
-#endif
 }
 
 
 void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     int eobtotal = 0;
     int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
     VP8_COMMON *pc = &pbi->common;
@@ -222,18 +214,11 @@ void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb
                     (xd->qcoeff+16*16, xd->block[16].dequant,
                      xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
                      xd->dst.uv_stride, xd->eobs+16);
-#else
-    (void) pbi;
-    (void) xd;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 
 THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 {
-#if CONFIG_MULTITHREAD
     int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
     VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
     MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
@@ -438,9 +423,6 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
             sem_post(&pbi->h_event_end_decoding);
         }
     }
-#else
-    (void) p_data;
-#endif
 
     return 0 ;
 }
@@ -448,7 +430,6 @@ THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
 
 void vp8_decoder_create_threads(VP8D_COMP *pbi)
 {
-#if CONFIG_MULTITHREAD
     int core_count = 0;
     int ithread;
 
@@ -482,16 +463,11 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
 
         pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
     }
-
-#else
-    (void) pbi;
-#endif
 }
 
 
 void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 {
-#if CONFIG_MULTITHREAD
     VP8_COMMON *const pc = & pbi->common;
     int i;
 
@@ -589,15 +565,11 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
             pbi->mt_vleft_col = NULL ;
         }
     }
-#else
-    (void) pbi;
-#endif
 }
 
 
 void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 {
-#if CONFIG_MULTITHREAD
     VP8_COMMON *const pc = & pbi->common;
     int i;
     int uv_width;
@@ -646,17 +618,11 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
         for (i=0; i< pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
     }
-#else
-    (void) pbi;
-    (void) width;
-#endif
 }
 
 
 void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 {
-#if CONFIG_MULTITHREAD
-
     /* shutdown MB Decoding thread; */
     if (pbi->b_multithreaded_rd)
     {
@@ -702,15 +668,11 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
             pbi->de_thread_data = NULL;
         }
     }
-#else
-    (void) pbi;
-#endif
 }
 
 
 void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
 {
-#if CONFIG_MULTITHREAD
     VP8_COMMON *cm  = &pbi->common;
     MACROBLOCKD *mbd = &pbi->mb;
     /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/  /*frame_to_show;*/
@@ -752,16 +714,11 @@ void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
         vp8_init_loop_filter(cm);
     else if (frame_type != cm->last_frame_type)
         vp8_frame_init_loop_filter(lfi, frame_type);
-#else
-    (void) pbi;
-    (void) default_filt_lvl;
-#endif
 }
 
 
 void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-#if CONFIG_MULTITHREAD
     int mb_row;
     VP8_COMMON *pc = &pbi->common;
 
@@ -981,8 +938,4 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
     }
 
     sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
-#else
-    (void) pbi;
-    (void) xd;
-#endif
 }
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index c129ee01b..419646c30 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1736,10 +1736,12 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
     {
         vp8_start_encode(&cpi->bc2, cx_data + bc->pos);
 
-        if (!cpi->b_multi_threaded)
-            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
-        else
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
             pack_mb_row_tokens(cpi, &cpi->bc2);
+        else
+#endif
+            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
 
         vp8_stop_encode(&cpi->bc2);
         oh.first_partition_length_in_bytes = cpi->bc.pos ;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 88e8d02b8..2950bb45f 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -853,28 +853,9 @@ void vp8_encode_frame(VP8_COMP *cpi)
         struct vpx_usec_timer  emr_timer;
         vpx_usec_timer_start(&emr_timer);
 
-        if (!cpi->b_multi_threaded)
-        {
-            // for each macroblock row in image
-            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
-            {
-
-                vp8_zero(cm->left_context)
-
-                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
-
-                // adjust to the next row of mbs
-                x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
-                x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
-                x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
-            }
-
-            cpi->tok_count = tp - cpi->tok;
-
-        }
-        else
-        {
 #if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
+        {
             int i;
 
             vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
@@ -939,7 +920,25 @@ void vp8_encode_frame(VP8_COMP *cpi)
                 x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
             }
 
+        }
+        else
 #endif
+        {
+            // for each macroblock row in image
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+            {
+
+                vp8_zero(cm->left_context)
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                // adjust to the next row of mbs
+                x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+                x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+                x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+            }
+
+            cpi->tok_count = tp - cpi->tok;
 
         }
 
@@ -1304,7 +1303,7 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 
         Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);
 
-        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4);
+        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4, Error16x16);
 
         rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;
     }
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index cce753013..4cb4c6e55 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -128,7 +128,7 @@ static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
 
         while (--i > 3);
 
-        if (x & 240)
+        if (x & 0xFFF0)
             cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
     }
 
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index d3f3bc4ca..967d2a6a9 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -13,6 +13,8 @@
 #include "common.h"
 #include "extend.h"
 
+#if CONFIG_MULTITHREAD
+
 extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
                                          TOKENEXTRA **t, int recon_yoffset,
                                          int recon_uvoffset);
@@ -25,7 +27,6 @@ extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 static
 THREAD_FUNCTION thread_encoding_proc(void *p_data)
 {
-#if CONFIG_MULTITHREAD
     int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
     VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
     MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
@@ -247,10 +248,6 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
         }
     }
 
-#else
-    (void) p_data;
-#endif
-
     //printf("exit thread %d\n", ithread);
     return 0;
 }
@@ -436,10 +433,6 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 
     cpi->processor_core_count = 32; //vp8_get_proc_core_count();
 
-    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
-
-#if CONFIG_MULTITHREAD
-
     if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
     {
         int ithread;
@@ -488,13 +481,10 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 
     }
 
-#endif
 }
 
 void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
 {
-#if CONFIG_MULTITHREAD
-
     if (cpi->b_multi_threaded)
     {
         //shutdown other threads
@@ -521,7 +511,5 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
         vpx_free(cpi->en_thread_data);
         vpx_free(cpi->mt_current_mb_col);
     }
-
-#endif
-    vpx_free(cpi->tplist);
 }
+#endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 1821f6c53..60ccc4e6e 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -331,6 +331,9 @@ static void setup_features(VP8_COMP *cpi)
 
 void vp8_dealloc_compressor_data(VP8_COMP *cpi)
 {
+    vpx_free(cpi->tplist);
+    cpi->tplist = NULL;
+
     // Delete last frame MV storage buffers
     if (cpi->lfmv != 0)
         vpx_free(cpi->lfmv);
@@ -1542,6 +1545,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
     else
         cpi->mt_sync_range = 16;
 #endif
+
+    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
 }
 
 
@@ -2492,7 +2497,9 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     init_mv_ref_counts();
 #endif
 
+#if CONFIG_MULTITHREAD
     vp8cx_create_encoder_threads(cpi);
+#endif
 
     cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
     cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
@@ -2767,7 +2774,9 @@ void vp8_remove_compressor(VP8_PTR *ptr)
 
     }
 
+#if CONFIG_MULTITHREAD
     vp8cx_remove_encoder_threads(cpi);
+#endif
 
     vp8_dealloc_compressor_data(cpi);
     vpx_free(cpi->mb.ss);
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index d0e48eee2..39d7e95c5 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -593,6 +593,7 @@ typedef struct
     int cyclic_refresh_q;
     signed char *cyclic_refresh_map;
 
+#if CONFIG_MULTITHREAD
     // multithread data
     int * mt_current_mb_col;
     int mt_sync_range;
@@ -600,13 +601,10 @@ typedef struct
     int b_multi_threaded;
     int encoding_thread_count;
 
-#if CONFIG_MULTITHREAD
     pthread_t *h_encoding_thread;
-#endif
     MB_ROW_COMP *mb_row_ei;
     ENCODETHREAD_DATA *en_thread_data;
 
-#if CONFIG_MULTITHREAD
     //events
     sem_t *h_event_start_encoding;
     sem_t h_event_end_encoding;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 39d694dff..80af8fa74 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -645,7 +645,7 @@ static void macro_block_yrd( MACROBLOCK *mb,
     *Rate = vp8_rdcost_mby(mb);
 }
 
-static void rd_pick_intra4x4block(
+static int rd_pick_intra4x4block(
     VP8_COMP *cpi,
     MACROBLOCK *x,
     BLOCK *be,
@@ -711,16 +711,20 @@ static void rd_pick_intra4x4block(
     b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
     vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, b->bmi.mode);
 
+    return best_rd;
+
 }
 
 
-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, int *Distortion)
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
+                                  int *rate_y, int *Distortion, int best_rd)
 {
     MACROBLOCKD *const xd = &mb->e_mbd;
     int i;
     int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
     int distortion = 0;
     int tot_rate_y = 0;
+    int total_rd = 0;
     ENTROPY_CONTEXT_PLANES t_above, t_left;
     ENTROPY_CONTEXT *ta;
     ENTROPY_CONTEXT *tl;
@@ -742,7 +746,7 @@ int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int
         B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
         int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
 
-        rd_pick_intra4x4block(
+        total_rd += rd_pick_intra4x4block(
             cpi, mb, mb->block + i, xd->block + i, &best_mode, A, L,
             ta + vp8_block2above[i],
             tl + vp8_block2left[i], &r, &ry, &d);
@@ -751,8 +755,14 @@ int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int
         distortion += d;
         tot_rate_y += ry;
         mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode;
+
+        if(total_rd >= best_rd)
+          break;
     }
 
+    if(total_rd >= best_rd)
+      return INT_MAX;
+
     *Rate = cost;
     *rate_y += tot_rate_y;
     *Distortion = distortion;
@@ -2025,15 +2035,28 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         switch (this_mode)
         {
         case B_PRED:
+        {
+            int tmp_rd;
+
             // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];
-            vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion);
+            tmp_rd = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd);
             rate2 += rate;
             distortion2 += distortion;
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-            break;
+
+            if(tmp_rd < best_yrd)
+            {
+                rate2 += uv_intra_rate;
+                rate_uv = uv_intra_rate_tokenonly;
+                distortion2 += uv_intra_distortion;
+                distortion_uv = uv_intra_distortion;
+            }
+            else
+            {
+                this_rd = INT_MAX;
+                disable_skip = 1;
+            }
+        }
+        break;
 
         case SPLITMV:
         {
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index d87440998..72ba9a0b5 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -12,7 +12,7 @@
 #ifndef __INC_RDOPT_H
 #define __INC_RDOPT_H
 void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion);
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion, int best_rd);
 int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion);
 int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_to, int *distortion);
 extern int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index f28daaff9..3c6d1a4d9 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -70,7 +70,7 @@ static void vp8_temporal_filter_predictors_mb_c
     // U & V
     mv_row >>= 1;
     mv_col >>= 1;
-    stride >>= 1;
+    stride = (stride + 1) >> 1;
     offset = (mv_row >> 3) * stride + (mv_col >> 3);
     uptr = u_mb_ptr + offset;
     vptr = v_mb_ptr + offset;
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 270006ed8..bf9fb513c 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -35,7 +35,7 @@ VP8_COMMON_SRCS-yes += common/entropy.c
 VP8_COMMON_SRCS-yes += common/entropymode.c
 VP8_COMMON_SRCS-yes += common/entropymv.c
 VP8_COMMON_SRCS-yes += common/extend.c
-VP8_COMMON_SRCS-yes += common/filter_c.c
+VP8_COMMON_SRCS-yes += common/filter.c
 VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
 VP8_COMMON_SRCS-yes += common/idctllm.c
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 5b1fc581e..7e8211ae4 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -934,8 +934,8 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
         ctx->preview_img.x_chroma_shift = 1;
         ctx->preview_img.y_chroma_shift = 1;
 
-        ctx->preview_img.d_w = ctx->cfg.g_w;
-        ctx->preview_img.d_h = ctx->cfg.g_h;
+        ctx->preview_img.d_w = sd.y_width;
+        ctx->preview_img.d_h = sd.y_height;
         ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
         ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
         ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 932f145e6..1ec26e69e 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -42,7 +42,7 @@ VP8_CX_SRCS-yes += encoder/encodeframe.c
 VP8_CX_SRCS-yes += encoder/encodeintra.c
 VP8_CX_SRCS-yes += encoder/encodemb.c
 VP8_CX_SRCS-yes += encoder/encodemv.c
-VP8_CX_SRCS-yes += encoder/ethreading.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
 VP8_CX_SRCS-yes += encoder/firstpass.c
 VP8_CX_SRCS-yes += encoder/generic/csystemdependent.c
 VP8_CX_SRCS-yes += encoder/block.h
diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk
index 1acd67453..62f6211f6 100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -65,7 +65,7 @@ VP8_DX_SRCS-yes += decoder/detokenize.h
 VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
-VP8_DX_SRCS-yes += decoder/threading.c
+VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
 VP8_DX_SRCS-yes += decoder/idct_blk.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c