Move variance functions to vpx_dsp

subpel functions will be moved in another patch. Change-Id: Idb2e049bad0b9b32ac42cc7731cd6903de2826ce
2015-05-15 11:52:03 -07:00 · 2015-05-15 11:52:03 -07:00 · c3bdffb0a5
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
--- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
@ -1,154 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance16x16_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance16x16_armv6| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-
-loop
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-
-    subs    r12, r12, #1
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-    END
-
--- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
@ -1,101 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance8x8_armv6|
-
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance8x8_armv6| PROC
-
-    push    {r4-r10, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #8             ; set loop counter to 8 (=block height)
-    mov     r4, #0              ; initialize sum = 0
-    mov     r5, #0              ; initialize sse = 0
-
-loop
-    ; 1st 4 pixels
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels
-    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r6, r7          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-    ; calculate total sum
-    add    r4, r4, r6           ; add positive differences to sum
-    sub    r4, r4, r7           ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r0, #0x4]      ; load 4 src pixels
-    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r6, r7          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r4, r4, r6          ; add positive differences to sum
-    sub     r4, r4, r7          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-    subs    r12, r12, #1        ; next row
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r8, [sp, #32]       ; get address of sse
-    mul     r1, r4, r4          ; sum * sum
-    str     r5, [r8]            ; store sse
-    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
-
-    pop     {r4-r10, pc}
-
-    ENDP
-
-    END
--- a/vp8/common/arm/neon/variance_neon.c
+++ b/vp8/common/arm/neon/variance_neon.c
@ -1,320 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "vpx_ports/mem.h"
-
-unsigned int vp8_variance16x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance16x8_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 4; i++) {  // variance16x8_neon_loop
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d2u8, d4u8, d6u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint16x8_t q11u16, q12u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {  // variance8x16_neon_loop
-        d0u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d2u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        d4u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d6u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(d0u8, d4u8);
-        q12u16 = vsubl_u8(d2u8, d6u8);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x8_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 2; i++) {  // variance8x8_neon_loop
-        d0u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d1u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d2u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d3u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-
-        d4u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d5u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d6u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d7u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        q11u16 = vsubl_u8(d0u8, d4u8);
-        q12u16 = vsubl_u8(d1u8, d5u8);
-        q13u16 = vsubl_u8(d2u8, d6u8);
-        q14u16 = vsubl_u8(d3u8, d7u8);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@ -9,10 +9,14 @@
 */

 #include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp8/common/variance.h"
 #include "vp8/common/filter.h"

+// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
+#if CONFIG_VP8_ENCODER
+
 #if HAVE_MEDIA
 #include "vp8/common/arm/bilinearfilter_arm.h"

@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6
    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
                                             8, 8, 8, VFilter);

-    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
-                                   dst_pixels_per_line, sse);
+    return vpx_variance8x8_media(second_pass, 8, dst_ptr,
+                                 dst_pixels_per_line, sse);
 }

 unsigned int vp8_sub_pixel_variance16x16_armv6
@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
                                                 16, 16, 16, VFilter);

-        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
-                                       dst_pixels_per_line, sse);
+        var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
+                                      dst_pixels_per_line, sse);
    }
    return var;
 }

-#endif /* HAVE_MEDIA */
+#endif  // HAVE_MEDIA


 #if HAVE_NEON
@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon
    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
 }

-#endif
+#endif  // HAVE_NEON
+#endif  // CONFIG_VP8_ENCODER
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block

    if (blksize == 16)
    {
-        actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
-        act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
 #ifdef USE_SSD
-        vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
+        vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
        sad = (sse + 128)>>8;
-        vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
+        vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
        usad = (sse + 32)>>6;
-        vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
+        vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
        vsad = (sse + 32)>>6;
 #else
        sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block
    }
    else /* if (blksize == 8) */
    {
-        actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
-        act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
 #ifdef USE_SSD
-        vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
+        vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
        sad = (sse + 32)>>6;
-        vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
+        vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
        usad = (sse + 8)>>4;
-        vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
+        vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
        vsad = (sse + 8)>>4;
 #else
        sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@ -236,31 +236,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch,
 specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;

-#
-# Whole-pixel Variance
-#
-add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance4x4 mmx sse2/;
-$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
-
-add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance8x8 mmx sse2 media neon/;
-$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
-$vp8_variance8x8_media=vp8_variance8x8_armv6;
-
-add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance8x16 mmx sse2 neon/;
-$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
-
-add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance16x8 mmx sse2 neon/;
-$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
-
-add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance16x16 mmx sse2 media neon/;
-$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
-$vp8_variance16x16_media=vp8_variance16x16_armv6;
-
 #
 # Sub-pixel Variance
 #
@ -308,12 +283,6 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
 #
 if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {

-#
-# Sum of squares (vector)
-#
-add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
-specialize qw/vp8_get_mb_ss mmx sse2/;
-
 #
 # SSE (Sum Squared Error)
 #
@ -321,14 +290,6 @@ add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char  *src_pt
 specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
 $vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;

-add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_mse16x16 mmx sse2 media neon/;
-$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
-$vp8_mse16x16_media=vp8_mse16x16_armv6;
-
-add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-specialize qw/vp8_get4x4sse_cs mmx neon/;
-
 #
 # Block copy
 #
--- a/vp8/common/variance.h
+++ b/vp8/common/variance.h
@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)(
    const unsigned char *ref_array,
    int  ref_stride,
    unsigned int *sad_array);
+
 typedef void (*vpx_sad_multi_d_fn_t)
    (
     const unsigned char *src_ptr,
@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t)
     unsigned int *sad_array
    );

-typedef unsigned int (*vp8_variance_fn_t)
+typedef unsigned int (*vpx_variance_fn_t)
    (
     const unsigned char *src_ptr,
     int source_stride,
@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t)
      unsigned int *sse
    );

-typedef void (*vp8_ssimpf_fn_t)
-      (
-        unsigned char *s,
-        int sp,
-        unsigned char *r,
-        int rp,
-        unsigned long *sum_s,
-        unsigned long *sum_r,
-        unsigned long *sum_sq_s,
-        unsigned long *sum_sq_r,
-        unsigned long *sum_sxr
-      );
-
-typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
-
-typedef unsigned int (*vp8_get16x16prederror_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char *ref_ptr,
-     int  ref_stride
-    );
-
 typedef struct variance_vtable
 {
    vpx_sad_fn_t            sdf;
-    vp8_variance_fn_t       vf;
+    vpx_variance_fn_t       vf;
    vp8_subpixvariance_fn_t svf;
-    vp8_variance_fn_t       svf_halfpix_h;
-    vp8_variance_fn_t       svf_halfpix_v;
-    vp8_variance_fn_t       svf_halfpix_hv;
+    vpx_variance_fn_t       svf_halfpix_h;
+    vpx_variance_fn_t       svf_halfpix_v;
+    vpx_variance_fn_t       svf_halfpix_hv;
    vpx_sad_multi_fn_t      sdx3f;
    vpx_sad_multi_fn_t      sdx8f;
    vpx_sad_multi_d_fn_t    sdx4df;
--- a/vp8/common/variance_c.c
+++ b/vp8/common/variance_c.c
@ -8,44 +8,34 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
 #include "./vp8_rtcd.h"
 #include "filter.h"
 #include "variance.h"

-
-unsigned int vp8_get_mb_ss_c
-(
-    const short *src_ptr
-)
-{
-    unsigned int i = 0, sum = 0;
-
-    do
-    {
-        sum += (src_ptr[i] * src_ptr[i]);
-        i++;
-    }
-    while (i < 256);
-
-    return sum;
+/* This is a bad idea.
+ * ctz = count trailing zeros */
+static int ctz(int a) {
+  int b = 0;
+  while (a != 1) {
+    a >>= 1;
+    b++;
+  }
+  return b;
 }

-
-static void variance(
+static unsigned int variance(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    int  w,
    int  h,
-    unsigned int *sse,
-    int *sum)
+    unsigned int *sse)
 {
    int i, j;
-    int diff;
+    int diff, sum;

-    *sum = 0;
+    sum = 0;
    *sse = 0;

    for (i = 0; i < h; i++)
@ -53,114 +43,17 @@ static void variance(
        for (j = 0; j < w; j++)
        {
            diff = src_ptr[j] - ref_ptr[j];
-            *sum += diff;
+            sum += diff;
            *sse += diff * diff;
        }

        src_ptr += source_stride;
        ref_ptr += recon_stride;
    }
+
+    return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
 }

-
-unsigned int vp8_variance16x16_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp8_variance8x16_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-unsigned int vp8_variance16x8_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-
-unsigned int vp8_variance8x8_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 6));
-}
-
-unsigned int vp8_variance4x4_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-}
-
-
-unsigned int vp8_mse16x16_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-    *sse = var;
-    return var;
-}
-
-
 /****************************************************************************
 *
 *  ROUTINE       : filter_block2d_bil_first_pass
@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c
    /* Now filter Verticaly */
    var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);

-    return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
 }


@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c
    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);

-    return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
 }

 unsigned int vp8_sub_pixel_variance16x16_c
@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c
    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);

-    return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
 }


@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c
    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);

-    return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
 }

 unsigned int vp8_sub_pixel_variance8x16_c
@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c
    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);

-    return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
 }
--- a/vp8/common/x86/variance_impl_mmx.asm
+++ b/vp8/common/x86/variance_impl_mmx.asm
@ -11,504 +11,6 @@

 %include "vpx_ports/x86_abi_support.asm"

-;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
-global sym(vp8_get_mb_ss_mmx) PRIVATE
-sym(vp8_get_mb_ss_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 8
-    ; end prolog
-
-        mov         rax, arg(0) ;src_ptr
-        mov         rcx, 16
-        pxor        mm4, mm4
-
-.NEXTROW:
-        movq        mm0, [rax]
-        movq        mm1, [rax+8]
-        movq        mm2, [rax+16]
-        movq        mm3, [rax+24]
-        pmaddwd     mm0, mm0
-        pmaddwd     mm1, mm1
-        pmaddwd     mm2, mm2
-        pmaddwd     mm3, mm3
-
-        paddd       mm4, mm0
-        paddd       mm4, mm1
-        paddd       mm4, mm2
-        paddd       mm4, mm3
-
-        add         rax, 32
-        dec         rcx
-        ja          .NEXTROW
-        movq        QWORD PTR [rsp], mm4
-
-        ;return sum[0]+sum[1];
-        movsxd      rax, dword ptr [rsp]
-        movsxd      rcx, dword ptr [rsp+4]
-        add         rax, rcx
-
-
-    ; begin epilog
-    add rsp, 8
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_get8x8var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp8_get8x8var_mmx) PRIVATE
-sym(vp8_get8x8var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-
-        ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 5
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        ;              movq        mm4, [rbx + rdx]
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 6
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 7
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 8
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp8_get4x4var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp8_get4x4var_mmx) PRIVATE
-sym(vp8_get4x4var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher precision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp8_get4x4sse_cs_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride
-;)
-global sym(vp8_get4x4sse_cs_mmx) PRIVATE
-sym(vp8_get4x4sse_cs_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm1, mm6
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        movq        mm0,    mm7                 ;
-        psrlq       mm7,    32
-
-        paddd       mm0,    mm7
-        movq        rax,    mm0
-
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 %define mmx_filter_shift            7

 ;void vp8_filter_block2d_bil4x4_var_mmx
--- a/vp8/common/x86/variance_impl_sse2.asm
+++ b/vp8/common/x86/variance_impl_sse2.asm
@ -13,393 +13,6 @@

 %define xmm_filter_shift            7

-;unsigned int vp8_get_mb_ss_sse2
-;(
-;    short *src_ptr
-;)
-global sym(vp8_get_mb_ss_sse2) PRIVATE
-sym(vp8_get_mb_ss_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 1
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        mov         rax, arg(0) ;[src_ptr]
-        mov         rcx, 8
-        pxor        xmm4, xmm4
-
-.NEXTROW:
-        movdqa      xmm0, [rax]
-        movdqa      xmm1, [rax+16]
-        movdqa      xmm2, [rax+32]
-        movdqa      xmm3, [rax+48]
-        pmaddwd     xmm0, xmm0
-        pmaddwd     xmm1, xmm1
-        pmaddwd     xmm2, xmm2
-        pmaddwd     xmm3, xmm3
-
-        paddd       xmm0, xmm1
-        paddd       xmm2, xmm3
-        paddd       xmm4, xmm0
-        paddd       xmm4, xmm2
-
-        add         rax, 0x40
-        dec         rcx
-        ja          .NEXTROW
-
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,8
-        paddd       xmm4,xmm3
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,4
-        paddd       xmm4,xmm3
-        movq        rax,xmm4
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_get16x16var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp8_get16x16var_sse2) PRIVATE
-sym(vp8_get16x16var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        ; Prefetch data
-        lea             rcx,    [rax+rax*2]
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+rax]
-        prefetcht0      [rsi+rax*2]
-        prefetcht0      [rsi+rcx]
-        lea             rbx,    [rsi+rax*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax*2]
-        prefetcht0      [rbx+rcx]
-
-        lea             rcx,    [rdx+rdx*2]
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+rdx]
-        prefetcht0      [rdi+rdx*2]
-        prefetcht0      [rdi+rcx]
-        lea             rbx,    [rdi+rdx*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx*2]
-        prefetcht0      [rbx+rcx]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
-        mov         rcx,            16
-
-.var16loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        prefetcht0      [rsi+rax*8]
-        prefetcht0      [rdi+rdx*8]
-
-        movdqa      xmm3,           xmm1
-        movdqa      xmm4,           xmm2
-
-
-        punpcklbw   xmm1,           xmm0
-        punpckhbw   xmm3,           xmm0
-
-        punpcklbw   xmm2,           xmm0
-        punpckhbw   xmm4,           xmm0
-
-
-        psubw       xmm1,           xmm2
-        psubw       xmm3,           xmm4
-
-        paddw       xmm7,           xmm1
-        pmaddwd     xmm1,           xmm1
-
-        paddw       xmm7,           xmm3
-        pmaddwd     xmm3,           xmm3
-
-        paddd       xmm6,           xmm1
-        paddd       xmm6,           xmm3
-
-        add         rsi,            rax
-        add         rdi,            rdx
-
-        sub         rcx,            1
-        jnz         .var16loop
-
-
-        movdqa      xmm1,           xmm6
-        pxor        xmm6,           xmm6
-
-        pxor        xmm5,           xmm5
-        punpcklwd   xmm6,           xmm7
-
-        punpckhwd   xmm5,           xmm7
-        psrad       xmm5,           16
-
-        psrad       xmm6,           16
-        paddd       xmm6,           xmm5
-
-        movdqa      xmm2,           xmm1
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddd       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddd       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movd DWORD PTR [rax],       xmm7
-        movd DWORD PTR [rdi],       xmm1
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;unsigned int vp8_get8x8var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp8_get8x8var_sse2) PRIVATE
-sym(vp8_get8x8var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        movq        xmm1,           QWORD PTR [rsi]
-        movq        xmm2,           QWORD PTR [rdi]
-
-        punpcklbw   xmm1,           xmm0
-        punpcklbw   xmm2,           xmm0
-
-        psubsw      xmm1,           xmm2
-        paddw       xmm7,           xmm1
-
-        pmaddwd     xmm1,           xmm1
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax * 2]
-        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movdqa      xmm6,           xmm7
-        punpcklwd   xmm6,           xmm0
-
-        punpckhwd   xmm7,           xmm0
-        movdqa      xmm2,           xmm1
-
-        paddw       xmm6,           xmm7
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddw       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddw       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movq        rdx,            xmm7
-        movsx       rcx,            dx
-
-        mov  dword ptr [rax],       ecx
-        movd DWORD PTR [rdi],       xmm1
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void vp8_filter_block2d_bil_var_sse2
 ;(
 ;    unsigned char *ref_ptr,
--- a/vp8/common/x86/variance_mmx.c
+++ b/vp8/common/x86/variance_mmx.c
@ -35,25 +35,6 @@ extern void filter_block1d_v6_mmx
    short *filter
 );

-extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
-extern unsigned int vp8_get8x8var_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-extern unsigned int vp8_get4x4var_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
    const unsigned char *ref_ptr,
@ -78,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx
    unsigned int *sumsquared
 );

-
-unsigned int vp8_variance4x4_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-
-}
-
-unsigned int vp8_variance8x8_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 6));
-
-}
-
-unsigned int vp8_mse16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    *sse = var;
-    return var;
-}
-
-
-unsigned int vp8_variance16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp8_variance16x8_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_variance8x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-
 unsigned int vp8_sub_pixel_variance4x4_mmx
 (
    const unsigned char  *src_ptr,
--- a/vp8/common/x86/variance_sse2.c
+++ b/vp8/common/x86/variance_sse2.c
@ -31,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
    unsigned int *sumsquared
 );

-extern unsigned int vp8_get4x4var_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-
-unsigned int vp8_get_mb_ss_sse2
-(
-    const short *src_ptr
-);
-unsigned int vp8_get16x16var_sse2
-(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-unsigned int vp8_get8x8var_sse2
-(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
 void vp8_filter_block2d_bil_var_sse2
 (
    const unsigned char *ref_ptr,
@ -136,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2
    unsigned int *sumsquared
 );

-unsigned int vp8_variance4x4_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-
-}
-
-unsigned int vp8_variance8x8_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 6));
-
-}
-
-
-unsigned int vp8_variance16x16_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0;
-    int sum0;
-
-
-    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    *sse = sse0;
-    return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
-}
-unsigned int vp8_mse16x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-
-    unsigned int sse0;
-    int sum0;
-    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    *sse = sse0;
-    return sse0;
-
-}
-
-
-unsigned int vp8_variance16x8_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-unsigned int vp8_variance8x16_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
 unsigned int vp8_sub_pixel_variance4x4_wmt
 (
    const unsigned char  *src_ptr,
--- a/vp8/common/x86/variance_ssse3.c
+++ b/vp8/common/x86/variance_ssse3.c
@ -13,15 +13,6 @@
 #include "vp8/common/variance.h"
 #include "vpx_ports/mem.h"

-extern unsigned int vp8_get16x16var_sse2
-(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
 extern void vp8_half_horiz_vert_variance16x_h_sse2
 (
    const unsigned char *ref_ptr,
--- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@ -1,138 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mse16x16_armv6|
-
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-;
-;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
-;      So, we can remove this part of calculation.
-
-|vp8_mse16x16_armv6| PROC
-
-    push    {r4-r9, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     r4, #0              ; initialize sse = 0
-
-loop
-    ; 1st 4 pixels
-    ldr     r5, [r0, #0x0]      ; load 4 src pixels
-    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r5, r6          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0x4]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-    ldr     r5, [r0, #0x8]      ; load 4 src pixels
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0xc]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    subs    r12, r12, #1        ; next row
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r1, [sp, #28]       ; get address of sse
-    mov     r0, r4              ; return sse
-    str     r4, [r1]            ; store sse
-
-    pop     {r4-r9, pc}
-
-    ENDP
-
-    END
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c
+++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c
@ -1,131 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-unsigned int vp8_mse16x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    int64x1_t d0s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    int32x4_t q7s32, q8s32, q9s32, q10s32;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int64x2_t q1s64;
-
-    q7s32 = vdupq_n_s32(0);
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-        q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-        q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q7s32 = vaddq_s32(q7s32, q8s32);
-    q9s32 = vaddq_s32(q9s32, q10s32);
-    q10s32 = vaddq_s32(q7s32, q9s32);
-
-    q1s64 = vpaddlq_s32(q10s32);
-    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
-
-unsigned int vp8_get4x4sse_cs_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride) {
-    int16x4_t d22s16, d24s16, d26s16, d28s16;
-    int64x1_t d0s64;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    int32x4_t q7s32, q8s32, q9s32, q10s32;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int64x2_t q1s64;
-
-    d0u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d4u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d1u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d5u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d6u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d3u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d7u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-
-    q11u16 = vsubl_u8(d0u8, d4u8);
-    q12u16 = vsubl_u8(d1u8, d5u8);
-    q13u16 = vsubl_u8(d2u8, d6u8);
-    q14u16 = vsubl_u8(d3u8, d7u8);
-
-    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
-
-    q7s32 = vmull_s16(d22s16, d22s16);
-    q8s32 = vmull_s16(d24s16, d24s16);
-    q9s32 = vmull_s16(d26s16, d26s16);
-    q10s32 = vmull_s16(d28s16, d28s16);
-
-    q7s32 = vaddq_s32(q7s32, q8s32);
-    q9s32 = vaddq_s32(q9s32, q10s32);
-    q9s32 = vaddq_s32(q7s32, q9s32);
-
-    q1s64 = vpaddlq_s32(q9s32);
-    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@ -11,6 +11,7 @@

 #include "vpx_config.h"
 #include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "encodemb.h"
 #include "encodemv.h"
 #include "vp8/common/common.h"
@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
     *  lambda using a non-linear combination (e.g., the smallest, or second
     *  smallest, etc.).
     */
-    act =  vp8_variance16x16(x->src.y_buffer,
+    act =  vpx_variance16x16(x->src.y_buffer,
                    x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
    act = act<<4;

--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@ -11,6 +11,7 @@

 #include "vpx_config.h"
 #include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "quantize.h"
 #include "vp8/common/reconintra4x4.h"
 #include "encodemb.h"
@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
        }
    }

-    intra_pred_var = vp8_get_mb_ss(x->src_diff);
+    intra_pred_var = vpx_get_mb_ss(x->src_diff);

    return intra_pred_var;
 }
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@ -12,6 +12,7 @@
 #include <limits.h>
 #include <stdio.h>

+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "block.h"
 #include "onyx_int.h"
@ -422,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
    /* Set up pointers for this macro block raw buffer */
    raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
                                + d->offset);
-    vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
-                   (unsigned int *)(raw_motion_err));
+    vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride,
+                 (unsigned int *)(raw_motion_err));

    /* Set up pointers for this macro block recon buffer */
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
    ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
-    vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
-                   (unsigned int *)(best_motion_err));
+    vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
+                 (unsigned int *)(best_motion_err));
 }

 static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
@ -453,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
    int new_mv_mode_penalty = 256;

    /* override the default variance function to use MSE */
-    v_fn_ptr.vf    = vp8_mse16x16;
+    v_fn_ptr.vf    = vpx_mse16x16;

    /* Set up pointers for this macro block recon buffer */
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@ -2131,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
 #endif

    cpi->fn_ptr[BLOCK_16X16].sdf            = vpx_sad16x16;
-    cpi->fn_ptr[BLOCK_16X16].vf             = vp8_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].vf             = vpx_variance16x16;
    cpi->fn_ptr[BLOCK_16X16].svf            = vp8_sub_pixel_variance16x16;
    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vp8_variance_halfpixvar16x16_h;
    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vp8_variance_halfpixvar16x16_v;
@ -2141,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_16X16].sdx4df         = vpx_sad16x16x4d;

    cpi->fn_ptr[BLOCK_16X8].sdf            = vpx_sad16x8;
-    cpi->fn_ptr[BLOCK_16X8].vf             = vp8_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].vf             = vpx_variance16x8;
    cpi->fn_ptr[BLOCK_16X8].svf            = vp8_sub_pixel_variance16x8;
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
@ -2151,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_16X8].sdx4df         = vpx_sad16x8x4d;

    cpi->fn_ptr[BLOCK_8X16].sdf            = vpx_sad8x16;
-    cpi->fn_ptr[BLOCK_8X16].vf             = vp8_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].vf             = vpx_variance8x16;
    cpi->fn_ptr[BLOCK_8X16].svf            = vp8_sub_pixel_variance8x16;
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
@ -2161,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_8X16].sdx4df         = vpx_sad8x16x4d;

    cpi->fn_ptr[BLOCK_8X8].sdf            = vpx_sad8x8;
-    cpi->fn_ptr[BLOCK_8X8].vf             = vp8_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].vf             = vpx_variance8x8;
    cpi->fn_ptr[BLOCK_8X8].svf            = vp8_sub_pixel_variance8x8;
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
@ -2171,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_8X8].sdx4df         = vpx_sad8x8x4d;

    cpi->fn_ptr[BLOCK_4X4].sdf            = vpx_sad4x4;
-    cpi->fn_ptr[BLOCK_4X4].vf             = vp8_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].vf             = vpx_variance4x4;
    cpi->fn_ptr[BLOCK_4X4].svf            = vp8_sub_pixel_variance4x4;
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
@ -2558,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
        {
            unsigned int sse;

-            vp8_mse16x16(orig + col, orig_stride,
+            vpx_mse16x16(orig + col, orig_stride,
                                            recon + col, recon_stride,
                                            &sse);
            total_sse += sse;
@ -3384,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
                int index = block_index_row + (j >> 4);
                if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
                  unsigned int sse;
-                  Total += vp8_mse16x16(src + j,
+                  Total += vpx_mse16x16(src + j,
                                        source->y_stride,
                                        dst + j, dest->y_stride,
                                        &sse);
@ -3448,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
      int index = block_index_row + (j >> 4);
      if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
        unsigned int sse;
-        const unsigned int var = vp8_variance16x16(src + j,
+        const unsigned int var = vpx_variance16x16(src + j,
                                                   ystride,
                                                   dst + j,
                                                   ystride,
@ -3458,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
        // is small (to avoid effects from lighting change).
        if ((sse - var) < 128) {
          unsigned int sse2;
-          const unsigned int act = vp8_variance16x16(src + j,
+          const unsigned int act = vpx_variance16x16(src + j,
                                                     ystride,
                                                     const_source,
                                                     0,
@ -5993,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest)
        for (j = 0; j < source->y_width; j += 16)
        {
            unsigned int sse;
-            Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+            Total += vpx_mse16x16(src + j, source->y_stride,
+                                  dst + j, dest->y_stride, &sse);
        }

        src += 16 * source->y_stride;
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@ -11,6 +11,7 @@

 #include <limits.h>
 #include "vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
@ -215,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb,

 }

-
-unsigned int vp8_get4x4sse_cs_c
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride
-)
-{
-    int distortion = 0;
-    int r, c;
-
-    for (r = 0; r < 4; r++)
-    {
-        for (c = 0; c < 4; c++)
-        {
-            int diff = src_ptr[c] - ref_ptr[c];
-            distortion += diff * diff;
-        }
-
-        src_ptr += source_stride;
-        ref_ptr += recon_stride;
-    }
-
-    return distortion;
-}
-
 static int get_prediction_error(BLOCK *be, BLOCKD *b)
 {
    unsigned char *sptr;
@ -249,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b)
    sptr = (*(be->base_src) + be->src);
    dptr = b->predictor;

-    return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
+    return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16);

 }

@ -1037,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            else
            {
                rate2 += rate;
-                distortion2 = vp8_variance16x16(
+                distortion2 = vpx_variance16x16(
                                    *(b->base_src), b->src_stride,
                                    x->e_mbd.predictor, 16, &sse);
                this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
@ -1066,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                                             xd->dst.y_stride,
                                             xd->predictor,
                                             16);
-            distortion2 = vp8_variance16x16
+            distortion2 = vpx_variance16x16
                                          (*(b->base_src), b->src_stride,
                                          x->e_mbd.predictor, 16, &sse);
            rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
@ -1547,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
                                         xd->dst.y_stride,
                                         xd->predictor,
                                         16);
-        distortion = vp8_variance16x16
+        distortion = vpx_variance16x16
            (*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
        rate = x->mbmode_cost[xd->frame_type][mode];
        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@ -9,6 +9,7 @@
 */


+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
        for (j = 0; j < source->y_width; j += 16)
        {
            unsigned int sse;
-            Total += vp8_mse16x16(src + j, source->y_stride,
+            Total += vpx_mse16x16(src + j, source->y_stride,
                                                     dst + j, dest->y_stride,
                                                     &sse);
        }
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@ -15,6 +15,7 @@
 #include <assert.h>
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "tokenize.h"
 #include "treewriter.h"
 #include "onyx_int.h"
@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x)
    }
    else
    {
-        vp8_variance8x8(uptr, pre_stride,
+        vpx_variance8x8(uptr, pre_stride,
            upred_ptr, uv_stride, &sse2);
-        vp8_variance8x8(vptr, pre_stride,
+        vpx_variance8x8(vptr, pre_stride,
            vpred_ptr, uv_stride, &sse1);
        sse2 += sse1;
    }
@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4],
        if(threshold < x->encode_breakout)
            threshold = x->encode_breakout;

-        var = vp8_variance16x16
+        var = vpx_variance16x16
                (*(b->base_src), b->src_stride,
                x->e_mbd.predictor, 16, &sse);

--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance_neon.c

 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
 #File list for media
 # encoder
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)

 #File list for neon
@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_mse16x16_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
  get_thr(bs, qdiff, &sad_thr, &vdiff_thr);

  if (bs == BLOCK_16X16) {
-    vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+    vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
    sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
  } else if (bs == BLOCK_32X32) {
-    vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+    vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
    sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
  } else /* if (bs == BLOCK_64X64) */ {
-    vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+    vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
    sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
  }

--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {


 # variance
-add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x32/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x4/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x4/, "$sse2_x86inc";
-
 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";

@ -922,21 +877,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";

-add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
-specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
-
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp9_avg_8x8 sse2 neon/;

@ -1141,142 +1081,6 @@ specialize qw/vp9_temporal_filter_apply sse2/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

-  # variance
-  add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x4/;
-
-  add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance4x4/;
-
-  add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
-
-  add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x4/;
-
-  add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance4x4/;
-
-  add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
-
-  add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x4/;
-
-  add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance4x4/;
-
-  add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
-
-  add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
-
  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";

@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;

-  add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse8x16/;
-
-  add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse16x8/;
-
-  add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse8x16/;
-
-  add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse16x8/;
-
-  add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse8x16/;
-
-  add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse16x8/;
-
-  add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";

  # ENCODEMB INVOKE

--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@ -10,6 +10,7 @@

 #include <arm_neon.h>
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"

 #include "vpx_ports/mem.h"
@ -20,82 +21,6 @@

 #include "vp9/encoder/vp9_variance.h"

-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride,
-                             const uint8_t *b, int b_stride,
-                             int w, int h, uint32_t *sse, int *sum) {
-  int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo = vmlal_s16(v_sse_lo,
-                           vget_low_s16(sv_diff),
-                           vget_low_s16(sv_diff));
-      v_sse_hi = vmlal_s16(v_sse_hi,
-                           vget_high_s16(sv_diff),
-                           vget_high_s16(sv_diff));
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
-
-void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
-                        const uint8_t *ref_ptr, int ref_stride,
-                        unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
-                   8, sse, sum);
-}
-
-unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
-}
-
-void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
-                          const uint8_t *ref_ptr, int ref_stride,
-                          unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
-                   16, sse, sum);
-}
-
-unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
-}
-
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                      uint8_t *output_ptr,
                                      unsigned int src_pixels_per_line,
@ -162,7 +87,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
                            BILINEAR_FILTERS_2TAP(xoffset));
  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
                            8, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }

 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
@ -180,77 +105,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
                             BILINEAR_FILTERS_2TAP(xoffset));
  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
                             16, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
-}
-
-void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
-                          const uint8_t *ref_ptr, int ref_stride,
-                          unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
-                   32, sse, sum);
-}
-
-unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
-}
-
-unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w8(a + (32 * a_stride), a_stride,
-                   b + (32 * b_stride), b_stride, 32, 32,
-                   &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
-}
-
-unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride,
-                   b + (16 * b_stride), b_stride, 64, 16,
-                   &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
-}
-
-unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride,
-                   b + (16 * b_stride), b_stride, 64, 16,
-                   &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
-                   b + (16 * 2 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
-                   b + (16 * 3 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
+  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
 }

 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
@ -268,7 +123,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
                             BILINEAR_FILTERS_2TAP(xoffset));
  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
                             32, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
 }

 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
@ -286,5 +141,5 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
                             BILINEAR_FILTERS_2TAP(xoffset));
  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
                             64, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
 }
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c
@ -98,9 +98,9 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
    int avg;
 #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                      CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
-                      &sse, &avg);
+      highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                        CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
+                        &sse, &avg);
      sse >>= 2 * (xd->bd - 8);
      avg >>= (xd->bd - 8);
    } else {
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -13,6 +13,7 @@
 #include <stdio.h>

 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"

 #include "vpx_ports/mem.h"
@ -3672,15 +3673,15 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
      if (cm->use_highbitdepth) {
        switch (cm->bit_depth) {
          case VPX_BITS_8:
-            vp9_highbd_get16x16var(src, src_stride, last_src, last_stride,
+            vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
                                   &var16->sse, &var16->sum);
            break;
          case VPX_BITS_10:
-            vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
+            vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
                                    &var16->sse, &var16->sum);
            break;
          case VPX_BITS_12:
-            vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
+            vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
                                      &var16->sse, &var16->sum);
            break;
          default:
@ -3689,11 +3690,11 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
            return -1;
        }
      } else {
-        vp9_get16x16var(src, src_stride, last_src, last_stride,
+        vpx_get16x16var(src, src_stride, last_src, last_stride,
                        &var16->sse, &var16->sum);
      }
 #else
-      vp9_get16x16var(src, src_stride, last_src, last_stride,
+      vpx_get16x16var(src, src_stride, last_src, last_stride,
                      &var16->sse, &var16->sum);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
      var16->var = var16->sse -
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@ -998,7 +998,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X16,
                   vpx_highbd_sad32x16_bits8,
                   vpx_highbd_sad32x16_avg_bits8,
-                   vp9_highbd_variance32x16,
+                   vpx_highbd_8_variance32x16,
                   vp9_highbd_sub_pixel_variance32x16,
                   vp9_highbd_sub_pixel_avg_variance32x16,
                   NULL,
@ -1008,7 +1008,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X32,
                   vpx_highbd_sad16x32_bits8,
                   vpx_highbd_sad16x32_avg_bits8,
-                   vp9_highbd_variance16x32,
+                   vpx_highbd_8_variance16x32,
                   vp9_highbd_sub_pixel_variance16x32,
                   vp9_highbd_sub_pixel_avg_variance16x32,
                   NULL,
@ -1018,7 +1018,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X32,
                   vpx_highbd_sad64x32_bits8,
                   vpx_highbd_sad64x32_avg_bits8,
-                   vp9_highbd_variance64x32,
+                   vpx_highbd_8_variance64x32,
                   vp9_highbd_sub_pixel_variance64x32,
                   vp9_highbd_sub_pixel_avg_variance64x32,
                   NULL,
@ -1028,7 +1028,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X64,
                   vpx_highbd_sad32x64_bits8,
                   vpx_highbd_sad32x64_avg_bits8,
-                   vp9_highbd_variance32x64,
+                   vpx_highbd_8_variance32x64,
                   vp9_highbd_sub_pixel_variance32x64,
                   vp9_highbd_sub_pixel_avg_variance32x64,
                   NULL,
@ -1038,7 +1038,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X32,
                   vpx_highbd_sad32x32_bits8,
                   vpx_highbd_sad32x32_avg_bits8,
-                   vp9_highbd_variance32x32,
+                   vpx_highbd_8_variance32x32,
                   vp9_highbd_sub_pixel_variance32x32,
                   vp9_highbd_sub_pixel_avg_variance32x32,
                   vpx_highbd_sad32x32x3_bits8,
@ -1048,7 +1048,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X64,
                   vpx_highbd_sad64x64_bits8,
                   vpx_highbd_sad64x64_avg_bits8,
-                   vp9_highbd_variance64x64,
+                   vpx_highbd_8_variance64x64,
                   vp9_highbd_sub_pixel_variance64x64,
                   vp9_highbd_sub_pixel_avg_variance64x64,
                   vpx_highbd_sad64x64x3_bits8,
@ -1058,7 +1058,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X16,
                   vpx_highbd_sad16x16_bits8,
                   vpx_highbd_sad16x16_avg_bits8,
-                   vp9_highbd_variance16x16,
+                   vpx_highbd_8_variance16x16,
                   vp9_highbd_sub_pixel_variance16x16,
                   vp9_highbd_sub_pixel_avg_variance16x16,
                   vpx_highbd_sad16x16x3_bits8,
@ -1068,7 +1068,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X8,
                   vpx_highbd_sad16x8_bits8,
                   vpx_highbd_sad16x8_avg_bits8,
-                   vp9_highbd_variance16x8,
+                   vpx_highbd_8_variance16x8,
                   vp9_highbd_sub_pixel_variance16x8,
                   vp9_highbd_sub_pixel_avg_variance16x8,
                   vpx_highbd_sad16x8x3_bits8,
@ -1078,7 +1078,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X16,
                   vpx_highbd_sad8x16_bits8,
                   vpx_highbd_sad8x16_avg_bits8,
-                   vp9_highbd_variance8x16,
+                   vpx_highbd_8_variance8x16,
                   vp9_highbd_sub_pixel_variance8x16,
                   vp9_highbd_sub_pixel_avg_variance8x16,
                   vpx_highbd_sad8x16x3_bits8,
@ -1088,7 +1088,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X8,
                   vpx_highbd_sad8x8_bits8,
                   vpx_highbd_sad8x8_avg_bits8,
-                   vp9_highbd_variance8x8,
+                   vpx_highbd_8_variance8x8,
                   vp9_highbd_sub_pixel_variance8x8,
                   vp9_highbd_sub_pixel_avg_variance8x8,
                   vpx_highbd_sad8x8x3_bits8,
@ -1098,7 +1098,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X4,
                   vpx_highbd_sad8x4_bits8,
                   vpx_highbd_sad8x4_avg_bits8,
-                   vp9_highbd_variance8x4,
+                   vpx_highbd_8_variance8x4,
                   vp9_highbd_sub_pixel_variance8x4,
                   vp9_highbd_sub_pixel_avg_variance8x4,
                   NULL,
@ -1108,7 +1108,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X8,
                   vpx_highbd_sad4x8_bits8,
                   vpx_highbd_sad4x8_avg_bits8,
-                   vp9_highbd_variance4x8,
+                   vpx_highbd_8_variance4x8,
                   vp9_highbd_sub_pixel_variance4x8,
                   vp9_highbd_sub_pixel_avg_variance4x8,
                   NULL,
@ -1118,7 +1118,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X4,
                   vpx_highbd_sad4x4_bits8,
                   vpx_highbd_sad4x4_avg_bits8,
-                   vp9_highbd_variance4x4,
+                   vpx_highbd_8_variance4x4,
                   vp9_highbd_sub_pixel_variance4x4,
                   vp9_highbd_sub_pixel_avg_variance4x4,
                   vpx_highbd_sad4x4x3_bits8,
@ -1130,7 +1130,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X16,
                   vpx_highbd_sad32x16_bits10,
                   vpx_highbd_sad32x16_avg_bits10,
-                   vp9_highbd_10_variance32x16,
+                   vpx_highbd_10_variance32x16,
                   vp9_highbd_10_sub_pixel_variance32x16,
                   vp9_highbd_10_sub_pixel_avg_variance32x16,
                   NULL,
@ -1140,7 +1140,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X32,
                   vpx_highbd_sad16x32_bits10,
                   vpx_highbd_sad16x32_avg_bits10,
-                   vp9_highbd_10_variance16x32,
+                   vpx_highbd_10_variance16x32,
                   vp9_highbd_10_sub_pixel_variance16x32,
                   vp9_highbd_10_sub_pixel_avg_variance16x32,
                   NULL,
@ -1150,7 +1150,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X32,
                   vpx_highbd_sad64x32_bits10,
                   vpx_highbd_sad64x32_avg_bits10,
-                   vp9_highbd_10_variance64x32,
+                   vpx_highbd_10_variance64x32,
                   vp9_highbd_10_sub_pixel_variance64x32,
                   vp9_highbd_10_sub_pixel_avg_variance64x32,
                   NULL,
@ -1160,7 +1160,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X64,
                   vpx_highbd_sad32x64_bits10,
                   vpx_highbd_sad32x64_avg_bits10,
-                   vp9_highbd_10_variance32x64,
+                   vpx_highbd_10_variance32x64,
                   vp9_highbd_10_sub_pixel_variance32x64,
                   vp9_highbd_10_sub_pixel_avg_variance32x64,
                   NULL,
@ -1170,7 +1170,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X32,
                   vpx_highbd_sad32x32_bits10,
                   vpx_highbd_sad32x32_avg_bits10,
-                   vp9_highbd_10_variance32x32,
+                   vpx_highbd_10_variance32x32,
                   vp9_highbd_10_sub_pixel_variance32x32,
                   vp9_highbd_10_sub_pixel_avg_variance32x32,
                   vpx_highbd_sad32x32x3_bits10,
@ -1180,7 +1180,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X64,
                   vpx_highbd_sad64x64_bits10,
                   vpx_highbd_sad64x64_avg_bits10,
-                   vp9_highbd_10_variance64x64,
+                   vpx_highbd_10_variance64x64,
                   vp9_highbd_10_sub_pixel_variance64x64,
                   vp9_highbd_10_sub_pixel_avg_variance64x64,
                   vpx_highbd_sad64x64x3_bits10,
@ -1190,7 +1190,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X16,
                   vpx_highbd_sad16x16_bits10,
                   vpx_highbd_sad16x16_avg_bits10,
-                   vp9_highbd_10_variance16x16,
+                   vpx_highbd_10_variance16x16,
                   vp9_highbd_10_sub_pixel_variance16x16,
                   vp9_highbd_10_sub_pixel_avg_variance16x16,
                   vpx_highbd_sad16x16x3_bits10,
@ -1200,7 +1200,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X8,
                   vpx_highbd_sad16x8_bits10,
                   vpx_highbd_sad16x8_avg_bits10,
-                   vp9_highbd_10_variance16x8,
+                   vpx_highbd_10_variance16x8,
                   vp9_highbd_10_sub_pixel_variance16x8,
                   vp9_highbd_10_sub_pixel_avg_variance16x8,
                   vpx_highbd_sad16x8x3_bits10,
@ -1210,7 +1210,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X16,
                   vpx_highbd_sad8x16_bits10,
                   vpx_highbd_sad8x16_avg_bits10,
-                   vp9_highbd_10_variance8x16,
+                   vpx_highbd_10_variance8x16,
                   vp9_highbd_10_sub_pixel_variance8x16,
                   vp9_highbd_10_sub_pixel_avg_variance8x16,
                   vpx_highbd_sad8x16x3_bits10,
@ -1220,7 +1220,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X8,
                   vpx_highbd_sad8x8_bits10,
                   vpx_highbd_sad8x8_avg_bits10,
-                   vp9_highbd_10_variance8x8,
+                   vpx_highbd_10_variance8x8,
                   vp9_highbd_10_sub_pixel_variance8x8,
                   vp9_highbd_10_sub_pixel_avg_variance8x8,
                   vpx_highbd_sad8x8x3_bits10,
@ -1230,7 +1230,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X4,
                   vpx_highbd_sad8x4_bits10,
                   vpx_highbd_sad8x4_avg_bits10,
-                   vp9_highbd_10_variance8x4,
+                   vpx_highbd_10_variance8x4,
                   vp9_highbd_10_sub_pixel_variance8x4,
                   vp9_highbd_10_sub_pixel_avg_variance8x4,
                   NULL,
@ -1240,7 +1240,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X8,
                   vpx_highbd_sad4x8_bits10,
                   vpx_highbd_sad4x8_avg_bits10,
-                   vp9_highbd_10_variance4x8,
+                   vpx_highbd_10_variance4x8,
                   vp9_highbd_10_sub_pixel_variance4x8,
                   vp9_highbd_10_sub_pixel_avg_variance4x8,
                   NULL,
@ -1250,7 +1250,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X4,
                   vpx_highbd_sad4x4_bits10,
                   vpx_highbd_sad4x4_avg_bits10,
-                   vp9_highbd_10_variance4x4,
+                   vpx_highbd_10_variance4x4,
                   vp9_highbd_10_sub_pixel_variance4x4,
                   vp9_highbd_10_sub_pixel_avg_variance4x4,
                   vpx_highbd_sad4x4x3_bits10,
@ -1262,7 +1262,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X16,
                   vpx_highbd_sad32x16_bits12,
                   vpx_highbd_sad32x16_avg_bits12,
-                   vp9_highbd_12_variance32x16,
+                   vpx_highbd_12_variance32x16,
                   vp9_highbd_12_sub_pixel_variance32x16,
                   vp9_highbd_12_sub_pixel_avg_variance32x16,
                   NULL,
@ -1272,7 +1272,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X32,
                   vpx_highbd_sad16x32_bits12,
                   vpx_highbd_sad16x32_avg_bits12,
-                   vp9_highbd_12_variance16x32,
+                   vpx_highbd_12_variance16x32,
                   vp9_highbd_12_sub_pixel_variance16x32,
                   vp9_highbd_12_sub_pixel_avg_variance16x32,
                   NULL,
@ -1282,7 +1282,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X32,
                   vpx_highbd_sad64x32_bits12,
                   vpx_highbd_sad64x32_avg_bits12,
-                   vp9_highbd_12_variance64x32,
+                   vpx_highbd_12_variance64x32,
                   vp9_highbd_12_sub_pixel_variance64x32,
                   vp9_highbd_12_sub_pixel_avg_variance64x32,
                   NULL,
@ -1292,7 +1292,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X64,
                   vpx_highbd_sad32x64_bits12,
                   vpx_highbd_sad32x64_avg_bits12,
-                   vp9_highbd_12_variance32x64,
+                   vpx_highbd_12_variance32x64,
                   vp9_highbd_12_sub_pixel_variance32x64,
                   vp9_highbd_12_sub_pixel_avg_variance32x64,
                   NULL,
@ -1302,7 +1302,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X32,
                   vpx_highbd_sad32x32_bits12,
                   vpx_highbd_sad32x32_avg_bits12,
-                   vp9_highbd_12_variance32x32,
+                   vpx_highbd_12_variance32x32,
                   vp9_highbd_12_sub_pixel_variance32x32,
                   vp9_highbd_12_sub_pixel_avg_variance32x32,
                   vpx_highbd_sad32x32x3_bits12,
@ -1312,7 +1312,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X64,
                   vpx_highbd_sad64x64_bits12,
                   vpx_highbd_sad64x64_avg_bits12,
-                   vp9_highbd_12_variance64x64,
+                   vpx_highbd_12_variance64x64,
                   vp9_highbd_12_sub_pixel_variance64x64,
                   vp9_highbd_12_sub_pixel_avg_variance64x64,
                   vpx_highbd_sad64x64x3_bits12,
@ -1322,7 +1322,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X16,
                   vpx_highbd_sad16x16_bits12,
                   vpx_highbd_sad16x16_avg_bits12,
-                   vp9_highbd_12_variance16x16,
+                   vpx_highbd_12_variance16x16,
                   vp9_highbd_12_sub_pixel_variance16x16,
                   vp9_highbd_12_sub_pixel_avg_variance16x16,
                   vpx_highbd_sad16x16x3_bits12,
@ -1332,7 +1332,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X8,
                   vpx_highbd_sad16x8_bits12,
                   vpx_highbd_sad16x8_avg_bits12,
-                   vp9_highbd_12_variance16x8,
+                   vpx_highbd_12_variance16x8,
                   vp9_highbd_12_sub_pixel_variance16x8,
                   vp9_highbd_12_sub_pixel_avg_variance16x8,
                   vpx_highbd_sad16x8x3_bits12,
@ -1342,7 +1342,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X16,
                   vpx_highbd_sad8x16_bits12,
                   vpx_highbd_sad8x16_avg_bits12,
-                   vp9_highbd_12_variance8x16,
+                   vpx_highbd_12_variance8x16,
                   vp9_highbd_12_sub_pixel_variance8x16,
                   vp9_highbd_12_sub_pixel_avg_variance8x16,
                   vpx_highbd_sad8x16x3_bits12,
@ -1352,7 +1352,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X8,
                   vpx_highbd_sad8x8_bits12,
                   vpx_highbd_sad8x8_avg_bits12,
-                   vp9_highbd_12_variance8x8,
+                   vpx_highbd_12_variance8x8,
                   vp9_highbd_12_sub_pixel_variance8x8,
                   vp9_highbd_12_sub_pixel_avg_variance8x8,
                   vpx_highbd_sad8x8x3_bits12,
@ -1362,7 +1362,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X4,
                   vpx_highbd_sad8x4_bits12,
                   vpx_highbd_sad8x4_avg_bits12,
-                   vp9_highbd_12_variance8x4,
+                   vpx_highbd_12_variance8x4,
                   vp9_highbd_12_sub_pixel_variance8x4,
                   vp9_highbd_12_sub_pixel_avg_variance8x4,
                   NULL,
@ -1372,7 +1372,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X8,
                   vpx_highbd_sad4x8_bits12,
                   vpx_highbd_sad4x8_avg_bits12,
-                   vp9_highbd_12_variance4x8,
+                   vpx_highbd_12_variance4x8,
                   vp9_highbd_12_sub_pixel_variance4x8,
                   vp9_highbd_12_sub_pixel_avg_variance4x8,
                   NULL,
@ -1382,7 +1382,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X4,
                   vpx_highbd_sad4x4_bits12,
                   vpx_highbd_sad4x4_avg_bits12,
-                   vp9_highbd_12_variance4x4,
+                   vpx_highbd_12_variance4x4,
                   vp9_highbd_12_sub_pixel_variance4x4,
                   vp9_highbd_12_sub_pixel_avg_variance4x4,
                   vpx_highbd_sad4x4x3_bits12,
@ -1805,61 +1805,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
    cpi->fn_ptr[BT].sdx4df         = SDX4DF;

  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
-      vp9_variance32x16, vp9_sub_pixel_variance32x16,
+      vpx_variance32x16, vp9_sub_pixel_variance32x16,
      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)

  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
-      vp9_variance16x32, vp9_sub_pixel_variance16x32,
+      vpx_variance16x32, vp9_sub_pixel_variance16x32,
      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)

  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
-      vp9_variance64x32, vp9_sub_pixel_variance64x32,
+      vpx_variance64x32, vp9_sub_pixel_variance64x32,
      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)

  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
-      vp9_variance32x64, vp9_sub_pixel_variance32x64,
+      vpx_variance32x64, vp9_sub_pixel_variance32x64,
      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)

  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
-      vp9_variance32x32, vp9_sub_pixel_variance32x32,
+      vpx_variance32x32, vp9_sub_pixel_variance32x32,
      vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
      vpx_sad32x32x4d)

  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
-      vp9_variance64x64, vp9_sub_pixel_variance64x64,
+      vpx_variance64x64, vp9_sub_pixel_variance64x64,
      vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
      vpx_sad64x64x4d)

  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
-      vp9_variance16x16, vp9_sub_pixel_variance16x16,
+      vpx_variance16x16, vp9_sub_pixel_variance16x16,
      vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
      vpx_sad16x16x4d)

  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
-      vp9_variance16x8, vp9_sub_pixel_variance16x8,
+      vpx_variance16x8, vp9_sub_pixel_variance16x8,
      vp9_sub_pixel_avg_variance16x8,
      vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)

  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
-      vp9_variance8x16, vp9_sub_pixel_variance8x16,
+      vpx_variance8x16, vp9_sub_pixel_variance8x16,
      vp9_sub_pixel_avg_variance8x16,
      vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)

  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
-      vp9_variance8x8, vp9_sub_pixel_variance8x8,
+      vpx_variance8x8, vp9_sub_pixel_variance8x8,
      vp9_sub_pixel_avg_variance8x8,
      vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)

  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
-      vp9_variance8x4, vp9_sub_pixel_variance8x4,
+      vpx_variance8x4, vp9_sub_pixel_variance8x4,
      vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)

  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
-      vp9_variance4x8, vp9_sub_pixel_variance4x8,
+      vpx_variance4x8, vp9_sub_pixel_variance4x8,
      vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)

  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
-      vp9_variance4x4, vp9_sub_pixel_variance4x4,
+      vpx_variance4x4, vp9_sub_pixel_variance4x4,
      vp9_sub_pixel_avg_variance4x4,
      vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)

@ -2079,7 +2079,7 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
    const uint8_t *pa = a;
    const uint8_t *pb = b;
    for (x = 0; x < width / 16; ++x) {
-      vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
      total_sse += sse;

      pa += 16;
@ -2124,21 +2124,21 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
  unsigned int sse = 0;
  int sum = 0;
  if (dw > 0) {
-    highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
-                    dw, height, &sse, &sum);
+    highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                      dw, height, &sse, &sum);
    total_sse += sse;
  }
  if (dh > 0) {
-    highbd_variance(&a[(height - dh) * a_stride], a_stride,
-                    &b[(height - dh) * b_stride], b_stride,
-                    width - dw, dh, &sse, &sum);
+    highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+                      &b[(height - dh) * b_stride], b_stride,
+                      width - dw, dh, &sse, &sum);
    total_sse += sse;
  }
  for (y = 0; y < height / 16; ++y) {
    const uint8_t *pa = a;
    const uint8_t *pb = b;
    for (x = 0; x < width / 16; ++x) {
-      vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
      total_sse += sse;
      pa += 16;
      pb += 16;
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdio.h>

+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"

 #include "vpx_mem/vpx_mem.h"
@ -267,13 +268,13 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
 static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
  switch (bsize) {
    case BLOCK_8X8:
-      return vp9_mse8x8;
+      return vpx_mse8x8;
    case BLOCK_16X8:
-      return vp9_mse16x8;
+      return vpx_mse16x8;
    case BLOCK_8X16:
-      return vp9_mse8x16;
+      return vpx_mse8x16;
    default:
-      return vp9_mse16x16;
+      return vpx_mse16x16;
  }
 }

@ -293,37 +294,37 @@ static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
    default:
      switch (bsize) {
        case BLOCK_8X8:
-          return vp9_highbd_mse8x8;
+          return vpx_highbd_8_mse8x8;
        case BLOCK_16X8:
-          return vp9_highbd_mse16x8;
+          return vpx_highbd_8_mse16x8;
        case BLOCK_8X16:
-          return vp9_highbd_mse8x16;
+          return vpx_highbd_8_mse8x16;
        default:
-          return vp9_highbd_mse16x16;
+          return vpx_highbd_8_mse16x16;
      }
      break;
    case 10:
      switch (bsize) {
        case BLOCK_8X8:
-          return vp9_highbd_10_mse8x8;
+          return vpx_highbd_10_mse8x8;
        case BLOCK_16X8:
-          return vp9_highbd_10_mse16x8;
+          return vpx_highbd_10_mse16x8;
        case BLOCK_8X16:
-          return vp9_highbd_10_mse8x16;
+          return vpx_highbd_10_mse8x16;
        default:
-          return vp9_highbd_10_mse16x16;
+          return vpx_highbd_10_mse16x16;
      }
      break;
    case 12:
      switch (bsize) {
        case BLOCK_8X8:
-          return vp9_highbd_12_mse8x8;
+          return vpx_highbd_12_mse8x8;
        case BLOCK_16X8:
-          return vp9_highbd_12_mse16x8;
+          return vpx_highbd_12_mse16x8;
        case BLOCK_8X16:
-          return vp9_highbd_12_mse8x16;
+          return vpx_highbd_12_mse8x16;
        default:
-          return vp9_highbd_12_mse16x16;
+          return vpx_highbd_12_mse16x16;
      }
      break;
  }
@ -634,7 +635,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
      xd->mi[0]->mbmi.tx_size = use_dc_pred ?
         (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
      vp9_encode_intra_block_plane(x, bsize, 0);
-      this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+      this_error = vpx_get_mb_ss(x->plane[0].src_diff);
 #if CONFIG_VP9_HIGHBITDEPTH
      if (cm->use_highbitdepth) {
        switch (cm->bit_depth) {
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@ -13,6 +13,7 @@
 #include <stdio.h>

 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"

 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@ -303,13 +304,13 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
  if (second_pred != NULL) {
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
-      vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
                               y_stride);
      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
                        sse1);
    } else {
      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
-      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
    }
  } else {
@ -321,7 +322,7 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
  (void) xd;
  if (second_pred != NULL) {
    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
  } else {
    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@ -14,6 +14,7 @@
 #include <stdio.h>

 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"

 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@ -215,7 +216,7 @@ static void block_variance(const uint8_t *src, int src_stride,

  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
-      vp9_get8x8var(src + src_stride * i + j, src_stride,
+      vpx_get8x8var(src + src_stride * i + j, src_stride,
                    ref + ref_stride * i + j, ref_stride,
                    &sse8x8[k], &sum8x8[k]);
      *sse += sse8x8[k];
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@ -9,6 +9,7 @@
 */

 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"

 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
@ -18,26 +19,6 @@

 #include "vp9/encoder/vp9_variance.h"

-void variance(const uint8_t *a, int  a_stride,
-              const uint8_t *b, int  b_stride,
-              int  w, int  h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
 // Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
 // or vertical direction to produce the filtered output block. Used to implement
 // first-pass of 2-D separable filter.
@ -100,25 +81,6 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
  }
 }

-unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
-  unsigned int i, sum = 0;
-
-  for (i = 0; i < 256; ++i) {
-    sum += src_ptr[i] * src_ptr[i];
-  }
-
-  return sum;
-}
-
-#define VAR(W, H) \
-unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                       const uint8_t *b, int b_stride, \
-                                       unsigned int *sse) { \
-  int sum; \
-  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
-
 #define SUBPIX_VAR(W, H) \
 unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
  const uint8_t *src, int  src_stride, \
@ -133,7 +95,7 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                     BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
+  return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
 }

 #define SUBPIX_AVG_VAR(W, H) \
@ -152,178 +114,51 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                     BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
 \
-  return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
+  return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
 }

-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
-                       const uint8_t *ref_ptr, int ref_stride,
-                       unsigned int *sse, int *sum) {
-  variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
-}
-
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
-                     const uint8_t *ref_ptr, int ref_stride,
-                     unsigned int *sse, int *sum) {
-  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
-}
-
-unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
-                            const uint8_t *ref, int ref_stride,
-                            unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
-  return *sse;
-}
-
-VAR(4, 4)
 SUBPIX_VAR(4, 4)
 SUBPIX_AVG_VAR(4, 4)

-VAR(4, 8)
 SUBPIX_VAR(4, 8)
 SUBPIX_AVG_VAR(4, 8)

-VAR(8, 4)
 SUBPIX_VAR(8, 4)
 SUBPIX_AVG_VAR(8, 4)

-VAR(8, 8)
 SUBPIX_VAR(8, 8)
 SUBPIX_AVG_VAR(8, 8)

-VAR(8, 16)
 SUBPIX_VAR(8, 16)
 SUBPIX_AVG_VAR(8, 16)

-VAR(16, 8)
 SUBPIX_VAR(16, 8)
 SUBPIX_AVG_VAR(16, 8)

-VAR(16, 16)
 SUBPIX_VAR(16, 16)
 SUBPIX_AVG_VAR(16, 16)

-VAR(16, 32)
 SUBPIX_VAR(16, 32)
 SUBPIX_AVG_VAR(16, 32)

-VAR(32, 16)
 SUBPIX_VAR(32, 16)
 SUBPIX_AVG_VAR(32, 16)

-VAR(32, 32)
 SUBPIX_VAR(32, 32)
 SUBPIX_AVG_VAR(32, 32)

-VAR(32, 64)
 SUBPIX_VAR(32, 64)
 SUBPIX_AVG_VAR(32, 64)

-VAR(64, 32)
 SUBPIX_VAR(64, 32)
 SUBPIX_AVG_VAR(64, 32)

-VAR(64, 64)
 SUBPIX_VAR(64, 64)
 SUBPIX_AVG_VAR(64, 64)

-void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                       int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
-void highbd_variance64(const uint8_t *a8, int  a_stride,
-                       const uint8_t *b8, int  b_stride,
-                       int w, int h, uint64_t *sse,
-                       uint64_t *sum) {
-  int i, j;
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-void highbd_variance(const uint8_t *a8, int  a_stride,
-                     const uint8_t *b8, int  b_stride,
-                     int w, int h, unsigned int *sse,
-                     int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
-
-void highbd_10_variance(const uint8_t *a8, int  a_stride,
-                        const uint8_t *b8, int  b_stride,
-                        int w, int h, unsigned int *sse,
-                        int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-void highbd_12_variance(const uint8_t *a8, int  a_stride,
-                        const uint8_t *b8, int  b_stride,
-                        int w, int h, unsigned int *sse,
-                        int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
-}
-
 static void highbd_var_filter_block2d_bil_first_pass(
    const uint8_t *src_ptr8,
    uint16_t *output_ptr,
@ -374,35 +209,6 @@ static void highbd_var_filter_block2d_bil_second_pass(
  }
 }

-#define HIGHBD_VAR(W, H) \
-unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                              const uint8_t *b, int b_stride, \
-                                              unsigned int *sse) { \
-  int sum; \
-  highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
-                                                 int a_stride, \
-                                                 const uint8_t *b, \
-                                                 int b_stride, \
-                                                 unsigned int *sse) { \
-  int sum; \
-  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
-                                                 int a_stride, \
-                                                 const uint8_t *b, \
-                                                 int b_stride, \
-                                                 unsigned int *sse) { \
-  int sum; \
-  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
-
 #define HIGHBD_SUBPIX_VAR(W, H) \
 unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
  const uint8_t *src, int  src_stride, \
@ -417,7 +223,7 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
                                          dst_stride, sse); \
 } \
 \
@ -434,7 +240,7 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                             W, dst, dst_stride, sse); \
 } \
 \
@ -451,7 +257,7 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                             W, dst, dst_stride, sse); \
 }

@ -471,10 +277,10 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                           CONVERT_TO_BYTEPTR(temp2), W); \
 \
-  return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
                                          dst_stride, sse); \
 } \
 \
@ -493,10 +299,10 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                           CONVERT_TO_BYTEPTR(temp2), W); \
 \
-  return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                             W, dst, dst_stride, sse); \
 } \
 \
@ -515,137 +321,49 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                           CONVERT_TO_BYTEPTR(temp2), W); \
 \
-  return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                             W, dst, dst_stride, sse); \
 }

-#define HIGHBD_GET_VAR(S) \
-void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    unsigned int *sse, int *sum) { \
-  highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse, int *sum) { \
-  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse, int *sum) { \
-  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-}
-
-#define HIGHBD_MSE(W, H) \
-unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \
-                                         int src_stride, \
-                                         const uint8_t *ref, \
-                                         int ref_stride, \
-                                         unsigned int *sse) { \
-  int sum; \
-  highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-} \
-\
-unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
-                                            int src_stride, \
-                                            const uint8_t *ref, \
-                                            int ref_stride, \
-                                            unsigned int *sse) { \
-  int sum; \
-  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-} \
-\
-unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
-                                            int src_stride, \
-                                            const uint8_t *ref, \
-                                            int ref_stride, \
-                                            unsigned int *sse) { \
-  int sum; \
-  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-}
-
-HIGHBD_GET_VAR(8)
-HIGHBD_GET_VAR(16)
-
-HIGHBD_MSE(16, 16)
-HIGHBD_MSE(16, 8)
-HIGHBD_MSE(8, 16)
-HIGHBD_MSE(8, 8)
-
-HIGHBD_VAR(4, 4)
 HIGHBD_SUBPIX_VAR(4, 4)
 HIGHBD_SUBPIX_AVG_VAR(4, 4)

-HIGHBD_VAR(4, 8)
 HIGHBD_SUBPIX_VAR(4, 8)
 HIGHBD_SUBPIX_AVG_VAR(4, 8)

-HIGHBD_VAR(8, 4)
 HIGHBD_SUBPIX_VAR(8, 4)
 HIGHBD_SUBPIX_AVG_VAR(8, 4)

-HIGHBD_VAR(8, 8)
 HIGHBD_SUBPIX_VAR(8, 8)
 HIGHBD_SUBPIX_AVG_VAR(8, 8)

-HIGHBD_VAR(8, 16)
 HIGHBD_SUBPIX_VAR(8, 16)
 HIGHBD_SUBPIX_AVG_VAR(8, 16)

-HIGHBD_VAR(16, 8)
 HIGHBD_SUBPIX_VAR(16, 8)
 HIGHBD_SUBPIX_AVG_VAR(16, 8)

-HIGHBD_VAR(16, 16)
 HIGHBD_SUBPIX_VAR(16, 16)
 HIGHBD_SUBPIX_AVG_VAR(16, 16)

-HIGHBD_VAR(16, 32)
 HIGHBD_SUBPIX_VAR(16, 32)
 HIGHBD_SUBPIX_AVG_VAR(16, 32)

-HIGHBD_VAR(32, 16)
 HIGHBD_SUBPIX_VAR(32, 16)
 HIGHBD_SUBPIX_AVG_VAR(32, 16)

-HIGHBD_VAR(32, 32)
 HIGHBD_SUBPIX_VAR(32, 32)
 HIGHBD_SUBPIX_AVG_VAR(32, 32)

-HIGHBD_VAR(32, 64)
 HIGHBD_SUBPIX_VAR(32, 64)
 HIGHBD_SUBPIX_AVG_VAR(32, 64)

-HIGHBD_VAR(64, 32)
 HIGHBD_SUBPIX_VAR(64, 32)
 HIGHBD_SUBPIX_AVG_VAR(64, 32)

-HIGHBD_VAR(64, 64)
 HIGHBD_SUBPIX_VAR(64, 64)
 HIGHBD_SUBPIX_AVG_VAR(64, 64)
-
-void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                              int width, int height, const uint8_t *ref8,
-                              int ref_stride) {
-  int i, j;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@ -12,31 +12,64 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_

 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

-void variance(const uint8_t *a, int a_stride,
-              const uint8_t *b, int b_stride,
-              int  w, int  h,
-              unsigned int *sse, int *sum);
+// TODO(johannkoenig): All functions which depend on
+// [highbd_][8|10|12_]variance should be refactored or moved to vpx_dsp.
+static void variance(const uint8_t *a, int a_stride,
+                     const uint8_t *b, int b_stride,
+                     int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}

 #if CONFIG_VP9_HIGHBITDEPTH
-void highbd_variance(const uint8_t *a8, int a_stride,
-                     const uint8_t *b8, int b_stride,
-                     int w, int h,
-                     unsigned int *sse, int *sum);
+static void highbd_variance64(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, uint64_t *sse, uint64_t *sum) {
+  int i, j;

-void highbd_10_variance(const uint8_t *a8, int a_stride,
-                        const uint8_t *b8, int b_stride,
-                        int w, int h,
-                        unsigned int *sse, int *sum);
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;

-void highbd_12_variance(const uint8_t *a8, int a_stride,
-                        const uint8_t *b8, int b_stride,
-                        int w, int h,
-                        unsigned int *sse, int *sum);
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride,
+                              int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
 #endif

 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
@ -95,15 +128,6 @@ typedef struct vp9_variance_vtable {
  vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;

-void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                       int height, const uint8_t *ref, int ref_stride);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred,
-                              int width, int height,
-                              const uint8_t *ref, int ref_stride);
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
@ -13,237 +13,6 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"

-typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
-                                        const uint16_t *ref, int ref_stride,
-                                        uint32_t *sse, int *sum);
-
-uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    uint32_t *sse, int *sum);
-
-uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
-                                      const uint16_t *ref, int ref_stride,
-                                      uint32_t *sse, int *sum);
-
-static void highbd_variance_sse2(const uint16_t *src, int src_stride,
-                                 const uint16_t *ref, int ref_stride,
-                                 int w, int h, uint32_t *sse, int *sum,
-                                 high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    int w, int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    int w, int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
-}
-
-
-#define HIGH_GET_VAR(S) \
-void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                       const uint8_t *ref8, int ref_stride, \
-                                       uint32_t *sse, int *sum) { \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
-                                     sse, sum); \
-} \
-\
-void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                          const uint8_t *ref8, int ref_stride, \
-                                          uint32_t *sse, int *sum) { \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
-                                     sse, sum); \
-  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
-  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
-} \
-\
-void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                          const uint8_t *ref8, int ref_stride, \
-                                          uint32_t *sse, int *sum) { \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
-                                     sse, sum); \
-  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
-  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
-}
-
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
-
-#undef HIGH_GET_VAR
-
-#define VAR_FN(w, h, block_size, shift) \
-uint32_t vp9_highbd_variance##w##x##h##_sse2( \
-    const uint8_t *src8, int src_stride, \
-    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
-  int sum; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
-                       vp9_highbd_calc##block_size##x##block_size##var_sse2, \
-                       block_size); \
-  return *sse - (((int64_t)sum * sum) >> shift); \
-} \
-\
-uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
-    const uint8_t *src8, int src_stride, \
-    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
-  int sum; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  highbd_10_variance_sse2( \
-      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
-      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-  return *sse - (((int64_t)sum * sum) >> shift); \
-} \
-\
-uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
-    const uint8_t *src8, int src_stride, \
-    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
-  int sum; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  highbd_12_variance_sse2( \
-      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
-      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-  return *sse - (((int64_t)sum * sum) >> shift); \
-}
-
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-
-#undef VAR_FN
-
-unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                      const uint8_t *ref8, int ref_stride,
-                                      unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                       sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                         const uint8_t *ref8, int ref_stride,
-                                         unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                         const uint8_t *ref8, int ref_stride,
-                                         unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                    const uint8_t *ref8, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                       sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                       const uint8_t *ref8, int ref_stride,
-                                       unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                       const uint8_t *ref8, int ref_stride,
-                                       unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
 #define DECL(w, opt) \
 int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
                                               ptrdiff_t src_stride, \
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@ -13,18 +13,6 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"

-typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse, int *sum);
-
-void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse, int *sum);
-
-void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse, int *sum);
-
 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
                                             int x_offset, int y_offset,
                                             const uint8_t *dst, int dst_stride,
@ -42,81 +30,6 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
                                                 int height,
                                                 unsigned int *sseptr);

-static void variance_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int  ref_stride,
-                          int w, int h, unsigned int *sse, int *sum,
-                          get_var_avx2 var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += 16) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(&src[src_stride * i + j], src_stride,
-             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-
-unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
-                sse, &sum, vp9_get16x16var_avx2, 16);
-  return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  int sum;
-  vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
-                sse, &sum, vp9_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
-                sse, &sum, vp9_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 10);
-}
-
-unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
-                sse, &sum, vp9_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 12);
-}
-
-unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
-                sse, &sum, vp9_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 11);
-}
-
 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
                                              int src_stride,
                                              int x_offset,
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@ -16,299 +16,6 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"

-typedef void (*variance_fn_t)(const unsigned char *src, int src_stride,
-                              const unsigned char *ref, int ref_stride,
-                              unsigned int *sse, int *sum);
-
-unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
-  __m128i vsum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 32; ++i) {
-    const __m128i v = _mm_loadu_si128((const __m128i *)src);
-    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
-    src += 8;
-  }
-
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  return  _mm_cvtsi128_si32(vsum);
-}
-
-#define READ64(p, stride, i) \
-  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
-
-static void get4x4var_sse2(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
-  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
-  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
-  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-  // sum
-  __m128i vsum = _mm_add_epi16(diff0, diff1);
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
-  // sse
-  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
-                       _mm_madd_epi16(diff1, diff1));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  *sse = _mm_cvtsi128_si32(vsum);
-}
-
-void vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
-                        const uint8_t *ref, int ref_stride,
-                        unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 8; i += 2) {
-    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(src + i * src_stride)), zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(ref + i * ref_stride)), zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
-    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(src + (i + 1) * src_stride)), zero);
-    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-  }
-
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
-}
-
-void vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    const __m128i s = _mm_loadu_si128((const __m128i *)src);
-    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
-
-    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
-    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-
-    src += src_stride;
-    ref += ref_stride;
-  }
-
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
-             (int16_t)_mm_extract_epi16(vsum, 1);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
-}
-
-
-static void variance_sse2(const unsigned char *src, int src_stride,
-                          const unsigned char *ref, int ref_stride,
-                          int w, int h, unsigned int *sse, int *sum,
-                          variance_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 4);
-}
-
-unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
-                sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
-}
-
-unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
-                sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
-}
-
-unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 6);
-}
-
-unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
-                sse, &sum, vp9_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
-                sse, &sum, vp9_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
-                                    const unsigned char *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 10);
-}
-
-unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 12);
-}
-
-unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 11);
-}
-
-unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 11);
-}
-
-unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse) {
-  vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
 // The 2 unused parameters are place holders for PIC enabled build.
 #define DECL(w, opt) \
 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@ -102,13 +102,11 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h

-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
--- a/vpx_dsp/arm/variance_media.asm
+++ b/vpx_dsp/arm/variance_media.asm
@ -0,0 +1,363 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance16x16_media|
+    EXPORT  |vpx_variance8x8_media|
+    EXPORT  |vpx_mse16x16_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance16x16_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop16x16
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop16x16
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance8x8_media| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop8x8
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop8x8
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+    END
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vpx_variance16x16_media. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vpx_mse16x16_media| PROC
+
+    push    {r4-r9, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r4, #0              ; initialize sse = 0
+
+loopmse
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loopmse
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@ -0,0 +1,417 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+                             const uint8_t *b, int b_stride,
+                             int w, int h, uint32_t *sse, int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo = vmlal_s16(v_sse_lo,
+                           vget_low_s16(sv_diff),
+                           vget_low_s16(sv_diff));
+      v_sse_hi = vmlal_s16(v_sse_hi,
+                           vget_high_s16(sv_diff),
+                           vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
+                        const uint8_t *b, int b_stride,
+                        unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
+                          const uint8_t *b, int b_stride,
+                          unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
+}
+
+unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
+}
+
+unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+}
+
+unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w8(a + (32 * a_stride), a_stride,
+                   b + (32 * b_stride), b_stride, 32, 32,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
+                   b + (16 * 2 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
+                   b + (16 * 3 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
+}
+
+unsigned int vpx_variance16x8_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 4; i++) {
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_variance8x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    uint8x8_t d0u8, d2u8, d4u8, d6u8;
+    int16x4_t d22s16, d23s16, d24s16, d25s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint16x8_t q11u16, q12u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {
+        d0u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        d2u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        d4u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        d6u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(d0u8, d4u8);
+        q12u16 = vsubl_u8(d2u8, d6u8);
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_mse16x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    int64x1_t d0s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    int32x4_t q7s32, q8s32, q9s32, q10s32;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int64x2_t q1s64;
+
+    q7s32 = vdupq_n_s32(0);
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+        q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+        q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q7s32 = vaddq_s32(q7s32, q8s32);
+    q9s32 = vaddq_s32(q9s32, q10s32);
+    q10s32 = vaddq_s32(q7s32, q9s32);
+
+    q1s64 = vpaddlq_s32(q10s32);
+    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int vpx_get4x4sse_cs_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride) {
+    int16x4_t d22s16, d24s16, d26s16, d28s16;
+    int64x1_t d0s64;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    int32x4_t q7s32, q8s32, q9s32, q10s32;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int64x2_t q1s64;
+
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d4u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d1u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d5u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d6u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d3u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d7u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+
+    q11u16 = vsubl_u8(d0u8, d4u8);
+    q12u16 = vsubl_u8(d1u8, d5u8);
+    q13u16 = vsubl_u8(d2u8, d6u8);
+    q14u16 = vsubl_u8(d3u8, d7u8);
+
+    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+    q7s32 = vmull_s16(d22s16, d22s16);
+    q8s32 = vmull_s16(d24s16, d24s16);
+    q9s32 = vmull_s16(d26s16, d26s16);
+    q10s32 = vmull_s16(d28s16, d28s16);
+
+    q7s32 = vaddq_s32(q7s32, q8s32);
+    q9s32 = vaddq_s32(q9s32, q10s32);
+    q9s32 = vaddq_s32(q7s32, q9s32);
+
+    q1s64 = vpaddlq_s32(q9s32);
+    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@ -33,6 +33,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
  return sad;
 }

+// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
 /* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
 * The function averages every corresponding element of the buffers and stores
 * the value in a third buffer, comp_pred.
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@ -0,0 +1,306 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int  a_stride,
+                                const unsigned char *b, int  b_stride) {
+  int distortion = 0;
+  int r, c;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int diff = a[c] - b[c];
+      distortion += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  return distortion;
+}
+
+unsigned int vpx_get_mb_ss_c(const int16_t *a) {
+  unsigned int i, sum = 0;
+
+  for (i = 0; i < 256; ++i) {
+    sum += a[i] * a[i];
+  }
+
+  return sum;
+}
+
+static void variance(const uint8_t *a, int  a_stride,
+                     const uint8_t *b, int  b_stride,
+                     int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#define VAR(W, H) \
+unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                       const uint8_t *b, int b_stride, \
+                                       unsigned int *sse) { \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H) \
+void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
+                             const uint8_t *b, int b_stride, \
+                             unsigned int *sse, int *sum) { \
+  variance(a, a_stride, b, b_stride, W, H, sse, sum); \
+}
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                  const uint8_t *b, int b_stride, \
+                                  unsigned int *sse) { \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+VAR(64, 64)
+VAR(64, 32)
+VAR(32, 64)
+VAR(32, 32)
+VAR(32, 16)
+VAR(16, 32)
+VAR(16, 16)
+VAR(16, 8)
+VAR(8, 16)
+VAR(8, 8)
+VAR(8, 4)
+VAR(4, 8)
+VAR(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                         int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, uint64_t *sse, uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void highbd_8_variance(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
+                                                int a_stride, \
+                                                const uint8_t *b, \
+                                                int b_stride, \
+                                                unsigned int *sse) { \
+  int sum; \
+  highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int a_stride, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 unsigned int *sse) { \
+  int sum; \
+  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int a_stride, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 unsigned int *sse) { \
+  int sum; \
+  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGHBD_GET_VAR(S) \
+void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                    const uint8_t *ref, int ref_stride, \
+                                    unsigned int *sse, int *sum) { \
+  highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse, int *sum) { \
+  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse, int *sum) { \
+  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGHBD_MSE(W, H) \
+unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
+                                         int src_stride, \
+                                         const uint8_t *ref, \
+                                         int ref_stride, \
+                                         unsigned int *sse) { \
+  int sum; \
+  highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
+                                            int src_stride, \
+                                            const uint8_t *ref, \
+                                            int ref_stride, \
+                                            unsigned int *sse) { \
+  int sum; \
+  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
+                                            int src_stride, \
+                                            const uint8_t *ref, \
+                                            int ref_stride, \
+                                            unsigned int *sse) { \
+  int sum; \
+  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+HIGHBD_VAR(64, 64)
+HIGHBD_VAR(64, 32)
+HIGHBD_VAR(32, 64)
+HIGHBD_VAR(32, 32)
+HIGHBD_VAR(32, 16)
+HIGHBD_VAR(16, 32)
+HIGHBD_VAR(16, 16)
+HIGHBD_VAR(16, 8)
+HIGHBD_VAR(8, 16)
+HIGHBD_VAR(8, 8)
+HIGHBD_VAR(8, 4)
+HIGHBD_VAR(4, 8)
+HIGHBD_VAR(4, 4)
+
+void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                              int width, int height, const uint8_t *ref8,
+                              int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@ -17,6 +17,7 @@ DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c

+
 DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
@ -29,9 +30,28 @@ DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS

+ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes            += variance.c
+
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
+
+DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
+DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
+endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)

 DSP_SRCS-yes += vpx_dsp_rtcd.c
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@ -392,4 +392,212 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS

+if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+
+add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x64 sse2 avx2 neon/;
+
+add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x32 sse2 avx2 neon/;
+
+add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x64 sse2 neon/;
+
+add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x32 sse2 avx2 neon/;
+
+add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x16 sse2 avx2/;
+
+add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x32 sse2/;
+
+add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/;
+
+add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x8 mmx sse2 neon/;
+
+add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x16 mmx sse2 neon/;
+
+add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x8 mmx sse2 media neon/;
+
+add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x4 sse2/;
+
+add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x8 sse2/;
+
+add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x4 mmx sse2/;
+
+
+add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get16x16var sse2 avx2 neon/;
+
+add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get8x8var mmx sse2 neon/;
+
+add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/;
+
+add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x8 sse2/;
+
+add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x16 sse2/;
+
+add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x8 sse2/;
+
+add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
+  specialize qw/vpx_get_mb_ss mmx sse2/;
+
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
+  specialize qw/vpx_get4x4sse_cs neon/;
+
+add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x8 sse2/;
+
+  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
 1;
--- a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
@ -11,7 +11,7 @@

 %include "vpx_ports/x86_abi_support.asm"

-;unsigned int vp9_highbd_calc16x16var_sse2
+;unsigned int vpx_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
 ;    int             source_stride,
@ -20,8 +20,8 @@
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
-sym(vp9_highbd_calc16x16var_sse2):
+global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
+sym(vpx_highbd_calc16x16var_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
@ -164,7 +164,7 @@ sym(vp9_highbd_calc16x16var_sse2):
    ret


-;unsigned int vp9_highbd_calc8x8var_sse2
+;unsigned int vpx_highbd_calc8x8var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
 ;    int             source_stride,
@ -173,8 +173,8 @@ sym(vp9_highbd_calc16x16var_sse2):
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
-sym(vp9_highbd_calc8x8var_sse2):
+global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
+sym(vpx_highbd_calc8x8var_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@ -0,0 +1,245 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+                                        const uint16_t *ref, int ref_stride,
+                                        uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                      const uint16_t *ref, int ref_stride,
+                                      uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   int w, int h, uint32_t *sse, int *sum,
+                                   high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                       const uint8_t *ref8, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                         vpx_highbd_calc##block_size##x##block_size##var_sse2, \
+                         block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_10_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_12_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                         sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                    const uint8_t *ref8, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                         sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_dsp_rtcd.h"
+
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse, int *sum);
+
+void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum);
+
+static void variance_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int  ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          get_var_avx2 var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += 16) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(&src[src_stride * i + j], src_stride,
+             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+
+unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vpx_get16x16var_avx2, 16);
+  return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  int sum;
+  vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse;
+}
+
+unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
--- a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
@ -10,9 +10,9 @@

 #include <immintrin.h>  // AVX2

-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"

-void vp9_get16x16var_avx2(const unsigned char *src_ptr,
+void vpx_get16x16var_avx2(const unsigned char *src_ptr,
                          int source_stride,
                          const unsigned char *ref_ptr,
                          int recon_stride,
@ -123,7 +123,7 @@ void vp9_get16x16var_avx2(const unsigned char *src_ptr,
    }
 }

-void vp9_get32x32var_avx2(const unsigned char *src_ptr,
+void vpx_get32x32var_avx2(const unsigned char *src_ptr,
                          int source_stride,
                          const unsigned char *ref_ptr,
                          int recon_stride,
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ b/vpx_dsp/x86/variance_impl_mmx.asm
@ -0,0 +1,424 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
+global sym(vpx_get_mb_ss_mmx) PRIVATE
+sym(vpx_get_mb_ss_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 8
+    ; end prolog
+
+        mov         rax, arg(0) ;src_ptr
+        mov         rcx, 16
+        pxor        mm4, mm4
+
+.NEXTROW:
+        movq        mm0, [rax]
+        movq        mm1, [rax+8]
+        movq        mm2, [rax+16]
+        movq        mm3, [rax+24]
+        pmaddwd     mm0, mm0
+        pmaddwd     mm1, mm1
+        pmaddwd     mm2, mm2
+        pmaddwd     mm3, mm3
+
+        paddd       mm4, mm0
+        paddd       mm4, mm1
+        paddd       mm4, mm2
+        paddd       mm4, mm3
+
+        add         rax, 32
+        dec         rcx
+        ja          .NEXTROW
+        movq        QWORD PTR [rsp], mm4
+
+        ;return sum[0]+sum[1];
+        movsxd      rax, dword ptr [rsp]
+        movsxd      rcx, dword ptr [rsp+4]
+        add         rax, rcx
+
+
+    ; begin epilog
+    add rsp, 8
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_get8x8var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vpx_get8x8var_mmx) PRIVATE
+sym(vpx_get8x8var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 5
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        ;              movq        mm4, [rbx + rdx]
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 6
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 7
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 8
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void
+;vpx_get4x4var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vpx_get4x4var_mmx) PRIVATE
+sym(vpx_get4x4var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Row 2
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher precision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
--- a/vpx_dsp/x86/variance_mmx.c
+++ b/vpx_dsp/x86/variance_mmx.c
@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+
+extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              unsigned int *sse, int *sum);
+
+unsigned int vpx_variance4x4_mmx(const unsigned char *a, int  a_stride,
+                                 const unsigned char *b, int  b_stride,
+                                 unsigned int *sse) {
+    unsigned int var;
+    int avg;
+
+    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 4));
+}
+
+unsigned int vpx_variance8x8_mmx(const unsigned char *a, int  a_stride,
+                                 const unsigned char *b, int  b_stride,
+                                 unsigned int *sse) {
+    unsigned int var;
+    int avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
+    *sse = var;
+
+    return (var - (((unsigned int)avg * avg) >> 6));
+}
+
+unsigned int vpx_mse16x16_mmx(const unsigned char *a, int  a_stride,
+                              const unsigned char *b, int  b_stride,
+                              unsigned int *sse) {
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse2, &sum2);
+    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    *sse = var;
+    return var;
+}
+
+unsigned int vpx_variance16x16_mmx(const unsigned char *a, int  a_stride,
+                                   const unsigned char *b, int  b_stride,
+                                   unsigned int *sse) {
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse2, &sum2);
+    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 8));
+}
+
+unsigned int vpx_variance16x8_mmx(const unsigned char *a, int  a_stride,
+                                  const unsigned char *b, int  b_stride,
+                                  unsigned int *sse) {
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 7));
+}
+
+unsigned int vpx_variance8x16_mmx(const unsigned char *a, int  a_stride,
+                                  const unsigned char *b, int  b_stride,
+                                  unsigned int *sse) {
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+
+    return (var - (((unsigned int)avg * avg) >> 7));
+}
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@ -0,0 +1,309 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+
+typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
+                                const unsigned char *ref, int ref_stride,
+                                unsigned int *sse, int *sum);
+
+unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
+  __m128i vsum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 32; ++i) {
+    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+    src += 8;
+  }
+
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  return  _mm_cvtsi128_si32(vsum);
+}
+
+#define READ64(p, stride, i) \
+  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+
+static void get4x4var_sse2(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
+  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
+  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+  // sum
+  __m128i vsum = _mm_add_epi16(diff0, diff1);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
+                       _mm_madd_epi16(diff1, diff1));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  *sse = _mm_cvtsi128_si32(vsum);
+}
+
+void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
+                        const uint8_t *ref, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 8; i += 2) {
+    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + i * src_stride)), zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + i * ref_stride)), zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + (i + 1) * src_stride)), zero);
+    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+}
+
+void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    const __m128i s = _mm_loadu_si128((const __m128i *)src);
+    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+
+    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
+             (int16_t)_mm_extract_epi16(vsum, 1);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+}
+
+
+static void variance_sse2(const unsigned char *src, int src_stride,
+                          const unsigned char *ref, int ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          getNxMvar_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 4);
+}
+
+unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
+                sse, &sum, get4x4var_sse2, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
+                sse, &sum, get4x4var_sse2, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 6);
+}
+
+unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
+                sse, &sum, vpx_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
+                sse, &sum, vpx_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
+                                    const unsigned char *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse) {
+  vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}