From 4a8336fa9da0e338f22012cb603be314c1414264 Mon Sep 17 00:00:00 2001
From: James Yu <james.yu@linaro.org>
Date: Tue, 17 Dec 2013 23:16:03 +0800
Subject: [PATCH] VP8 for ARMv8 by using NEON intrinsics 11

Add mbloopfilter_neon.c
- vp8_mbloop_filter_horizontal_edge_y_neon
- vp8_mbloop_filter_horizontal_edge_uv_neon
- vp8_mbloop_filter_vertical_edge_y_neon
- vp8_mbloop_filter_vertical_edge_uv_neon

Change-Id: Ia9084e0892d4d49412d9cf2b165a0f719f2382d7
Signed-off-by: James Yu <james.yu@linaro.org>
---
 vp8/common/arm/neon/mbloopfilter_neon.asm | 481 ------------------
 vp8/common/arm/neon/mbloopfilter_neon.c   | 592 ++++++++++++++++++++++
 vp8/vp8_common.mk                         |   2 +-
 3 files changed, 593 insertions(+), 482 deletions(-)
 delete mode 100644 vp8/common/arm/neon/mbloopfilter_neon.asm
 create mode 100644 vp8/common/arm/neon/mbloopfilter_neon.c

diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm
deleted file mode 100644
index d200c3090..000000000
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ /dev/null
@@ -1,481 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
-    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
-    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
-    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-;                                               const unsigned char *blimit,
-;                                               const unsigned char *limit,
-;                                               const unsigned char *thresh)
-; r0    unsigned char *src,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    add         r1, r1, r1                  ; double stride
-    ldr         r12, [sp, #68]              ; load thresh
-    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
-    vdup.u8     q2, r12                     ; thresh
-    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line
-
-    vld1.u8     {q3}, [r0@128], r1              ; p3
-    vld1.u8     {q4}, [r12@128], r1             ; p2
-    vld1.u8     {q5}, [r0@128], r1              ; p1
-    vld1.u8     {q6}, [r12@128], r1             ; p0
-    vld1.u8     {q7}, [r0@128], r1              ; q0
-    vld1.u8     {q8}, [r12@128], r1             ; q1
-    vld1.u8     {q9}, [r0@128], r1              ; q2
-    vld1.u8     {q10}, [r12@128], r1            ; q3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r12, r12, r1, lsl #2
-    add         r0, r12, r1, lsr #1
-
-    vst1.u8     {q4}, [r12@128],r1         ; store op2
-    vst1.u8     {q5}, [r0@128],r1          ; store op1
-    vst1.u8     {q6}, [r12@128], r1        ; store op0
-    vst1.u8     {q7}, [r0@128],r1          ; store oq0
-    vst1.u8     {q8}, [r12@128]            ; store oq1
-    vst1.u8     {q9}, [r0@128]             ; store oq2
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
-
-; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
-;                                                const unsigned char *blimit,
-;                                                const unsigned char *limit,
-;                                                const unsigned char *thresh,
-;                                                unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-; sp+4  unsigned char *v
-
-|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    ldr         r12, [sp, #68]                ; load thresh
-    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
-    vdup.u8     q2, r12                       ; thresh
-    ldr         r12, [sp, #72]                ; load v ptr
-    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines
-
-    vld1.u8     {d6}, [r0@64], r1              ; p3
-    vld1.u8     {d7}, [r12@64], r1              ; p3
-    vld1.u8     {d8}, [r0@64], r1              ; p2
-    vld1.u8     {d9}, [r12@64], r1              ; p2
-    vld1.u8     {d10}, [r0@64], r1             ; p1
-    vld1.u8     {d11}, [r12@64], r1             ; p1
-    vld1.u8     {d12}, [r0@64], r1             ; p0
-    vld1.u8     {d13}, [r12@64], r1             ; p0
-    vld1.u8     {d14}, [r0@64], r1             ; q0
-    vld1.u8     {d15}, [r12@64], r1             ; q0
-    vld1.u8     {d16}, [r0@64], r1             ; q1
-    vld1.u8     {d17}, [r12@64], r1             ; q1
-    vld1.u8     {d18}, [r0@64], r1             ; q2
-    vld1.u8     {d19}, [r12@64], r1             ; q2
-    vld1.u8     {d20}, [r0@64], r1             ; q3
-    vld1.u8     {d21}, [r12@64], r1             ; q3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r0, r0, r1, lsl #3
-    sub         r12, r12, r1, lsl #3
-
-    add         r0, r0, r1
-    add         r12, r12, r1
-
-    vst1.u8     {d8}, [r0@64], r1              ; store u op2
-    vst1.u8     {d9}, [r12@64], r1              ; store v op2
-    vst1.u8     {d10}, [r0@64], r1             ; store u op1
-    vst1.u8     {d11}, [r12@64], r1             ; store v op1
-    vst1.u8     {d12}, [r0@64], r1             ; store u op0
-    vst1.u8     {d13}, [r12@64], r1             ; store v op0
-    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
-    vst1.u8     {d15}, [r12@64], r1             ; store v oq0
-    vst1.u8     {d16}, [r0@64], r1             ; store u oq1
-    vst1.u8     {d17}, [r12@64], r1             ; store v oq1
-    vst1.u8     {d18}, [r0@64], r1             ; store u oq2
-    vst1.u8     {d19}, [r12@64], r1             ; store v oq2
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
-
-; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-;                                             const unsigned char *blimit,
-;                                             const unsigned char *limit,
-;                                             const unsigned char *thresh)
-; r0    unsigned char *src,
-; r1    int pitch,
-; r2    unsigned char blimit
-; r3    unsigned char limit
-; sp    unsigned char thresh,
-|vp8_mbloop_filter_vertical_edge_y_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    ldr         r12, [sp, #68]              ; load thresh
-    sub         r0, r0, #4                  ; move src pointer down by 4 columns
-    vdup.s8     q2, r12                     ; thresh
-    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines
-
-    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
-    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r12], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r12], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r12], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r12], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r12], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r12], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r12], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         r0, r0, r1, lsl #3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r12, r12, r1, lsl #3
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r12], r1
-    vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r12], r1
-    vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r12], r1
-    vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r12], r1
-    vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r12], r1
-    vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r12], r1
-    vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r12], r1
-    vst1.8      {d20}, [r0]
-    vst1.8      {d21}, [r12]
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
-
-; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
-;                                              const unsigned char *blimit,
-;                                              const unsigned char *limit,
-;                                              const unsigned char *thresh,
-;                                              unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  unsigned char *v
-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
-    push        {lr}
-    vpush       {d8-d15}
-
-    ldr         r12, [sp, #68]              ; load thresh
-    sub         r0, r0, #4                  ; move u pointer down by 4 columns
-    vdup.u8     q2, r12                     ; thresh
-    ldr         r12, [sp, #72]              ; load v ptr
-    sub         r12, r12, #4                ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r0], r1              ;load u data
-    vld1.u8     {d7}, [r12], r1             ;load v data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r12], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r12], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r12], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r12], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r12], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r12], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r12], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         r0, r0, r1, lsl #3
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r12, r12, r1, lsl #3
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r12], r1
-    vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r12], r1
-    vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r12], r1
-    vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r12], r1
-    vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r12], r1
-    vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r12], r1
-    vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r12], r1
-    vst1.8      {d20}, [r0]
-    vst1.8      {d21}, [r12]
-
-    vpop        {d8-d15}
-    pop         {pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
-
-; void vp8_mbloop_filter_neon()
-; This is a helper function for the macroblock loopfilters. The individual
-; functions do the necessary load, transpose (if necessary), preserve (if
-; necessary) and store.
-
-; r0,r1 PRESERVE
-; r2    mblimit
-; r3    limit
-
-; q2    thresh
-; q3    p3 PRESERVE
-; q4    p2
-; q5    p1
-; q6    p0
-; q7    q0
-; q8    q1
-; q9    q2
-; q10   q3 PRESERVE
-
-|vp8_mbloop_filter_neon| PROC
-
-    ; vp8_filter_mask
-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)
-    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
-
-    vmax.u8     q11, q11, q12
-    vmax.u8     q12, q13, q14
-    vmax.u8     q1, q1, q0
-    vmax.u8     q15, q11, q12
-
-    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
-
-    ; vp8_hevmask
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
-    vmax.u8     q15, q15, q1
-
-    vdup.u8     q1, r3                      ; limit
-    vdup.u8     q2, r2                      ; mblimit
-
-    vmov.u8     q0, #0x80                   ; 0x80
-
-    vcge.u8     q15, q1, q15
-
-    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
-    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
-    vmov.u16    q11, #3                     ; #3
-
-    ; vp8_filter
-    ; convert to signed
-    veor        q7, q7, q0                  ; qs0
-    vshr.u8     q1, q1, #1                  ; a = a / 2
-    veor        q6, q6, q0                  ; ps0
-    veor        q5, q5, q0                  ; ps1
-
-    vqadd.u8    q12, q12, q1                ; a = b + a
-
-    veor        q8, q8, q0                  ; qs1
-    veor        q4, q4, q0                  ; ps2
-    veor        q9, q9, q0                  ; qs2
-
-    vorr        q14, q13, q14               ; vp8_hevmask
-
-    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
-
-    vsubl.s8    q2, d14, d12                ; qs0 - ps0
-    vsubl.s8    q13, d15, d13
-
-    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
-
-    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)
-
-    vand        q15, q15, q12               ; vp8_filter_mask
-
-    vmul.i16    q13, q13, q11
-
-    vmov.u8     q12, #3                     ; #3
-
-    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
-
-    vmov.u8     q11, #4                     ; #4
-
-    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d2, q2
-    vqmovn.s16  d3, q13
-
-    vand        q1, q1, q15                 ; vp8_filter &= mask
-
-    vmov.u16    q15, #63                    ; #63
-
-    vand        q13, q1, q14                ; Filter2 &= hev
-
-    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
-    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
-
-    vmov        q0, q15
-
-    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
-    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
-
-    vmov        q11, q15
-    vmov        q12, q15
-
-    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
-
-    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
-
-    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-
-    ; roughly 1/7th difference across boundary
-    ; roughly 2/7th difference across boundary
-    ; roughly 3/7th difference across boundary
-
-    vmov.u8     d5, #9                      ; #9
-    vmov.u8     d4, #18                     ; #18
-
-    vmov        q13, q15
-    vmov        q14, q15
-
-    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9
-    vmlal.s8    q11, d3, d5
-    vmov.u8     d5, #27                     ; #27
-    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18
-    vmlal.s8    q13, d3, d4
-    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27
-    vmlal.s8    q15, d3, d5
-
-    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d1, q11, #7
-    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
-    vqshrn.s16  d25, q13, #7
-    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
-    vqshrn.s16  d29, q15, #7
-
-    vmov.u8     q1, #0x80                   ; 0x80
-
-    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)
-    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)
-    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
-    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
-    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
-    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
-
-    veor        q9, q11, q1                 ; *oq2 = s^0x80
-    veor        q4, q0, q1                  ; *op2 = s^0x80
-    veor        q8, q13, q1                 ; *oq1 = s^0x80
-    veor        q5, q12, q1                 ; *op2 = s^0x80
-    veor        q7, q15, q1                 ; *oq0 = s^0x80
-    veor        q6, q14, q1                 ; *op0 = s^0x80
-
-    bx          lr
-    ENDP        ; |vp8_mbloop_filter_neon|
-
-;-----------------
-
-    END
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.c b/vp8/common/arm/neon/mbloopfilter_neon.c
new file mode 100644
index 000000000..385ce36e7
--- /dev/null
+++ b/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -0,0 +1,592 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static inline void vp8_mbloop_filter_neon(
+        uint8x16_t qblimit,  // mblimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p2
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q4r,     // p1
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r,     // q1
+        uint8x16_t *q9r) {   // q1
+    uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
+    uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
+    int8x16_t q0s8, q12s8, q14s8, q15s8;
+    int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q1u8  = vabdq_u8(q9, q8);
+    q0u8  = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q1u8  = vmaxq_u8(q1u8, q0u8);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q12u8 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q1u8);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    q1u8 = vabdq_u8(q5, q8);
+    q12u8 = vqaddq_u8(q12u8, q12u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q0u8 = vdupq_n_u8(0x80);
+    q9 = veorq_u8(q9, q0u8);
+    q8 = veorq_u8(q8, q0u8);
+    q7 = veorq_u8(q7, q0u8);
+    q6 = veorq_u8(q6, q0u8);
+    q5 = veorq_u8(q5, q0u8);
+    q4 = veorq_u8(q4, q0u8);
+
+    q1u8 = vshrq_n_u8(q1u8, 1);
+    q12u8 = vqaddq_u8(q12u8, q1u8);
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+    q12u8 = vcgeq_u8(qblimit, q12u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                     vreinterpretq_s8_u8(q8));
+
+    q11s16 = vdupq_n_s16(3);
+    q2s16  = vmulq_s16(q2s16, q11s16);
+    q13s16 = vmulq_s16(q13s16, q11s16);
+
+    q15u8 = vandq_u8(q15u8, q12u8);
+
+    q2s16  = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
+
+    q12u8 = vdupq_n_u8(3);
+    q11u8 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2 = vqmovn_s16(q2s16);
+    d3 = vqmovn_s16(q13s16);
+    q1s8 = vcombine_s8(d2, d3);
+    q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
+    q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
+    q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q13s8 = vshrq_n_s8(q13s8, 3);
+
+    q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
+    q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
+
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
+    d5 = vdup_n_s8(9);
+    d4 = vdup_n_s8(18);
+
+    q0s16  = vmlal_s8(vreinterpretq_s16_u16(q0u16),  vget_low_s8(q1s8),  d5);
+    q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
+    d5 = vdup_n_s8(27);
+    q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8),  d4);
+    q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
+    q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8),  d5);
+    q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
+
+    d0  = vqshrn_n_s16(q0s16 , 7);
+    d1  = vqshrn_n_s16(q11s16, 7);
+    d24 = vqshrn_n_s16(q12s16, 7);
+    d25 = vqshrn_n_s16(q13s16, 7);
+    d28 = vqshrn_n_s16(q14s16, 7);
+    d29 = vqshrn_n_s16(q15s16, 7);
+
+    q0s8  = vcombine_s8(d0, d1);
+    q12s8 = vcombine_s8(d24, d25);
+    q14s8 = vcombine_s8(d28, d29);
+
+    q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
+    q0s8  = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
+    q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
+    q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
+    q15s8 = vqsubq_s8((q7s8), q14s8);
+    q14s8 = vqaddq_s8((q6s8), q14s8);
+
+    q1u8 = vdupq_n_u8(0x80);
+    *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
+    *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
+    return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    src -= (pitch << 2);
+
+    q3 = vld1q_u8(src);
+    src += pitch;
+    q4 = vld1q_u8(src);
+    src += pitch;
+    q5 = vld1q_u8(src);
+    src += pitch;
+    q6 = vld1q_u8(src);
+    src += pitch;
+    q7 = vld1q_u8(src);
+    src += pitch;
+    q8 = vld1q_u8(src);
+    src += pitch;
+    q9 = vld1q_u8(src);
+    src += pitch;
+    q10 = vld1q_u8(src);
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    src -= (pitch * 6);
+    vst1q_u8(src, q4);
+    src += pitch;
+    vst1q_u8(src, q5);
+    src += pitch;
+    vst1q_u8(src, q6);
+    src += pitch;
+    vst1q_u8(src, q7);
+    src += pitch;
+    vst1q_u8(src, q8);
+    src += pitch;
+    vst1q_u8(src, q9);
+    return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    u -= (pitch << 2);
+    v -= (pitch << 2);
+
+    d6 = vld1_u8(u);
+    u += pitch;
+    d7 = vld1_u8(v);
+    v += pitch;
+    d8 = vld1_u8(u);
+    u += pitch;
+    d9 = vld1_u8(v);
+    v += pitch;
+    d10 = vld1_u8(u);
+    u += pitch;
+    d11 = vld1_u8(v);
+    v += pitch;
+    d12 = vld1_u8(u);
+    u += pitch;
+    d13 = vld1_u8(v);
+    v += pitch;
+    d14 = vld1_u8(u);
+    u += pitch;
+    d15 = vld1_u8(v);
+    v += pitch;
+    d16 = vld1_u8(u);
+    u += pitch;
+    d17 = vld1_u8(v);
+    v += pitch;
+    d18 = vld1_u8(u);
+    u += pitch;
+    d19 = vld1_u8(v);
+    v += pitch;
+    d20 = vld1_u8(u);
+    d21 = vld1_u8(v);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    u -= (pitch * 6);
+    v -= (pitch * 6);
+    vst1_u8(u, vget_low_u8(q4));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q4));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q5));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q5));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q6));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q6));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q7));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q7));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q8));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q8));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q9));
+    vst1_u8(v, vget_high_u8(q9));
+    return;
+}
+
+void vp8_mbloop_filter_vertical_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    unsigned char *s1, *s2;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    s1 = src - 4;
+    s2 = s1 + 8 * pitch;
+    d6  = vld1_u8(s1);
+    s1 += pitch;
+    d7  = vld1_u8(s2);
+    s2 += pitch;
+    d8  = vld1_u8(s1);
+    s1 += pitch;
+    d9  = vld1_u8(s2);
+    s2 += pitch;
+    d10 = vld1_u8(s1);
+    s1 += pitch;
+    d11 = vld1_u8(s2);
+    s2 += pitch;
+    d12 = vld1_u8(s1);
+    s1 += pitch;
+    d13 = vld1_u8(s2);
+    s2 += pitch;
+    d14 = vld1_u8(s1);
+    s1 += pitch;
+    d15 = vld1_u8(s2);
+    s2 += pitch;
+    d16 = vld1_u8(s1);
+    s1 += pitch;
+    d17 = vld1_u8(s2);
+    s2 += pitch;
+    d18 = vld1_u8(s1);
+    s1 += pitch;
+    d19 = vld1_u8(s2);
+    s2 += pitch;
+    d20 = vld1_u8(s1);
+    d21 = vld1_u8(s2);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32((uint32x4_t)q3, (uint32x4_t)q7);
+    q2tmp1 = vtrnq_u32((uint32x4_t)q4, (uint32x4_t)q8);
+    q2tmp2 = vtrnq_u32((uint32x4_t)q5, (uint32x4_t)q9);
+    q2tmp3 = vtrnq_u32((uint32x4_t)q6, (uint32x4_t)q10);
+
+    q2tmp4 = vtrnq_u16((uint16x8_t)q2tmp0.val[0], (uint16x8_t)q2tmp2.val[0]);
+    q2tmp5 = vtrnq_u16((uint16x8_t)q2tmp1.val[0], (uint16x8_t)q2tmp3.val[0]);
+    q2tmp6 = vtrnq_u16((uint16x8_t)q2tmp0.val[1], (uint16x8_t)q2tmp2.val[1]);
+    q2tmp7 = vtrnq_u16((uint16x8_t)q2tmp1.val[1], (uint16x8_t)q2tmp3.val[1]);
+
+    q2tmp8  = vtrnq_u8((uint8x16_t)q2tmp4.val[0], (uint8x16_t)q2tmp5.val[0]);
+    q2tmp9  = vtrnq_u8((uint8x16_t)q2tmp4.val[1], (uint8x16_t)q2tmp5.val[1]);
+    q2tmp10 = vtrnq_u8((uint8x16_t)q2tmp6.val[0], (uint8x16_t)q2tmp7.val[0]);
+    q2tmp11 = vtrnq_u8((uint8x16_t)q2tmp6.val[1], (uint8x16_t)q2tmp7.val[1]);
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    q2tmp0 = vtrnq_u32((uint32x4_t)q3, (uint32x4_t)q7);
+    q2tmp1 = vtrnq_u32((uint32x4_t)q4, (uint32x4_t)q8);
+    q2tmp2 = vtrnq_u32((uint32x4_t)q5, (uint32x4_t)q9);
+    q2tmp3 = vtrnq_u32((uint32x4_t)q6, (uint32x4_t)q10);
+
+    q2tmp4 = vtrnq_u16((uint16x8_t)q2tmp0.val[0], (uint16x8_t)q2tmp2.val[0]);
+    q2tmp5 = vtrnq_u16((uint16x8_t)q2tmp1.val[0], (uint16x8_t)q2tmp3.val[0]);
+    q2tmp6 = vtrnq_u16((uint16x8_t)q2tmp0.val[1], (uint16x8_t)q2tmp2.val[1]);
+    q2tmp7 = vtrnq_u16((uint16x8_t)q2tmp1.val[1], (uint16x8_t)q2tmp3.val[1]);
+
+    q2tmp8  = vtrnq_u8((uint8x16_t)q2tmp4.val[0], (uint8x16_t)q2tmp5.val[0]);
+    q2tmp9  = vtrnq_u8((uint8x16_t)q2tmp4.val[1], (uint8x16_t)q2tmp5.val[1]);
+    q2tmp10 = vtrnq_u8((uint8x16_t)q2tmp6.val[0], (uint8x16_t)q2tmp7.val[0]);
+    q2tmp11 = vtrnq_u8((uint8x16_t)q2tmp6.val[1], (uint8x16_t)q2tmp7.val[1]);
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    s1 -= 7 * pitch;
+    s2 -= 7 * pitch;
+
+    vst1_u8(s1, vget_low_u8(q3));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q3));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q4));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q4));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q5));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q5));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q6));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q6));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q7));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q7));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q8));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q8));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q9));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q9));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q10));
+    vst1_u8(s2, vget_high_u8(q10));
+    return;
+}
+
+void vp8_mbloop_filter_vertical_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    unsigned char *us, *ud;
+    unsigned char *vs, *vd;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    us = u - 4;
+    vs = v - 4;
+    d6 = vld1_u8(us);
+    us += pitch;
+    d7 = vld1_u8(vs);
+    vs += pitch;
+    d8 = vld1_u8(us);
+    us += pitch;
+    d9 = vld1_u8(vs);
+    vs += pitch;
+    d10 = vld1_u8(us);
+    us += pitch;
+    d11 = vld1_u8(vs);
+    vs += pitch;
+    d12 = vld1_u8(us);
+    us += pitch;
+    d13 = vld1_u8(vs);
+    vs += pitch;
+    d14 = vld1_u8(us);
+    us += pitch;
+    d15 = vld1_u8(vs);
+    vs += pitch;
+    d16 = vld1_u8(us);
+    us += pitch;
+    d17 = vld1_u8(vs);
+    vs += pitch;
+    d18 = vld1_u8(us);
+    us += pitch;
+    d19 = vld1_u8(vs);
+    vs += pitch;
+    d20 = vld1_u8(us);
+    d21 = vld1_u8(vs);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32((uint32x4_t)q3, (uint32x4_t)q7);
+    q2tmp1 = vtrnq_u32((uint32x4_t)q4, (uint32x4_t)q8);
+    q2tmp2 = vtrnq_u32((uint32x4_t)q5, (uint32x4_t)q9);
+    q2tmp3 = vtrnq_u32((uint32x4_t)q6, (uint32x4_t)q10);
+
+    q2tmp4 = vtrnq_u16((uint16x8_t)q2tmp0.val[0], (uint16x8_t)q2tmp2.val[0]);
+    q2tmp5 = vtrnq_u16((uint16x8_t)q2tmp1.val[0], (uint16x8_t)q2tmp3.val[0]);
+    q2tmp6 = vtrnq_u16((uint16x8_t)q2tmp0.val[1], (uint16x8_t)q2tmp2.val[1]);
+    q2tmp7 = vtrnq_u16((uint16x8_t)q2tmp1.val[1], (uint16x8_t)q2tmp3.val[1]);
+
+    q2tmp8  = vtrnq_u8((uint8x16_t)q2tmp4.val[0], (uint8x16_t)q2tmp5.val[0]);
+    q2tmp9  = vtrnq_u8((uint8x16_t)q2tmp4.val[1], (uint8x16_t)q2tmp5.val[1]);
+    q2tmp10 = vtrnq_u8((uint8x16_t)q2tmp6.val[0], (uint8x16_t)q2tmp7.val[0]);
+    q2tmp11 = vtrnq_u8((uint8x16_t)q2tmp6.val[1], (uint8x16_t)q2tmp7.val[1]);
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    q2tmp0 = vtrnq_u32((uint32x4_t)q3, (uint32x4_t)q7);
+    q2tmp1 = vtrnq_u32((uint32x4_t)q4, (uint32x4_t)q8);
+    q2tmp2 = vtrnq_u32((uint32x4_t)q5, (uint32x4_t)q9);
+    q2tmp3 = vtrnq_u32((uint32x4_t)q6, (uint32x4_t)q10);
+
+    q2tmp4 = vtrnq_u16((uint16x8_t)q2tmp0.val[0], (uint16x8_t)q2tmp2.val[0]);
+    q2tmp5 = vtrnq_u16((uint16x8_t)q2tmp1.val[0], (uint16x8_t)q2tmp3.val[0]);
+    q2tmp6 = vtrnq_u16((uint16x8_t)q2tmp0.val[1], (uint16x8_t)q2tmp2.val[1]);
+    q2tmp7 = vtrnq_u16((uint16x8_t)q2tmp1.val[1], (uint16x8_t)q2tmp3.val[1]);
+
+    q2tmp8  = vtrnq_u8((uint8x16_t)q2tmp4.val[0], (uint8x16_t)q2tmp5.val[0]);
+    q2tmp9  = vtrnq_u8((uint8x16_t)q2tmp4.val[1], (uint8x16_t)q2tmp5.val[1]);
+    q2tmp10 = vtrnq_u8((uint8x16_t)q2tmp6.val[0], (uint8x16_t)q2tmp7.val[0]);
+    q2tmp11 = vtrnq_u8((uint8x16_t)q2tmp6.val[1], (uint8x16_t)q2tmp7.val[1]);
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    ud = u - 4;
+    vst1_u8(ud, vget_low_u8(q3));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q4));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q5));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q6));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q7));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q8));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q9));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q10));
+
+    vd = v - 4;
+    vst1_u8(vd, vget_high_u8(q3));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q4));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q5));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q6));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q7));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q8));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q9));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q10));
+    return;
+}
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index b63bee6af..1b658352b 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -159,7 +159,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
 
 # common (neon)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad8_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad16_neon$(ASM)
@@ -186,6 +185,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
 
 
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))