Move variance functions to vpx_dsp
subpel functions will be moved in another patch. Change-Id: Idb2e049bad0b9b32ac42cc7731cd6903de2826ce
This commit is contained in:
Родитель
976f7f42c1
Коммит
c3bdffb0a5
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,154 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_variance16x16_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance16x16_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
|
||||
loop
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
|
@ -1,101 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_variance8x8_armv6|
|
||||
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance8x8_armv6| PROC
|
||||
|
||||
push {r4-r10, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #8 ; set loop counter to 8 (=block height)
|
||||
mov r4, #0 ; initialize sum = 0
|
||||
mov r5, #0 ; initialize sse = 0
|
||||
|
||||
loop
|
||||
; 1st 4 pixels
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r0, #0x4] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
subs r12, r12, #1 ; next row
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r8, [sp, #32] ; get address of sse
|
||||
mul r1, r4, r4 ; sum * sum
|
||||
str r5, [r8] ; store sse
|
||||
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
|
||||
|
||||
pop {r4-r10, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,320 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
unsigned int vp8_variance16x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_variance16x8_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 4; i++) { // variance16x8_neon_loop
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
uint8x8_t d0u8, d2u8, d4u8, d6u8;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint16x8_t q11u16, q12u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) { // variance8x16_neon_loop
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d2u8, d6u8);
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x8_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 2; i++) { // variance8x8_neon_loop
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d1u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d3u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d5u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d7u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d1u8, d5u8);
|
||||
q13u16 = vsubl_u8(d2u8, d6u8);
|
||||
q14u16 = vsubl_u8(d3u8, d7u8);
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
|
@ -9,10 +9,14 @@
|
|||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vp8/common/filter.h"
|
||||
|
||||
// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
|
||||
#if CONFIG_VP8_ENCODER
|
||||
|
||||
#if HAVE_MEDIA
|
||||
#include "vp8/common/arm/bilinearfilter_arm.h"
|
||||
|
||||
|
@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6
|
|||
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
|
||||
8, 8, 8, VFilter);
|
||||
|
||||
return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
|
||||
dst_pixels_per_line, sse);
|
||||
return vpx_variance8x8_media(second_pass, 8, dst_ptr,
|
||||
dst_pixels_per_line, sse);
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x16_armv6
|
||||
|
@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
|
|||
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
|
||||
16, 16, 16, VFilter);
|
||||
|
||||
var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
|
||||
dst_pixels_per_line, sse);
|
||||
var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
|
||||
dst_pixels_per_line, sse);
|
||||
}
|
||||
return var;
|
||||
}
|
||||
|
||||
#endif /* HAVE_MEDIA */
|
||||
#endif // HAVE_MEDIA
|
||||
|
||||
|
||||
#if HAVE_NEON
|
||||
|
@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon
|
|||
return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // HAVE_NEON
|
||||
#endif // CONFIG_VP8_ENCODER
|
||||
|
|
|
@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block
|
|||
|
||||
if (blksize == 16)
|
||||
{
|
||||
actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
|
||||
act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
|
||||
actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
|
||||
act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
|
||||
#ifdef USE_SSD
|
||||
vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
|
||||
vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
|
||||
sad = (sse + 128)>>8;
|
||||
vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
|
||||
vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
|
||||
usad = (sse + 32)>>6;
|
||||
vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
|
||||
vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
|
||||
vsad = (sse + 32)>>6;
|
||||
#else
|
||||
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
|
||||
|
@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block
|
|||
}
|
||||
else /* if (blksize == 8) */
|
||||
{
|
||||
actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
|
||||
act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
|
||||
actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
|
||||
act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
|
||||
#ifdef USE_SSD
|
||||
vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
|
||||
vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
|
||||
sad = (sse + 32)>>6;
|
||||
vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
|
||||
vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
|
||||
usad = (sse + 8)>>4;
|
||||
vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
|
||||
vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
|
||||
vsad = (sse + 8)>>4;
|
||||
#else
|
||||
sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
|
||||
|
|
|
@ -236,31 +236,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch,
|
|||
specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
|
||||
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
|
||||
|
||||
#
|
||||
# Whole-pixel Variance
|
||||
#
|
||||
add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp8_variance4x4 mmx sse2/;
|
||||
$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
|
||||
|
||||
add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp8_variance8x8 mmx sse2 media neon/;
|
||||
$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
|
||||
$vp8_variance8x8_media=vp8_variance8x8_armv6;
|
||||
|
||||
add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp8_variance8x16 mmx sse2 neon/;
|
||||
$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
|
||||
|
||||
add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp8_variance16x8 mmx sse2 neon/;
|
||||
$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
|
||||
|
||||
add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp8_variance16x16 mmx sse2 media neon/;
|
||||
$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
|
||||
$vp8_variance16x16_media=vp8_variance16x16_armv6;
|
||||
|
||||
#
|
||||
# Sub-pixel Variance
|
||||
#
|
||||
|
@ -308,12 +283,6 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
|
|||
#
|
||||
if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
|
||||
|
||||
#
|
||||
# Sum of squares (vector)
|
||||
#
|
||||
add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
|
||||
specialize qw/vp8_get_mb_ss mmx sse2/;
|
||||
|
||||
#
|
||||
# SSE (Sum Squared Error)
|
||||
#
|
||||
|
@ -321,14 +290,6 @@ add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_pt
|
|||
specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
|
||||
$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
|
||||
|
||||
add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp8_mse16x16 mmx sse2 media neon/;
|
||||
$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
|
||||
$vp8_mse16x16_media=vp8_mse16x16_armv6;
|
||||
|
||||
add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
|
||||
specialize qw/vp8_get4x4sse_cs mmx neon/;
|
||||
|
||||
#
|
||||
# Block copy
|
||||
#
|
||||
|
|
|
@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)(
|
|||
const unsigned char *ref_array,
|
||||
int ref_stride,
|
||||
unsigned int *sad_array);
|
||||
|
||||
typedef void (*vpx_sad_multi_d_fn_t)
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
|
@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t)
|
|||
unsigned int *sad_array
|
||||
);
|
||||
|
||||
typedef unsigned int (*vp8_variance_fn_t)
|
||||
typedef unsigned int (*vpx_variance_fn_t)
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
|
@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t)
|
|||
unsigned int *sse
|
||||
);
|
||||
|
||||
typedef void (*vp8_ssimpf_fn_t)
|
||||
(
|
||||
unsigned char *s,
|
||||
int sp,
|
||||
unsigned char *r,
|
||||
int rp,
|
||||
unsigned long *sum_s,
|
||||
unsigned long *sum_r,
|
||||
unsigned long *sum_sq_s,
|
||||
unsigned long *sum_sq_r,
|
||||
unsigned long *sum_sxr
|
||||
);
|
||||
|
||||
typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
|
||||
|
||||
typedef unsigned int (*vp8_get16x16prederror_fn_t)
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int ref_stride
|
||||
);
|
||||
|
||||
typedef struct variance_vtable
|
||||
{
|
||||
vpx_sad_fn_t sdf;
|
||||
vp8_variance_fn_t vf;
|
||||
vpx_variance_fn_t vf;
|
||||
vp8_subpixvariance_fn_t svf;
|
||||
vp8_variance_fn_t svf_halfpix_h;
|
||||
vp8_variance_fn_t svf_halfpix_v;
|
||||
vp8_variance_fn_t svf_halfpix_hv;
|
||||
vpx_variance_fn_t svf_halfpix_h;
|
||||
vpx_variance_fn_t svf_halfpix_v;
|
||||
vpx_variance_fn_t svf_halfpix_hv;
|
||||
vpx_sad_multi_fn_t sdx3f;
|
||||
vpx_sad_multi_fn_t sdx8f;
|
||||
vpx_sad_multi_d_fn_t sdx4df;
|
||||
|
|
|
@ -8,44 +8,34 @@
|
|||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "filter.h"
|
||||
#include "variance.h"
|
||||
|
||||
|
||||
unsigned int vp8_get_mb_ss_c
|
||||
(
|
||||
const short *src_ptr
|
||||
)
|
||||
{
|
||||
unsigned int i = 0, sum = 0;
|
||||
|
||||
do
|
||||
{
|
||||
sum += (src_ptr[i] * src_ptr[i]);
|
||||
i++;
|
||||
}
|
||||
while (i < 256);
|
||||
|
||||
return sum;
|
||||
/* This is a bad idea.
|
||||
* ctz = count trailing zeros */
|
||||
static int ctz(int a) {
|
||||
int b = 0;
|
||||
while (a != 1) {
|
||||
a >>= 1;
|
||||
b++;
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
static void variance(
|
||||
static unsigned int variance(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
int w,
|
||||
int h,
|
||||
unsigned int *sse,
|
||||
int *sum)
|
||||
unsigned int *sse)
|
||||
{
|
||||
int i, j;
|
||||
int diff;
|
||||
int diff, sum;
|
||||
|
||||
*sum = 0;
|
||||
sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++)
|
||||
|
@ -53,114 +43,17 @@ static void variance(
|
|||
for (j = 0; j < w; j++)
|
||||
{
|
||||
diff = src_ptr[j] - ref_ptr[j];
|
||||
*sum += diff;
|
||||
sum += diff;
|
||||
*sse += diff * diff;
|
||||
}
|
||||
|
||||
src_ptr += source_stride;
|
||||
ref_ptr += recon_stride;
|
||||
}
|
||||
|
||||
return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance16x16_c(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
|
||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 8));
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x16_c(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
|
||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
}
|
||||
|
||||
unsigned int vp8_variance16x8_c(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
|
||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance8x8_c(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
|
||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 6));
|
||||
}
|
||||
|
||||
unsigned int vp8_variance4x4_c(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
|
||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 4));
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_mse16x16_c(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
|
||||
*sse = var;
|
||||
return var;
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : filter_block2d_bil_first_pass
|
||||
|
@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c
|
|||
/* Now filter Verticaly */
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
|
||||
|
||||
return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
|
||||
return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
|
||||
}
|
||||
|
||||
|
||||
|
@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c
|
|||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
|
||||
|
||||
return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
|
||||
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance16x16_c
|
||||
|
@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c
|
|||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
|
||||
|
||||
return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
|
||||
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
|
||||
}
|
||||
|
||||
|
||||
|
@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c
|
|||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
|
||||
|
||||
return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
|
||||
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance8x16_c
|
||||
|
@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c
|
|||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
|
||||
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
|
||||
|
||||
return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
|
||||
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
|
||||
}
|
||||
|
|
|
@ -11,504 +11,6 @@
|
|||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
|
||||
global sym(vp8_get_mb_ss_mmx) PRIVATE
|
||||
sym(vp8_get_mb_ss_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 8
|
||||
; end prolog
|
||||
|
||||
mov rax, arg(0) ;src_ptr
|
||||
mov rcx, 16
|
||||
pxor mm4, mm4
|
||||
|
||||
.NEXTROW:
|
||||
movq mm0, [rax]
|
||||
movq mm1, [rax+8]
|
||||
movq mm2, [rax+16]
|
||||
movq mm3, [rax+24]
|
||||
pmaddwd mm0, mm0
|
||||
pmaddwd mm1, mm1
|
||||
pmaddwd mm2, mm2
|
||||
pmaddwd mm3, mm3
|
||||
|
||||
paddd mm4, mm0
|
||||
paddd mm4, mm1
|
||||
paddd mm4, mm2
|
||||
paddd mm4, mm3
|
||||
|
||||
add rax, 32
|
||||
dec rcx
|
||||
ja .NEXTROW
|
||||
movq QWORD PTR [rsp], mm4
|
||||
|
||||
;return sum[0]+sum[1];
|
||||
movsxd rax, dword ptr [rsp]
|
||||
movsxd rcx, dword ptr [rsp+4]
|
||||
add rax, rcx
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 8
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;unsigned int vp8_get8x8var_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *SSE,
|
||||
; int *Sum
|
||||
;)
|
||||
global sym(vp8_get8x8var_mmx) PRIVATE
|
||||
sym(vp8_get8x8var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm5, mm5 ; Blank mmx6
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||
mov rbx, arg(2) ;[ref_ptr]
|
||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||
|
||||
; Row 1
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
|
||||
; Row 2
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 3
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 4
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 5
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
; movq mm4, [rbx + rdx]
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 6
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 7
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 8
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Now accumulate the final results.
|
||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||||
movsx rdx, WORD PTR [rsp+8]
|
||||
movsx rcx, WORD PTR [rsp+10]
|
||||
movsx rbx, WORD PTR [rsp+12]
|
||||
movsx rax, WORD PTR [rsp+14]
|
||||
add rdx, rcx
|
||||
add rbx, rax
|
||||
add rdx, rbx ;XSum
|
||||
movsxd rax, DWORD PTR [rsp]
|
||||
movsxd rcx, DWORD PTR [rsp+4]
|
||||
add rax, rcx ;XXSum
|
||||
mov rsi, arg(4) ;SSE
|
||||
mov rdi, arg(5) ;Sum
|
||||
mov dword ptr [rsi], eax
|
||||
mov dword ptr [rdi], edx
|
||||
xor rax, rax ; return 0
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
;unsigned int
|
||||
;vp8_get4x4var_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *SSE,
|
||||
; int *Sum
|
||||
;)
|
||||
global sym(vp8_get4x4var_mmx) PRIVATE
|
||||
sym(vp8_get4x4var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm5, mm5 ; Blank mmx6
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||
mov rbx, arg(2) ;[ref_ptr]
|
||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||
|
||||
; Row 1
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
|
||||
; Row 2
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 3
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher precision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 4
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
|
||||
; Now accumulate the final results.
|
||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||||
movsx rdx, WORD PTR [rsp+8]
|
||||
movsx rcx, WORD PTR [rsp+10]
|
||||
movsx rbx, WORD PTR [rsp+12]
|
||||
movsx rax, WORD PTR [rsp+14]
|
||||
add rdx, rcx
|
||||
add rbx, rax
|
||||
add rdx, rbx ;XSum
|
||||
movsxd rax, DWORD PTR [rsp]
|
||||
movsxd rcx, DWORD PTR [rsp+4]
|
||||
add rax, rcx ;XXSum
|
||||
mov rsi, arg(4) ;SSE
|
||||
mov rdi, arg(5) ;Sum
|
||||
mov dword ptr [rsi], eax
|
||||
mov dword ptr [rdi], edx
|
||||
xor rax, rax ; return 0
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
;unsigned int
|
||||
;vp8_get4x4sse_cs_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride
|
||||
;)
|
||||
global sym(vp8_get4x4sse_cs_mmx) PRIVATE
|
||||
sym(vp8_get4x4sse_cs_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||
mov rbx, arg(2) ;[ref_ptr]
|
||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||
; Row 1
|
||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 2
|
||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 3
|
||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
||||
punpcklbw mm1, mm6
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 4
|
||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
movq mm0, mm7 ;
|
||||
psrlq mm7, 32
|
||||
|
||||
paddd mm0, mm7
|
||||
movq rax, mm0
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
%define mmx_filter_shift 7
|
||||
|
||||
;void vp8_filter_block2d_bil4x4_var_mmx
|
||||
|
|
|
@ -13,393 +13,6 @@
|
|||
|
||||
%define xmm_filter_shift 7
|
||||
|
||||
;unsigned int vp8_get_mb_ss_sse2
|
||||
;(
|
||||
; short *src_ptr
|
||||
;)
|
||||
global sym(vp8_get_mb_ss_sse2) PRIVATE
|
||||
sym(vp8_get_mb_ss_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 1
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
mov rax, arg(0) ;[src_ptr]
|
||||
mov rcx, 8
|
||||
pxor xmm4, xmm4
|
||||
|
||||
.NEXTROW:
|
||||
movdqa xmm0, [rax]
|
||||
movdqa xmm1, [rax+16]
|
||||
movdqa xmm2, [rax+32]
|
||||
movdqa xmm3, [rax+48]
|
||||
pmaddwd xmm0, xmm0
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm2, xmm2
|
||||
pmaddwd xmm3, xmm3
|
||||
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm2, xmm3
|
||||
paddd xmm4, xmm0
|
||||
paddd xmm4, xmm2
|
||||
|
||||
add rax, 0x40
|
||||
dec rcx
|
||||
ja .NEXTROW
|
||||
|
||||
movdqa xmm3,xmm4
|
||||
psrldq xmm4,8
|
||||
paddd xmm4,xmm3
|
||||
movdqa xmm3,xmm4
|
||||
psrldq xmm4,4
|
||||
paddd xmm4,xmm3
|
||||
movq rax,xmm4
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;unsigned int vp8_get16x16var_sse2
|
||||
;(
|
||||
; unsigned char * src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char * ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int * SSE,
|
||||
; int * Sum
|
||||
;)
|
||||
global sym(vp8_get16x16var_sse2) PRIVATE
|
||||
sym(vp8_get16x16var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
push rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;[src_ptr]
|
||||
mov rdi, arg(2) ;[ref_ptr]
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;[source_stride]
|
||||
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
|
||||
|
||||
; Prefetch data
|
||||
lea rcx, [rax+rax*2]
|
||||
prefetcht0 [rsi]
|
||||
prefetcht0 [rsi+rax]
|
||||
prefetcht0 [rsi+rax*2]
|
||||
prefetcht0 [rsi+rcx]
|
||||
lea rbx, [rsi+rax*4]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rax]
|
||||
prefetcht0 [rbx+rax*2]
|
||||
prefetcht0 [rbx+rcx]
|
||||
|
||||
lea rcx, [rdx+rdx*2]
|
||||
prefetcht0 [rdi]
|
||||
prefetcht0 [rdi+rdx]
|
||||
prefetcht0 [rdi+rdx*2]
|
||||
prefetcht0 [rdi+rcx]
|
||||
lea rbx, [rdi+rdx*4]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rdx]
|
||||
prefetcht0 [rbx+rdx*2]
|
||||
prefetcht0 [rbx+rcx]
|
||||
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
||||
|
||||
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
|
||||
mov rcx, 16
|
||||
|
||||
.var16loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rdi]
|
||||
|
||||
prefetcht0 [rsi+rax*8]
|
||||
prefetcht0 [rdi+rdx*8]
|
||||
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
|
||||
punpcklbw xmm1, xmm0
|
||||
punpckhbw xmm3, xmm0
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpckhbw xmm4, xmm0
|
||||
|
||||
|
||||
psubw xmm1, xmm2
|
||||
psubw xmm3, xmm4
|
||||
|
||||
paddw xmm7, xmm1
|
||||
pmaddwd xmm1, xmm1
|
||||
|
||||
paddw xmm7, xmm3
|
||||
pmaddwd xmm3, xmm3
|
||||
|
||||
paddd xmm6, xmm1
|
||||
paddd xmm6, xmm3
|
||||
|
||||
add rsi, rax
|
||||
add rdi, rdx
|
||||
|
||||
sub rcx, 1
|
||||
jnz .var16loop
|
||||
|
||||
|
||||
movdqa xmm1, xmm6
|
||||
pxor xmm6, xmm6
|
||||
|
||||
pxor xmm5, xmm5
|
||||
punpcklwd xmm6, xmm7
|
||||
|
||||
punpckhwd xmm5, xmm7
|
||||
psrad xmm5, 16
|
||||
|
||||
psrad xmm6, 16
|
||||
paddd xmm6, xmm5
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
punpckldq xmm1, xmm0
|
||||
|
||||
punpckhdq xmm2, xmm0
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
paddd xmm1, xmm2
|
||||
punpckldq xmm6, xmm0
|
||||
|
||||
punpckhdq xmm7, xmm0
|
||||
paddd xmm6, xmm7
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
psrldq xmm1, 8
|
||||
psrldq xmm6, 8
|
||||
|
||||
paddd xmm7, xmm6
|
||||
paddd xmm1, xmm2
|
||||
|
||||
mov rax, arg(5) ;[Sum]
|
||||
mov rdi, arg(4) ;[SSE]
|
||||
|
||||
movd DWORD PTR [rax], xmm7
|
||||
movd DWORD PTR [rdi], xmm1
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
|
||||
;unsigned int vp8_get8x8var_sse2
|
||||
;(
|
||||
; unsigned char * src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char * ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int * SSE,
|
||||
; int * Sum
|
||||
;)
|
||||
global sym(vp8_get8x8var_sse2) PRIVATE
|
||||
sym(vp8_get8x8var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;[src_ptr]
|
||||
mov rdi, arg(2) ;[ref_ptr]
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;[source_stride]
|
||||
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
|
||||
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
||||
|
||||
movq xmm1, QWORD PTR [rsi]
|
||||
movq xmm2, QWORD PTR [rdi]
|
||||
|
||||
punpcklbw xmm1, xmm0
|
||||
punpcklbw xmm2, xmm0
|
||||
|
||||
psubsw xmm1, xmm2
|
||||
paddw xmm7, xmm1
|
||||
|
||||
pmaddwd xmm1, xmm1
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax]
|
||||
movq xmm3, QWORD PTR[rdi + rdx]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax * 2]
|
||||
movq xmm3, QWORD PTR[rdi + rdx * 2]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
lea rsi, [rsi + rax * 2]
|
||||
lea rdi, [rdi + rdx * 2]
|
||||
movq xmm2, QWORD PTR[rsi + rax]
|
||||
movq xmm3, QWORD PTR[rdi + rdx]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax *2]
|
||||
movq xmm3, QWORD PTR[rdi + rdx *2]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
lea rsi, [rsi + rax * 2]
|
||||
lea rdi, [rdi + rdx * 2]
|
||||
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax]
|
||||
movq xmm3, QWORD PTR[rdi + rdx]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax *2]
|
||||
movq xmm3, QWORD PTR[rdi + rdx *2]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
lea rsi, [rsi + rax * 2]
|
||||
lea rdi, [rdi + rdx * 2]
|
||||
|
||||
movq xmm2, QWORD PTR[rsi + rax]
|
||||
movq xmm3, QWORD PTR[rdi + rdx]
|
||||
|
||||
punpcklbw xmm2, xmm0
|
||||
punpcklbw xmm3, xmm0
|
||||
|
||||
psubsw xmm2, xmm3
|
||||
paddw xmm7, xmm2
|
||||
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
movdqa xmm6, xmm7
|
||||
punpcklwd xmm6, xmm0
|
||||
|
||||
punpckhwd xmm7, xmm0
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
paddw xmm6, xmm7
|
||||
punpckldq xmm1, xmm0
|
||||
|
||||
punpckhdq xmm2, xmm0
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
paddd xmm1, xmm2
|
||||
punpckldq xmm6, xmm0
|
||||
|
||||
punpckhdq xmm7, xmm0
|
||||
paddw xmm6, xmm7
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
psrldq xmm1, 8
|
||||
psrldq xmm6, 8
|
||||
|
||||
paddw xmm7, xmm6
|
||||
paddd xmm1, xmm2
|
||||
|
||||
mov rax, arg(5) ;[Sum]
|
||||
mov rdi, arg(4) ;[SSE]
|
||||
|
||||
movq rdx, xmm7
|
||||
movsx rcx, dx
|
||||
|
||||
mov dword ptr [rax], ecx
|
||||
movd DWORD PTR [rdi], xmm1
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_filter_block2d_bil_var_sse2
|
||||
;(
|
||||
; unsigned char *ref_ptr,
|
||||
|
|
|
@ -35,25 +35,6 @@ extern void filter_block1d_v6_mmx
|
|||
short *filter
|
||||
);
|
||||
|
||||
extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
|
||||
extern unsigned int vp8_get8x8var_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *SSE,
|
||||
int *Sum
|
||||
);
|
||||
extern unsigned int vp8_get4x4var_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *SSE,
|
||||
int *Sum
|
||||
);
|
||||
extern void vp8_filter_block2d_bil4x4_var_mmx
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
|
@ -78,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx
|
|||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
|
||||
unsigned int vp8_variance4x4_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 4));
|
||||
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x8_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
||||
*sse = var;
|
||||
|
||||
return (var - (((unsigned int)avg * avg) >> 6));
|
||||
|
||||
}
|
||||
|
||||
unsigned int vp8_mse16x16_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int sse0, sse1, sse2, sse3, var;
|
||||
int sum0, sum1, sum2, sum3;
|
||||
|
||||
|
||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
||||
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
|
||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
|
||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
|
||||
|
||||
var = sse0 + sse1 + sse2 + sse3;
|
||||
*sse = var;
|
||||
return var;
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance16x16_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int sse0, sse1, sse2, sse3, var;
|
||||
int sum0, sum1, sum2, sum3, avg;
|
||||
|
||||
|
||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
||||
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
|
||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
|
||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
|
||||
|
||||
var = sse0 + sse1 + sse2 + sse3;
|
||||
avg = sum0 + sum1 + sum2 + sum3;
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 8));
|
||||
}
|
||||
|
||||
unsigned int vp8_variance16x8_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int sse0, sse1, var;
|
||||
int sum0, sum1, avg;
|
||||
|
||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
||||
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
|
||||
|
||||
var = sse0 + sse1;
|
||||
avg = sum0 + sum1;
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance8x16_mmx(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int sse0, sse1, var;
|
||||
int sum0, sum1, avg;
|
||||
|
||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
|
||||
|
||||
var = sse0 + sse1;
|
||||
avg = sum0 + sum1;
|
||||
*sse = var;
|
||||
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_sub_pixel_variance4x4_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
|
|
|
@ -31,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
|
|||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
extern unsigned int vp8_get4x4var_mmx
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *SSE,
|
||||
int *Sum
|
||||
);
|
||||
|
||||
unsigned int vp8_get_mb_ss_sse2
|
||||
(
|
||||
const short *src_ptr
|
||||
);
|
||||
unsigned int vp8_get16x16var_sse2
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *SSE,
|
||||
int *Sum
|
||||
);
|
||||
unsigned int vp8_get8x8var_sse2
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *SSE,
|
||||
int *Sum
|
||||
);
|
||||
void vp8_filter_block2d_bil_var_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
|
@ -136,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2
|
|||
unsigned int *sumsquared
|
||||
);
|
||||
|
||||
unsigned int vp8_variance4x4_wmt(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 4));
|
||||
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x8_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 6));
|
||||
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance16x16_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
|
||||
|
||||
vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
||||
*sse = sse0;
|
||||
return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
|
||||
}
|
||||
unsigned int vp8_mse16x16_wmt(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
||||
*sse = sse0;
|
||||
return sse0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_variance16x8_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int sse0, sse1, var;
|
||||
int sum0, sum1, avg;
|
||||
|
||||
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
||||
vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
|
||||
|
||||
var = sse0 + sse1;
|
||||
avg = sum0 + sum1;
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x16_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse)
|
||||
{
|
||||
unsigned int sse0, sse1, var;
|
||||
int sum0, sum1, avg;
|
||||
|
||||
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
||||
vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
|
||||
|
||||
var = sse0 + sse1;
|
||||
avg = sum0 + sum1;
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
|
||||
}
|
||||
|
||||
unsigned int vp8_sub_pixel_variance4x4_wmt
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
|
|
|
@ -13,15 +13,6 @@
|
|||
#include "vp8/common/variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
extern unsigned int vp8_get16x16var_sse2
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *SSE,
|
||||
int *Sum
|
||||
);
|
||||
extern void vp8_half_horiz_vert_variance16x_h_sse2
|
||||
(
|
||||
const unsigned char *ref_ptr,
|
||||
|
|
|
@ -1,138 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_mse16x16_armv6|
|
||||
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
;
|
||||
;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
|
||||
; So, we can remove this part of calculation.
|
||||
|
||||
|vp8_mse16x16_armv6| PROC
|
||||
|
||||
push {r4-r9, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov r4, #0 ; initialize sse = 0
|
||||
|
||||
loop
|
||||
; 1st 4 pixels
|
||||
ldr r5, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r6, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0x4] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
ldr r5, [r0, #0x8] ; load 4 src pixels
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r6, [r2, #0x8] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0xc] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r6, [r2, #0xc] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
subs r12, r12, #1 ; next row
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r1, [sp, #28] ; get address of sse
|
||||
mov r0, r4 ; return sse
|
||||
str r4, [r1] ; store sse
|
||||
|
||||
pop {r4-r9, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,131 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
unsigned int vp8_mse16x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
int64x1_t d0s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int64x2_t q1s64;
|
||||
|
||||
q7s32 = vdupq_n_s32(0);
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) { // mse16x16_neon_loop
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
|
||||
q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
|
||||
q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q7s32 = vaddq_s32(q7s32, q8s32);
|
||||
q9s32 = vaddq_s32(q9s32, q10s32);
|
||||
q10s32 = vaddq_s32(q7s32, q9s32);
|
||||
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
|
||||
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_get4x4sse_cs_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride) {
|
||||
int16x4_t d22s16, d24s16, d26s16, d28s16;
|
||||
int64x1_t d0s64;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int64x2_t q1s64;
|
||||
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d1u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d5u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d3u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d7u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d1u8, d5u8);
|
||||
q13u16 = vsubl_u8(d2u8, d6u8);
|
||||
q14u16 = vsubl_u8(d3u8, d7u8);
|
||||
|
||||
d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
|
||||
d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
|
||||
d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
|
||||
d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
|
||||
|
||||
q7s32 = vmull_s16(d22s16, d22s16);
|
||||
q8s32 = vmull_s16(d24s16, d24s16);
|
||||
q9s32 = vmull_s16(d26s16, d26s16);
|
||||
q10s32 = vmull_s16(d28s16, d28s16);
|
||||
|
||||
q7s32 = vaddq_s32(q7s32, q8s32);
|
||||
q9s32 = vaddq_s32(q9s32, q10s32);
|
||||
q9s32 = vaddq_s32(q7s32, q9s32);
|
||||
|
||||
q1s64 = vpaddlq_s32(q9s32);
|
||||
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
||||
}
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "encodemb.h"
|
||||
#include "encodemv.h"
|
||||
#include "vp8/common/common.h"
|
||||
|
@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
|
|||
* lambda using a non-linear combination (e.g., the smallest, or second
|
||||
* smallest, etc.).
|
||||
*/
|
||||
act = vp8_variance16x16(x->src.y_buffer,
|
||||
act = vpx_variance16x16(x->src.y_buffer,
|
||||
x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
|
||||
act = act<<4;
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "quantize.h"
|
||||
#include "vp8/common/reconintra4x4.h"
|
||||
#include "encodemb.h"
|
||||
|
@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
|
|||
}
|
||||
}
|
||||
|
||||
intra_pred_var = vp8_get_mb_ss(x->src_diff);
|
||||
intra_pred_var = vpx_get_mb_ss(x->src_diff);
|
||||
|
||||
return intra_pred_var;
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_scale_rtcd.h"
|
||||
#include "block.h"
|
||||
#include "onyx_int.h"
|
||||
|
@ -422,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
|
|||
/* Set up pointers for this macro block raw buffer */
|
||||
raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
|
||||
+ d->offset);
|
||||
vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
|
||||
(unsigned int *)(raw_motion_err));
|
||||
vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride,
|
||||
(unsigned int *)(raw_motion_err));
|
||||
|
||||
/* Set up pointers for this macro block recon buffer */
|
||||
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
|
||||
ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
|
||||
vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
|
||||
(unsigned int *)(best_motion_err));
|
||||
vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
|
||||
(unsigned int *)(best_motion_err));
|
||||
}
|
||||
|
||||
static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
|
||||
|
@ -453,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
|
|||
int new_mv_mode_penalty = 256;
|
||||
|
||||
/* override the default variance function to use MSE */
|
||||
v_fn_ptr.vf = vp8_mse16x16;
|
||||
v_fn_ptr.vf = vpx_mse16x16;
|
||||
|
||||
/* Set up pointers for this macro block recon buffer */
|
||||
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
|
||||
|
|
|
@ -2131,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||
#endif
|
||||
|
||||
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
|
||||
cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16;
|
||||
cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
|
||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
|
||||
|
@ -2141,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
|
||||
|
||||
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
|
||||
cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8;
|
||||
cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
|
||||
cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
|
||||
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
|
||||
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
|
||||
|
@ -2151,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||
cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
|
||||
|
||||
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
|
||||
cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16;
|
||||
cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
|
||||
cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
|
||||
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
|
||||
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
|
||||
|
@ -2161,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||
cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
|
||||
|
||||
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
|
||||
cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8;
|
||||
cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
|
||||
cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
|
||||
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
|
||||
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
|
||||
|
@ -2171,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||
cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
|
||||
|
||||
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
|
||||
cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4;
|
||||
cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
|
||||
cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
|
||||
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
|
||||
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
|
||||
|
@ -2558,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
|
|||
{
|
||||
unsigned int sse;
|
||||
|
||||
vp8_mse16x16(orig + col, orig_stride,
|
||||
vpx_mse16x16(orig + col, orig_stride,
|
||||
recon + col, recon_stride,
|
||||
&sse);
|
||||
total_sse += sse;
|
||||
|
@ -3384,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
|
|||
int index = block_index_row + (j >> 4);
|
||||
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
|
||||
unsigned int sse;
|
||||
Total += vp8_mse16x16(src + j,
|
||||
Total += vpx_mse16x16(src + j,
|
||||
source->y_stride,
|
||||
dst + j, dest->y_stride,
|
||||
&sse);
|
||||
|
@ -3448,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
|
|||
int index = block_index_row + (j >> 4);
|
||||
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
|
||||
unsigned int sse;
|
||||
const unsigned int var = vp8_variance16x16(src + j,
|
||||
const unsigned int var = vpx_variance16x16(src + j,
|
||||
ystride,
|
||||
dst + j,
|
||||
ystride,
|
||||
|
@ -3458,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
|
|||
// is small (to avoid effects from lighting change).
|
||||
if ((sse - var) < 128) {
|
||||
unsigned int sse2;
|
||||
const unsigned int act = vp8_variance16x16(src + j,
|
||||
const unsigned int act = vpx_variance16x16(src + j,
|
||||
ystride,
|
||||
const_source,
|
||||
0,
|
||||
|
@ -5993,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest)
|
|||
for (j = 0; j < source->y_width; j += 16)
|
||||
{
|
||||
unsigned int sse;
|
||||
Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
|
||||
Total += vpx_mse16x16(src + j, source->y_stride,
|
||||
dst + j, dest->y_stride, &sse);
|
||||
}
|
||||
|
||||
src += 16 * source->y_stride;
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include <limits.h>
|
||||
#include "vpx_config.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "onyx_int.h"
|
||||
#include "modecosts.h"
|
||||
#include "encodeintra.h"
|
||||
|
@ -215,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
|
|||
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp8_get4x4sse_cs_c
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride
|
||||
)
|
||||
{
|
||||
int distortion = 0;
|
||||
int r, c;
|
||||
|
||||
for (r = 0; r < 4; r++)
|
||||
{
|
||||
for (c = 0; c < 4; c++)
|
||||
{
|
||||
int diff = src_ptr[c] - ref_ptr[c];
|
||||
distortion += diff * diff;
|
||||
}
|
||||
|
||||
src_ptr += source_stride;
|
||||
ref_ptr += recon_stride;
|
||||
}
|
||||
|
||||
return distortion;
|
||||
}
|
||||
|
||||
static int get_prediction_error(BLOCK *be, BLOCKD *b)
|
||||
{
|
||||
unsigned char *sptr;
|
||||
|
@ -249,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b)
|
|||
sptr = (*(be->base_src) + be->src);
|
||||
dptr = b->predictor;
|
||||
|
||||
return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
|
||||
return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
|
||||
|
||||
}
|
||||
|
||||
|
@ -1037,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
|
|||
else
|
||||
{
|
||||
rate2 += rate;
|
||||
distortion2 = vp8_variance16x16(
|
||||
distortion2 = vpx_variance16x16(
|
||||
*(b->base_src), b->src_stride,
|
||||
x->e_mbd.predictor, 16, &sse);
|
||||
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
|
||||
|
@ -1066,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
|
|||
xd->dst.y_stride,
|
||||
xd->predictor,
|
||||
16);
|
||||
distortion2 = vp8_variance16x16
|
||||
distortion2 = vpx_variance16x16
|
||||
(*(b->base_src), b->src_stride,
|
||||
x->e_mbd.predictor, 16, &sse);
|
||||
rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
|
||||
|
@ -1547,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
|
|||
xd->dst.y_stride,
|
||||
xd->predictor,
|
||||
16);
|
||||
distortion = vp8_variance16x16
|
||||
distortion = vpx_variance16x16
|
||||
(*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
|
||||
rate = x->mbmode_cost[xd->frame_type][mode];
|
||||
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
*/
|
||||
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_scale_rtcd.h"
|
||||
#include "vp8/common/onyxc_int.h"
|
||||
#include "onyx_int.h"
|
||||
|
@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
|
|||
for (j = 0; j < source->y_width; j += 16)
|
||||
{
|
||||
unsigned int sse;
|
||||
Total += vp8_mse16x16(src + j, source->y_stride,
|
||||
Total += vpx_mse16x16(src + j, source->y_stride,
|
||||
dst + j, dest->y_stride,
|
||||
&sse);
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <assert.h>
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "tokenize.h"
|
||||
#include "treewriter.h"
|
||||
#include "onyx_int.h"
|
||||
|
@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x)
|
|||
}
|
||||
else
|
||||
{
|
||||
vp8_variance8x8(uptr, pre_stride,
|
||||
vpx_variance8x8(uptr, pre_stride,
|
||||
upred_ptr, uv_stride, &sse2);
|
||||
vp8_variance8x8(vptr, pre_stride,
|
||||
vpx_variance8x8(vptr, pre_stride,
|
||||
vpred_ptr, uv_stride, &sse1);
|
||||
sse2 += sse1;
|
||||
}
|
||||
|
@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4],
|
|||
if(threshold < x->encode_breakout)
|
||||
threshold = x->encode_breakout;
|
||||
|
||||
var = vp8_variance16x16
|
||||
var = vpx_variance16x16
|
||||
(*(b->base_src), b->src_stride,
|
||||
x->e_mbd.predictor, 16, &sse);
|
||||
|
||||
|
|
|
@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM)
|
|||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
|
||||
|
@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
|
|||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
|
||||
|
||||
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
|
||||
|
|
|
@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
|
|||
#File list for media
|
||||
# encoder
|
||||
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
|
||||
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
|
||||
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
|
||||
|
||||
#File list for neon
|
||||
|
@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
|
|||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
|
||||
|
|
|
@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
|
|||
get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
|
||||
|
||||
if (bs == BLOCK_16X16) {
|
||||
vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
|
||||
vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
|
||||
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
|
||||
} else if (bs == BLOCK_32X32) {
|
||||
vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
|
||||
vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
|
||||
sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
|
||||
} else /* if (bs == BLOCK_64X64) */ {
|
||||
vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
|
||||
vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
|
||||
sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
|
||||
}
|
||||
|
||||
|
|
|
@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
|
|||
|
||||
|
||||
# variance
|
||||
add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance4x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_variance4x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||
|
||||
|
@ -922,21 +877,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
|
|||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_mse8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_mse16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_mse8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
|
||||
specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
|
||||
specialize qw/vp9_avg_8x8 sse2 neon/;
|
||||
|
||||
|
@ -1141,142 +1081,6 @@ specialize qw/vp9_temporal_filter_apply sse2/;
|
|||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
|
||||
# variance
|
||||
add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance8x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance4x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance4x4/;
|
||||
|
||||
add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance8x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance4x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance4x4/;
|
||||
|
||||
add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance8x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance4x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance4x4/;
|
||||
|
||||
add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
|
@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_mse8x16/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_mse16x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_mse8x16/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_mse16x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_mse8x16/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_mse16x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
|
||||
|
||||
# ENCODEMB INVOKE
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
#include <arm_neon.h>
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
|
@ -20,82 +21,6 @@
|
|||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
|
||||
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
|
||||
const int32x4_t a = vpaddlq_s16(v_16x8);
|
||||
const int64x2_t b = vpaddlq_s32(a);
|
||||
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
|
||||
vreinterpret_s32_s64(vget_high_s64(b)));
|
||||
return vget_lane_s32(c, 0);
|
||||
}
|
||||
|
||||
static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
|
||||
const int64x2_t b = vpaddlq_s32(v_32x4);
|
||||
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
|
||||
vreinterpret_s32_s64(vget_high_s64(b)));
|
||||
return vget_lane_s32(c, 0);
|
||||
}
|
||||
|
||||
// w * h must be less than 2048 or local variable v_sum may overflow.
|
||||
static void variance_neon_w8(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
int w, int h, uint32_t *sse, int *sum) {
|
||||
int i, j;
|
||||
int16x8_t v_sum = vdupq_n_s16(0);
|
||||
int32x4_t v_sse_lo = vdupq_n_s32(0);
|
||||
int32x4_t v_sse_hi = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; j += 8) {
|
||||
const uint8x8_t v_a = vld1_u8(&a[j]);
|
||||
const uint8x8_t v_b = vld1_u8(&b[j]);
|
||||
const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
|
||||
const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
|
||||
v_sum = vaddq_s16(v_sum, sv_diff);
|
||||
v_sse_lo = vmlal_s16(v_sse_lo,
|
||||
vget_low_s16(sv_diff),
|
||||
vget_low_s16(sv_diff));
|
||||
v_sse_hi = vmlal_s16(v_sse_hi,
|
||||
vget_high_s16(sv_diff),
|
||||
vget_high_s16(sv_diff));
|
||||
}
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
|
||||
*sum = horizontal_add_s16x8(v_sum);
|
||||
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
|
||||
}
|
||||
|
||||
void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
|
||||
8, sse, sum);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
|
||||
return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
|
||||
}
|
||||
|
||||
void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
|
||||
16, sse, sum);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
|
||||
return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
|
||||
}
|
||||
|
||||
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
|
||||
uint8_t *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
|
@ -162,7 +87,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
|
|||
BILINEAR_FILTERS_2TAP(xoffset));
|
||||
var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
|
||||
8, BILINEAR_FILTERS_2TAP(yoffset));
|
||||
return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
|
||||
return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
|
||||
|
@ -180,77 +105,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
|
|||
BILINEAR_FILTERS_2TAP(xoffset));
|
||||
var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
|
||||
16, BILINEAR_FILTERS_2TAP(yoffset));
|
||||
return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
|
||||
}
|
||||
|
||||
void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
|
||||
32, sse, sum);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
|
||||
return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
|
||||
}
|
||||
|
||||
unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum1, sum2;
|
||||
uint32_t sse1, sse2;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
|
||||
variance_neon_w8(a + (32 * a_stride), a_stride,
|
||||
b + (32 * b_stride), b_stride, 32, 32,
|
||||
&sse2, &sum2);
|
||||
*sse = sse1 + sse2;
|
||||
sum1 += sum2;
|
||||
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
|
||||
}
|
||||
|
||||
unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum1, sum2;
|
||||
uint32_t sse1, sse2;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
|
||||
variance_neon_w8(a + (16 * a_stride), a_stride,
|
||||
b + (16 * b_stride), b_stride, 64, 16,
|
||||
&sse2, &sum2);
|
||||
*sse = sse1 + sse2;
|
||||
sum1 += sum2;
|
||||
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
|
||||
}
|
||||
|
||||
unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum1, sum2;
|
||||
uint32_t sse1, sse2;
|
||||
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
|
||||
variance_neon_w8(a + (16 * a_stride), a_stride,
|
||||
b + (16 * b_stride), b_stride, 64, 16,
|
||||
&sse2, &sum2);
|
||||
sse1 += sse2;
|
||||
sum1 += sum2;
|
||||
|
||||
variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
|
||||
b + (16 * 2 * b_stride), b_stride,
|
||||
64, 16, &sse2, &sum2);
|
||||
sse1 += sse2;
|
||||
sum1 += sum2;
|
||||
|
||||
variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
|
||||
b + (16 * 3 * b_stride), b_stride,
|
||||
64, 16, &sse2, &sum2);
|
||||
*sse = sse1 + sse2;
|
||||
sum1 += sum2;
|
||||
return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
|
||||
return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
|
||||
|
@ -268,7 +123,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
|
|||
BILINEAR_FILTERS_2TAP(xoffset));
|
||||
var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
|
||||
32, BILINEAR_FILTERS_2TAP(yoffset));
|
||||
return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
|
||||
return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
|
||||
|
@ -286,5 +141,5 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
|
|||
BILINEAR_FILTERS_2TAP(xoffset));
|
||||
var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
|
||||
64, BILINEAR_FILTERS_2TAP(yoffset));
|
||||
return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
|
||||
return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
|
||||
}
|
||||
|
|
|
@ -98,9 +98,9 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
|
|||
int avg;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||
highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride,
|
||||
CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
|
||||
&sse, &avg);
|
||||
highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
|
||||
CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
|
||||
&sse, &avg);
|
||||
sse >>= 2 * (xd->bd - 8);
|
||||
avg >>= (xd->bd - 8);
|
||||
} else {
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
|
@ -3672,15 +3673,15 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
|
|||
if (cm->use_highbitdepth) {
|
||||
switch (cm->bit_depth) {
|
||||
case VPX_BITS_8:
|
||||
vp9_highbd_get16x16var(src, src_stride, last_src, last_stride,
|
||||
vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
|
||||
&var16->sse, &var16->sum);
|
||||
break;
|
||||
case VPX_BITS_10:
|
||||
vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
|
||||
vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
|
||||
&var16->sse, &var16->sum);
|
||||
break;
|
||||
case VPX_BITS_12:
|
||||
vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
|
||||
vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
|
||||
&var16->sse, &var16->sum);
|
||||
break;
|
||||
default:
|
||||
|
@ -3689,11 +3690,11 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
|
|||
return -1;
|
||||
}
|
||||
} else {
|
||||
vp9_get16x16var(src, src_stride, last_src, last_stride,
|
||||
vpx_get16x16var(src, src_stride, last_src, last_stride,
|
||||
&var16->sse, &var16->sum);
|
||||
}
|
||||
#else
|
||||
vp9_get16x16var(src, src_stride, last_src, last_stride,
|
||||
vpx_get16x16var(src, src_stride, last_src, last_stride,
|
||||
&var16->sse, &var16->sum);
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
var16->var = var16->sse -
|
||||
|
|
|
@ -998,7 +998,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_32X16,
|
||||
vpx_highbd_sad32x16_bits8,
|
||||
vpx_highbd_sad32x16_avg_bits8,
|
||||
vp9_highbd_variance32x16,
|
||||
vpx_highbd_8_variance32x16,
|
||||
vp9_highbd_sub_pixel_variance32x16,
|
||||
vp9_highbd_sub_pixel_avg_variance32x16,
|
||||
NULL,
|
||||
|
@ -1008,7 +1008,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_16X32,
|
||||
vpx_highbd_sad16x32_bits8,
|
||||
vpx_highbd_sad16x32_avg_bits8,
|
||||
vp9_highbd_variance16x32,
|
||||
vpx_highbd_8_variance16x32,
|
||||
vp9_highbd_sub_pixel_variance16x32,
|
||||
vp9_highbd_sub_pixel_avg_variance16x32,
|
||||
NULL,
|
||||
|
@ -1018,7 +1018,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_64X32,
|
||||
vpx_highbd_sad64x32_bits8,
|
||||
vpx_highbd_sad64x32_avg_bits8,
|
||||
vp9_highbd_variance64x32,
|
||||
vpx_highbd_8_variance64x32,
|
||||
vp9_highbd_sub_pixel_variance64x32,
|
||||
vp9_highbd_sub_pixel_avg_variance64x32,
|
||||
NULL,
|
||||
|
@ -1028,7 +1028,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_32X64,
|
||||
vpx_highbd_sad32x64_bits8,
|
||||
vpx_highbd_sad32x64_avg_bits8,
|
||||
vp9_highbd_variance32x64,
|
||||
vpx_highbd_8_variance32x64,
|
||||
vp9_highbd_sub_pixel_variance32x64,
|
||||
vp9_highbd_sub_pixel_avg_variance32x64,
|
||||
NULL,
|
||||
|
@ -1038,7 +1038,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_32X32,
|
||||
vpx_highbd_sad32x32_bits8,
|
||||
vpx_highbd_sad32x32_avg_bits8,
|
||||
vp9_highbd_variance32x32,
|
||||
vpx_highbd_8_variance32x32,
|
||||
vp9_highbd_sub_pixel_variance32x32,
|
||||
vp9_highbd_sub_pixel_avg_variance32x32,
|
||||
vpx_highbd_sad32x32x3_bits8,
|
||||
|
@ -1048,7 +1048,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_64X64,
|
||||
vpx_highbd_sad64x64_bits8,
|
||||
vpx_highbd_sad64x64_avg_bits8,
|
||||
vp9_highbd_variance64x64,
|
||||
vpx_highbd_8_variance64x64,
|
||||
vp9_highbd_sub_pixel_variance64x64,
|
||||
vp9_highbd_sub_pixel_avg_variance64x64,
|
||||
vpx_highbd_sad64x64x3_bits8,
|
||||
|
@ -1058,7 +1058,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_16X16,
|
||||
vpx_highbd_sad16x16_bits8,
|
||||
vpx_highbd_sad16x16_avg_bits8,
|
||||
vp9_highbd_variance16x16,
|
||||
vpx_highbd_8_variance16x16,
|
||||
vp9_highbd_sub_pixel_variance16x16,
|
||||
vp9_highbd_sub_pixel_avg_variance16x16,
|
||||
vpx_highbd_sad16x16x3_bits8,
|
||||
|
@ -1068,7 +1068,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_16X8,
|
||||
vpx_highbd_sad16x8_bits8,
|
||||
vpx_highbd_sad16x8_avg_bits8,
|
||||
vp9_highbd_variance16x8,
|
||||
vpx_highbd_8_variance16x8,
|
||||
vp9_highbd_sub_pixel_variance16x8,
|
||||
vp9_highbd_sub_pixel_avg_variance16x8,
|
||||
vpx_highbd_sad16x8x3_bits8,
|
||||
|
@ -1078,7 +1078,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_8X16,
|
||||
vpx_highbd_sad8x16_bits8,
|
||||
vpx_highbd_sad8x16_avg_bits8,
|
||||
vp9_highbd_variance8x16,
|
||||
vpx_highbd_8_variance8x16,
|
||||
vp9_highbd_sub_pixel_variance8x16,
|
||||
vp9_highbd_sub_pixel_avg_variance8x16,
|
||||
vpx_highbd_sad8x16x3_bits8,
|
||||
|
@ -1088,7 +1088,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_8X8,
|
||||
vpx_highbd_sad8x8_bits8,
|
||||
vpx_highbd_sad8x8_avg_bits8,
|
||||
vp9_highbd_variance8x8,
|
||||
vpx_highbd_8_variance8x8,
|
||||
vp9_highbd_sub_pixel_variance8x8,
|
||||
vp9_highbd_sub_pixel_avg_variance8x8,
|
||||
vpx_highbd_sad8x8x3_bits8,
|
||||
|
@ -1098,7 +1098,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_8X4,
|
||||
vpx_highbd_sad8x4_bits8,
|
||||
vpx_highbd_sad8x4_avg_bits8,
|
||||
vp9_highbd_variance8x4,
|
||||
vpx_highbd_8_variance8x4,
|
||||
vp9_highbd_sub_pixel_variance8x4,
|
||||
vp9_highbd_sub_pixel_avg_variance8x4,
|
||||
NULL,
|
||||
|
@ -1108,7 +1108,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_4X8,
|
||||
vpx_highbd_sad4x8_bits8,
|
||||
vpx_highbd_sad4x8_avg_bits8,
|
||||
vp9_highbd_variance4x8,
|
||||
vpx_highbd_8_variance4x8,
|
||||
vp9_highbd_sub_pixel_variance4x8,
|
||||
vp9_highbd_sub_pixel_avg_variance4x8,
|
||||
NULL,
|
||||
|
@ -1118,7 +1118,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_4X4,
|
||||
vpx_highbd_sad4x4_bits8,
|
||||
vpx_highbd_sad4x4_avg_bits8,
|
||||
vp9_highbd_variance4x4,
|
||||
vpx_highbd_8_variance4x4,
|
||||
vp9_highbd_sub_pixel_variance4x4,
|
||||
vp9_highbd_sub_pixel_avg_variance4x4,
|
||||
vpx_highbd_sad4x4x3_bits8,
|
||||
|
@ -1130,7 +1130,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_32X16,
|
||||
vpx_highbd_sad32x16_bits10,
|
||||
vpx_highbd_sad32x16_avg_bits10,
|
||||
vp9_highbd_10_variance32x16,
|
||||
vpx_highbd_10_variance32x16,
|
||||
vp9_highbd_10_sub_pixel_variance32x16,
|
||||
vp9_highbd_10_sub_pixel_avg_variance32x16,
|
||||
NULL,
|
||||
|
@ -1140,7 +1140,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_16X32,
|
||||
vpx_highbd_sad16x32_bits10,
|
||||
vpx_highbd_sad16x32_avg_bits10,
|
||||
vp9_highbd_10_variance16x32,
|
||||
vpx_highbd_10_variance16x32,
|
||||
vp9_highbd_10_sub_pixel_variance16x32,
|
||||
vp9_highbd_10_sub_pixel_avg_variance16x32,
|
||||
NULL,
|
||||
|
@ -1150,7 +1150,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_64X32,
|
||||
vpx_highbd_sad64x32_bits10,
|
||||
vpx_highbd_sad64x32_avg_bits10,
|
||||
vp9_highbd_10_variance64x32,
|
||||
vpx_highbd_10_variance64x32,
|
||||
vp9_highbd_10_sub_pixel_variance64x32,
|
||||
vp9_highbd_10_sub_pixel_avg_variance64x32,
|
||||
NULL,
|
||||
|
@ -1160,7 +1160,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_32X64,
|
||||
vpx_highbd_sad32x64_bits10,
|
||||
vpx_highbd_sad32x64_avg_bits10,
|
||||
vp9_highbd_10_variance32x64,
|
||||
vpx_highbd_10_variance32x64,
|
||||
vp9_highbd_10_sub_pixel_variance32x64,
|
||||
vp9_highbd_10_sub_pixel_avg_variance32x64,
|
||||
NULL,
|
||||
|
@ -1170,7 +1170,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_32X32,
|
||||
vpx_highbd_sad32x32_bits10,
|
||||
vpx_highbd_sad32x32_avg_bits10,
|
||||
vp9_highbd_10_variance32x32,
|
||||
vpx_highbd_10_variance32x32,
|
||||
vp9_highbd_10_sub_pixel_variance32x32,
|
||||
vp9_highbd_10_sub_pixel_avg_variance32x32,
|
||||
vpx_highbd_sad32x32x3_bits10,
|
||||
|
@ -1180,7 +1180,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_64X64,
|
||||
vpx_highbd_sad64x64_bits10,
|
||||
vpx_highbd_sad64x64_avg_bits10,
|
||||
vp9_highbd_10_variance64x64,
|
||||
vpx_highbd_10_variance64x64,
|
||||
vp9_highbd_10_sub_pixel_variance64x64,
|
||||
vp9_highbd_10_sub_pixel_avg_variance64x64,
|
||||
vpx_highbd_sad64x64x3_bits10,
|
||||
|
@ -1190,7 +1190,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_16X16,
|
||||
vpx_highbd_sad16x16_bits10,
|
||||
vpx_highbd_sad16x16_avg_bits10,
|
||||
vp9_highbd_10_variance16x16,
|
||||
vpx_highbd_10_variance16x16,
|
||||
vp9_highbd_10_sub_pixel_variance16x16,
|
||||
vp9_highbd_10_sub_pixel_avg_variance16x16,
|
||||
vpx_highbd_sad16x16x3_bits10,
|
||||
|
@ -1200,7 +1200,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_16X8,
|
||||
vpx_highbd_sad16x8_bits10,
|
||||
vpx_highbd_sad16x8_avg_bits10,
|
||||
vp9_highbd_10_variance16x8,
|
||||
vpx_highbd_10_variance16x8,
|
||||
vp9_highbd_10_sub_pixel_variance16x8,
|
||||
vp9_highbd_10_sub_pixel_avg_variance16x8,
|
||||
vpx_highbd_sad16x8x3_bits10,
|
||||
|
@ -1210,7 +1210,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_8X16,
|
||||
vpx_highbd_sad8x16_bits10,
|
||||
vpx_highbd_sad8x16_avg_bits10,
|
||||
vp9_highbd_10_variance8x16,
|
||||
vpx_highbd_10_variance8x16,
|
||||
vp9_highbd_10_sub_pixel_variance8x16,
|
||||
vp9_highbd_10_sub_pixel_avg_variance8x16,
|
||||
vpx_highbd_sad8x16x3_bits10,
|
||||
|
@ -1220,7 +1220,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_8X8,
|
||||
vpx_highbd_sad8x8_bits10,
|
||||
vpx_highbd_sad8x8_avg_bits10,
|
||||
vp9_highbd_10_variance8x8,
|
||||
vpx_highbd_10_variance8x8,
|
||||
vp9_highbd_10_sub_pixel_variance8x8,
|
||||
vp9_highbd_10_sub_pixel_avg_variance8x8,
|
||||
vpx_highbd_sad8x8x3_bits10,
|
||||
|
@ -1230,7 +1230,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_8X4,
|
||||
vpx_highbd_sad8x4_bits10,
|
||||
vpx_highbd_sad8x4_avg_bits10,
|
||||
vp9_highbd_10_variance8x4,
|
||||
vpx_highbd_10_variance8x4,
|
||||
vp9_highbd_10_sub_pixel_variance8x4,
|
||||
vp9_highbd_10_sub_pixel_avg_variance8x4,
|
||||
NULL,
|
||||
|
@ -1240,7 +1240,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_4X8,
|
||||
vpx_highbd_sad4x8_bits10,
|
||||
vpx_highbd_sad4x8_avg_bits10,
|
||||
vp9_highbd_10_variance4x8,
|
||||
vpx_highbd_10_variance4x8,
|
||||
vp9_highbd_10_sub_pixel_variance4x8,
|
||||
vp9_highbd_10_sub_pixel_avg_variance4x8,
|
||||
NULL,
|
||||
|
@ -1250,7 +1250,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_4X4,
|
||||
vpx_highbd_sad4x4_bits10,
|
||||
vpx_highbd_sad4x4_avg_bits10,
|
||||
vp9_highbd_10_variance4x4,
|
||||
vpx_highbd_10_variance4x4,
|
||||
vp9_highbd_10_sub_pixel_variance4x4,
|
||||
vp9_highbd_10_sub_pixel_avg_variance4x4,
|
||||
vpx_highbd_sad4x4x3_bits10,
|
||||
|
@ -1262,7 +1262,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_32X16,
|
||||
vpx_highbd_sad32x16_bits12,
|
||||
vpx_highbd_sad32x16_avg_bits12,
|
||||
vp9_highbd_12_variance32x16,
|
||||
vpx_highbd_12_variance32x16,
|
||||
vp9_highbd_12_sub_pixel_variance32x16,
|
||||
vp9_highbd_12_sub_pixel_avg_variance32x16,
|
||||
NULL,
|
||||
|
@ -1272,7 +1272,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_16X32,
|
||||
vpx_highbd_sad16x32_bits12,
|
||||
vpx_highbd_sad16x32_avg_bits12,
|
||||
vp9_highbd_12_variance16x32,
|
||||
vpx_highbd_12_variance16x32,
|
||||
vp9_highbd_12_sub_pixel_variance16x32,
|
||||
vp9_highbd_12_sub_pixel_avg_variance16x32,
|
||||
NULL,
|
||||
|
@ -1282,7 +1282,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_64X32,
|
||||
vpx_highbd_sad64x32_bits12,
|
||||
vpx_highbd_sad64x32_avg_bits12,
|
||||
vp9_highbd_12_variance64x32,
|
||||
vpx_highbd_12_variance64x32,
|
||||
vp9_highbd_12_sub_pixel_variance64x32,
|
||||
vp9_highbd_12_sub_pixel_avg_variance64x32,
|
||||
NULL,
|
||||
|
@ -1292,7 +1292,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_32X64,
|
||||
vpx_highbd_sad32x64_bits12,
|
||||
vpx_highbd_sad32x64_avg_bits12,
|
||||
vp9_highbd_12_variance32x64,
|
||||
vpx_highbd_12_variance32x64,
|
||||
vp9_highbd_12_sub_pixel_variance32x64,
|
||||
vp9_highbd_12_sub_pixel_avg_variance32x64,
|
||||
NULL,
|
||||
|
@ -1302,7 +1302,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_32X32,
|
||||
vpx_highbd_sad32x32_bits12,
|
||||
vpx_highbd_sad32x32_avg_bits12,
|
||||
vp9_highbd_12_variance32x32,
|
||||
vpx_highbd_12_variance32x32,
|
||||
vp9_highbd_12_sub_pixel_variance32x32,
|
||||
vp9_highbd_12_sub_pixel_avg_variance32x32,
|
||||
vpx_highbd_sad32x32x3_bits12,
|
||||
|
@ -1312,7 +1312,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_64X64,
|
||||
vpx_highbd_sad64x64_bits12,
|
||||
vpx_highbd_sad64x64_avg_bits12,
|
||||
vp9_highbd_12_variance64x64,
|
||||
vpx_highbd_12_variance64x64,
|
||||
vp9_highbd_12_sub_pixel_variance64x64,
|
||||
vp9_highbd_12_sub_pixel_avg_variance64x64,
|
||||
vpx_highbd_sad64x64x3_bits12,
|
||||
|
@ -1322,7 +1322,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_16X16,
|
||||
vpx_highbd_sad16x16_bits12,
|
||||
vpx_highbd_sad16x16_avg_bits12,
|
||||
vp9_highbd_12_variance16x16,
|
||||
vpx_highbd_12_variance16x16,
|
||||
vp9_highbd_12_sub_pixel_variance16x16,
|
||||
vp9_highbd_12_sub_pixel_avg_variance16x16,
|
||||
vpx_highbd_sad16x16x3_bits12,
|
||||
|
@ -1332,7 +1332,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_16X8,
|
||||
vpx_highbd_sad16x8_bits12,
|
||||
vpx_highbd_sad16x8_avg_bits12,
|
||||
vp9_highbd_12_variance16x8,
|
||||
vpx_highbd_12_variance16x8,
|
||||
vp9_highbd_12_sub_pixel_variance16x8,
|
||||
vp9_highbd_12_sub_pixel_avg_variance16x8,
|
||||
vpx_highbd_sad16x8x3_bits12,
|
||||
|
@ -1342,7 +1342,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_8X16,
|
||||
vpx_highbd_sad8x16_bits12,
|
||||
vpx_highbd_sad8x16_avg_bits12,
|
||||
vp9_highbd_12_variance8x16,
|
||||
vpx_highbd_12_variance8x16,
|
||||
vp9_highbd_12_sub_pixel_variance8x16,
|
||||
vp9_highbd_12_sub_pixel_avg_variance8x16,
|
||||
vpx_highbd_sad8x16x3_bits12,
|
||||
|
@ -1352,7 +1352,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_8X8,
|
||||
vpx_highbd_sad8x8_bits12,
|
||||
vpx_highbd_sad8x8_avg_bits12,
|
||||
vp9_highbd_12_variance8x8,
|
||||
vpx_highbd_12_variance8x8,
|
||||
vp9_highbd_12_sub_pixel_variance8x8,
|
||||
vp9_highbd_12_sub_pixel_avg_variance8x8,
|
||||
vpx_highbd_sad8x8x3_bits12,
|
||||
|
@ -1362,7 +1362,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_8X4,
|
||||
vpx_highbd_sad8x4_bits12,
|
||||
vpx_highbd_sad8x4_avg_bits12,
|
||||
vp9_highbd_12_variance8x4,
|
||||
vpx_highbd_12_variance8x4,
|
||||
vp9_highbd_12_sub_pixel_variance8x4,
|
||||
vp9_highbd_12_sub_pixel_avg_variance8x4,
|
||||
NULL,
|
||||
|
@ -1372,7 +1372,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_4X8,
|
||||
vpx_highbd_sad4x8_bits12,
|
||||
vpx_highbd_sad4x8_avg_bits12,
|
||||
vp9_highbd_12_variance4x8,
|
||||
vpx_highbd_12_variance4x8,
|
||||
vp9_highbd_12_sub_pixel_variance4x8,
|
||||
vp9_highbd_12_sub_pixel_avg_variance4x8,
|
||||
NULL,
|
||||
|
@ -1382,7 +1382,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||
HIGHBD_BFP(BLOCK_4X4,
|
||||
vpx_highbd_sad4x4_bits12,
|
||||
vpx_highbd_sad4x4_avg_bits12,
|
||||
vp9_highbd_12_variance4x4,
|
||||
vpx_highbd_12_variance4x4,
|
||||
vp9_highbd_12_sub_pixel_variance4x4,
|
||||
vp9_highbd_12_sub_pixel_avg_variance4x4,
|
||||
vpx_highbd_sad4x4x3_bits12,
|
||||
|
@ -1805,61 +1805,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
|
|||
cpi->fn_ptr[BT].sdx4df = SDX4DF;
|
||||
|
||||
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
|
||||
vp9_variance32x16, vp9_sub_pixel_variance32x16,
|
||||
vpx_variance32x16, vp9_sub_pixel_variance32x16,
|
||||
vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
|
||||
|
||||
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
|
||||
vp9_variance16x32, vp9_sub_pixel_variance16x32,
|
||||
vpx_variance16x32, vp9_sub_pixel_variance16x32,
|
||||
vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
|
||||
|
||||
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
|
||||
vp9_variance64x32, vp9_sub_pixel_variance64x32,
|
||||
vpx_variance64x32, vp9_sub_pixel_variance64x32,
|
||||
vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
|
||||
|
||||
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
|
||||
vp9_variance32x64, vp9_sub_pixel_variance32x64,
|
||||
vpx_variance32x64, vp9_sub_pixel_variance32x64,
|
||||
vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
|
||||
|
||||
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
|
||||
vp9_variance32x32, vp9_sub_pixel_variance32x32,
|
||||
vpx_variance32x32, vp9_sub_pixel_variance32x32,
|
||||
vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
|
||||
vpx_sad32x32x4d)
|
||||
|
||||
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
|
||||
vp9_variance64x64, vp9_sub_pixel_variance64x64,
|
||||
vpx_variance64x64, vp9_sub_pixel_variance64x64,
|
||||
vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
|
||||
vpx_sad64x64x4d)
|
||||
|
||||
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
|
||||
vp9_variance16x16, vp9_sub_pixel_variance16x16,
|
||||
vpx_variance16x16, vp9_sub_pixel_variance16x16,
|
||||
vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
|
||||
vpx_sad16x16x4d)
|
||||
|
||||
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
|
||||
vp9_variance16x8, vp9_sub_pixel_variance16x8,
|
||||
vpx_variance16x8, vp9_sub_pixel_variance16x8,
|
||||
vp9_sub_pixel_avg_variance16x8,
|
||||
vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
|
||||
|
||||
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
|
||||
vp9_variance8x16, vp9_sub_pixel_variance8x16,
|
||||
vpx_variance8x16, vp9_sub_pixel_variance8x16,
|
||||
vp9_sub_pixel_avg_variance8x16,
|
||||
vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
|
||||
|
||||
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
|
||||
vp9_variance8x8, vp9_sub_pixel_variance8x8,
|
||||
vpx_variance8x8, vp9_sub_pixel_variance8x8,
|
||||
vp9_sub_pixel_avg_variance8x8,
|
||||
vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
|
||||
|
||||
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
|
||||
vp9_variance8x4, vp9_sub_pixel_variance8x4,
|
||||
vpx_variance8x4, vp9_sub_pixel_variance8x4,
|
||||
vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
|
||||
|
||||
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
|
||||
vp9_variance4x8, vp9_sub_pixel_variance4x8,
|
||||
vpx_variance4x8, vp9_sub_pixel_variance4x8,
|
||||
vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
|
||||
|
||||
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
|
||||
vp9_variance4x4, vp9_sub_pixel_variance4x4,
|
||||
vpx_variance4x4, vp9_sub_pixel_variance4x4,
|
||||
vp9_sub_pixel_avg_variance4x4,
|
||||
vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
|
||||
|
||||
|
@ -2079,7 +2079,7 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
|
|||
const uint8_t *pa = a;
|
||||
const uint8_t *pb = b;
|
||||
for (x = 0; x < width / 16; ++x) {
|
||||
vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
||||
vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
||||
total_sse += sse;
|
||||
|
||||
pa += 16;
|
||||
|
@ -2124,21 +2124,21 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
|
|||
unsigned int sse = 0;
|
||||
int sum = 0;
|
||||
if (dw > 0) {
|
||||
highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
|
||||
dw, height, &sse, &sum);
|
||||
highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
|
||||
dw, height, &sse, &sum);
|
||||
total_sse += sse;
|
||||
}
|
||||
if (dh > 0) {
|
||||
highbd_variance(&a[(height - dh) * a_stride], a_stride,
|
||||
&b[(height - dh) * b_stride], b_stride,
|
||||
width - dw, dh, &sse, &sum);
|
||||
highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
|
||||
&b[(height - dh) * b_stride], b_stride,
|
||||
width - dw, dh, &sse, &sum);
|
||||
total_sse += sse;
|
||||
}
|
||||
for (y = 0; y < height / 16; ++y) {
|
||||
const uint8_t *pa = a;
|
||||
const uint8_t *pb = b;
|
||||
for (x = 0; x < width / 16; ++x) {
|
||||
vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
||||
vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
||||
total_sse += sse;
|
||||
pa += 16;
|
||||
pb += 16;
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_scale_rtcd.h"
|
||||
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
@ -267,13 +268,13 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
|
|||
static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
|
||||
switch (bsize) {
|
||||
case BLOCK_8X8:
|
||||
return vp9_mse8x8;
|
||||
return vpx_mse8x8;
|
||||
case BLOCK_16X8:
|
||||
return vp9_mse16x8;
|
||||
return vpx_mse16x8;
|
||||
case BLOCK_8X16:
|
||||
return vp9_mse8x16;
|
||||
return vpx_mse8x16;
|
||||
default:
|
||||
return vp9_mse16x16;
|
||||
return vpx_mse16x16;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -293,37 +294,37 @@ static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
|
|||
default:
|
||||
switch (bsize) {
|
||||
case BLOCK_8X8:
|
||||
return vp9_highbd_mse8x8;
|
||||
return vpx_highbd_8_mse8x8;
|
||||
case BLOCK_16X8:
|
||||
return vp9_highbd_mse16x8;
|
||||
return vpx_highbd_8_mse16x8;
|
||||
case BLOCK_8X16:
|
||||
return vp9_highbd_mse8x16;
|
||||
return vpx_highbd_8_mse8x16;
|
||||
default:
|
||||
return vp9_highbd_mse16x16;
|
||||
return vpx_highbd_8_mse16x16;
|
||||
}
|
||||
break;
|
||||
case 10:
|
||||
switch (bsize) {
|
||||
case BLOCK_8X8:
|
||||
return vp9_highbd_10_mse8x8;
|
||||
return vpx_highbd_10_mse8x8;
|
||||
case BLOCK_16X8:
|
||||
return vp9_highbd_10_mse16x8;
|
||||
return vpx_highbd_10_mse16x8;
|
||||
case BLOCK_8X16:
|
||||
return vp9_highbd_10_mse8x16;
|
||||
return vpx_highbd_10_mse8x16;
|
||||
default:
|
||||
return vp9_highbd_10_mse16x16;
|
||||
return vpx_highbd_10_mse16x16;
|
||||
}
|
||||
break;
|
||||
case 12:
|
||||
switch (bsize) {
|
||||
case BLOCK_8X8:
|
||||
return vp9_highbd_12_mse8x8;
|
||||
return vpx_highbd_12_mse8x8;
|
||||
case BLOCK_16X8:
|
||||
return vp9_highbd_12_mse16x8;
|
||||
return vpx_highbd_12_mse16x8;
|
||||
case BLOCK_8X16:
|
||||
return vp9_highbd_12_mse8x16;
|
||||
return vpx_highbd_12_mse8x16;
|
||||
default:
|
||||
return vp9_highbd_12_mse16x16;
|
||||
return vpx_highbd_12_mse16x16;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -634,7 +635,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
|
|||
xd->mi[0]->mbmi.tx_size = use_dc_pred ?
|
||||
(bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
|
||||
vp9_encode_intra_block_plane(x, bsize, 0);
|
||||
this_error = vp9_get_mb_ss(x->plane[0].src_diff);
|
||||
this_error = vpx_get_mb_ss(x->plane[0].src_diff);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (cm->use_highbitdepth) {
|
||||
switch (cm->bit_depth) {
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
@ -303,13 +304,13 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
|
|||
if (second_pred != NULL) {
|
||||
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||
DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
|
||||
vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
|
||||
vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
|
||||
y_stride);
|
||||
besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
|
||||
sse1);
|
||||
} else {
|
||||
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
|
||||
vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
|
||||
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
|
||||
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
|
||||
}
|
||||
} else {
|
||||
|
@ -321,7 +322,7 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
|
|||
(void) xd;
|
||||
if (second_pred != NULL) {
|
||||
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
|
||||
vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
|
||||
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
|
||||
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
|
||||
} else {
|
||||
besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
@ -215,7 +216,7 @@ static void block_variance(const uint8_t *src, int src_stride,
|
|||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
vp9_get8x8var(src + src_stride * i + j, src_stride,
|
||||
vpx_get8x8var(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride,
|
||||
&sse8x8[k], &sum8x8[k]);
|
||||
*sse += sse8x8[k];
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
@ -18,26 +19,6 @@
|
|||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
|
||||
void variance(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
int i, j;
|
||||
|
||||
*sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
}
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
}
|
||||
|
||||
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
|
||||
// or vertical direction to produce the filtered output block. Used to implement
|
||||
// first-pass of 2-D separable filter.
|
||||
|
@ -100,25 +81,6 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
|
|||
}
|
||||
}
|
||||
|
||||
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
|
||||
unsigned int i, sum = 0;
|
||||
|
||||
for (i = 0; i < 256; ++i) {
|
||||
sum += src_ptr[i] * src_ptr[i];
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
#define VAR(W, H) \
|
||||
unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
}
|
||||
|
||||
#define SUBPIX_VAR(W, H) \
|
||||
unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
|
@ -133,7 +95,7 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
|
|||
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||
\
|
||||
return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
|
||||
return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
#define SUBPIX_AVG_VAR(W, H) \
|
||||
|
@ -152,178 +114,51 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
|
|||
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||
\
|
||||
vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
|
||||
vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
|
||||
\
|
||||
return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
|
||||
return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
|
||||
}
|
||||
|
||||
void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
|
||||
}
|
||||
|
||||
unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
VAR(4, 4)
|
||||
SUBPIX_VAR(4, 4)
|
||||
SUBPIX_AVG_VAR(4, 4)
|
||||
|
||||
VAR(4, 8)
|
||||
SUBPIX_VAR(4, 8)
|
||||
SUBPIX_AVG_VAR(4, 8)
|
||||
|
||||
VAR(8, 4)
|
||||
SUBPIX_VAR(8, 4)
|
||||
SUBPIX_AVG_VAR(8, 4)
|
||||
|
||||
VAR(8, 8)
|
||||
SUBPIX_VAR(8, 8)
|
||||
SUBPIX_AVG_VAR(8, 8)
|
||||
|
||||
VAR(8, 16)
|
||||
SUBPIX_VAR(8, 16)
|
||||
SUBPIX_AVG_VAR(8, 16)
|
||||
|
||||
VAR(16, 8)
|
||||
SUBPIX_VAR(16, 8)
|
||||
SUBPIX_AVG_VAR(16, 8)
|
||||
|
||||
VAR(16, 16)
|
||||
SUBPIX_VAR(16, 16)
|
||||
SUBPIX_AVG_VAR(16, 16)
|
||||
|
||||
VAR(16, 32)
|
||||
SUBPIX_VAR(16, 32)
|
||||
SUBPIX_AVG_VAR(16, 32)
|
||||
|
||||
VAR(32, 16)
|
||||
SUBPIX_VAR(32, 16)
|
||||
SUBPIX_AVG_VAR(32, 16)
|
||||
|
||||
VAR(32, 32)
|
||||
SUBPIX_VAR(32, 32)
|
||||
SUBPIX_AVG_VAR(32, 32)
|
||||
|
||||
VAR(32, 64)
|
||||
SUBPIX_VAR(32, 64)
|
||||
SUBPIX_AVG_VAR(32, 64)
|
||||
|
||||
VAR(64, 32)
|
||||
SUBPIX_VAR(64, 32)
|
||||
SUBPIX_AVG_VAR(64, 32)
|
||||
|
||||
VAR(64, 64)
|
||||
SUBPIX_VAR(64, 64)
|
||||
SUBPIX_AVG_VAR(64, 64)
|
||||
|
||||
void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
|
||||
int height, const uint8_t *ref, int ref_stride) {
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j++) {
|
||||
const int tmp = pred[j] + ref[j];
|
||||
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
||||
}
|
||||
comp_pred += width;
|
||||
pred += width;
|
||||
ref += ref_stride;
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void highbd_variance64(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, uint64_t *sse,
|
||||
uint64_t *sum) {
|
||||
int i, j;
|
||||
|
||||
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
||||
*sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
}
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void highbd_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse,
|
||||
int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sse = (unsigned int)sse_long;
|
||||
*sum = (int)sum_long;
|
||||
}
|
||||
|
||||
void highbd_10_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse,
|
||||
int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
|
||||
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
|
||||
}
|
||||
|
||||
void highbd_12_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse,
|
||||
int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
|
||||
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
|
||||
}
|
||||
|
||||
static void highbd_var_filter_block2d_bil_first_pass(
|
||||
const uint8_t *src_ptr8,
|
||||
uint16_t *output_ptr,
|
||||
|
@ -374,35 +209,6 @@ static void highbd_var_filter_block2d_bil_second_pass(
|
|||
}
|
||||
}
|
||||
|
||||
#define HIGHBD_VAR(W, H) \
|
||||
unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
} \
|
||||
\
|
||||
unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
} \
|
||||
\
|
||||
unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
}
|
||||
|
||||
#define HIGHBD_SUBPIX_VAR(W, H) \
|
||||
unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
|
||||
const uint8_t *src, int src_stride, \
|
||||
|
@ -417,7 +223,7 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
|
|||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||
\
|
||||
return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
|
||||
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
|
||||
dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
|
@ -434,7 +240,7 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
|
|||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||
\
|
||||
return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||
W, dst, dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
|
@ -451,7 +257,7 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
|
|||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||
\
|
||||
return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||
W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
|
@ -471,10 +277,10 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
|
|||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||
\
|
||||
vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||
\
|
||||
return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
|
||||
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
|
||||
dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
|
@ -493,10 +299,10 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
|
|||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||
\
|
||||
vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||
\
|
||||
return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||
W, dst, dst_stride, sse); \
|
||||
} \
|
||||
\
|
||||
|
@ -515,137 +321,49 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
|
|||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||
\
|
||||
vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||
\
|
||||
return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||
W, dst, dst_stride, sse); \
|
||||
}
|
||||
|
||||
#define HIGHBD_GET_VAR(S) \
|
||||
void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||
} \
|
||||
\
|
||||
void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||
} \
|
||||
\
|
||||
void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||
}
|
||||
|
||||
#define HIGHBD_MSE(W, H) \
|
||||
unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
} \
|
||||
\
|
||||
unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
} \
|
||||
\
|
||||
unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
}
|
||||
|
||||
HIGHBD_GET_VAR(8)
|
||||
HIGHBD_GET_VAR(16)
|
||||
|
||||
HIGHBD_MSE(16, 16)
|
||||
HIGHBD_MSE(16, 8)
|
||||
HIGHBD_MSE(8, 16)
|
||||
HIGHBD_MSE(8, 8)
|
||||
|
||||
HIGHBD_VAR(4, 4)
|
||||
HIGHBD_SUBPIX_VAR(4, 4)
|
||||
HIGHBD_SUBPIX_AVG_VAR(4, 4)
|
||||
|
||||
HIGHBD_VAR(4, 8)
|
||||
HIGHBD_SUBPIX_VAR(4, 8)
|
||||
HIGHBD_SUBPIX_AVG_VAR(4, 8)
|
||||
|
||||
HIGHBD_VAR(8, 4)
|
||||
HIGHBD_SUBPIX_VAR(8, 4)
|
||||
HIGHBD_SUBPIX_AVG_VAR(8, 4)
|
||||
|
||||
HIGHBD_VAR(8, 8)
|
||||
HIGHBD_SUBPIX_VAR(8, 8)
|
||||
HIGHBD_SUBPIX_AVG_VAR(8, 8)
|
||||
|
||||
HIGHBD_VAR(8, 16)
|
||||
HIGHBD_SUBPIX_VAR(8, 16)
|
||||
HIGHBD_SUBPIX_AVG_VAR(8, 16)
|
||||
|
||||
HIGHBD_VAR(16, 8)
|
||||
HIGHBD_SUBPIX_VAR(16, 8)
|
||||
HIGHBD_SUBPIX_AVG_VAR(16, 8)
|
||||
|
||||
HIGHBD_VAR(16, 16)
|
||||
HIGHBD_SUBPIX_VAR(16, 16)
|
||||
HIGHBD_SUBPIX_AVG_VAR(16, 16)
|
||||
|
||||
HIGHBD_VAR(16, 32)
|
||||
HIGHBD_SUBPIX_VAR(16, 32)
|
||||
HIGHBD_SUBPIX_AVG_VAR(16, 32)
|
||||
|
||||
HIGHBD_VAR(32, 16)
|
||||
HIGHBD_SUBPIX_VAR(32, 16)
|
||||
HIGHBD_SUBPIX_AVG_VAR(32, 16)
|
||||
|
||||
HIGHBD_VAR(32, 32)
|
||||
HIGHBD_SUBPIX_VAR(32, 32)
|
||||
HIGHBD_SUBPIX_AVG_VAR(32, 32)
|
||||
|
||||
HIGHBD_VAR(32, 64)
|
||||
HIGHBD_SUBPIX_VAR(32, 64)
|
||||
HIGHBD_SUBPIX_AVG_VAR(32, 64)
|
||||
|
||||
HIGHBD_VAR(64, 32)
|
||||
HIGHBD_SUBPIX_VAR(64, 32)
|
||||
HIGHBD_SUBPIX_AVG_VAR(64, 32)
|
||||
|
||||
HIGHBD_VAR(64, 64)
|
||||
HIGHBD_SUBPIX_VAR(64, 64)
|
||||
HIGHBD_SUBPIX_AVG_VAR(64, 64)
|
||||
|
||||
void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
|
||||
int width, int height, const uint8_t *ref8,
|
||||
int ref_stride) {
|
||||
int i, j;
|
||||
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j++) {
|
||||
const int tmp = pred[j] + ref[j];
|
||||
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
||||
}
|
||||
comp_pred += width;
|
||||
pred += width;
|
||||
ref += ref_stride;
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
|
|
@ -12,31 +12,64 @@
|
|||
#define VP9_ENCODER_VP9_VARIANCE_H_
|
||||
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void variance(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
int w, int h,
|
||||
unsigned int *sse, int *sum);
|
||||
// TODO(johannkoenig): All functions which depend on
|
||||
// [highbd_][8|10|12_]variance should be refactored or moved to vpx_dsp.
|
||||
static void variance(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
int i, j;
|
||||
|
||||
*sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
}
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void highbd_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h,
|
||||
unsigned int *sse, int *sum);
|
||||
static void highbd_variance64(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, uint64_t *sse, uint64_t *sum) {
|
||||
int i, j;
|
||||
|
||||
void highbd_10_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h,
|
||||
unsigned int *sse, int *sum);
|
||||
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
||||
*sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
void highbd_12_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h,
|
||||
unsigned int *sse, int *sum);
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
}
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
}
|
||||
static void highbd_8_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sse = (unsigned int)sse_long;
|
||||
*sum = (int)sum_long;
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
|
||||
|
@ -95,15 +128,6 @@ typedef struct vp9_variance_vtable {
|
|||
vp9_sad_multi_d_fn_t sdx4df;
|
||||
} vp9_variance_fn_ptr_t;
|
||||
|
||||
void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
|
||||
int height, const uint8_t *ref, int ref_stride);
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred,
|
||||
int width, int height,
|
||||
const uint8_t *ref, int ref_stride);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
|
|
@ -13,237 +13,6 @@
|
|||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
uint32_t *sse, int *sum);
|
||||
|
||||
uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
uint32_t *sse, int *sum);
|
||||
|
||||
uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
uint32_t *sse, int *sum);
|
||||
|
||||
static void highbd_variance_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
int w, int h, uint32_t *sse, int *sum,
|
||||
high_variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
|
||||
*sse = 0;
|
||||
*sum = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
*sse += sse0;
|
||||
*sum += sum0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
int w, int h, uint32_t *sse, int *sum,
|
||||
high_variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
uint64_t sse_long = 0;
|
||||
int64_t sum_long = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
sse_long += sse0;
|
||||
sum_long += sum0;
|
||||
}
|
||||
}
|
||||
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
|
||||
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
|
||||
}
|
||||
|
||||
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
int w, int h, uint32_t *sse, int *sum,
|
||||
high_variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
uint64_t sse_long = 0;
|
||||
int64_t sum_long = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
sse_long += sse0;
|
||||
sum_long += sum0;
|
||||
}
|
||||
}
|
||||
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
|
||||
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
|
||||
}
|
||||
|
||||
|
||||
#define HIGH_GET_VAR(S) \
|
||||
void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
uint32_t *sse, int *sum) { \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||
sse, sum); \
|
||||
} \
|
||||
\
|
||||
void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
uint32_t *sse, int *sum) { \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||
sse, sum); \
|
||||
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
|
||||
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
|
||||
} \
|
||||
\
|
||||
void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
uint32_t *sse, int *sum) { \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||
sse, sum); \
|
||||
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
|
||||
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
|
||||
}
|
||||
|
||||
HIGH_GET_VAR(16);
|
||||
HIGH_GET_VAR(8);
|
||||
|
||||
#undef HIGH_GET_VAR
|
||||
|
||||
#define VAR_FN(w, h, block_size, shift) \
|
||||
uint32_t vp9_highbd_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||
int sum; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
|
||||
block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
} \
|
||||
\
|
||||
uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||
int sum; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_10_variance_sse2( \
|
||||
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
} \
|
||||
\
|
||||
uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||
int sum; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_12_variance_sse2( \
|
||||
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
}
|
||||
|
||||
VAR_FN(64, 64, 16, 12);
|
||||
VAR_FN(64, 32, 16, 11);
|
||||
VAR_FN(32, 64, 16, 11);
|
||||
VAR_FN(32, 32, 16, 10);
|
||||
VAR_FN(32, 16, 16, 9);
|
||||
VAR_FN(16, 32, 16, 9);
|
||||
VAR_FN(16, 16, 16, 8);
|
||||
VAR_FN(16, 8, 8, 7);
|
||||
VAR_FN(8, 16, 8, 7);
|
||||
VAR_FN(8, 8, 8, 6);
|
||||
|
||||
#undef VAR_FN
|
||||
|
||||
unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
#define DECL(w, opt) \
|
||||
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
|
|
|
@ -13,18 +13,6 @@
|
|||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
|
||||
int x_offset, int y_offset,
|
||||
const uint8_t *dst, int dst_stride,
|
||||
|
@ -42,81 +30,6 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
|
|||
int height,
|
||||
unsigned int *sseptr);
|
||||
|
||||
static void variance_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
int w, int h, unsigned int *sse, int *sum,
|
||||
get_var_avx2 var_fn, int block_size) {
|
||||
int i, j;
|
||||
|
||||
*sse = 0;
|
||||
*sum = 0;
|
||||
|
||||
for (i = 0; i < h; i += 16) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(&src[src_stride * i + j], src_stride,
|
||||
&ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
|
||||
*sse += sse0;
|
||||
*sum += sum0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vp9_get16x16var_avx2, 16);
|
||||
return *sse - (((unsigned int)sum * sum) >> 8);
|
||||
}
|
||||
|
||||
unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
|
||||
sse, &sum, vp9_get32x32var_avx2, 32);
|
||||
return *sse - (((int64_t)sum * sum) >> 9);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
|
||||
sse, &sum, vp9_get32x32var_avx2, 32);
|
||||
return *sse - (((int64_t)sum * sum) >> 10);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
|
||||
sse, &sum, vp9_get32x32var_avx2, 32);
|
||||
return *sse - (((int64_t)sum * sum) >> 12);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
|
||||
sse, &sum, vp9_get32x32var_avx2, 32);
|
||||
return *sse - (((int64_t)sum * sum) >> 11);
|
||||
}
|
||||
|
||||
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
|
||||
int src_stride,
|
||||
int x_offset,
|
||||
|
|
|
@ -16,299 +16,6 @@
|
|||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
typedef void (*variance_fn_t)(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
|
||||
__m128i vsum = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 32; ++i) {
|
||||
const __m128i v = _mm_loadu_si128((const __m128i *)src);
|
||||
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
|
||||
src += 8;
|
||||
}
|
||||
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
||||
return _mm_cvtsi128_si32(vsum);
|
||||
}
|
||||
|
||||
#define READ64(p, stride, i) \
|
||||
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
|
||||
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
|
||||
|
||||
static void get4x4var_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
|
||||
const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
|
||||
const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
|
||||
const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
|
||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||
|
||||
// sum
|
||||
__m128i vsum = _mm_add_epi16(diff0, diff1);
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
|
||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
|
||||
|
||||
// sse
|
||||
vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
|
||||
_mm_madd_epi16(diff1, diff1));
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
||||
*sse = _mm_cvtsi128_si32(vsum);
|
||||
}
|
||||
|
||||
void vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i vsum = _mm_setzero_si128();
|
||||
__m128i vsse = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 8; i += 2) {
|
||||
const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(src + i * src_stride)), zero);
|
||||
const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(ref + i * ref_stride)), zero);
|
||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||
|
||||
const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(src + (i + 1) * src_stride)), zero);
|
||||
const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(ref + (i + 1) * ref_stride)), zero);
|
||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||
|
||||
vsum = _mm_add_epi16(vsum, diff0);
|
||||
vsum = _mm_add_epi16(vsum, diff1);
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
||||
}
|
||||
|
||||
// sum
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
|
||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
|
||||
|
||||
// sse
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
||||
*sse = _mm_cvtsi128_si32(vsse);
|
||||
}
|
||||
|
||||
void vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i vsum = _mm_setzero_si128();
|
||||
__m128i vsse = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const __m128i s = _mm_loadu_si128((const __m128i *)src);
|
||||
const __m128i r = _mm_loadu_si128((const __m128i *)ref);
|
||||
|
||||
const __m128i src0 = _mm_unpacklo_epi8(s, zero);
|
||||
const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
|
||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||
|
||||
const __m128i src1 = _mm_unpackhi_epi8(s, zero);
|
||||
const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
|
||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||
|
||||
vsum = _mm_add_epi16(vsum, diff0);
|
||||
vsum = _mm_add_epi16(vsum, diff1);
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
||||
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
}
|
||||
|
||||
// sum
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0) +
|
||||
(int16_t)_mm_extract_epi16(vsum, 1);
|
||||
|
||||
// sse
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
||||
*sse = _mm_cvtsi128_si32(vsse);
|
||||
}
|
||||
|
||||
|
||||
static void variance_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
int w, int h, unsigned int *sse, int *sum,
|
||||
variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
|
||||
*sse = 0;
|
||||
*sum = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
*sse += sse0;
|
||||
*sum += sum0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 4);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
|
||||
sse, &sum, get4x4var_sse2, 4);
|
||||
return *sse - (((unsigned int)sum * sum) >> 5);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
|
||||
sse, &sum, get4x4var_sse2, 4);
|
||||
return *sse - (((unsigned int)sum * sum) >> 5);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 6);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
|
||||
sse, &sum, vp9_get8x8var_sse2, 8);
|
||||
return *sse - (((unsigned int)sum * sum) >> 7);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
|
||||
sse, &sum, vp9_get8x8var_sse2, 8);
|
||||
return *sse - (((unsigned int)sum * sum) >> 7);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 8);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
|
||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 10);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
|
||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 9);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
|
||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 9);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
|
||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 12);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
|
||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 11);
|
||||
}
|
||||
|
||||
unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
|
||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 11);
|
||||
}
|
||||
|
||||
unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
// The 2 unused parameters are place holders for PIC enabled build.
|
||||
#define DECL(w, opt) \
|
||||
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
|
||||
|
|
|
@ -102,13 +102,11 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
|
|||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
||||
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
|
||||
endif
|
||||
|
|
|
@ -0,0 +1,363 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vpx_variance16x16_media|
|
||||
EXPORT |vpx_variance8x8_media|
|
||||
EXPORT |vpx_mse16x16_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vpx_variance16x16_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
|
||||
loop16x16
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop16x16
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vpx_variance8x8_media| PROC
|
||||
|
||||
push {r4-r10, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #8 ; set loop counter to 8 (=block height)
|
||||
mov r4, #0 ; initialize sum = 0
|
||||
mov r5, #0 ; initialize sse = 0
|
||||
|
||||
loop8x8
|
||||
; 1st 4 pixels
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r0, #0x4] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
subs r12, r12, #1 ; next row
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop8x8
|
||||
|
||||
; return stuff
|
||||
ldr r8, [sp, #32] ; get address of sse
|
||||
mul r1, r4, r4 ; sum * sum
|
||||
str r5, [r8] ; store sse
|
||||
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
|
||||
|
||||
pop {r4-r10, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
;
|
||||
;note: Based on vpx_variance16x16_media. In this function, sum is never used.
|
||||
; So, we can remove this part of calculation.
|
||||
|
||||
|vpx_mse16x16_media| PROC
|
||||
|
||||
push {r4-r9, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov r4, #0 ; initialize sse = 0
|
||||
|
||||
loopmse
|
||||
; 1st 4 pixels
|
||||
ldr r5, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r6, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0x4] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
ldr r5, [r0, #0x8] ; load 4 src pixels
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r6, [r2, #0x8] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0xc] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r6, [r2, #0xc] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
subs r12, r12, #1 ; next row
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loopmse
|
||||
|
||||
; return stuff
|
||||
ldr r1, [sp, #28] ; get address of sse
|
||||
mov r0, r4 ; return sse
|
||||
str r4, [r1] ; store sse
|
||||
|
||||
pop {r4-r9, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -0,0 +1,417 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
|
||||
const int32x4_t a = vpaddlq_s16(v_16x8);
|
||||
const int64x2_t b = vpaddlq_s32(a);
|
||||
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
|
||||
vreinterpret_s32_s64(vget_high_s64(b)));
|
||||
return vget_lane_s32(c, 0);
|
||||
}
|
||||
|
||||
static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
|
||||
const int64x2_t b = vpaddlq_s32(v_32x4);
|
||||
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
|
||||
vreinterpret_s32_s64(vget_high_s64(b)));
|
||||
return vget_lane_s32(c, 0);
|
||||
}
|
||||
|
||||
// w * h must be less than 2048 or local variable v_sum may overflow.
|
||||
static void variance_neon_w8(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
int w, int h, uint32_t *sse, int *sum) {
|
||||
int i, j;
|
||||
int16x8_t v_sum = vdupq_n_s16(0);
|
||||
int32x4_t v_sse_lo = vdupq_n_s32(0);
|
||||
int32x4_t v_sse_hi = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; j += 8) {
|
||||
const uint8x8_t v_a = vld1_u8(&a[j]);
|
||||
const uint8x8_t v_b = vld1_u8(&b[j]);
|
||||
const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
|
||||
const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
|
||||
v_sum = vaddq_s16(v_sum, sv_diff);
|
||||
v_sse_lo = vmlal_s16(v_sse_lo,
|
||||
vget_low_s16(sv_diff),
|
||||
vget_low_s16(sv_diff));
|
||||
v_sse_hi = vmlal_s16(v_sse_hi,
|
||||
vget_high_s16(sv_diff),
|
||||
vget_high_s16(sv_diff));
|
||||
}
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
|
||||
*sum = horizontal_add_s16x8(v_sum);
|
||||
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
|
||||
}
|
||||
|
||||
void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
|
||||
}
|
||||
|
||||
void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
|
||||
return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
|
||||
}
|
||||
|
||||
unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
|
||||
return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
|
||||
}
|
||||
|
||||
unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
|
||||
return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
|
||||
}
|
||||
|
||||
unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum1, sum2;
|
||||
uint32_t sse1, sse2;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
|
||||
variance_neon_w8(a + (32 * a_stride), a_stride,
|
||||
b + (32 * b_stride), b_stride, 32, 32,
|
||||
&sse2, &sum2);
|
||||
*sse = sse1 + sse2;
|
||||
sum1 += sum2;
|
||||
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
|
||||
}
|
||||
|
||||
unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum1, sum2;
|
||||
uint32_t sse1, sse2;
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
|
||||
variance_neon_w8(a + (16 * a_stride), a_stride,
|
||||
b + (16 * b_stride), b_stride, 64, 16,
|
||||
&sse2, &sum2);
|
||||
*sse = sse1 + sse2;
|
||||
sum1 += sum2;
|
||||
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
|
||||
}
|
||||
|
||||
unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
int sum1, sum2;
|
||||
uint32_t sse1, sse2;
|
||||
|
||||
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
|
||||
variance_neon_w8(a + (16 * a_stride), a_stride,
|
||||
b + (16 * b_stride), b_stride, 64, 16,
|
||||
&sse2, &sum2);
|
||||
sse1 += sse2;
|
||||
sum1 += sum2;
|
||||
|
||||
variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
|
||||
b + (16 * 2 * b_stride), b_stride,
|
||||
64, 16, &sse2, &sum2);
|
||||
sse1 += sse2;
|
||||
sum1 += sum2;
|
||||
|
||||
variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
|
||||
b + (16 * 3 * b_stride), b_stride,
|
||||
64, 16, &sse2, &sum2);
|
||||
*sse = sse1 + sse2;
|
||||
sum1 += sum2;
|
||||
return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
|
||||
}
|
||||
|
||||
unsigned int vpx_variance16x8_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance8x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
uint8x8_t d0u8, d2u8, d4u8, d6u8;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint16x8_t q11u16, q12u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d2u8, d6u8);
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vpx_mse16x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
int64x1_t d0s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int64x2_t q1s64;
|
||||
|
||||
q7s32 = vdupq_n_s32(0);
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) { // mse16x16_neon_loop
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
|
||||
q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
|
||||
q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q7s32 = vaddq_s32(q7s32, q8s32);
|
||||
q9s32 = vaddq_s32(q9s32, q10s32);
|
||||
q10s32 = vaddq_s32(q7s32, q9s32);
|
||||
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
|
||||
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
||||
}
|
||||
|
||||
unsigned int vpx_get4x4sse_cs_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride) {
|
||||
int16x4_t d22s16, d24s16, d26s16, d28s16;
|
||||
int64x1_t d0s64;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int64x2_t q1s64;
|
||||
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d1u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d5u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d3u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d7u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d1u8, d5u8);
|
||||
q13u16 = vsubl_u8(d2u8, d6u8);
|
||||
q14u16 = vsubl_u8(d3u8, d7u8);
|
||||
|
||||
d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
|
||||
d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
|
||||
d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
|
||||
d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
|
||||
|
||||
q7s32 = vmull_s16(d22s16, d22s16);
|
||||
q8s32 = vmull_s16(d24s16, d24s16);
|
||||
q9s32 = vmull_s16(d26s16, d26s16);
|
||||
q10s32 = vmull_s16(d28s16, d28s16);
|
||||
|
||||
q7s32 = vaddq_s32(q7s32, q8s32);
|
||||
q9s32 = vaddq_s32(q9s32, q10s32);
|
||||
q9s32 = vaddq_s32(q7s32, q9s32);
|
||||
|
||||
q1s64 = vpaddlq_s32(q9s32);
|
||||
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
||||
}
|
|
@ -33,6 +33,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
|
|||
return sad;
|
||||
}
|
||||
|
||||
// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
|
||||
/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
|
||||
* The function averages every corresponding element of the buffers and stores
|
||||
* the value in a third buffer, comp_pred.
|
||||
|
|
|
@ -0,0 +1,306 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride) {
|
||||
int distortion = 0;
|
||||
int r, c;
|
||||
|
||||
for (r = 0; r < 4; r++) {
|
||||
for (c = 0; c < 4; c++) {
|
||||
int diff = a[c] - b[c];
|
||||
distortion += diff * diff;
|
||||
}
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
|
||||
return distortion;
|
||||
}
|
||||
|
||||
unsigned int vpx_get_mb_ss_c(const int16_t *a) {
|
||||
unsigned int i, sum = 0;
|
||||
|
||||
for (i = 0; i < 256; ++i) {
|
||||
sum += a[i] * a[i];
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
static void variance(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
int i, j;
|
||||
|
||||
*sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
}
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
}
|
||||
|
||||
#define VAR(W, H) \
|
||||
unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
}
|
||||
|
||||
/* Identical to the variance call except it takes an additional parameter, sum,
|
||||
* and returns that value using pass-by-reference instead of returning
|
||||
* sse - sum^2 / w*h
|
||||
*/
|
||||
#define GET_VAR(W, H) \
|
||||
void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
variance(a, a_stride, b, b_stride, W, H, sse, sum); \
|
||||
}
|
||||
|
||||
/* Identical to the variance call except it does not calculate the
|
||||
* sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
|
||||
* variable.
|
||||
*/
|
||||
#define MSE(W, H) \
|
||||
unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||
const uint8_t *b, int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
}
|
||||
|
||||
VAR(64, 64)
|
||||
VAR(64, 32)
|
||||
VAR(32, 64)
|
||||
VAR(32, 32)
|
||||
VAR(32, 16)
|
||||
VAR(16, 32)
|
||||
VAR(16, 16)
|
||||
VAR(16, 8)
|
||||
VAR(8, 16)
|
||||
VAR(8, 8)
|
||||
VAR(8, 4)
|
||||
VAR(4, 8)
|
||||
VAR(4, 4)
|
||||
|
||||
GET_VAR(16, 16)
|
||||
GET_VAR(8, 8)
|
||||
|
||||
MSE(16, 16)
|
||||
MSE(16, 8)
|
||||
MSE(8, 16)
|
||||
MSE(8, 8)
|
||||
|
||||
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
|
||||
int height, const uint8_t *ref, int ref_stride) {
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j++) {
|
||||
const int tmp = pred[j] + ref[j];
|
||||
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
||||
}
|
||||
comp_pred += width;
|
||||
pred += width;
|
||||
ref += ref_stride;
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static void highbd_variance64(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, uint64_t *sse, uint64_t *sum) {
|
||||
int i, j;
|
||||
|
||||
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
||||
*sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
}
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_8_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sse = (unsigned int)sse_long;
|
||||
*sum = (int)sum_long;
|
||||
}
|
||||
|
||||
static void highbd_10_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
|
||||
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
|
||||
}
|
||||
|
||||
static void highbd_12_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, unsigned int *sse, int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
|
||||
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
|
||||
}
|
||||
|
||||
#define HIGHBD_VAR(W, H) \
|
||||
unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
} \
|
||||
\
|
||||
unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
} \
|
||||
\
|
||||
unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
|
||||
int a_stride, \
|
||||
const uint8_t *b, \
|
||||
int b_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||
}
|
||||
|
||||
#define HIGHBD_GET_VAR(S) \
|
||||
void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||
} \
|
||||
\
|
||||
void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||
} \
|
||||
\
|
||||
void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||
}
|
||||
|
||||
#define HIGHBD_MSE(W, H) \
|
||||
unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
} \
|
||||
\
|
||||
unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
} \
|
||||
\
|
||||
unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
|
||||
int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||
return *sse; \
|
||||
}
|
||||
|
||||
HIGHBD_GET_VAR(8)
|
||||
HIGHBD_GET_VAR(16)
|
||||
|
||||
HIGHBD_MSE(16, 16)
|
||||
HIGHBD_MSE(16, 8)
|
||||
HIGHBD_MSE(8, 16)
|
||||
HIGHBD_MSE(8, 8)
|
||||
|
||||
HIGHBD_VAR(64, 64)
|
||||
HIGHBD_VAR(64, 32)
|
||||
HIGHBD_VAR(32, 64)
|
||||
HIGHBD_VAR(32, 32)
|
||||
HIGHBD_VAR(32, 16)
|
||||
HIGHBD_VAR(16, 32)
|
||||
HIGHBD_VAR(16, 16)
|
||||
HIGHBD_VAR(16, 8)
|
||||
HIGHBD_VAR(8, 16)
|
||||
HIGHBD_VAR(8, 8)
|
||||
HIGHBD_VAR(8, 4)
|
||||
HIGHBD_VAR(4, 8)
|
||||
HIGHBD_VAR(4, 4)
|
||||
|
||||
void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
|
||||
int width, int height, const uint8_t *ref8,
|
||||
int ref_stride) {
|
||||
int i, j;
|
||||
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j++) {
|
||||
const int tmp = pred[j] + ref[j];
|
||||
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
||||
}
|
||||
comp_pred += width;
|
||||
pred += width;
|
||||
ref += ref_stride;
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
|
@ -17,6 +17,7 @@ DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
|
|||
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
|
||||
|
||||
|
||||
DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
|
||||
|
@ -29,9 +30,28 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
|
|||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
|
||||
|
||||
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||
endif # CONFIG_ENCODERS
|
||||
|
||||
ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
|
||||
DSP_SRCS-yes += variance.c
|
||||
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
|
||||
|
||||
DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c
|
||||
DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
|
||||
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
|
||||
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
|
||||
|
||||
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
|
||||
|
||||
DSP_SRCS-yes += vpx_dsp_rtcd.c
|
||||
|
|
|
@ -392,4 +392,212 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||
} # CONFIG_VP9_HIGHBITDEPTH
|
||||
} # CONFIG_ENCODERS
|
||||
|
||||
if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
|
||||
|
||||
add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance64x64 sse2 avx2 neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance64x32 sse2 avx2 neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance32x64 sse2 neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance32x32 sse2 avx2 neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance32x16 sse2 avx2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance16x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance16x8 mmx sse2 neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance8x16 mmx sse2 neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance8x8 mmx sse2 media neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance8x4 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance4x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance4x4 mmx sse2/;
|
||||
|
||||
|
||||
add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vpx_get16x16var sse2 avx2 neon/;
|
||||
|
||||
add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vpx_get8x8var mmx sse2 neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse16x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse8x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse8x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
|
||||
specialize qw/vpx_get_mb_ss mmx sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
|
||||
specialize qw/vpx_get4x4sse_cs neon/;
|
||||
|
||||
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
|
||||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance64x64 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance64x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance32x64 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance32x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance32x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance16x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance16x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance16x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance8x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_variance8x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance64x64 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance64x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance32x64 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance32x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance32x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance16x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance16x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance16x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance8x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_variance8x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance64x64 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance64x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance32x64 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance32x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance32x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance16x32 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance16x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance16x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance8x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_variance8x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
|
||||
add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
|
||||
add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
|
||||
add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_mse16x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_8_mse8x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_mse16x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_10_mse8x8 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_mse16x16 sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_highbd_12_mse8x8 sse2/;
|
||||
|
||||
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
|
||||
} # CONFIG_VP9_HIGHBITDEPTH
|
||||
} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
|
||||
|
||||
1;
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;unsigned int vp9_highbd_calc16x16var_sse2
|
||||
;unsigned int vpx_highbd_calc16x16var_sse2
|
||||
;(
|
||||
; unsigned char * src_ptr,
|
||||
; int source_stride,
|
||||
|
@ -20,8 +20,8 @@
|
|||
; unsigned int * SSE,
|
||||
; int * Sum
|
||||
;)
|
||||
global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
|
||||
sym(vp9_highbd_calc16x16var_sse2):
|
||||
global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
|
||||
sym(vpx_highbd_calc16x16var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
|
@ -164,7 +164,7 @@ sym(vp9_highbd_calc16x16var_sse2):
|
|||
ret
|
||||
|
||||
|
||||
;unsigned int vp9_highbd_calc8x8var_sse2
|
||||
;unsigned int vpx_highbd_calc8x8var_sse2
|
||||
;(
|
||||
; unsigned char * src_ptr,
|
||||
; int source_stride,
|
||||
|
@ -173,8 +173,8 @@ sym(vp9_highbd_calc16x16var_sse2):
|
|||
; unsigned int * SSE,
|
||||
; int * Sum
|
||||
;)
|
||||
global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
|
||||
sym(vp9_highbd_calc8x8var_sse2):
|
||||
global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
|
||||
sym(vpx_highbd_calc8x8var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
|
@ -0,0 +1,245 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#include "./vpx_config.h"
|
||||
#include "vp9/common/vp9_common.h"
|
||||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
uint32_t *sse, int *sum);
|
||||
|
||||
uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
uint32_t *sse, int *sum);
|
||||
|
||||
uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
uint32_t *sse, int *sum);
|
||||
|
||||
static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
int w, int h, uint32_t *sse, int *sum,
|
||||
high_variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
|
||||
*sse = 0;
|
||||
*sum = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
*sse += sse0;
|
||||
*sum += sum0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
int w, int h, uint32_t *sse, int *sum,
|
||||
high_variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
uint64_t sse_long = 0;
|
||||
int64_t sum_long = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
sse_long += sse0;
|
||||
sum_long += sum0;
|
||||
}
|
||||
}
|
||||
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
|
||||
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
|
||||
}
|
||||
|
||||
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
int w, int h, uint32_t *sse, int *sum,
|
||||
high_variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
uint64_t sse_long = 0;
|
||||
int64_t sum_long = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
sse_long += sse0;
|
||||
sum_long += sum0;
|
||||
}
|
||||
}
|
||||
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
|
||||
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
|
||||
}
|
||||
|
||||
|
||||
#define HIGH_GET_VAR(S) \
|
||||
void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
uint32_t *sse, int *sum) { \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||
sse, sum); \
|
||||
} \
|
||||
\
|
||||
void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
uint32_t *sse, int *sum) { \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||
sse, sum); \
|
||||
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
|
||||
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
|
||||
} \
|
||||
\
|
||||
void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
uint32_t *sse, int *sum) { \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||
sse, sum); \
|
||||
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
|
||||
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
|
||||
}
|
||||
|
||||
HIGH_GET_VAR(16);
|
||||
HIGH_GET_VAR(8);
|
||||
|
||||
#undef HIGH_GET_VAR
|
||||
|
||||
#define VAR_FN(w, h, block_size, shift) \
|
||||
uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||
int sum; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vpx_highbd_calc##block_size##x##block_size##var_sse2, \
|
||||
block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||
int sum; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_10_variance_sse2( \
|
||||
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||
int sum; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_12_variance_sse2( \
|
||||
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
}
|
||||
|
||||
VAR_FN(64, 64, 16, 12);
|
||||
VAR_FN(64, 32, 16, 11);
|
||||
VAR_FN(32, 64, 16, 11);
|
||||
VAR_FN(32, 32, 16, 10);
|
||||
VAR_FN(32, 16, 16, 9);
|
||||
VAR_FN(16, 32, 16, 9);
|
||||
VAR_FN(16, 16, 16, 8);
|
||||
VAR_FN(16, 8, 8, 7);
|
||||
VAR_FN(8, 16, 8, 7);
|
||||
VAR_FN(8, 8, 8, 6);
|
||||
|
||||
#undef VAR_FN
|
||||
|
||||
unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
static void variance_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
int w, int h, unsigned int *sse, int *sum,
|
||||
get_var_avx2 var_fn, int block_size) {
|
||||
int i, j;
|
||||
|
||||
*sse = 0;
|
||||
*sum = 0;
|
||||
|
||||
for (i = 0; i < h; i += 16) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(&src[src_stride * i + j], src_stride,
|
||||
&ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
|
||||
*sse += sse0;
|
||||
*sum += sum0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vpx_get16x16var_avx2, 16);
|
||||
return *sse - (((unsigned int)sum * sum) >> 8);
|
||||
}
|
||||
|
||||
unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
|
||||
sse, &sum, vpx_get32x32var_avx2, 32);
|
||||
return *sse - (((int64_t)sum * sum) >> 9);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
|
||||
sse, &sum, vpx_get32x32var_avx2, 32);
|
||||
return *sse - (((int64_t)sum * sum) >> 10);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
|
||||
sse, &sum, vpx_get32x32var_avx2, 32);
|
||||
return *sse - (((int64_t)sum * sum) >> 12);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
|
||||
sse, &sum, vpx_get32x32var_avx2, 32);
|
||||
return *sse - (((int64_t)sum * sum) >> 11);
|
||||
}
|
|
@ -10,9 +10,9 @@
|
|||
|
||||
#include <immintrin.h> // AVX2
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
void vp9_get16x16var_avx2(const unsigned char *src_ptr,
|
||||
void vpx_get16x16var_avx2(const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
|
@ -123,7 +123,7 @@ void vp9_get16x16var_avx2(const unsigned char *src_ptr,
|
|||
}
|
||||
}
|
||||
|
||||
void vp9_get32x32var_avx2(const unsigned char *src_ptr,
|
||||
void vpx_get32x32var_avx2(const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
|
@ -0,0 +1,424 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
|
||||
global sym(vpx_get_mb_ss_mmx) PRIVATE
|
||||
sym(vpx_get_mb_ss_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 8
|
||||
; end prolog
|
||||
|
||||
mov rax, arg(0) ;src_ptr
|
||||
mov rcx, 16
|
||||
pxor mm4, mm4
|
||||
|
||||
.NEXTROW:
|
||||
movq mm0, [rax]
|
||||
movq mm1, [rax+8]
|
||||
movq mm2, [rax+16]
|
||||
movq mm3, [rax+24]
|
||||
pmaddwd mm0, mm0
|
||||
pmaddwd mm1, mm1
|
||||
pmaddwd mm2, mm2
|
||||
pmaddwd mm3, mm3
|
||||
|
||||
paddd mm4, mm0
|
||||
paddd mm4, mm1
|
||||
paddd mm4, mm2
|
||||
paddd mm4, mm3
|
||||
|
||||
add rax, 32
|
||||
dec rcx
|
||||
ja .NEXTROW
|
||||
movq QWORD PTR [rsp], mm4
|
||||
|
||||
;return sum[0]+sum[1];
|
||||
movsxd rax, dword ptr [rsp]
|
||||
movsxd rcx, dword ptr [rsp+4]
|
||||
add rax, rcx
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 8
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vpx_get8x8var_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *SSE,
|
||||
; int *Sum
|
||||
;)
|
||||
global sym(vpx_get8x8var_mmx) PRIVATE
|
||||
sym(vpx_get8x8var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm5, mm5 ; Blank mmx6
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||
mov rbx, arg(2) ;[ref_ptr]
|
||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||
|
||||
; Row 1
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
|
||||
; Row 2
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 3
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 4
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 5
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
; movq mm4, [rbx + rdx]
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 6
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 7
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Row 8
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm2, mm0 ; Take copies
|
||||
movq mm3, mm1 ; Take copies
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||
punpckhbw mm3, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
paddw mm5, mm2 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
pmaddwd mm2, mm2 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
paddd mm7, mm2 ; accumulate in mm7
|
||||
|
||||
; Now accumulate the final results.
|
||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||||
movsx rdx, WORD PTR [rsp+8]
|
||||
movsx rcx, WORD PTR [rsp+10]
|
||||
movsx rbx, WORD PTR [rsp+12]
|
||||
movsx rax, WORD PTR [rsp+14]
|
||||
add rdx, rcx
|
||||
add rbx, rax
|
||||
add rdx, rbx ;XSum
|
||||
movsxd rax, DWORD PTR [rsp]
|
||||
movsxd rcx, DWORD PTR [rsp+4]
|
||||
add rax, rcx ;XXSum
|
||||
mov rsi, arg(4) ;SSE
|
||||
mov rdi, arg(5) ;Sum
|
||||
mov dword ptr [rsi], eax
|
||||
mov dword ptr [rdi], edx
|
||||
xor rax, rax ; return 0
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
|
||||
;void
|
||||
;vpx_get4x4var_mmx
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *SSE,
|
||||
; int *Sum
|
||||
;)
|
||||
global sym(vpx_get4x4var_mmx) PRIVATE
|
||||
sym(vpx_get4x4var_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
|
||||
pxor mm5, mm5 ; Blank mmx6
|
||||
pxor mm6, mm6 ; Blank mmx7
|
||||
pxor mm7, mm7 ; Blank mmx7
|
||||
|
||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||
mov rbx, arg(2) ;[ref_ptr]
|
||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||
|
||||
; Row 1
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
|
||||
; Row 2
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 3
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher precision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 4
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
||||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
|
||||
; Now accumulate the final results.
|
||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||||
movsx rdx, WORD PTR [rsp+8]
|
||||
movsx rcx, WORD PTR [rsp+10]
|
||||
movsx rbx, WORD PTR [rsp+12]
|
||||
movsx rax, WORD PTR [rsp+14]
|
||||
add rdx, rcx
|
||||
add rbx, rax
|
||||
add rdx, rbx ;XSum
|
||||
movsxd rax, DWORD PTR [rsp]
|
||||
movsxd rcx, DWORD PTR [rsp+4]
|
||||
add rax, rcx ;XXSum
|
||||
mov rsi, arg(4) ;SSE
|
||||
mov rdi, arg(5) ;Sum
|
||||
mov dword ptr [rsi], eax
|
||||
mov dword ptr [rdi], edx
|
||||
xor rax, rax ; return 0
|
||||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 4));
|
||||
}
|
||||
|
||||
unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int var;
|
||||
int avg;
|
||||
|
||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
|
||||
*sse = var;
|
||||
|
||||
return (var - (((unsigned int)avg * avg) >> 6));
|
||||
}
|
||||
|
||||
unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, sse2, sse3, var;
|
||||
int sum0, sum1, sum2, sum3;
|
||||
|
||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
||||
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
|
||||
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
|
||||
b + 8 * b_stride, b_stride, &sse2, &sum2);
|
||||
vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
|
||||
b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
|
||||
|
||||
var = sse0 + sse1 + sse2 + sse3;
|
||||
*sse = var;
|
||||
return var;
|
||||
}
|
||||
|
||||
unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, sse2, sse3, var;
|
||||
int sum0, sum1, sum2, sum3, avg;
|
||||
|
||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
||||
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
|
||||
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
|
||||
b + 8 * b_stride, b_stride, &sse2, &sum2);
|
||||
vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
|
||||
b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
|
||||
|
||||
var = sse0 + sse1 + sse2 + sse3;
|
||||
avg = sum0 + sum1 + sum2 + sum3;
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 8));
|
||||
}
|
||||
|
||||
unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, var;
|
||||
int sum0, sum1, avg;
|
||||
|
||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
||||
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
|
||||
|
||||
var = sse0 + sse1;
|
||||
avg = sum0 + sum1;
|
||||
*sse = var;
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
}
|
||||
|
||||
unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
|
||||
const unsigned char *b, int b_stride,
|
||||
unsigned int *sse) {
|
||||
unsigned int sse0, sse1, var;
|
||||
int sum0, sum1, avg;
|
||||
|
||||
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
||||
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
|
||||
b + 8 * b_stride, b_stride, &sse1, &sum1);
|
||||
|
||||
var = sse0 + sse1;
|
||||
avg = sum0 + sum1;
|
||||
*sse = var;
|
||||
|
||||
return (var - (((unsigned int)avg * avg) >> 7));
|
||||
}
|
|
@ -0,0 +1,309 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <emmintrin.h> // SSE2
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
|
||||
__m128i vsum = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 32; ++i) {
|
||||
const __m128i v = _mm_loadu_si128((const __m128i *)src);
|
||||
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
|
||||
src += 8;
|
||||
}
|
||||
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
||||
return _mm_cvtsi128_si32(vsum);
|
||||
}
|
||||
|
||||
#define READ64(p, stride, i) \
|
||||
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
|
||||
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
|
||||
|
||||
static void get4x4var_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
|
||||
const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
|
||||
const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
|
||||
const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
|
||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||
|
||||
// sum
|
||||
__m128i vsum = _mm_add_epi16(diff0, diff1);
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
|
||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
|
||||
|
||||
// sse
|
||||
vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
|
||||
_mm_madd_epi16(diff1, diff1));
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
||||
*sse = _mm_cvtsi128_si32(vsum);
|
||||
}
|
||||
|
||||
void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i vsum = _mm_setzero_si128();
|
||||
__m128i vsse = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 8; i += 2) {
|
||||
const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(src + i * src_stride)), zero);
|
||||
const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(ref + i * ref_stride)), zero);
|
||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||
|
||||
const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(src + (i + 1) * src_stride)), zero);
|
||||
const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||
(const __m128i *)(ref + (i + 1) * ref_stride)), zero);
|
||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||
|
||||
vsum = _mm_add_epi16(vsum, diff0);
|
||||
vsum = _mm_add_epi16(vsum, diff1);
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
||||
}
|
||||
|
||||
// sum
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
|
||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
|
||||
|
||||
// sse
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
||||
*sse = _mm_cvtsi128_si32(vsse);
|
||||
}
|
||||
|
||||
void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i vsum = _mm_setzero_si128();
|
||||
__m128i vsse = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const __m128i s = _mm_loadu_si128((const __m128i *)src);
|
||||
const __m128i r = _mm_loadu_si128((const __m128i *)ref);
|
||||
|
||||
const __m128i src0 = _mm_unpacklo_epi8(s, zero);
|
||||
const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
|
||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||
|
||||
const __m128i src1 = _mm_unpackhi_epi8(s, zero);
|
||||
const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
|
||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||
|
||||
vsum = _mm_add_epi16(vsum, diff0);
|
||||
vsum = _mm_add_epi16(vsum, diff1);
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
||||
|
||||
src += src_stride;
|
||||
ref += ref_stride;
|
||||
}
|
||||
|
||||
// sum
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0) +
|
||||
(int16_t)_mm_extract_epi16(vsum, 1);
|
||||
|
||||
// sse
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
||||
*sse = _mm_cvtsi128_si32(vsse);
|
||||
}
|
||||
|
||||
|
||||
static void variance_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
int w, int h, unsigned int *sse, int *sum,
|
||||
getNxMvar_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
|
||||
*sse = 0;
|
||||
*sum = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
*sse += sse0;
|
||||
*sum += sum0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 4);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
|
||||
sse, &sum, get4x4var_sse2, 4);
|
||||
return *sse - (((unsigned int)sum * sum) >> 5);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
|
||||
sse, &sum, get4x4var_sse2, 4);
|
||||
return *sse - (((unsigned int)sum * sum) >> 5);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 6);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
|
||||
sse, &sum, vpx_get8x8var_sse2, 8);
|
||||
return *sse - (((unsigned int)sum * sum) >> 7);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
|
||||
sse, &sum, vpx_get8x8var_sse2, 8);
|
||||
return *sse - (((unsigned int)sum * sum) >> 7);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
|
||||
const unsigned char *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||
return *sse - (((unsigned int)sum * sum) >> 8);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
|
||||
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 10);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
|
||||
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 9);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
|
||||
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 9);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
|
||||
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 12);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
|
||||
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 11);
|
||||
}
|
||||
|
||||
unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
|
||||
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||
return *sse - (((int64_t)sum * sum) >> 11);
|
||||
}
|
||||
|
||||
unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
|
||||
return *sse;
|
||||
}
|
Загрузка…
Ссылка в новой задаче