Move variance functions to vpx_dsp

subpel functions will be moved in another patch.

Change-Id: Idb2e049bad0b9b32ac42cc7731cd6903de2826ce
This commit is contained in:
Johann 2015-05-15 11:52:03 -07:00
Родитель 976f7f42c1
Коммит c3bdffb0a5
53 изменённых файлов: 3224 добавлений и 4230 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,154 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_variance16x16_armv6|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance16x16_armv6| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r8, #0 ; initialize sum = 0
mov r11, #0 ; initialize sse = 0
mov r12, #16 ; set loop counter to 16 (=block height)
loop
; 1st 4 pixels
ldr r4, [r0, #0] ; load 4 src pixels
ldr r5, [r2, #0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r6, r4, r5 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r4, [r0, #4] ; load 4 src pixels
ldr r5, [r2, #4] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r4, [r0, #8] ; load 4 src pixels
ldr r5, [r2, #8] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r4, [r0, #12] ; load 4 src pixels
ldr r5, [r2, #12] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
subs r12, r12, #1
bne loop
; return stuff
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ENDP
END

Просмотреть файл

@ -1,101 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_variance8x8_armv6|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance8x8_armv6| PROC
push {r4-r10, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r12, #8 ; set loop counter to 8 (=block height)
mov r4, #0 ; initialize sum = 0
mov r5, #0 ; initialize sse = 0
loop
; 1st 4 pixels
ldr r6, [r0, #0x0] ; load 4 src pixels
ldr r7, [r2, #0x0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r8, r6, r7 ; calculate difference
pld [r0, r1, lsl #1]
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r6, [r0, #0x4] ; load 4 src pixels
ldr r7, [r2, #0x4] ; load 4 ref pixels
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
usub8 r8, r6, r7 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
subs r12, r12, #1 ; next row
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
bne loop
; return stuff
ldr r8, [sp, #32] ; get address of sse
mul r1, r4, r4 ; sum * sum
str r5, [r8] ; store sse
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
pop {r4-r10, pc}
ENDP
END

Просмотреть файл

@ -1,320 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vpx_ports/mem.h"
unsigned int vp8_variance16x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) {
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_variance16x8_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 4; i++) { // variance16x8_neon_loop
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_variance8x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
uint8x8_t d0u8, d2u8, d4u8, d6u8;
int16x4_t d22s16, d23s16, d24s16, d25s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint16x8_t q11u16, q12u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) { // variance8x16_neon_loop
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d2u8, d6u8);
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_variance8x8_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 2; i++) { // variance8x8_neon_loop
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d1u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d3u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d5u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d7u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d1u8, d5u8);
q13u16 = vsubl_u8(d2u8, d6u8);
q14u16 = vsubl_u8(d3u8, d7u8);
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}

Просмотреть файл

@ -9,10 +9,14 @@
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp8/common/variance.h"
#include "vp8/common/filter.h"
// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
#if CONFIG_VP8_ENCODER
#if HAVE_MEDIA
#include "vp8/common/arm/bilinearfilter_arm.h"
@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
8, 8, 8, VFilter);
return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
dst_pixels_per_line, sse);
return vpx_variance8x8_media(second_pass, 8, dst_ptr,
dst_pixels_per_line, sse);
}
unsigned int vp8_sub_pixel_variance16x16_armv6
@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
16, 16, 16, VFilter);
var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
dst_pixels_per_line, sse);
var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
dst_pixels_per_line, sse);
}
return var;
}
#endif /* HAVE_MEDIA */
#endif // HAVE_MEDIA
#if HAVE_NEON
@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon
return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
}
#endif
#endif // HAVE_NEON
#endif // CONFIG_VP8_ENCODER

Просмотреть файл

@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block
if (blksize == 16)
{
actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
#ifdef USE_SSD
vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 128)>>8;
vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 32)>>6;
vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 32)>>6;
#else
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block
}
else /* if (blksize == 8) */
{
actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
#ifdef USE_SSD
vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 32)>>6;
vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 8)>>4;
vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 8)>>4;
#else
sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;

Просмотреть файл

@ -236,31 +236,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch,
specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
#
# Whole-pixel Variance
#
add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance4x4 mmx sse2/;
$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance8x8 mmx sse2 media neon/;
$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
$vp8_variance8x8_media=vp8_variance8x8_armv6;
add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance8x16 mmx sse2 neon/;
$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance16x8 mmx sse2 neon/;
$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance16x16 mmx sse2 media neon/;
$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
$vp8_variance16x16_media=vp8_variance16x16_armv6;
#
# Sub-pixel Variance
#
@ -308,12 +283,6 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
#
if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
#
# Sum of squares (vector)
#
add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
specialize qw/vp8_get_mb_ss mmx sse2/;
#
# SSE (Sum Squared Error)
#
@ -321,14 +290,6 @@ add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_pt
specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_mse16x16 mmx sse2 media neon/;
$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
$vp8_mse16x16_media=vp8_mse16x16_armv6;
add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
specialize qw/vp8_get4x4sse_cs mmx neon/;
#
# Block copy
#

Просмотреть файл

@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)(
const unsigned char *ref_array,
int ref_stride,
unsigned int *sad_array);
typedef void (*vpx_sad_multi_d_fn_t)
(
const unsigned char *src_ptr,
@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t)
unsigned int *sad_array
);
typedef unsigned int (*vp8_variance_fn_t)
typedef unsigned int (*vpx_variance_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t)
unsigned int *sse
);
typedef void (*vp8_ssimpf_fn_t)
(
unsigned char *s,
int sp,
unsigned char *r,
int rp,
unsigned long *sum_s,
unsigned long *sum_r,
unsigned long *sum_sq_s,
unsigned long *sum_sq_r,
unsigned long *sum_sxr
);
typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
typedef unsigned int (*vp8_get16x16prederror_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int ref_stride
);
typedef struct variance_vtable
{
vpx_sad_fn_t sdf;
vp8_variance_fn_t vf;
vpx_variance_fn_t vf;
vp8_subpixvariance_fn_t svf;
vp8_variance_fn_t svf_halfpix_h;
vp8_variance_fn_t svf_halfpix_v;
vp8_variance_fn_t svf_halfpix_hv;
vpx_variance_fn_t svf_halfpix_h;
vpx_variance_fn_t svf_halfpix_v;
vpx_variance_fn_t svf_halfpix_hv;
vpx_sad_multi_fn_t sdx3f;
vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;

Просмотреть файл

@ -8,44 +8,34 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp8_rtcd.h"
#include "filter.h"
#include "variance.h"
unsigned int vp8_get_mb_ss_c
(
const short *src_ptr
)
{
unsigned int i = 0, sum = 0;
do
{
sum += (src_ptr[i] * src_ptr[i]);
i++;
}
while (i < 256);
return sum;
/* This is a bad idea.
* ctz = count trailing zeros */
static int ctz(int a) {
int b = 0;
while (a != 1) {
a >>= 1;
b++;
}
return b;
}
static void variance(
static unsigned int variance(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
int w,
int h,
unsigned int *sse,
int *sum)
unsigned int *sse)
{
int i, j;
int diff;
int diff, sum;
*sum = 0;
sum = 0;
*sse = 0;
for (i = 0; i < h; i++)
@ -53,114 +43,17 @@ static void variance(
for (j = 0; j < w; j++)
{
diff = src_ptr[j] - ref_ptr[j];
*sum += diff;
sum += diff;
*sse += diff * diff;
}
src_ptr += source_stride;
ref_ptr += recon_stride;
}
return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
}
unsigned int vp8_variance16x16_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 8));
}
unsigned int vp8_variance8x16_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
}
unsigned int vp8_variance16x8_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
}
unsigned int vp8_variance8x8_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 6));
}
unsigned int vp8_variance4x4_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 4));
}
unsigned int vp8_mse16x16_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
*sse = var;
return var;
}
/****************************************************************************
*
* ROUTINE : filter_block2d_bil_first_pass
@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c
/* Now filter Verticaly */
var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
}
@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
}
unsigned int vp8_sub_pixel_variance16x16_c
@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
}
@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
}
unsigned int vp8_sub_pixel_variance8x16_c
@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
}

Просмотреть файл

@ -11,504 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
global sym(vp8_get_mb_ss_mmx) PRIVATE
sym(vp8_get_mb_ss_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
sub rsp, 8
; end prolog
mov rax, arg(0) ;src_ptr
mov rcx, 16
pxor mm4, mm4
.NEXTROW:
movq mm0, [rax]
movq mm1, [rax+8]
movq mm2, [rax+16]
movq mm3, [rax+24]
pmaddwd mm0, mm0
pmaddwd mm1, mm1
pmaddwd mm2, mm2
pmaddwd mm3, mm3
paddd mm4, mm0
paddd mm4, mm1
paddd mm4, mm2
paddd mm4, mm3
add rax, 32
dec rcx
ja .NEXTROW
movq QWORD PTR [rsp], mm4
;return sum[0]+sum[1];
movsxd rax, dword ptr [rsp]
movsxd rcx, dword ptr [rsp+4]
add rax, rcx
; begin epilog
add rsp, 8
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_get8x8var_mmx
;(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *SSE,
; int *Sum
;)
global sym(vp8_get8x8var_mmx) PRIVATE
sym(vp8_get8x8var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
push rsi
push rdi
push rbx
sub rsp, 16
; end prolog
pxor mm5, mm5 ; Blank mmx6
pxor mm6, mm6 ; Blank mmx7
pxor mm7, mm7 ; Blank mmx7
mov rax, arg(0) ;[src_ptr] ; Load base addresses
mov rbx, arg(2) ;[ref_ptr]
movsxd rcx, dword ptr arg(1) ;[source_stride]
movsxd rdx, dword ptr arg(3) ;[recon_stride]
; Row 1
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm1, [rbx] ; Copy eight bytes to mm1
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 2
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 3
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 4
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 5
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
; movq mm4, [rbx + rdx]
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 6
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 7
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 8
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Now accumulate the final results.
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
movsx rdx, WORD PTR [rsp+8]
movsx rcx, WORD PTR [rsp+10]
movsx rbx, WORD PTR [rsp+12]
movsx rax, WORD PTR [rsp+14]
add rdx, rcx
add rbx, rax
add rdx, rbx ;XSum
movsxd rax, DWORD PTR [rsp]
movsxd rcx, DWORD PTR [rsp+4]
add rax, rcx ;XXSum
mov rsi, arg(4) ;SSE
mov rdi, arg(5) ;Sum
mov dword ptr [rsi], eax
mov dword ptr [rdi], edx
xor rax, rax ; return 0
; begin epilog
add rsp, 16
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int
;vp8_get4x4var_mmx
;(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *SSE,
; int *Sum
;)
global sym(vp8_get4x4var_mmx) PRIVATE
sym(vp8_get4x4var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
push rsi
push rdi
push rbx
sub rsp, 16
; end prolog
pxor mm5, mm5 ; Blank mmx6
pxor mm6, mm6 ; Blank mmx7
pxor mm7, mm7 ; Blank mmx7
mov rax, arg(0) ;[src_ptr] ; Load base addresses
mov rbx, arg(2) ;[ref_ptr]
movsxd rcx, dword ptr arg(1) ;[source_stride]
movsxd rdx, dword ptr arg(3) ;[recon_stride]
; Row 1
movd mm0, [rax] ; Copy four bytes to mm0
movd mm1, [rbx] ; Copy four bytes to mm1
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 2
movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 3
movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher precision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 4
movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
paddd mm7, mm0 ; accumulate in mm7
; Now accumulate the final results.
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
movsx rdx, WORD PTR [rsp+8]
movsx rcx, WORD PTR [rsp+10]
movsx rbx, WORD PTR [rsp+12]
movsx rax, WORD PTR [rsp+14]
add rdx, rcx
add rbx, rax
add rdx, rbx ;XSum
movsxd rax, DWORD PTR [rsp]
movsxd rcx, DWORD PTR [rsp+4]
add rax, rcx ;XXSum
mov rsi, arg(4) ;SSE
mov rdi, arg(5) ;Sum
mov dword ptr [rsi], eax
mov dword ptr [rdi], edx
xor rax, rax ; return 0
; begin epilog
add rsp, 16
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int
;vp8_get4x4sse_cs_mmx
;(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride
;)
global sym(vp8_get4x4sse_cs_mmx) PRIVATE
sym(vp8_get4x4sse_cs_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
push rbx
; end prolog
pxor mm6, mm6 ; Blank mmx7
pxor mm7, mm7 ; Blank mmx7
mov rax, arg(0) ;[src_ptr] ; Load base addresses
mov rbx, arg(2) ;[ref_ptr]
movsxd rcx, dword ptr arg(1) ;[source_stride]
movsxd rdx, dword ptr arg(3) ;[recon_stride]
; Row 1
movd mm0, [rax] ; Copy eight bytes to mm0
movd mm1, [rbx] ; Copy eight bytes to mm1
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 2
movd mm0, [rax] ; Copy eight bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 3
movd mm0, [rax] ; Copy eight bytes to mm0
punpcklbw mm1, mm6
punpcklbw mm0, mm6 ; unpack to higher prrcision
psubsw mm0, mm1 ; A-B (low order) to MM0
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 4
movd mm0, [rax] ; Copy eight bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
pmaddwd mm0, mm0 ; square and accumulate
paddd mm7, mm0 ; accumulate in mm7
movq mm0, mm7 ;
psrlq mm7, 32
paddd mm0, mm7
movq rax, mm0
; begin epilog
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
%define mmx_filter_shift 7
;void vp8_filter_block2d_bil4x4_var_mmx

Просмотреть файл

@ -13,393 +13,6 @@
%define xmm_filter_shift 7
;unsigned int vp8_get_mb_ss_sse2
;(
; short *src_ptr
;)
global sym(vp8_get_mb_ss_sse2) PRIVATE
sym(vp8_get_mb_ss_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 1
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
mov rax, arg(0) ;[src_ptr]
mov rcx, 8
pxor xmm4, xmm4
.NEXTROW:
movdqa xmm0, [rax]
movdqa xmm1, [rax+16]
movdqa xmm2, [rax+32]
movdqa xmm3, [rax+48]
pmaddwd xmm0, xmm0
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
pmaddwd xmm3, xmm3
paddd xmm0, xmm1
paddd xmm2, xmm3
paddd xmm4, xmm0
paddd xmm4, xmm2
add rax, 0x40
dec rcx
ja .NEXTROW
movdqa xmm3,xmm4
psrldq xmm4,8
paddd xmm4,xmm3
movdqa xmm3,xmm4
psrldq xmm4,4
paddd xmm4,xmm3
movq rax,xmm4
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_get16x16var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp8_get16x16var_sse2) PRIVATE
sym(vp8_get16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
; Prefetch data
lea rcx, [rax+rax*2]
prefetcht0 [rsi]
prefetcht0 [rsi+rax]
prefetcht0 [rsi+rax*2]
prefetcht0 [rsi+rcx]
lea rbx, [rsi+rax*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
prefetcht0 [rbx+rax*2]
prefetcht0 [rbx+rcx]
lea rcx, [rdx+rdx*2]
prefetcht0 [rdi]
prefetcht0 [rdi+rdx]
prefetcht0 [rdi+rdx*2]
prefetcht0 [rdi+rcx]
lea rbx, [rdi+rdx*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
prefetcht0 [rbx+rdx*2]
prefetcht0 [rbx+rcx]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 16
.var16loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
prefetcht0 [rsi+rax*8]
prefetcht0 [rdi+rdx*8]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
punpcklbw xmm1, xmm0
punpckhbw xmm3, xmm0
punpcklbw xmm2, xmm0
punpckhbw xmm4, xmm0
psubw xmm1, xmm2
psubw xmm3, xmm4
paddw xmm7, xmm1
pmaddwd xmm1, xmm1
paddw xmm7, xmm3
pmaddwd xmm3, xmm3
paddd xmm6, xmm1
paddd xmm6, xmm3
add rsi, rax
add rdi, rdx
sub rcx, 1
jnz .var16loop
movdqa xmm1, xmm6
pxor xmm6, xmm6
pxor xmm5, xmm5
punpcklwd xmm6, xmm7
punpckhwd xmm5, xmm7
psrad xmm5, 16
psrad xmm6, 16
paddd xmm6, xmm5
movdqa xmm2, xmm1
punpckldq xmm1, xmm0
punpckhdq xmm2, xmm0
movdqa xmm7, xmm6
paddd xmm1, xmm2
punpckldq xmm6, xmm0
punpckhdq xmm7, xmm0
paddd xmm6, xmm7
movdqa xmm2, xmm1
movdqa xmm7, xmm6
psrldq xmm1, 8
psrldq xmm6, 8
paddd xmm7, xmm6
paddd xmm1, xmm2
mov rax, arg(5) ;[Sum]
mov rdi, arg(4) ;[SSE]
movd DWORD PTR [rax], xmm7
movd DWORD PTR [rdi], xmm1
; begin epilog
pop rdi
pop rsi
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_get8x8var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp8_get8x8var_sse2) PRIVATE
sym(vp8_get8x8var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
movq xmm1, QWORD PTR [rsi]
movq xmm2, QWORD PTR [rdi]
punpcklbw xmm1, xmm0
punpcklbw xmm2, xmm0
psubsw xmm1, xmm2
paddw xmm7, xmm1
pmaddwd xmm1, xmm1
movq xmm2, QWORD PTR[rsi + rax]
movq xmm3, QWORD PTR[rdi + rdx]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
movq xmm2, QWORD PTR[rsi + rax * 2]
movq xmm3, QWORD PTR[rdi + rdx * 2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
lea rsi, [rsi + rax * 2]
lea rdi, [rdi + rdx * 2]
movq xmm2, QWORD PTR[rsi + rax]
movq xmm3, QWORD PTR[rdi + rdx]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
movq xmm2, QWORD PTR[rsi + rax *2]
movq xmm3, QWORD PTR[rdi + rdx *2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
lea rsi, [rsi + rax * 2]
lea rdi, [rdi + rdx * 2]
movq xmm2, QWORD PTR[rsi + rax]
movq xmm3, QWORD PTR[rdi + rdx]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
movq xmm2, QWORD PTR[rsi + rax *2]
movq xmm3, QWORD PTR[rdi + rdx *2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
lea rsi, [rsi + rax * 2]
lea rdi, [rdi + rdx * 2]
movq xmm2, QWORD PTR[rsi + rax]
movq xmm3, QWORD PTR[rdi + rdx]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
psubsw xmm2, xmm3
paddw xmm7, xmm2
pmaddwd xmm2, xmm2
paddd xmm1, xmm2
movdqa xmm6, xmm7
punpcklwd xmm6, xmm0
punpckhwd xmm7, xmm0
movdqa xmm2, xmm1
paddw xmm6, xmm7
punpckldq xmm1, xmm0
punpckhdq xmm2, xmm0
movdqa xmm7, xmm6
paddd xmm1, xmm2
punpckldq xmm6, xmm0
punpckhdq xmm7, xmm0
paddw xmm6, xmm7
movdqa xmm2, xmm1
movdqa xmm7, xmm6
psrldq xmm1, 8
psrldq xmm6, 8
paddw xmm7, xmm6
paddd xmm1, xmm2
mov rax, arg(5) ;[Sum]
mov rdi, arg(4) ;[SSE]
movq rdx, xmm7
movsx rcx, dx
mov dword ptr [rax], ecx
movd DWORD PTR [rdi], xmm1
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_filter_block2d_bil_var_sse2
;(
; unsigned char *ref_ptr,

Просмотреть файл

@ -35,25 +35,6 @@ extern void filter_block1d_v6_mmx
short *filter
);
extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
extern unsigned int vp8_get8x8var_mmx
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
);
extern unsigned int vp8_get4x4var_mmx
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
);
extern void vp8_filter_block2d_bil4x4_var_mmx
(
const unsigned char *ref_ptr,
@ -78,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx
unsigned int *sumsquared
);
unsigned int vp8_variance4x4_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 4));
}
unsigned int vp8_variance8x8_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 6));
}
unsigned int vp8_mse16x16_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3;
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
*sse = var;
return var;
}
unsigned int vp8_variance16x16_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3, avg;
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
avg = sum0 + sum1 + sum2 + sum3;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 8));
}
unsigned int vp8_variance16x8_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
}
unsigned int vp8_variance8x16_mmx(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
}
unsigned int vp8_sub_pixel_variance4x4_mmx
(
const unsigned char *src_ptr,

Просмотреть файл

@ -31,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
unsigned int *sumsquared
);
extern unsigned int vp8_get4x4var_mmx
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
);
unsigned int vp8_get_mb_ss_sse2
(
const short *src_ptr
);
unsigned int vp8_get16x16var_sse2
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
);
unsigned int vp8_get8x8var_sse2
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
);
void vp8_filter_block2d_bil_var_sse2
(
const unsigned char *ref_ptr,
@ -136,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2
unsigned int *sumsquared
);
unsigned int vp8_variance4x4_wmt(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 4));
}
unsigned int vp8_variance8x8_wmt
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 6));
}
unsigned int vp8_variance16x16_wmt
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0;
int sum0;
vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
*sse = sse0;
return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
}
unsigned int vp8_mse16x16_wmt(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0;
int sum0;
vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
*sse = sse0;
return sse0;
}
unsigned int vp8_variance16x8_wmt
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
}
unsigned int vp8_variance8x16_wmt
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
}
unsigned int vp8_sub_pixel_variance4x4_wmt
(
const unsigned char *src_ptr,

Просмотреть файл

@ -13,15 +13,6 @@
#include "vp8/common/variance.h"
#include "vpx_ports/mem.h"
extern unsigned int vp8_get16x16var_sse2
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *SSE,
int *Sum
);
extern void vp8_half_horiz_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,

Просмотреть файл

@ -1,138 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_mse16x16_armv6|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
;
;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
; So, we can remove this part of calculation.
|vp8_mse16x16_armv6| PROC
push {r4-r9, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r12, #16 ; set loop counter to 16 (=block height)
mov r4, #0 ; initialize sse = 0
loop
; 1st 4 pixels
ldr r5, [r0, #0x0] ; load 4 src pixels
ldr r6, [r2, #0x0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r8, r5, r6 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0x4] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r6, [r2, #0x4] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0x8] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r6, [r2, #0x8] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0xc] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r6, [r2, #0xc] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
subs r12, r12, #1 ; next row
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
bne loop
; return stuff
ldr r1, [sp, #28] ; get address of sse
mov r0, r4 ; return sse
str r4, [r1] ; store sse
pop {r4-r9, pc}
ENDP
END

Просмотреть файл

@ -1,131 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
unsigned int vp8_mse16x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
int64x1_t d0s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
int32x4_t q7s32, q8s32, q9s32, q10s32;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int64x2_t q1s64;
q7s32 = vdupq_n_s32(0);
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) { // mse16x16_neon_loop
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q7s32 = vaddq_s32(q7s32, q8s32);
q9s32 = vaddq_s32(q9s32, q10s32);
q10s32 = vaddq_s32(q7s32, q9s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}
unsigned int vp8_get4x4sse_cs_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride) {
int16x4_t d22s16, d24s16, d26s16, d28s16;
int64x1_t d0s64;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
int32x4_t q7s32, q8s32, q9s32, q10s32;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int64x2_t q1s64;
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d1u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d5u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d3u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d7u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d1u8, d5u8);
q13u16 = vsubl_u8(d2u8, d6u8);
q14u16 = vsubl_u8(d3u8, d7u8);
d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
q7s32 = vmull_s16(d22s16, d22s16);
q8s32 = vmull_s16(d24s16, d24s16);
q9s32 = vmull_s16(d26s16, d26s16);
q10s32 = vmull_s16(d28s16, d28s16);
q7s32 = vaddq_s32(q7s32, q8s32);
q9s32 = vaddq_s32(q9s32, q10s32);
q9s32 = vaddq_s32(q7s32, q9s32);
q1s64 = vpaddlq_s32(q9s32);
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}

Просмотреть файл

@ -11,6 +11,7 @@
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "encodemb.h"
#include "encodemv.h"
#include "vp8/common/common.h"
@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
* lambda using a non-linear combination (e.g., the smallest, or second
* smallest, etc.).
*/
act = vp8_variance16x16(x->src.y_buffer,
act = vpx_variance16x16(x->src.y_buffer,
x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
act = act<<4;

Просмотреть файл

@ -11,6 +11,7 @@
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "quantize.h"
#include "vp8/common/reconintra4x4.h"
#include "encodemb.h"
@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
}
}
intra_pred_var = vp8_get_mb_ss(x->src_diff);
intra_pred_var = vpx_get_mb_ss(x->src_diff);
return intra_pred_var;
}

Просмотреть файл

@ -12,6 +12,7 @@
#include <limits.h>
#include <stdio.h>
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "block.h"
#include "onyx_int.h"
@ -422,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
/* Set up pointers for this macro block raw buffer */
raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
+ d->offset);
vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
(unsigned int *)(raw_motion_err));
vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride,
(unsigned int *)(raw_motion_err));
/* Set up pointers for this macro block recon buffer */
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
(unsigned int *)(best_motion_err));
vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
(unsigned int *)(best_motion_err));
}
static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
@ -453,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
int new_mv_mode_penalty = 256;
/* override the default variance function to use MSE */
v_fn_ptr.vf = vp8_mse16x16;
v_fn_ptr.vf = vpx_mse16x16;
/* Set up pointers for this macro block recon buffer */
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

Просмотреть файл

@ -2131,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
#endif
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16;
cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
@ -2141,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8;
cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
@ -2151,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16;
cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
@ -2161,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8;
cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
@ -2171,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4;
cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
@ -2558,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
{
unsigned int sse;
vp8_mse16x16(orig + col, orig_stride,
vpx_mse16x16(orig + col, orig_stride,
recon + col, recon_stride,
&sse);
total_sse += sse;
@ -3384,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
int index = block_index_row + (j >> 4);
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
unsigned int sse;
Total += vp8_mse16x16(src + j,
Total += vpx_mse16x16(src + j,
source->y_stride,
dst + j, dest->y_stride,
&sse);
@ -3448,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
int index = block_index_row + (j >> 4);
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
unsigned int sse;
const unsigned int var = vp8_variance16x16(src + j,
const unsigned int var = vpx_variance16x16(src + j,
ystride,
dst + j,
ystride,
@ -3458,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
// is small (to avoid effects from lighting change).
if ((sse - var) < 128) {
unsigned int sse2;
const unsigned int act = vp8_variance16x16(src + j,
const unsigned int act = vpx_variance16x16(src + j,
ystride,
const_source,
0,
@ -5993,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest)
for (j = 0; j < source->y_width; j += 16)
{
unsigned int sse;
Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
Total += vpx_mse16x16(src + j, source->y_stride,
dst + j, dest->y_stride, &sse);
}
src += 16 * source->y_stride;

Просмотреть файл

@ -11,6 +11,7 @@
#include <limits.h>
#include "vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "onyx_int.h"
#include "modecosts.h"
#include "encodeintra.h"
@ -215,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
}
unsigned int vp8_get4x4sse_cs_c
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride
)
{
int distortion = 0;
int r, c;
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
int diff = src_ptr[c] - ref_ptr[c];
distortion += diff * diff;
}
src_ptr += source_stride;
ref_ptr += recon_stride;
}
return distortion;
}
static int get_prediction_error(BLOCK *be, BLOCKD *b)
{
unsigned char *sptr;
@ -249,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b)
sptr = (*(be->base_src) + be->src);
dptr = b->predictor;
return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
}
@ -1037,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
else
{
rate2 += rate;
distortion2 = vp8_variance16x16(
distortion2 = vpx_variance16x16(
*(b->base_src), b->src_stride,
x->e_mbd.predictor, 16, &sse);
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
@ -1066,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
xd->dst.y_stride,
xd->predictor,
16);
distortion2 = vp8_variance16x16
distortion2 = vpx_variance16x16
(*(b->base_src), b->src_stride,
x->e_mbd.predictor, 16, &sse);
rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
@ -1547,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
xd->dst.y_stride,
xd->predictor,
16);
distortion = vp8_variance16x16
distortion = vpx_variance16x16
(*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
rate = x->mbmode_cost[xd->frame_type][mode];
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

Просмотреть файл

@ -9,6 +9,7 @@
*/
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vp8/common/onyxc_int.h"
#include "onyx_int.h"
@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
for (j = 0; j < source->y_width; j += 16)
{
unsigned int sse;
Total += vp8_mse16x16(src + j, source->y_stride,
Total += vpx_mse16x16(src + j, source->y_stride,
dst + j, dest->y_stride,
&sse);
}

Просмотреть файл

@ -15,6 +15,7 @@
#include <assert.h>
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "tokenize.h"
#include "treewriter.h"
#include "onyx_int.h"
@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x)
}
else
{
vp8_variance8x8(uptr, pre_stride,
vpx_variance8x8(uptr, pre_stride,
upred_ptr, uv_stride, &sse2);
vp8_variance8x8(vptr, pre_stride,
vpx_variance8x8(vptr, pre_stride,
vpred_ptr, uv_stride, &sse1);
sse2 += sse1;
}
@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4],
if(threshold < x->encode_breakout)
threshold = x->encode_breakout;
var = vp8_variance16x16
var = vpx_variance16x16
(*(b->base_src), b->src_stride,
x->e_mbd.predictor, 16, &sse);

Просмотреть файл

@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))

Просмотреть файл

@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
#File list for media
# encoder
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
#File list for neon
@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c

Просмотреть файл

@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
if (bs == BLOCK_16X16) {
vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
} else if (bs == BLOCK_32X32) {
vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
} else /* if (bs == BLOCK_64X64) */ {
vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
}

Просмотреть файл

@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
# variance
add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance4x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance4x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
@ -922,21 +877,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_mse8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_mse16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_avg_8x8 sse2 neon/;
@ -1141,142 +1081,6 @@ specialize qw/vp9_temporal_filter_apply sse2/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# variance
add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x4/;
add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance4x8/;
add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance4x4/;
add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x4/;
add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance4x8/;
add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance4x4/;
add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x4/;
add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance4x8/;
add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance4x4/;
add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x16/;
add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse16x8/;
add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x16/;
add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse16x8/;
add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x16/;
add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse16x8/;
add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
# ENCODEMB INVOKE

Просмотреть файл

@ -10,6 +10,7 @@
#include <arm_neon.h>
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx_ports/mem.h"
@ -20,82 +21,6 @@
#include "vp9/encoder/vp9_variance.h"
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
const int32x4_t a = vpaddlq_s16(v_16x8);
const int64x2_t b = vpaddlq_s32(a);
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
vreinterpret_s32_s64(vget_high_s64(b)));
return vget_lane_s32(c, 0);
}
static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
const int64x2_t b = vpaddlq_s32(v_32x4);
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
vreinterpret_s32_s64(vget_high_s64(b)));
return vget_lane_s32(c, 0);
}
// w * h must be less than 2048 or local variable v_sum may overflow.
static void variance_neon_w8(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int w, int h, uint32_t *sse, int *sum) {
int i, j;
int16x8_t v_sum = vdupq_n_s16(0);
int32x4_t v_sse_lo = vdupq_n_s32(0);
int32x4_t v_sse_hi = vdupq_n_s32(0);
for (i = 0; i < h; ++i) {
for (j = 0; j < w; j += 8) {
const uint8x8_t v_a = vld1_u8(&a[j]);
const uint8x8_t v_b = vld1_u8(&b[j]);
const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
v_sum = vaddq_s16(v_sum, sv_diff);
v_sse_lo = vmlal_s16(v_sse_lo,
vget_low_s16(sv_diff),
vget_low_s16(sv_diff));
v_sse_hi = vmlal_s16(v_sse_hi,
vget_high_s16(sv_diff),
vget_high_s16(sv_diff));
}
a += a_stride;
b += b_stride;
}
*sum = horizontal_add_s16x8(v_sum);
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
}
void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
8, sse, sum);
}
unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
}
void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
16, sse, sum);
}
unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
}
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
uint8_t *output_ptr,
unsigned int src_pixels_per_line,
@ -162,7 +87,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
BILINEAR_FILTERS_2TAP(xoffset));
var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
8, BILINEAR_FILTERS_2TAP(yoffset));
return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
@ -180,77 +105,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
BILINEAR_FILTERS_2TAP(xoffset));
var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
16, BILINEAR_FILTERS_2TAP(yoffset));
return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
}
void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
32, sse, sum);
}
unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
}
unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
variance_neon_w8(a + (32 * a_stride), a_stride,
b + (32 * b_stride), b_stride, 32, 32,
&sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
}
unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
variance_neon_w8(a + (16 * a_stride), a_stride,
b + (16 * b_stride), b_stride, 64, 16,
&sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
}
unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
variance_neon_w8(a + (16 * a_stride), a_stride,
b + (16 * b_stride), b_stride, 64, 16,
&sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
b + (16 * 2 * b_stride), b_stride,
64, 16, &sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
b + (16 * 3 * b_stride), b_stride,
64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
@ -268,7 +123,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
BILINEAR_FILTERS_2TAP(xoffset));
var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
32, BILINEAR_FILTERS_2TAP(yoffset));
return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
}
unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
@ -286,5 +141,5 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
BILINEAR_FILTERS_2TAP(xoffset));
var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
64, BILINEAR_FILTERS_2TAP(yoffset));
return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
}

Просмотреть файл

@ -98,9 +98,9 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
int avg;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride,
CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
&sse, &avg);
highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
&sse, &avg);
sse >>= 2 * (xd->bd - 8);
avg >>= (xd->bd - 8);
} else {

Просмотреть файл

@ -13,6 +13,7 @@
#include <stdio.h>
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx_ports/mem.h"
@ -3672,15 +3673,15 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
if (cm->use_highbitdepth) {
switch (cm->bit_depth) {
case VPX_BITS_8:
vp9_highbd_get16x16var(src, src_stride, last_src, last_stride,
vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
break;
case VPX_BITS_10:
vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
break;
case VPX_BITS_12:
vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
break;
default:
@ -3689,11 +3690,11 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
return -1;
}
} else {
vp9_get16x16var(src, src_stride, last_src, last_stride,
vpx_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
}
#else
vp9_get16x16var(src, src_stride, last_src, last_stride,
vpx_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
#endif // CONFIG_VP9_HIGHBITDEPTH
var16->var = var16->sse -

Просмотреть файл

@ -998,7 +998,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X16,
vpx_highbd_sad32x16_bits8,
vpx_highbd_sad32x16_avg_bits8,
vp9_highbd_variance32x16,
vpx_highbd_8_variance32x16,
vp9_highbd_sub_pixel_variance32x16,
vp9_highbd_sub_pixel_avg_variance32x16,
NULL,
@ -1008,7 +1008,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X32,
vpx_highbd_sad16x32_bits8,
vpx_highbd_sad16x32_avg_bits8,
vp9_highbd_variance16x32,
vpx_highbd_8_variance16x32,
vp9_highbd_sub_pixel_variance16x32,
vp9_highbd_sub_pixel_avg_variance16x32,
NULL,
@ -1018,7 +1018,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X32,
vpx_highbd_sad64x32_bits8,
vpx_highbd_sad64x32_avg_bits8,
vp9_highbd_variance64x32,
vpx_highbd_8_variance64x32,
vp9_highbd_sub_pixel_variance64x32,
vp9_highbd_sub_pixel_avg_variance64x32,
NULL,
@ -1028,7 +1028,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X64,
vpx_highbd_sad32x64_bits8,
vpx_highbd_sad32x64_avg_bits8,
vp9_highbd_variance32x64,
vpx_highbd_8_variance32x64,
vp9_highbd_sub_pixel_variance32x64,
vp9_highbd_sub_pixel_avg_variance32x64,
NULL,
@ -1038,7 +1038,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X32,
vpx_highbd_sad32x32_bits8,
vpx_highbd_sad32x32_avg_bits8,
vp9_highbd_variance32x32,
vpx_highbd_8_variance32x32,
vp9_highbd_sub_pixel_variance32x32,
vp9_highbd_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits8,
@ -1048,7 +1048,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X64,
vpx_highbd_sad64x64_bits8,
vpx_highbd_sad64x64_avg_bits8,
vp9_highbd_variance64x64,
vpx_highbd_8_variance64x64,
vp9_highbd_sub_pixel_variance64x64,
vp9_highbd_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits8,
@ -1058,7 +1058,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X16,
vpx_highbd_sad16x16_bits8,
vpx_highbd_sad16x16_avg_bits8,
vp9_highbd_variance16x16,
vpx_highbd_8_variance16x16,
vp9_highbd_sub_pixel_variance16x16,
vp9_highbd_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits8,
@ -1068,7 +1068,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X8,
vpx_highbd_sad16x8_bits8,
vpx_highbd_sad16x8_avg_bits8,
vp9_highbd_variance16x8,
vpx_highbd_8_variance16x8,
vp9_highbd_sub_pixel_variance16x8,
vp9_highbd_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits8,
@ -1078,7 +1078,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X16,
vpx_highbd_sad8x16_bits8,
vpx_highbd_sad8x16_avg_bits8,
vp9_highbd_variance8x16,
vpx_highbd_8_variance8x16,
vp9_highbd_sub_pixel_variance8x16,
vp9_highbd_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits8,
@ -1088,7 +1088,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X8,
vpx_highbd_sad8x8_bits8,
vpx_highbd_sad8x8_avg_bits8,
vp9_highbd_variance8x8,
vpx_highbd_8_variance8x8,
vp9_highbd_sub_pixel_variance8x8,
vp9_highbd_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits8,
@ -1098,7 +1098,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X4,
vpx_highbd_sad8x4_bits8,
vpx_highbd_sad8x4_avg_bits8,
vp9_highbd_variance8x4,
vpx_highbd_8_variance8x4,
vp9_highbd_sub_pixel_variance8x4,
vp9_highbd_sub_pixel_avg_variance8x4,
NULL,
@ -1108,7 +1108,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X8,
vpx_highbd_sad4x8_bits8,
vpx_highbd_sad4x8_avg_bits8,
vp9_highbd_variance4x8,
vpx_highbd_8_variance4x8,
vp9_highbd_sub_pixel_variance4x8,
vp9_highbd_sub_pixel_avg_variance4x8,
NULL,
@ -1118,7 +1118,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X4,
vpx_highbd_sad4x4_bits8,
vpx_highbd_sad4x4_avg_bits8,
vp9_highbd_variance4x4,
vpx_highbd_8_variance4x4,
vp9_highbd_sub_pixel_variance4x4,
vp9_highbd_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits8,
@ -1130,7 +1130,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X16,
vpx_highbd_sad32x16_bits10,
vpx_highbd_sad32x16_avg_bits10,
vp9_highbd_10_variance32x16,
vpx_highbd_10_variance32x16,
vp9_highbd_10_sub_pixel_variance32x16,
vp9_highbd_10_sub_pixel_avg_variance32x16,
NULL,
@ -1140,7 +1140,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X32,
vpx_highbd_sad16x32_bits10,
vpx_highbd_sad16x32_avg_bits10,
vp9_highbd_10_variance16x32,
vpx_highbd_10_variance16x32,
vp9_highbd_10_sub_pixel_variance16x32,
vp9_highbd_10_sub_pixel_avg_variance16x32,
NULL,
@ -1150,7 +1150,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X32,
vpx_highbd_sad64x32_bits10,
vpx_highbd_sad64x32_avg_bits10,
vp9_highbd_10_variance64x32,
vpx_highbd_10_variance64x32,
vp9_highbd_10_sub_pixel_variance64x32,
vp9_highbd_10_sub_pixel_avg_variance64x32,
NULL,
@ -1160,7 +1160,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X64,
vpx_highbd_sad32x64_bits10,
vpx_highbd_sad32x64_avg_bits10,
vp9_highbd_10_variance32x64,
vpx_highbd_10_variance32x64,
vp9_highbd_10_sub_pixel_variance32x64,
vp9_highbd_10_sub_pixel_avg_variance32x64,
NULL,
@ -1170,7 +1170,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X32,
vpx_highbd_sad32x32_bits10,
vpx_highbd_sad32x32_avg_bits10,
vp9_highbd_10_variance32x32,
vpx_highbd_10_variance32x32,
vp9_highbd_10_sub_pixel_variance32x32,
vp9_highbd_10_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits10,
@ -1180,7 +1180,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X64,
vpx_highbd_sad64x64_bits10,
vpx_highbd_sad64x64_avg_bits10,
vp9_highbd_10_variance64x64,
vpx_highbd_10_variance64x64,
vp9_highbd_10_sub_pixel_variance64x64,
vp9_highbd_10_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits10,
@ -1190,7 +1190,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X16,
vpx_highbd_sad16x16_bits10,
vpx_highbd_sad16x16_avg_bits10,
vp9_highbd_10_variance16x16,
vpx_highbd_10_variance16x16,
vp9_highbd_10_sub_pixel_variance16x16,
vp9_highbd_10_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits10,
@ -1200,7 +1200,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X8,
vpx_highbd_sad16x8_bits10,
vpx_highbd_sad16x8_avg_bits10,
vp9_highbd_10_variance16x8,
vpx_highbd_10_variance16x8,
vp9_highbd_10_sub_pixel_variance16x8,
vp9_highbd_10_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits10,
@ -1210,7 +1210,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X16,
vpx_highbd_sad8x16_bits10,
vpx_highbd_sad8x16_avg_bits10,
vp9_highbd_10_variance8x16,
vpx_highbd_10_variance8x16,
vp9_highbd_10_sub_pixel_variance8x16,
vp9_highbd_10_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits10,
@ -1220,7 +1220,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X8,
vpx_highbd_sad8x8_bits10,
vpx_highbd_sad8x8_avg_bits10,
vp9_highbd_10_variance8x8,
vpx_highbd_10_variance8x8,
vp9_highbd_10_sub_pixel_variance8x8,
vp9_highbd_10_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits10,
@ -1230,7 +1230,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X4,
vpx_highbd_sad8x4_bits10,
vpx_highbd_sad8x4_avg_bits10,
vp9_highbd_10_variance8x4,
vpx_highbd_10_variance8x4,
vp9_highbd_10_sub_pixel_variance8x4,
vp9_highbd_10_sub_pixel_avg_variance8x4,
NULL,
@ -1240,7 +1240,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X8,
vpx_highbd_sad4x8_bits10,
vpx_highbd_sad4x8_avg_bits10,
vp9_highbd_10_variance4x8,
vpx_highbd_10_variance4x8,
vp9_highbd_10_sub_pixel_variance4x8,
vp9_highbd_10_sub_pixel_avg_variance4x8,
NULL,
@ -1250,7 +1250,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X4,
vpx_highbd_sad4x4_bits10,
vpx_highbd_sad4x4_avg_bits10,
vp9_highbd_10_variance4x4,
vpx_highbd_10_variance4x4,
vp9_highbd_10_sub_pixel_variance4x4,
vp9_highbd_10_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits10,
@ -1262,7 +1262,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X16,
vpx_highbd_sad32x16_bits12,
vpx_highbd_sad32x16_avg_bits12,
vp9_highbd_12_variance32x16,
vpx_highbd_12_variance32x16,
vp9_highbd_12_sub_pixel_variance32x16,
vp9_highbd_12_sub_pixel_avg_variance32x16,
NULL,
@ -1272,7 +1272,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X32,
vpx_highbd_sad16x32_bits12,
vpx_highbd_sad16x32_avg_bits12,
vp9_highbd_12_variance16x32,
vpx_highbd_12_variance16x32,
vp9_highbd_12_sub_pixel_variance16x32,
vp9_highbd_12_sub_pixel_avg_variance16x32,
NULL,
@ -1282,7 +1282,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X32,
vpx_highbd_sad64x32_bits12,
vpx_highbd_sad64x32_avg_bits12,
vp9_highbd_12_variance64x32,
vpx_highbd_12_variance64x32,
vp9_highbd_12_sub_pixel_variance64x32,
vp9_highbd_12_sub_pixel_avg_variance64x32,
NULL,
@ -1292,7 +1292,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X64,
vpx_highbd_sad32x64_bits12,
vpx_highbd_sad32x64_avg_bits12,
vp9_highbd_12_variance32x64,
vpx_highbd_12_variance32x64,
vp9_highbd_12_sub_pixel_variance32x64,
vp9_highbd_12_sub_pixel_avg_variance32x64,
NULL,
@ -1302,7 +1302,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_32X32,
vpx_highbd_sad32x32_bits12,
vpx_highbd_sad32x32_avg_bits12,
vp9_highbd_12_variance32x32,
vpx_highbd_12_variance32x32,
vp9_highbd_12_sub_pixel_variance32x32,
vp9_highbd_12_sub_pixel_avg_variance32x32,
vpx_highbd_sad32x32x3_bits12,
@ -1312,7 +1312,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_64X64,
vpx_highbd_sad64x64_bits12,
vpx_highbd_sad64x64_avg_bits12,
vp9_highbd_12_variance64x64,
vpx_highbd_12_variance64x64,
vp9_highbd_12_sub_pixel_variance64x64,
vp9_highbd_12_sub_pixel_avg_variance64x64,
vpx_highbd_sad64x64x3_bits12,
@ -1322,7 +1322,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X16,
vpx_highbd_sad16x16_bits12,
vpx_highbd_sad16x16_avg_bits12,
vp9_highbd_12_variance16x16,
vpx_highbd_12_variance16x16,
vp9_highbd_12_sub_pixel_variance16x16,
vp9_highbd_12_sub_pixel_avg_variance16x16,
vpx_highbd_sad16x16x3_bits12,
@ -1332,7 +1332,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_16X8,
vpx_highbd_sad16x8_bits12,
vpx_highbd_sad16x8_avg_bits12,
vp9_highbd_12_variance16x8,
vpx_highbd_12_variance16x8,
vp9_highbd_12_sub_pixel_variance16x8,
vp9_highbd_12_sub_pixel_avg_variance16x8,
vpx_highbd_sad16x8x3_bits12,
@ -1342,7 +1342,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X16,
vpx_highbd_sad8x16_bits12,
vpx_highbd_sad8x16_avg_bits12,
vp9_highbd_12_variance8x16,
vpx_highbd_12_variance8x16,
vp9_highbd_12_sub_pixel_variance8x16,
vp9_highbd_12_sub_pixel_avg_variance8x16,
vpx_highbd_sad8x16x3_bits12,
@ -1352,7 +1352,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X8,
vpx_highbd_sad8x8_bits12,
vpx_highbd_sad8x8_avg_bits12,
vp9_highbd_12_variance8x8,
vpx_highbd_12_variance8x8,
vp9_highbd_12_sub_pixel_variance8x8,
vp9_highbd_12_sub_pixel_avg_variance8x8,
vpx_highbd_sad8x8x3_bits12,
@ -1362,7 +1362,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_8X4,
vpx_highbd_sad8x4_bits12,
vpx_highbd_sad8x4_avg_bits12,
vp9_highbd_12_variance8x4,
vpx_highbd_12_variance8x4,
vp9_highbd_12_sub_pixel_variance8x4,
vp9_highbd_12_sub_pixel_avg_variance8x4,
NULL,
@ -1372,7 +1372,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X8,
vpx_highbd_sad4x8_bits12,
vpx_highbd_sad4x8_avg_bits12,
vp9_highbd_12_variance4x8,
vpx_highbd_12_variance4x8,
vp9_highbd_12_sub_pixel_variance4x8,
vp9_highbd_12_sub_pixel_avg_variance4x8,
NULL,
@ -1382,7 +1382,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
HIGHBD_BFP(BLOCK_4X4,
vpx_highbd_sad4x4_bits12,
vpx_highbd_sad4x4_avg_bits12,
vp9_highbd_12_variance4x4,
vpx_highbd_12_variance4x4,
vp9_highbd_12_sub_pixel_variance4x4,
vp9_highbd_12_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x3_bits12,
@ -1805,61 +1805,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
cpi->fn_ptr[BT].sdx4df = SDX4DF;
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
vp9_variance32x16, vp9_sub_pixel_variance32x16,
vpx_variance32x16, vp9_sub_pixel_variance32x16,
vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
vp9_variance16x32, vp9_sub_pixel_variance16x32,
vpx_variance16x32, vp9_sub_pixel_variance16x32,
vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
vp9_variance64x32, vp9_sub_pixel_variance64x32,
vpx_variance64x32, vp9_sub_pixel_variance64x32,
vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
vp9_variance32x64, vp9_sub_pixel_variance32x64,
vpx_variance32x64, vp9_sub_pixel_variance32x64,
vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
vp9_variance32x32, vp9_sub_pixel_variance32x32,
vpx_variance32x32, vp9_sub_pixel_variance32x32,
vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
vpx_sad32x32x4d)
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
vp9_variance64x64, vp9_sub_pixel_variance64x64,
vpx_variance64x64, vp9_sub_pixel_variance64x64,
vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
vpx_sad64x64x4d)
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
vp9_variance16x16, vp9_sub_pixel_variance16x16,
vpx_variance16x16, vp9_sub_pixel_variance16x16,
vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
vpx_sad16x16x4d)
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
vp9_variance16x8, vp9_sub_pixel_variance16x8,
vpx_variance16x8, vp9_sub_pixel_variance16x8,
vp9_sub_pixel_avg_variance16x8,
vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
vp9_variance8x16, vp9_sub_pixel_variance8x16,
vpx_variance8x16, vp9_sub_pixel_variance8x16,
vp9_sub_pixel_avg_variance8x16,
vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
vp9_variance8x8, vp9_sub_pixel_variance8x8,
vpx_variance8x8, vp9_sub_pixel_variance8x8,
vp9_sub_pixel_avg_variance8x8,
vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
vp9_variance8x4, vp9_sub_pixel_variance8x4,
vpx_variance8x4, vp9_sub_pixel_variance8x4,
vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
vp9_variance4x8, vp9_sub_pixel_variance4x8,
vpx_variance4x8, vp9_sub_pixel_variance4x8,
vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
vp9_variance4x4, vp9_sub_pixel_variance4x4,
vpx_variance4x4, vp9_sub_pixel_variance4x4,
vp9_sub_pixel_avg_variance4x4,
vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
@ -2079,7 +2079,7 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
const uint8_t *pa = a;
const uint8_t *pb = b;
for (x = 0; x < width / 16; ++x) {
vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;
pa += 16;
@ -2124,21 +2124,21 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
unsigned int sse = 0;
int sum = 0;
if (dw > 0) {
highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
dw, height, &sse, &sum);
highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
dw, height, &sse, &sum);
total_sse += sse;
}
if (dh > 0) {
highbd_variance(&a[(height - dh) * a_stride], a_stride,
&b[(height - dh) * b_stride], b_stride,
width - dw, dh, &sse, &sum);
highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
&b[(height - dh) * b_stride], b_stride,
width - dw, dh, &sse, &sum);
total_sse += sse;
}
for (y = 0; y < height / 16; ++y) {
const uint8_t *pa = a;
const uint8_t *pb = b;
for (x = 0; x < width / 16; ++x) {
vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse);
vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;
pa += 16;
pb += 16;

Просмотреть файл

@ -12,6 +12,7 @@
#include <math.h>
#include <stdio.h>
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx_mem/vpx_mem.h"
@ -267,13 +268,13 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
switch (bsize) {
case BLOCK_8X8:
return vp9_mse8x8;
return vpx_mse8x8;
case BLOCK_16X8:
return vp9_mse16x8;
return vpx_mse16x8;
case BLOCK_8X16:
return vp9_mse8x16;
return vpx_mse8x16;
default:
return vp9_mse16x16;
return vpx_mse16x16;
}
}
@ -293,37 +294,37 @@ static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
default:
switch (bsize) {
case BLOCK_8X8:
return vp9_highbd_mse8x8;
return vpx_highbd_8_mse8x8;
case BLOCK_16X8:
return vp9_highbd_mse16x8;
return vpx_highbd_8_mse16x8;
case BLOCK_8X16:
return vp9_highbd_mse8x16;
return vpx_highbd_8_mse8x16;
default:
return vp9_highbd_mse16x16;
return vpx_highbd_8_mse16x16;
}
break;
case 10:
switch (bsize) {
case BLOCK_8X8:
return vp9_highbd_10_mse8x8;
return vpx_highbd_10_mse8x8;
case BLOCK_16X8:
return vp9_highbd_10_mse16x8;
return vpx_highbd_10_mse16x8;
case BLOCK_8X16:
return vp9_highbd_10_mse8x16;
return vpx_highbd_10_mse8x16;
default:
return vp9_highbd_10_mse16x16;
return vpx_highbd_10_mse16x16;
}
break;
case 12:
switch (bsize) {
case BLOCK_8X8:
return vp9_highbd_12_mse8x8;
return vpx_highbd_12_mse8x8;
case BLOCK_16X8:
return vp9_highbd_12_mse16x8;
return vpx_highbd_12_mse16x8;
case BLOCK_8X16:
return vp9_highbd_12_mse8x16;
return vpx_highbd_12_mse8x16;
default:
return vp9_highbd_12_mse16x16;
return vpx_highbd_12_mse16x16;
}
break;
}
@ -634,7 +635,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
xd->mi[0]->mbmi.tx_size = use_dc_pred ?
(bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
vp9_encode_intra_block_plane(x, bsize, 0);
this_error = vp9_get_mb_ss(x->plane[0].src_diff);
this_error = vpx_get_mb_ss(x->plane[0].src_diff);
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
switch (cm->bit_depth) {

Просмотреть файл

@ -13,6 +13,7 @@
#include <stdio.h>
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@ -303,13 +304,13 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
if (second_pred != NULL) {
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
y_stride);
besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
sse1);
} else {
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
}
} else {
@ -321,7 +322,7 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
(void) xd;
if (second_pred != NULL) {
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
} else {
besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);

Просмотреть файл

@ -14,6 +14,7 @@
#include <stdio.h>
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
@ -215,7 +216,7 @@ static void block_variance(const uint8_t *src, int src_stride,
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
vp9_get8x8var(src + src_stride * i + j, src_stride,
vpx_get8x8var(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride,
&sse8x8[k], &sum8x8[k]);
*sse += sse8x8[k];

Просмотреть файл

@ -9,6 +9,7 @@
*/
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
@ -18,26 +19,6 @@
#include "vp9/encoder/vp9_variance.h"
void variance(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
int i, j;
*sum = 0;
*sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
}
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
// or vertical direction to produce the filtered output block. Used to implement
// first-pass of 2-D separable filter.
@ -100,25 +81,6 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
}
}
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
unsigned int i, sum = 0;
for (i = 0; i < 256; ++i) {
sum += src_ptr[i] * src_ptr[i];
}
return sum;
}
#define VAR(W, H) \
unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
unsigned int *sse) { \
int sum; \
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
}
#define SUBPIX_VAR(W, H) \
unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
@ -133,7 +95,7 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
}
#define SUBPIX_AVG_VAR(W, H) \
@ -152,178 +114,51 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
\
return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
}
void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
}
void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
const uint8_t *ref_ptr, int ref_stride,
unsigned int *sse, int *sum) {
variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
}
unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
return *sse;
}
unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
return *sse;
}
unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
return *sse;
}
unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
return *sse;
}
VAR(4, 4)
SUBPIX_VAR(4, 4)
SUBPIX_AVG_VAR(4, 4)
VAR(4, 8)
SUBPIX_VAR(4, 8)
SUBPIX_AVG_VAR(4, 8)
VAR(8, 4)
SUBPIX_VAR(8, 4)
SUBPIX_AVG_VAR(8, 4)
VAR(8, 8)
SUBPIX_VAR(8, 8)
SUBPIX_AVG_VAR(8, 8)
VAR(8, 16)
SUBPIX_VAR(8, 16)
SUBPIX_AVG_VAR(8, 16)
VAR(16, 8)
SUBPIX_VAR(16, 8)
SUBPIX_AVG_VAR(16, 8)
VAR(16, 16)
SUBPIX_VAR(16, 16)
SUBPIX_AVG_VAR(16, 16)
VAR(16, 32)
SUBPIX_VAR(16, 32)
SUBPIX_AVG_VAR(16, 32)
VAR(32, 16)
SUBPIX_VAR(32, 16)
SUBPIX_AVG_VAR(32, 16)
VAR(32, 32)
SUBPIX_VAR(32, 32)
SUBPIX_AVG_VAR(32, 32)
VAR(32, 64)
SUBPIX_VAR(32, 64)
SUBPIX_AVG_VAR(32, 64)
VAR(64, 32)
SUBPIX_VAR(64, 32)
SUBPIX_AVG_VAR(64, 32)
VAR(64, 64)
SUBPIX_VAR(64, 64)
SUBPIX_AVG_VAR(64, 64)
void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
int i, j;
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
comp_pred += width;
pred += width;
ref += ref_stride;
}
}
#if CONFIG_VP9_HIGHBITDEPTH
void highbd_variance64(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, uint64_t *sse,
uint64_t *sum) {
int i, j;
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
*sum = 0;
*sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
}
void highbd_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse,
int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sse = (unsigned int)sse_long;
*sum = (int)sum_long;
}
void highbd_10_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse,
int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
}
void highbd_12_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse,
int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
}
static void highbd_var_filter_block2d_bil_first_pass(
const uint8_t *src_ptr8,
uint16_t *output_ptr,
@ -374,35 +209,6 @@ static void highbd_var_filter_block2d_bil_second_pass(
}
}
#define HIGHBD_VAR(W, H) \
unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
unsigned int *sse) { \
int sum; \
highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
unsigned int *sse) { \
int sum; \
highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
unsigned int *sse) { \
int sum; \
highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
}
#define HIGHBD_SUBPIX_VAR(W, H) \
unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
@ -417,7 +223,7 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
dst_stride, sse); \
} \
\
@ -434,7 +240,7 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
} \
\
@ -451,7 +257,7 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
}
@ -471,10 +277,10 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
dst_stride, sse); \
} \
\
@ -493,10 +299,10 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
} \
\
@ -515,137 +321,49 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
BILINEAR_FILTERS_2TAP(yoffset)); \
\
vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
\
return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
}
#define HIGHBD_GET_VAR(S) \
void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sse, int *sum) { \
highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
} \
\
void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sse, int *sum) { \
highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
} \
\
void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sse, int *sum) { \
highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
}
#define HIGHBD_MSE(W, H) \
unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
unsigned int *sse) { \
int sum; \
highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
} \
\
unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
unsigned int *sse) { \
int sum; \
highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
} \
\
unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
unsigned int *sse) { \
int sum; \
highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
}
HIGHBD_GET_VAR(8)
HIGHBD_GET_VAR(16)
HIGHBD_MSE(16, 16)
HIGHBD_MSE(16, 8)
HIGHBD_MSE(8, 16)
HIGHBD_MSE(8, 8)
HIGHBD_VAR(4, 4)
HIGHBD_SUBPIX_VAR(4, 4)
HIGHBD_SUBPIX_AVG_VAR(4, 4)
HIGHBD_VAR(4, 8)
HIGHBD_SUBPIX_VAR(4, 8)
HIGHBD_SUBPIX_AVG_VAR(4, 8)
HIGHBD_VAR(8, 4)
HIGHBD_SUBPIX_VAR(8, 4)
HIGHBD_SUBPIX_AVG_VAR(8, 4)
HIGHBD_VAR(8, 8)
HIGHBD_SUBPIX_VAR(8, 8)
HIGHBD_SUBPIX_AVG_VAR(8, 8)
HIGHBD_VAR(8, 16)
HIGHBD_SUBPIX_VAR(8, 16)
HIGHBD_SUBPIX_AVG_VAR(8, 16)
HIGHBD_VAR(16, 8)
HIGHBD_SUBPIX_VAR(16, 8)
HIGHBD_SUBPIX_AVG_VAR(16, 8)
HIGHBD_VAR(16, 16)
HIGHBD_SUBPIX_VAR(16, 16)
HIGHBD_SUBPIX_AVG_VAR(16, 16)
HIGHBD_VAR(16, 32)
HIGHBD_SUBPIX_VAR(16, 32)
HIGHBD_SUBPIX_AVG_VAR(16, 32)
HIGHBD_VAR(32, 16)
HIGHBD_SUBPIX_VAR(32, 16)
HIGHBD_SUBPIX_AVG_VAR(32, 16)
HIGHBD_VAR(32, 32)
HIGHBD_SUBPIX_VAR(32, 32)
HIGHBD_SUBPIX_AVG_VAR(32, 32)
HIGHBD_VAR(32, 64)
HIGHBD_SUBPIX_VAR(32, 64)
HIGHBD_SUBPIX_AVG_VAR(32, 64)
HIGHBD_VAR(64, 32)
HIGHBD_SUBPIX_VAR(64, 32)
HIGHBD_SUBPIX_AVG_VAR(64, 32)
HIGHBD_VAR(64, 64)
HIGHBD_SUBPIX_VAR(64, 64)
HIGHBD_SUBPIX_AVG_VAR(64, 64)
void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride) {
int i, j;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
comp_pred += width;
pred += width;
ref += ref_stride;
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH

Просмотреть файл

@ -12,31 +12,64 @@
#define VP9_ENCODER_VP9_VARIANCE_H_
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
void variance(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int w, int h,
unsigned int *sse, int *sum);
// TODO(johannkoenig): All functions which depend on
// [highbd_][8|10|12_]variance should be refactored or moved to vpx_dsp.
static void variance(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
int i, j;
*sum = 0;
*sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
}
#if CONFIG_VP9_HIGHBITDEPTH
void highbd_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h,
unsigned int *sse, int *sum);
static void highbd_variance64(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, uint64_t *sse, uint64_t *sum) {
int i, j;
void highbd_10_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h,
unsigned int *sse, int *sum);
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
*sum = 0;
*sse = 0;
void highbd_12_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h,
unsigned int *sse, int *sum);
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
}
static void highbd_8_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sse = (unsigned int)sse_long;
*sum = (int)sum_long;
}
#endif
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
@ -95,15 +128,6 @@ typedef struct vp9_variance_vtable {
vp9_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride);
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred,
int width, int height,
const uint8_t *ref, int ref_stride);
#endif
#ifdef __cplusplus
} // extern "C"
#endif

Просмотреть файл

@ -13,237 +13,6 @@
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
static void highbd_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
*sse = 0;
*sum = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
}
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
}
#define HIGH_GET_VAR(S) \
void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
uint32_t *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
} \
\
void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
uint32_t *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
} \
\
void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
uint32_t *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
}
HIGH_GET_VAR(16);
HIGH_GET_VAR(8);
#undef HIGH_GET_VAR
#define VAR_FN(w, h, block_size, shift) \
uint32_t vp9_highbd_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
\
uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_10_variance_sse2( \
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
\
uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_12_variance_sse2( \
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
}
VAR_FN(64, 64, 16, 12);
VAR_FN(64, 32, 16, 11);
VAR_FN(32, 64, 16, 11);
VAR_FN(32, 32, 16, 10);
VAR_FN(32, 16, 16, 9);
VAR_FN(16, 32, 16, 9);
VAR_FN(16, 16, 16, 8);
VAR_FN(16, 8, 8, 7);
VAR_FN(8, 16, 8, 7);
VAR_FN(8, 8, 8, 6);
#undef VAR_FN
unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \

Просмотреть файл

@ -13,18 +13,6 @@
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum);
void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum);
void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum);
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
int x_offset, int y_offset,
const uint8_t *dst, int dst_stride,
@ -42,81 +30,6 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
int height,
unsigned int *sseptr);
static void variance_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
get_var_avx2 var_fn, int block_size) {
int i, j;
*sse = 0;
*sum = 0;
for (i = 0; i < h; i += 16) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(&src[src_stride * i + j], src_stride,
&ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_get16x16var_avx2, 16);
return *sse - (((unsigned int)sum * sum) >> 8);
}
unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse;
}
unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
sse, &sum, vp9_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 9);
}
unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
sse, &sum, vp9_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 10);
}
unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
sse, &sum, vp9_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 12);
}
unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
sse, &sum, vp9_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 11);
}
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
int src_stride,
int x_offset,

Просмотреть файл

@ -16,299 +16,6 @@
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
typedef void (*variance_fn_t)(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse, int *sum);
unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
__m128i vsum = _mm_setzero_si128();
int i;
for (i = 0; i < 32; ++i) {
const __m128i v = _mm_loadu_si128((const __m128i *)src);
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
src += 8;
}
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
return _mm_cvtsi128_si32(vsum);
}
#define READ64(p, stride, i) \
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
static void get4x4var_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum) {
const __m128i zero = _mm_setzero_si128();
const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
// sum
__m128i vsum = _mm_add_epi16(diff0, diff1);
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
// sse
vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
_mm_madd_epi16(diff1, diff1));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
*sse = _mm_cvtsi128_si32(vsum);
}
void vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum) {
const __m128i zero = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
__m128i vsse = _mm_setzero_si128();
int i;
for (i = 0; i < 8; i += 2) {
const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(src + i * src_stride)), zero);
const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(ref + i * ref_stride)), zero);
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(src + (i + 1) * src_stride)), zero);
const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(ref + (i + 1) * ref_stride)), zero);
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
vsum = _mm_add_epi16(vsum, diff0);
vsum = _mm_add_epi16(vsum, diff1);
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
}
// sum
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
// sse
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
*sse = _mm_cvtsi128_si32(vsse);
}
void vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum) {
const __m128i zero = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
__m128i vsse = _mm_setzero_si128();
int i;
for (i = 0; i < 16; ++i) {
const __m128i s = _mm_loadu_si128((const __m128i *)src);
const __m128i r = _mm_loadu_si128((const __m128i *)ref);
const __m128i src0 = _mm_unpacklo_epi8(s, zero);
const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
const __m128i src1 = _mm_unpackhi_epi8(s, zero);
const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
vsum = _mm_add_epi16(vsum, diff0);
vsum = _mm_add_epi16(vsum, diff1);
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
src += src_stride;
ref += ref_stride;
}
// sum
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
*sum = (int16_t)_mm_extract_epi16(vsum, 0) +
(int16_t)_mm_extract_epi16(vsum, 1);
// sse
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
*sse = _mm_cvtsi128_si32(vsse);
}
static void variance_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
variance_fn_t var_fn, int block_size) {
int i, j;
*sse = 0;
*sum = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 4);
}
unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
sse, &sum, get4x4var_sse2, 4);
return *sse - (((unsigned int)sum * sum) >> 5);
}
unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
sse, &sum, get4x4var_sse2, 4);
return *sse - (((unsigned int)sum * sum) >> 5);
}
unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 6);
}
unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
sse, &sum, vp9_get8x8var_sse2, 8);
return *sse - (((unsigned int)sum * sum) >> 7);
}
unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
sse, &sum, vp9_get8x8var_sse2, 8);
return *sse - (((unsigned int)sum * sum) >> 7);
}
unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 8);
}
unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
sse, &sum, vp9_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 10);
}
unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
sse, &sum, vp9_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 9);
}
unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
sse, &sum, vp9_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 9);
}
unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
sse, &sum, vp9_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 12);
}
unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
sse, &sum, vp9_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 11);
}
unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
sse, &sum, vp9_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 11);
}
unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}
unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}
unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}
unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}
// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \

Просмотреть файл

@ -102,13 +102,11 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
endif

Просмотреть файл

@ -0,0 +1,363 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vpx_variance16x16_media|
EXPORT |vpx_variance8x8_media|
EXPORT |vpx_mse16x16_media|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vpx_variance16x16_media| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r8, #0 ; initialize sum = 0
mov r11, #0 ; initialize sse = 0
mov r12, #16 ; set loop counter to 16 (=block height)
loop16x16
; 1st 4 pixels
ldr r4, [r0, #0] ; load 4 src pixels
ldr r5, [r2, #0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r6, r4, r5 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r4, [r0, #4] ; load 4 src pixels
ldr r5, [r2, #4] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r4, [r0, #8] ; load 4 src pixels
ldr r5, [r2, #8] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r4, [r0, #12] ; load 4 src pixels
ldr r5, [r2, #12] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
subs r12, r12, #1
bne loop16x16
; return stuff
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ENDP
END
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vpx_variance8x8_media| PROC
push {r4-r10, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r12, #8 ; set loop counter to 8 (=block height)
mov r4, #0 ; initialize sum = 0
mov r5, #0 ; initialize sse = 0
loop8x8
; 1st 4 pixels
ldr r6, [r0, #0x0] ; load 4 src pixels
ldr r7, [r2, #0x0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r8, r6, r7 ; calculate difference
pld [r0, r1, lsl #1]
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r6, [r0, #0x4] ; load 4 src pixels
ldr r7, [r2, #0x4] ; load 4 ref pixels
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
usub8 r8, r6, r7 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
subs r12, r12, #1 ; next row
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
bne loop8x8
; return stuff
ldr r8, [sp, #32] ; get address of sse
mul r1, r4, r4 ; sum * sum
str r5, [r8] ; store sse
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
pop {r4-r10, pc}
ENDP
END
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
;
;note: Based on vpx_variance16x16_media. In this function, sum is never used.
; So, we can remove this part of calculation.
|vpx_mse16x16_media| PROC
push {r4-r9, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r12, #16 ; set loop counter to 16 (=block height)
mov r4, #0 ; initialize sse = 0
loopmse
; 1st 4 pixels
ldr r5, [r0, #0x0] ; load 4 src pixels
ldr r6, [r2, #0x0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r8, r5, r6 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0x4] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r6, [r2, #0x4] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0x8] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r6, [r2, #0x8] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0xc] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r6, [r2, #0xc] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
subs r12, r12, #1 ; next row
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
bne loopmse
; return stuff
ldr r1, [sp, #28] ; get address of sse
mov r0, r4 ; return sse
str r4, [r1] ; store sse
pop {r4-r9, pc}
ENDP
END

417
vpx_dsp/arm/variance_neon.c Normal file
Просмотреть файл

@ -0,0 +1,417 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
const int32x4_t a = vpaddlq_s16(v_16x8);
const int64x2_t b = vpaddlq_s32(a);
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
vreinterpret_s32_s64(vget_high_s64(b)));
return vget_lane_s32(c, 0);
}
static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
const int64x2_t b = vpaddlq_s32(v_32x4);
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
vreinterpret_s32_s64(vget_high_s64(b)));
return vget_lane_s32(c, 0);
}
// w * h must be less than 2048 or local variable v_sum may overflow.
static void variance_neon_w8(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int w, int h, uint32_t *sse, int *sum) {
int i, j;
int16x8_t v_sum = vdupq_n_s16(0);
int32x4_t v_sse_lo = vdupq_n_s32(0);
int32x4_t v_sse_hi = vdupq_n_s32(0);
for (i = 0; i < h; ++i) {
for (j = 0; j < w; j += 8) {
const uint8x8_t v_a = vld1_u8(&a[j]);
const uint8x8_t v_b = vld1_u8(&b[j]);
const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
v_sum = vaddq_s16(v_sum, sv_diff);
v_sse_lo = vmlal_s16(v_sse_lo,
vget_low_s16(sv_diff),
vget_low_s16(sv_diff));
v_sse_hi = vmlal_s16(v_sse_hi,
vget_high_s16(sv_diff),
vget_high_s16(sv_diff));
}
a += a_stride;
b += b_stride;
}
*sum = horizontal_add_s16x8(v_sum);
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
}
void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse, int *sum) {
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
}
void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse, int *sum) {
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
}
unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
}
unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
}
unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
}
unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
variance_neon_w8(a + (32 * a_stride), a_stride,
b + (32 * b_stride), b_stride, 32, 32,
&sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
}
unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
variance_neon_w8(a + (16 * a_stride), a_stride,
b + (16 * b_stride), b_stride, 64, 16,
&sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
}
unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
variance_neon_w8(a + (16 * a_stride), a_stride,
b + (16 * b_stride), b_stride, 64, 16,
&sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
b + (16 * 2 * b_stride), b_stride,
64, 16, &sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
b + (16 * 3 * b_stride), b_stride,
64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
}
unsigned int vpx_variance16x8_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 4; i++) {
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vpx_variance8x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
uint8x8_t d0u8, d2u8, d4u8, d6u8;
int16x4_t d22s16, d23s16, d24s16, d25s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint16x8_t q11u16, q12u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) {
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d2u8, d6u8);
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vpx_mse16x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
int64x1_t d0s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
int32x4_t q7s32, q8s32, q9s32, q10s32;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int64x2_t q1s64;
q7s32 = vdupq_n_s32(0);
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) { // mse16x16_neon_loop
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q7s32 = vaddq_s32(q7s32, q8s32);
q9s32 = vaddq_s32(q9s32, q10s32);
q10s32 = vaddq_s32(q7s32, q9s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}
unsigned int vpx_get4x4sse_cs_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride) {
int16x4_t d22s16, d24s16, d26s16, d28s16;
int64x1_t d0s64;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
int32x4_t q7s32, q8s32, q9s32, q10s32;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int64x2_t q1s64;
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d1u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d5u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d3u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d7u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d1u8, d5u8);
q13u16 = vsubl_u8(d2u8, d6u8);
q14u16 = vsubl_u8(d3u8, d7u8);
d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
q7s32 = vmull_s16(d22s16, d22s16);
q8s32 = vmull_s16(d24s16, d24s16);
q9s32 = vmull_s16(d26s16, d26s16);
q10s32 = vmull_s16(d28s16, d28s16);
q7s32 = vaddq_s32(q7s32, q8s32);
q9s32 = vaddq_s32(q9s32, q10s32);
q9s32 = vaddq_s32(q7s32, q9s32);
q1s64 = vpaddlq_s32(q9s32);
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}

Просмотреть файл

@ -33,6 +33,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
return sad;
}
// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
* The function averages every corresponding element of the buffers and stores
* the value in a third buffer, comp_pred.

306
vpx_dsp/variance.c Normal file
Просмотреть файл

@ -0,0 +1,306 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride) {
int distortion = 0;
int r, c;
for (r = 0; r < 4; r++) {
for (c = 0; c < 4; c++) {
int diff = a[c] - b[c];
distortion += diff * diff;
}
a += a_stride;
b += b_stride;
}
return distortion;
}
unsigned int vpx_get_mb_ss_c(const int16_t *a) {
unsigned int i, sum = 0;
for (i = 0; i < 256; ++i) {
sum += a[i] * a[i];
}
return sum;
}
static void variance(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
int i, j;
*sum = 0;
*sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
}
#define VAR(W, H) \
unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
unsigned int *sse) { \
int sum; \
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
}
/* Identical to the variance call except it takes an additional parameter, sum,
* and returns that value using pass-by-reference instead of returning
* sse - sum^2 / w*h
*/
#define GET_VAR(W, H) \
void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
unsigned int *sse, int *sum) { \
variance(a, a_stride, b, b_stride, W, H, sse, sum); \
}
/* Identical to the variance call except it does not calculate the
* sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
* variable.
*/
#define MSE(W, H) \
unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
unsigned int *sse) { \
int sum; \
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse; \
}
VAR(64, 64)
VAR(64, 32)
VAR(32, 64)
VAR(32, 32)
VAR(32, 16)
VAR(16, 32)
VAR(16, 16)
VAR(16, 8)
VAR(8, 16)
VAR(8, 8)
VAR(8, 4)
VAR(4, 8)
VAR(4, 4)
GET_VAR(16, 16)
GET_VAR(8, 8)
MSE(16, 16)
MSE(16, 8)
MSE(8, 16)
MSE(8, 8)
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
int i, j;
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
comp_pred += width;
pred += width;
ref += ref_stride;
}
}
#if CONFIG_VP9_HIGHBITDEPTH
static void highbd_variance64(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, uint64_t *sse, uint64_t *sum) {
int i, j;
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
*sum = 0;
*sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
}
static void highbd_8_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sse = (unsigned int)sse_long;
*sum = (int)sum_long;
}
static void highbd_10_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
}
static void highbd_12_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
int w, int h, unsigned int *sse, int *sum) {
uint64_t sse_long = 0;
uint64_t sum_long = 0;
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
}
#define HIGHBD_VAR(W, H) \
unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
unsigned int *sse) { \
int sum; \
highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
unsigned int *sse) { \
int sum; \
highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
unsigned int *sse) { \
int sum; \
highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
}
#define HIGHBD_GET_VAR(S) \
void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sse, int *sum) { \
highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
} \
\
void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sse, int *sum) { \
highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
} \
\
void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
unsigned int *sse, int *sum) { \
highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
}
#define HIGHBD_MSE(W, H) \
unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
unsigned int *sse) { \
int sum; \
highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
} \
\
unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
unsigned int *sse) { \
int sum; \
highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
} \
\
unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
unsigned int *sse) { \
int sum; \
highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
return *sse; \
}
HIGHBD_GET_VAR(8)
HIGHBD_GET_VAR(16)
HIGHBD_MSE(16, 16)
HIGHBD_MSE(16, 8)
HIGHBD_MSE(8, 16)
HIGHBD_MSE(8, 8)
HIGHBD_VAR(64, 64)
HIGHBD_VAR(64, 32)
HIGHBD_VAR(32, 64)
HIGHBD_VAR(32, 32)
HIGHBD_VAR(32, 16)
HIGHBD_VAR(16, 32)
HIGHBD_VAR(16, 16)
HIGHBD_VAR(16, 8)
HIGHBD_VAR(8, 16)
HIGHBD_VAR(8, 8)
HIGHBD_VAR(8, 4)
HIGHBD_VAR(4, 8)
HIGHBD_VAR(4, 4)
void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
int width, int height, const uint8_t *ref8,
int ref_stride) {
int i, j;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
const int tmp = pred[j] + ref[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
}
comp_pred += width;
pred += width;
ref += ref_stride;
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH

Просмотреть файл

@ -17,6 +17,7 @@ DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
@ -29,9 +30,28 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS
ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
DSP_SRCS-yes += variance.c
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c
DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
DSP_SRCS-yes += vpx_dsp_rtcd.c

Просмотреть файл

@ -392,4 +392,212 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_ENCODERS
if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance64x64 sse2 avx2 neon/;
add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance64x32 sse2 avx2 neon/;
add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance32x64 sse2 neon/;
add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance32x32 sse2 avx2 neon/;
add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance32x16 sse2 avx2/;
add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance16x32 sse2/;
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/;
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance16x8 mmx sse2 neon/;
add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance8x16 mmx sse2 neon/;
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance8x8 mmx sse2 media neon/;
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance8x4 sse2/;
add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance4x8 sse2/;
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance4x4 mmx sse2/;
add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vpx_get16x16var sse2 avx2 neon/;
add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vpx_get8x8var mmx sse2 neon/;
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/;
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_mse16x8 sse2/;
add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_mse8x16 sse2/;
add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_mse8x8 sse2/;
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
specialize qw/vpx_get_mb_ss mmx sse2/;
add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
specialize qw/vpx_get4x4sse_cs neon/;
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance64x64 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance64x32 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance32x64 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance32x32 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance32x16 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance16x32 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance16x16 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance16x8 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance8x16 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_variance8x8 sse2/;
add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance64x64 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance64x32 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance32x64 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance32x32 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance32x16 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance16x32 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance16x16 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance16x8 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance8x16 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_variance8x8 sse2/;
add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance64x64 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance64x32 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance32x64 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance32x32 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance32x16 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance16x32 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance16x16 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance16x8 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance8x16 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_variance8x8 sse2/;
add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_mse16x16 sse2/;
add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_highbd_8_mse8x8 sse2/;
add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_mse16x16 sse2/;
add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_highbd_10_mse8x8 sse2/;
add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_mse16x16 sse2/;
add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_highbd_12_mse8x8 sse2/;
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
1;

Просмотреть файл

@ -11,7 +11,7 @@
%include "vpx_ports/x86_abi_support.asm"
;unsigned int vp9_highbd_calc16x16var_sse2
;unsigned int vpx_highbd_calc16x16var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
@ -20,8 +20,8 @@
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
sym(vp9_highbd_calc16x16var_sse2):
global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
sym(vpx_highbd_calc16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
@ -164,7 +164,7 @@ sym(vp9_highbd_calc16x16var_sse2):
ret
;unsigned int vp9_highbd_calc8x8var_sse2
;unsigned int vpx_highbd_calc8x8var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
@ -173,8 +173,8 @@ sym(vp9_highbd_calc16x16var_sse2):
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
sym(vp9_highbd_calc8x8var_sse2):
global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
sym(vpx_highbd_calc8x8var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6

Просмотреть файл

@ -0,0 +1,245 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
*sse = 0;
*sum = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
}
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, uint32_t *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
}
#define HIGH_GET_VAR(S) \
void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
uint32_t *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
} \
\
void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
uint32_t *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
} \
\
void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
uint32_t *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
}
HIGH_GET_VAR(16);
HIGH_GET_VAR(8);
#undef HIGH_GET_VAR
#define VAR_FN(w, h, block_size, shift) \
uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vpx_highbd_calc##block_size##x##block_size##var_sse2, \
block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
\
uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_10_variance_sse2( \
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
\
uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_12_variance_sse2( \
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
}
VAR_FN(64, 64, 16, 12);
VAR_FN(64, 32, 16, 11);
VAR_FN(32, 64, 16, 11);
VAR_FN(32, 32, 16, 10);
VAR_FN(32, 16, 16, 9);
VAR_FN(16, 32, 16, 9);
VAR_FN(16, 16, 16, 8);
VAR_FN(16, 8, 8, 7);
VAR_FN(8, 16, 8, 7);
VAR_FN(8, 8, 8, 6);
#undef VAR_FN
unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
return *sse;
}

Просмотреть файл

@ -0,0 +1,93 @@
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum);
void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum);
static void variance_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
get_var_avx2 var_fn, int block_size) {
int i, j;
*sse = 0;
*sum = 0;
for (i = 0; i < h; i += 16) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(&src[src_stride * i + j], src_stride,
&ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vpx_get16x16var_avx2, 16);
return *sse - (((unsigned int)sum * sum) >> 8);
}
unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse;
}
unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
sse, &sum, vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 9);
}
unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
sse, &sum, vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 10);
}
unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
sse, &sum, vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 12);
}
unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
sse, &sum, vpx_get32x32var_avx2, 32);
return *sse - (((int64_t)sum * sum) >> 11);
}

Просмотреть файл

@ -10,9 +10,9 @@
#include <immintrin.h> // AVX2
#include "./vp9_rtcd.h"
#include "./vpx_dsp_rtcd.h"
void vp9_get16x16var_avx2(const unsigned char *src_ptr,
void vpx_get16x16var_avx2(const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
@ -123,7 +123,7 @@ void vp9_get16x16var_avx2(const unsigned char *src_ptr,
}
}
void vp9_get32x32var_avx2(const unsigned char *src_ptr,
void vpx_get32x32var_avx2(const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,

Просмотреть файл

@ -0,0 +1,424 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
global sym(vpx_get_mb_ss_mmx) PRIVATE
sym(vpx_get_mb_ss_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
sub rsp, 8
; end prolog
mov rax, arg(0) ;src_ptr
mov rcx, 16
pxor mm4, mm4
.NEXTROW:
movq mm0, [rax]
movq mm1, [rax+8]
movq mm2, [rax+16]
movq mm3, [rax+24]
pmaddwd mm0, mm0
pmaddwd mm1, mm1
pmaddwd mm2, mm2
pmaddwd mm3, mm3
paddd mm4, mm0
paddd mm4, mm1
paddd mm4, mm2
paddd mm4, mm3
add rax, 32
dec rcx
ja .NEXTROW
movq QWORD PTR [rsp], mm4
;return sum[0]+sum[1];
movsxd rax, dword ptr [rsp]
movsxd rcx, dword ptr [rsp+4]
add rax, rcx
; begin epilog
add rsp, 8
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vpx_get8x8var_mmx
;(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *SSE,
; int *Sum
;)
global sym(vpx_get8x8var_mmx) PRIVATE
sym(vpx_get8x8var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
push rsi
push rdi
push rbx
sub rsp, 16
; end prolog
pxor mm5, mm5 ; Blank mmx6
pxor mm6, mm6 ; Blank mmx7
pxor mm7, mm7 ; Blank mmx7
mov rax, arg(0) ;[src_ptr] ; Load base addresses
mov rbx, arg(2) ;[ref_ptr]
movsxd rcx, dword ptr arg(1) ;[source_stride]
movsxd rdx, dword ptr arg(3) ;[recon_stride]
; Row 1
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm1, [rbx] ; Copy eight bytes to mm1
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 2
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 3
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 4
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 5
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
; movq mm4, [rbx + rdx]
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 6
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 7
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Row 8
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm2, mm0 ; Take copies
movq mm3, mm1 ; Take copies
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
punpckhbw mm2, mm6 ; unpack to higher prrcision
punpckhbw mm3, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
psubsw mm2, mm3 ; A-B (high order) to MM2
paddw mm5, mm0 ; accumulate differences in mm5
paddw mm5, mm2 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
pmaddwd mm2, mm2 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
paddd mm7, mm0 ; accumulate in mm7
paddd mm7, mm2 ; accumulate in mm7
; Now accumulate the final results.
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
movsx rdx, WORD PTR [rsp+8]
movsx rcx, WORD PTR [rsp+10]
movsx rbx, WORD PTR [rsp+12]
movsx rax, WORD PTR [rsp+14]
add rdx, rcx
add rbx, rax
add rdx, rbx ;XSum
movsxd rax, DWORD PTR [rsp]
movsxd rcx, DWORD PTR [rsp+4]
add rax, rcx ;XXSum
mov rsi, arg(4) ;SSE
mov rdi, arg(5) ;Sum
mov dword ptr [rsi], eax
mov dword ptr [rdi], edx
xor rax, rax ; return 0
; begin epilog
add rsp, 16
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void
;vpx_get4x4var_mmx
;(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *SSE,
; int *Sum
;)
global sym(vpx_get4x4var_mmx) PRIVATE
sym(vpx_get4x4var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
push rsi
push rdi
push rbx
sub rsp, 16
; end prolog
pxor mm5, mm5 ; Blank mmx6
pxor mm6, mm6 ; Blank mmx7
pxor mm7, mm7 ; Blank mmx7
mov rax, arg(0) ;[src_ptr] ; Load base addresses
mov rbx, arg(2) ;[ref_ptr]
movsxd rcx, dword ptr arg(1) ;[source_stride]
movsxd rdx, dword ptr arg(3) ;[recon_stride]
; Row 1
movd mm0, [rax] ; Copy four bytes to mm0
movd mm1, [rbx] ; Copy four bytes to mm1
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 2
movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 3
movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher precision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 4
movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
pmaddwd mm0, mm0 ; square and accumulate
paddd mm7, mm0 ; accumulate in mm7
; Now accumulate the final results.
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
movsx rdx, WORD PTR [rsp+8]
movsx rcx, WORD PTR [rsp+10]
movsx rbx, WORD PTR [rsp+12]
movsx rax, WORD PTR [rsp+14]
add rdx, rcx
add rbx, rax
add rdx, rbx ;XSum
movsxd rax, DWORD PTR [rsp]
movsxd rcx, DWORD PTR [rsp+4]
add rax, rcx ;XXSum
mov rsi, arg(4) ;SSE
mov rdi, arg(5) ;Sum
mov dword ptr [rsi], eax
mov dword ptr [rdi], edx
xor rax, rax ; return 0
; begin epilog
add rsp, 16
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret

107
vpx_dsp/x86/variance_mmx.c Normal file
Просмотреть файл

@ -0,0 +1,107 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse, int *sum);
unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int var;
int avg;
vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 4));
}
unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int var;
int avg;
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 6));
}
unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3;
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
b + 8 * b_stride, b_stride, &sse2, &sum2);
vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
*sse = var;
return var;
}
unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3, avg;
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
b + 8 * b_stride, b_stride, &sse2, &sum2);
vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
avg = sum0 + sum1 + sum2 + sum3;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 8));
}
unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
}
unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
const unsigned char *b, int b_stride,
unsigned int *sse) {
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
b + 8 * b_stride, b_stride, &sse1, &sum1);
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
}

309
vpx_dsp/x86/variance_sse2.c Normal file
Просмотреть файл

@ -0,0 +1,309 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <emmintrin.h> // SSE2
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse, int *sum);
unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
__m128i vsum = _mm_setzero_si128();
int i;
for (i = 0; i < 32; ++i) {
const __m128i v = _mm_loadu_si128((const __m128i *)src);
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
src += 8;
}
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
return _mm_cvtsi128_si32(vsum);
}
#define READ64(p, stride, i) \
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
static void get4x4var_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum) {
const __m128i zero = _mm_setzero_si128();
const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
// sum
__m128i vsum = _mm_add_epi16(diff0, diff1);
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
// sse
vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
_mm_madd_epi16(diff1, diff1));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
*sse = _mm_cvtsi128_si32(vsum);
}
void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum) {
const __m128i zero = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
__m128i vsse = _mm_setzero_si128();
int i;
for (i = 0; i < 8; i += 2) {
const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(src + i * src_stride)), zero);
const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(ref + i * ref_stride)), zero);
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(src + (i + 1) * src_stride)), zero);
const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
(const __m128i *)(ref + (i + 1) * ref_stride)), zero);
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
vsum = _mm_add_epi16(vsum, diff0);
vsum = _mm_add_epi16(vsum, diff1);
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
}
// sum
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
// sse
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
*sse = _mm_cvtsi128_si32(vsse);
}
void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum) {
const __m128i zero = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
__m128i vsse = _mm_setzero_si128();
int i;
for (i = 0; i < 16; ++i) {
const __m128i s = _mm_loadu_si128((const __m128i *)src);
const __m128i r = _mm_loadu_si128((const __m128i *)ref);
const __m128i src0 = _mm_unpacklo_epi8(s, zero);
const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
const __m128i src1 = _mm_unpackhi_epi8(s, zero);
const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
vsum = _mm_add_epi16(vsum, diff0);
vsum = _mm_add_epi16(vsum, diff1);
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
src += src_stride;
ref += ref_stride;
}
// sum
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
*sum = (int16_t)_mm_extract_epi16(vsum, 0) +
(int16_t)_mm_extract_epi16(vsum, 1);
// sse
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
*sse = _mm_cvtsi128_si32(vsse);
}
static void variance_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
getNxMvar_fn_t var_fn, int block_size) {
int i, j;
*sse = 0;
*sum = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 4);
}
unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
sse, &sum, get4x4var_sse2, 4);
return *sse - (((unsigned int)sum * sum) >> 5);
}
unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
sse, &sum, get4x4var_sse2, 4);
return *sse - (((unsigned int)sum * sum) >> 5);
}
unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 6);
}
unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
sse, &sum, vpx_get8x8var_sse2, 8);
return *sse - (((unsigned int)sum * sum) >> 7);
}
unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
sse, &sum, vpx_get8x8var_sse2, 8);
return *sse - (((unsigned int)sum * sum) >> 7);
}
unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 8);
}
unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
sse, &sum, vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 10);
}
unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
sse, &sum, vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 9);
}
unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
sse, &sum, vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 9);
}
unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
sse, &sum, vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 12);
}
unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
sse, &sum, vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 11);
}
unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
sse, &sum, vpx_get16x16var_sse2, 16);
return *sse - (((int64_t)sum * sum) >> 11);
}
unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}
unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}
unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}
unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
return *sse;
}