Merge "SSE/SSE2 assembly for 4x4/8x8/16x16/32x32 TM intra prediction."

This commit is contained in:
Ronald S. Bultje 2013-07-10 14:52:19 -07:00 коммит произвёл Gerrit Code Review
Родитель 865ca76604 8dade638a1
Коммит 6a60249071
2 изменённых файлов: 157 добавлений и 6 удалений

Просмотреть файл

@ -22,6 +22,8 @@ EOF
}
forward_decls vp9_common_forward_decls
[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
#
# Dequant
#
@ -77,7 +79,7 @@ prototype void vp9_v_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint
specialize vp9_v_predictor_4x4 sse
prototype void vp9_tm_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_tm_predictor_4x4
specialize vp9_tm_predictor_4x4 sse
prototype void vp9_dc_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_dc_predictor_4x4 sse
@ -116,7 +118,7 @@ prototype void vp9_v_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint
specialize vp9_v_predictor_8x8 sse
prototype void vp9_tm_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_tm_predictor_8x8
specialize vp9_tm_predictor_8x8 sse2
prototype void vp9_dc_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_dc_predictor_8x8 sse
@ -155,7 +157,7 @@ prototype void vp9_v_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui
specialize vp9_v_predictor_16x16 sse2
prototype void vp9_tm_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_tm_predictor_16x16
specialize vp9_tm_predictor_16x16 sse2
prototype void vp9_dc_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_dc_predictor_16x16 sse2
@ -194,7 +196,7 @@ prototype void vp9_v_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui
specialize vp9_v_predictor_32x32 sse2
prototype void vp9_tm_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_tm_predictor_32x32
specialize vp9_tm_predictor_32x32 sse2_x86_64
prototype void vp9_dc_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_dc_predictor_32x32 sse2
@ -362,8 +364,6 @@ if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
# variance
[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance32x16 sse2

Просмотреть файл

@ -188,3 +188,154 @@ cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
dec nlines4d
jnz .loop
REP_RET
INIT_MMX sse
cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
pxor m1, m1
movd m2, [aboveq-1]
movd m0, [aboveq]
punpcklbw m2, m1
punpcklbw m0, m1
pshufw m2, m2, 0x0
DEFINE_ARGS dst, stride, line, left
mov lineq, -2
add leftq, 4
psubw m0, m2
.loop:
movd m2, [leftq+lineq*2]
movd m3, [leftq+lineq*2+1]
punpcklbw m2, m1
punpcklbw m3, m1
pshufw m2, m2, 0x0
pshufw m3, m3, 0x0
paddw m2, m0
paddw m3, m0
packuswb m2, m2
packuswb m3, m3
movd [dstq ], m2
movd [dstq+strideq], m3
lea dstq, [dstq+strideq*2]
inc lineq
jnz .loop
REP_RET
INIT_XMM sse2
cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
pxor m1, m1
movd m2, [aboveq-1]
movq m0, [aboveq]
punpcklbw m2, m1
punpcklbw m0, m1
pshuflw m2, m2, 0x0
DEFINE_ARGS dst, stride, line, left
mov lineq, -4
punpcklqdq m2, m2
add leftq, 8
psubw m0, m2
.loop:
movd m2, [leftq+lineq*2]
movd m3, [leftq+lineq*2+1]
punpcklbw m2, m1
punpcklbw m3, m1
pshuflw m2, m2, 0x0
pshuflw m3, m3, 0x0
punpcklqdq m2, m2
punpcklqdq m3, m3
paddw m2, m0
paddw m3, m0
packuswb m2, m3
movq [dstq ], m2
movhps [dstq+strideq], m2
lea dstq, [dstq+strideq*2]
inc lineq
jnz .loop
REP_RET
INIT_XMM sse2
cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
pxor m1, m1
movd m2, [aboveq-1]
mova m0, [aboveq]
punpcklbw m2, m1
punpckhbw m4, m0, m1
punpcklbw m0, m1
pshuflw m2, m2, 0x0
DEFINE_ARGS dst, stride, line, left
mov lineq, -8
punpcklqdq m2, m2
add leftq, 16
psubw m0, m2
psubw m4, m2
.loop:
movd m2, [leftq+lineq*2]
movd m3, [leftq+lineq*2+1]
punpcklbw m2, m1
punpcklbw m3, m1
pshuflw m2, m2, 0x0
pshuflw m3, m3, 0x0
punpcklqdq m2, m2
punpcklqdq m3, m3
paddw m5, m2, m0
paddw m6, m3, m0
paddw m2, m4
paddw m3, m4
packuswb m5, m2
packuswb m6, m3
mova [dstq ], m5
mova [dstq+strideq], m6
lea dstq, [dstq+strideq*2]
inc lineq
jnz .loop
REP_RET
%if ARCH_X86_64
INIT_XMM sse2
cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
pxor m1, m1
movd m2, [aboveq-1]
mova m0, [aboveq]
mova m4, [aboveq+16]
punpcklbw m2, m1
punpckhbw m3, m0, m1
punpckhbw m5, m4, m1
punpcklbw m0, m1
punpcklbw m4, m1
pshuflw m2, m2, 0x0
DEFINE_ARGS dst, stride, line, left
mov lineq, -16
punpcklqdq m2, m2
add leftq, 32
psubw m0, m2
psubw m3, m2
psubw m4, m2
psubw m5, m2
.loop:
movd m2, [leftq+lineq*2]
movd m6, [leftq+lineq*2+1]
punpcklbw m2, m1
punpcklbw m6, m1
pshuflw m2, m2, 0x0
pshuflw m6, m6, 0x0
punpcklqdq m2, m2
punpcklqdq m6, m6
paddw m7, m2, m0
paddw m8, m2, m3
paddw m9, m2, m4
paddw m2, m5
packuswb m7, m8
packuswb m9, m2
paddw m2, m6, m0
paddw m8, m6, m3
mova [dstq ], m7
paddw m7, m6, m4
paddw m6, m5
mova [dstq +16], m9
packuswb m2, m8
packuswb m7, m6
mova [dstq+strideq ], m2
mova [dstq+strideq+16], m7
lea dstq, [dstq+strideq*2]
inc lineq
jnz .loop
REP_RET
%endif