Merge "Remove usage of predict buffer for decode"

This commit is contained in:
Scott LaVarnway 2011-10-19 10:24:48 -07:00 коммит произвёл Gerrit Code Review
Родитель efa17efced ed9c66f584
Коммит 63a77cbed9
56 изменённых файлов: 1496 добавлений и 2455 удалений

Просмотреть файл

@ -45,7 +45,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_armv6;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
@ -64,9 +63,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6;
rtcd->recon.recon = vp8_recon_b_armv6;
rtcd->recon.recon2 = vp8_recon2b_armv6;
rtcd->recon.recon4 = vp8_recon4b_armv6;
}
#endif
@ -82,7 +78,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_neon;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
@ -99,10 +94,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->recon.copy16x16 = vp8_copy_mem16x16_neon;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_neon;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_neon;
rtcd->recon.recon = vp8_recon_b_neon;
rtcd->recon.recon2 = vp8_recon2b_neon;
rtcd->recon.recon4 = vp8_recon4b_neon;
rtcd->recon.recon_mb = vp8_recon_mb_neon;
rtcd->recon.build_intra_predictors_mby =
vp8_build_intra_predictors_mby_neon;
rtcd->recon.build_intra_predictors_mby_s =

Просмотреть файл

@ -11,25 +11,27 @@
AREA |.text|, CODE, READONLY
;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
; int pred_stride, unsigned char *dst_ptr,
; int dst_stride)
; r0 input_dc
; r1 pred_ptr
; r2 dest_ptr
; r3 pitch
; sp stride
; r2 pred_stride
; r3 dst_ptr
; sp dst_stride
|vp8_dc_only_idct_add_v6| PROC
stmdb sp!, {r4 - r7, lr}
stmdb sp!, {r4 - r7}
add r0, r0, #4 ; input_dc += 4
ldr r12, c0x0000FFFF
ldr r4, [r1], r3
ldr r6, [r1], r3
ldr r4, [r1], r2
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
ldr lr, [sp, #20]
ldr r6, [r1], r2
orr r0, r0, r0, lsl #16 ; a1 | a1
ldr r12, [sp, #16] ; dst stride
uxtab16 r5, r0, r4 ; a1+2 | a1+0
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
uxtab16 r7, r0, r6
@ -40,10 +42,10 @@
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
ldr r4, [r1], r3
ldr r4, [r1], r2
str r5, [r3], r12
ldr r6, [r1]
str r5, [r2], lr
str r7, [r2], lr
str r7, [r3], r12
uxtab16 r5, r0, r4
uxtab16 r4, r0, r4, ror #8
@ -55,10 +57,11 @@
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
str r5, [r2], lr
str r7, [r2]
str r5, [r3], r12
str r7, [r3]
ldmia sp!, {r4 - r7, pc}
ldmia sp!, {r4 - r7}
bx lr
ENDP ; |vp8_dc_only_idct_add_v6|

Просмотреть файл

@ -9,337 +9,194 @@
;
; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
EXPORT |vp8_short_idct4x4llm_1_v6|
EXPORT |vp8_short_idct4x4llm_v6|
EXPORT |vp8_short_idct4x4llm_v6_scott|
EXPORT |vp8_short_idct4x4llm_v6_dual|
AREA |.text|, CODE, READONLY
;********************************************************************************
;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench: 3/5
;********************************************************************************
|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
;
ldrsh r0, [r0] ; load input[0] 1, r0 un 2
add r0, r0, #4 ; 1 +4
stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
mov r5, r4 ; expand expand
; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
; unsigned char *dst, int stride)
; r0 short* input
; r1 unsigned char* pred
; r2 int pitch
; r3 unsigned char* dst
; sp int stride
strd r4, [r1], r2 ; *output = r0, post inc 1
strd r4, [r1], r2 ; 1
strd r4, [r1], r2 ; 1
strd r4, [r1] ; 1
;
ldmia sp!, {r4, r5, pc} ; replace vars, return restore
ENDP ; |vp8_short_idct4x4llm_1_v6|
;********************************************************************************
;********************************************************************************
;********************************************************************************
|vp8_short_idct4x4llm_v6_dual| PROC
stmdb sp!, {r4-r11, lr}
;********************************************************************************
;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench:
;********************************************************************************
sub sp, sp, #4
|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
;
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
;
mov r4, #0x00004E00 ; 1 cst
orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
mov r5, #0x00008A00 ; 1 cst
orr r5, r5, #0x0000008C ; sinpi8sqrt2
;
mov r6, #4 ; i=4 1 i
loop1 ;
ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
add r9, r7, r8 ; a1 = [0] + [8] 1 a1
sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
add r11, r3, r11 ; temp2 1
rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
add r8, r7, r11 ; b1 + c1 1 b+c
strh r8, [r1, r2] ; out[pitch] = b1+c1 1
sub r7, r7, r11 ; b1 - c1 1 b-c
add r10, r12, r10 ; temp1 1
add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
add r10, r9, r3 ; a1 + d1 1 a+d
sub r3, r9, r3 ; a1 - d1 1 a-d
add r8, r2, r2 ; pitch * 2 1 p*2
strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
subs r6, r6, #1 ; i-- 1 --
strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
bne loop1 ; if i>0, continue
;
sub r1, r1, #8 ; set up out for next loop 1 -4
; for this iteration, input=prev output
mov r6, #4 ; i=4 1 i
; b returnfull
loop2 ;
ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
add r7, r0, r3 ; a1 = [0] + [2] 1 a1
sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
add r10, r8, r10 ; temp2 1
rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
add r3, r0, r9 ; b1+c1 1 b+c
add r3, r3, #4 ; b1+c1+4 1 +4
add r10, r11, r10 ; temp1 1
mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
strh r3, [r1, #2] ; out[1] = b1+c1 1
add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
add r3, r7, r10 ; a1+d1 1 a+d
add r3, r3, #4 ; a1+d1+4 1 +4
sub r7, r7, r10 ; a1-d1 1 a-d
add r7, r7, #4 ; a1-d1+4 1 +4
mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
strh r7, [r1, #6] ; out[3] = a1-d1 1
sub r0, r0, r9 ; b1-c1 1 b-c
add r0, r0, #4 ; b1-c1+4 1 +4
subs r6, r6, #1 ; i-- 1 --
mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
strh r0, [r1, #4] ; out[2] = b1-c1 1
strh r3, [r1], r2 ; out[0] = a1+d1 1
; add r1, r1, r2 ; out += pitch 1 ++
bne loop2 ; if i>0, continue
returnfull ;
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
;********************************************************************************
;********************************************************************************
;********************************************************************************
mov r5, #0x00004E00 ; cos
orr r5, r5, #0x0000007B ; cospi8sqrt2minus1
orr r5, r5, #1<<31 ; loop counter on top bit
;********************************************************************************
;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench:
;********************************************************************************
|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
; mov r0, #0 ;
; ldr r0, [r0] ;
stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
;
mov r3, #0x00004E00 ; cos
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
;
mov r5, #0x2 ; i i
;
short_idct4x4llm_v6_scott_loop1 ;
ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
;
smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
;
smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
;
add r6, r6, r7 ; partial c1 lt1-lt2
add r12, r12, r14 ; partial d1 l2t2+l2t1
;
smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
;
smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
;
add r7, r14, r7 ; partial c1_2 ht1+ht2
sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
;
pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
;
usub16 r6, r6, r10 ; c1_2 | c1_1 c
uadd16 r12, r12, r11 ; d1_2 | d1_1 d
;
ldr r10, [r0, #0] ; i1 | i0 1,0
ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
;
;;;;;; add r0, r0, #0x4 ; +4
;;;;;; add r1, r1, #0x4 ; +4
;
uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
;
uadd16 r7, r8, r12 ; a1 + d1 pair a+d
usub16 r14, r8, r12 ; a1 - d1 pair a-d
;
str r7, [r1] ; op[0] = a1 + d1
str r14, [r1, r2] ; op[pitch*3] = a1 - d1
;
add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
;
subs r5, r5, #0x1 ; --
bne short_idct4x4llm_v6_scott_loop1 ;
;
sub r1, r1, #16 ; reset output ptr
mov r5, #0x4 ;
mov r0, r1 ; input = output
;
short_idct4x4llm_v6_scott_loop2 ;
;
subs r5, r5, #0x1 ;
bne short_idct4x4llm_v6_scott_loop2 ;
;
ldmia sp!, {r4 - r11, pc} ;
ENDP ;
;
;********************************************************************************
;********************************************************************************
;********************************************************************************
;********************************************************************************
;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench:
;********************************************************************************
|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
;
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
mov r3, #0x00004E00 ; cos
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
mov r5, #0x2 ; i=2 i
loop1_dual
ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
ldr r6, [r0, #(4*2)] ; i5 | i4
ldr r12, [r0, #(12*2)] ; i13|i12
ldr r14, [r0, #(8*2)] ; i9 | i8
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16
smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16
pkhtb r7, r9, r7, asr #16 ; 5c | 4c
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
subs r5, r5, #0x1 ; i-- --
pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
usub16 r7, r8, r7 ; c c
uadd16 r6, r6, r10 ; d d
uadd16 r10, r11, r14 ; a a
usub16 r8, r11, r14 ; b b
uadd16 r9, r10, r6 ; a+d a+d
usub16 r10, r10, r6 ; a-d a-d
uadd16 r6, r8, r7 ; b+c b+c
usub16 r7, r8, r7 ; b-c b-c
str r6, [r1, r2] ; o5 | o4
add r6, r2, r2 ; pitch * 2 p2
str r7, [r1, r6] ; o9 | o8
add r6, r6, r2 ; pitch * 3 p3
str r10, [r1, r6] ; o13 | o12
str r9, [r1], #0x4 ; o1 | o0 ++
bne loop1_dual ;
mov r5, #0x2 ; i=2 i
sub r0, r1, #8 ; reset input/output i/o
loop2_dual
ldr r6, [r0, r2] ; i5 | i4 5|4
ldr r1, [r0] ; i1 | i0 1|0
ldr r12, [r0, #0x4] ; i3 | i2 3|2
add r14, r2, #0x4 ; pitch + 2 p+2
ldr r14, [r0, r14] ; i7 | i6 7|6
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
uadd16 r10, r11, r9 ; a a
usub16 r9, r11, r9 ; b b
pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
subs r5, r5, #0x1 ; i-- --
smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16
smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16
subs r5, r5, #1<<31 ; i--
pkhtb r9, r11, r9, asr #16 ; 13c | 12c
ldr r11, [r0] ; i1 | i0
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
usub16 r7, r8, r7 ; c
uadd16 r6, r6, r10 ; d
uadd16 r10, r11, r14 ; a
usub16 r8, r11, r14 ; b
uadd16 r9, r10, r6 ; a+d
usub16 r10, r10, r6 ; a-d
uadd16 r6, r8, r7 ; b+c
usub16 r7, r8, r7 ; b-c
; use input buffer to store intermediate results
str r6, [r0, #(4*2)] ; o5 | o4
str r7, [r0, #(8*2)] ; o9 | o8
str r10,[r0, #(12*2)] ; o13|o12
str r9, [r0], #4 ; o1 | o0
bcs loop1_dual
sub r0, r0, #8 ; reset input/output
str r0, [sp]
loop2_dual
ldr r6, [r0, #(4*2)] ; i5 | i4
ldr r12,[r0, #(2*2)] ; i3 | i2
ldr r14,[r0, #(6*2)] ; i7 | i6
ldr r0, [r0, #(0*2)] ; i1 | i0
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16
pkhbt r11, r6, r0, lsl #16 ; i0 | i4
pkhtb r7, r7, r9, asr #16 ; 1c | 5c
pkhtb r0, r0, r6, asr #16 ; i1 | i5
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1
uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2
pkhbt r9, r14, r12, lsl #16 ; i2 | i6
uadd16 r10, r11, r9 ; a
usub16 r9, r11, r9 ; b
pkhtb r6, r12, r14, asr #16 ; i3 | i7
subs r5, r5, #1<<31 ; i--
smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16
smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16
pkhtb r7, r7, r12, asr #16 ; 3c | 7c
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2
usub16 r12, r8, r6 ; c (o1 | o5)
uadd16 r6, r11, r0 ; d (o3 | o7)
uadd16 r7, r10, r6 ; a+d
mov r8, #4 ; set up 4's
orr r8, r8, #0x40000 ; 4|4
usub16 r6, r10, r6 ; a-d
uadd16 r6, r6, r8 ; a-d+4, 3|7
uadd16 r7, r7, r8 ; a+d+4, 0|4
uadd16 r10, r9, r12 ; b+c
usub16 r0, r9, r12 ; b-c
uadd16 r10, r10, r8 ; b+c+4, 1|5
uadd16 r8, r0, r8 ; b-c+4, 2|6
ldr lr, [sp, #40] ; dst stride
ldrb r0, [r1] ; pred p0
ldrb r11, [r1, #1] ; pred p1
ldrb r12, [r1, #2] ; pred p2
add r0, r0, r7, asr #19 ; p0 + o0
add r11, r11, r10, asr #19 ; p1 + o1
add r12, r12, r8, asr #19 ; p2 + o2
usat r0, #8, r0 ; d0 = clip8(p0 + o0)
usat r11, #8, r11 ; d1 = clip8(p1 + o1)
usat r12, #8, r12 ; d2 = clip8(p2 + o2)
add r0, r0, r11, lsl #8 ; |--|--|d1|d0|
ldrb r11, [r1, #3] ; pred p3
add r0, r0, r12, lsl #16 ; |--|d2|d1|d0|
add r11, r11, r6, asr #19 ; p3 + o3
sxth r7, r7 ;
sxth r10, r10 ;
usat r11, #8, r11 ; d3 = clip8(p3 + o3)
sxth r8, r8 ;
sxth r6, r6 ;
add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0|
ldrb r12, [r1, r2]! ; pred p4
str r0, [r3], lr
ldrb r11, [r1, #1] ; pred p5
add r12, r12, r7, asr #3 ; p4 + o4
add r11, r11, r10, asr #3 ; p5 + o5
usat r12, #8, r12 ; d4 = clip8(p4 + o4)
usat r11, #8, r11 ; d5 = clip8(p5 + o5)
ldrb r7, [r1, #2] ; pred p6
ldrb r10, [r1, #3] ; pred p6
add r12, r12, r11, lsl #8 ; |--|--|d5|d4|
add r7, r7, r8, asr #3 ; p6 + o6
add r10, r10, r6, asr #3 ; p7 + o7
ldr r0, [sp] ; load input pointer
usat r7, #8, r7 ; d6 = clip8(p6 + o6)
usat r10, #8, r10 ; d7 = clip8(p7 + o7)
add r12, r12, r7, lsl #16 ; |--|d6|d5|d4|
add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4|
str r12, [r3], lr
add r0, r0, #16
add r1, r1, r2 ; pred + pitch
bcs loop2_dual
add sp, sp, #4 ; idct_output buffer
ldmia sp!, {r4 - r11, pc}
pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
usub16 r12, r8, r6 ; c (o1 | o5) c
uadd16 r6, r11, r1 ; d (o3 | o7) d
uadd16 r7, r10, r6 ; a+d a+d
mov r8, #0x4 ; set up 4's 4
orr r8, r8, #0x40000 ; 4|4
usub16 r6, r10, r6 ; a-d a-d
uadd16 r6, r6, r8 ; a-d+4 3|7
uadd16 r7, r7, r8 ; a+d+4 0|4
uadd16 r10, r9, r12 ; b+c b+c
usub16 r1, r9, r12 ; b-c b-c
uadd16 r10, r10, r8 ; b+c+4 1|5
uadd16 r1, r1, r8 ; b-c+4 2|6
mov r8, r10, asr #19 ; o1 >> 3
strh r8, [r0, #2] ; o1
mov r8, r1, asr #19 ; o2 >> 3
strh r8, [r0, #4] ; o2
mov r8, r6, asr #19 ; o3 >> 3
strh r8, [r0, #6] ; o3
mov r8, r7, asr #19 ; o0 >> 3
strh r8, [r0], r2 ; o0 +p
sxth r10, r10 ;
mov r8, r10, asr #3 ; o5 >> 3
strh r8, [r0, #2] ; o5
sxth r1, r1 ;
mov r8, r1, asr #3 ; o6 >> 3
strh r8, [r0, #4] ; o6
sxth r6, r6 ;
mov r8, r6, asr #3 ; o7 >> 3
strh r8, [r0, #6] ; o7
sxth r7, r7 ;
mov r8, r7, asr #3 ; o4 >> 3
strh r8, [r0], r2 ; o4 +p
;;;;; subs r5, r5, #0x1 ; i-- --
bne loop2_dual ;
;
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP
END

Просмотреть файл

@ -1,281 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon_b_armv6|
EXPORT |vp8_recon2b_armv6|
EXPORT |vp8_recon4b_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
prd RN r0
dif RN r1
dst RN r2
stride RN r3
;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
; R0 char* pred_ptr
; R1 short * dif_ptr
; R2 char * dst_ptr
; R3 int stride
; Description:
; Loop through the block adding the Pred and Diff together. Clamp and then
; store back into the Dst.
; Restrictions :
; all buffers are expected to be 4 byte aligned coming in and
; going out.
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_recon_b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #8] ; 1 | 0
;; ldr r7, [dif, #12] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #16] ; 1 | 0
;; ldr r7, [dif, #20] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #24] ; 1 | 0
;; ldr r7, [dif, #28] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst], stride
ldmia sp!, {r4 - r9, pc}
ENDP ; |recon_b|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
; R0 char *pred_ptr
; R1 short *dif_ptr
; R2 char *dst_ptr
; R3 int stride
|vp8_recon4b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
mov lr, #4
recon4b_loop
;0, 1, 2, 3
ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst]
;4, 5, 6, 7
ldr r4, [prd], #4
;; ldr r6, [dif, #32]
;; ldr r7, [dif, #36]
ldr r6, [dif, #8]
ldr r7, [dif, #12]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #4]
;8, 9, 10, 11
ldr r4, [prd], #4
;; ldr r6, [dif, #64]
;; ldr r7, [dif, #68]
ldr r6, [dif, #16]
ldr r7, [dif, #20]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #8]
;12, 13, 14, 15
ldr r4, [prd], #4
;; ldr r6, [dif, #96]
;; ldr r7, [dif, #100]
ldr r6, [dif, #24]
ldr r7, [dif, #28]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #12]
add dst, dst, stride
;; add dif, dif, #8
add dif, dif, #32
subs lr, lr, #1
bne recon4b_loop
ldmia sp!, {r4 - r9, pc}
ENDP ; |Recon4B|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
; R0 char *pred_ptr
; R1 short *dif_ptr
; R2 char *dst_ptr
; R3 int stride
|vp8_recon2b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
mov lr, #4
recon2b_loop
;0, 1, 2, 3
ldr r4, [prd], #4
ldr r6, [dif, #0]
ldr r7, [dif, #4]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst]
;4, 5, 6, 7
ldr r4, [prd], #4
;; ldr r6, [dif, #32]
;; ldr r7, [dif, #36]
ldr r6, [dif, #8]
ldr r7, [dif, #12]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #4]
add dst, dst, stride
;; add dif, dif, #8
add dif, dif, #16
subs lr, lr, #1
bne recon2b_loop
ldmia sp!, {r4 - r9, pc}
ENDP ; |Recon2B|
END

Просмотреть файл

@ -13,16 +13,12 @@
#define IDCT_ARM_H
#if HAVE_ARMV6
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
@ -38,16 +34,12 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#endif
#if HAVE_ARMV7
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
extern prototype_idct(vp8_short_idct4x4llm_neon);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon

Просмотреть файл

@ -14,22 +14,26 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
; int pred_stride, unsigned char *dst_ptr,
; int dst_stride)
; r0 input_dc
; r1 pred_ptr
; r2 dst_ptr
; r3 pitch
; sp stride
; r2 pred_stride
; r3 dst_ptr
; sp dst_stride
|vp8_dc_only_idct_add_neon| PROC
add r0, r0, #4
asr r0, r0, #3
ldr r12, [sp]
vdup.16 q0, r0
vld1.32 {d2[0]}, [r1], r3
vld1.32 {d2[1]}, [r1], r3
vld1.32 {d4[0]}, [r1], r3
vld1.32 {d2[0]}, [r1], r2
vld1.32 {d2[1]}, [r1], r2
vld1.32 {d4[0]}, [r1], r2
vld1.32 {d4[1]}, [r1]
vaddw.u8 q1, q0, d2
@ -38,12 +42,13 @@
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
vst1.32 {d2[0]}, [r2], r12
vst1.32 {d2[1]}, [r2], r12
vst1.32 {d4[0]}, [r2], r12
vst1.32 {d4[1]}, [r2]
bx lr
vst1.32 {d2[0]}, [r3], r12
vst1.32 {d2[1]}, [r3], r12
vst1.32 {d4[0]}, [r3], r12
vst1.32 {d4[1]}, [r3]
bx lr
ENDP
END

Просмотреть файл

@ -1,131 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon16x16mb_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int ystride,
; stack unsigned char *udst_ptr,
; stack unsigned char *vdst_ptr
|vp8_recon16x16mb_neon| PROC
mov r12, #4 ;loop counter for Y loop
recon16x16mb_loop_y
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]!
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]!
pld [r0]
pld [r1]
pld [r1, #64]
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vadd.s16 q7, q7, q15
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vqmovun.s16 d4, q4
vqmovun.s16 d5, q5
vst1.u8 {q0}, [r2], r3 ;store result
vqmovun.s16 d6, q6
vst1.u8 {q1}, [r2], r3
vqmovun.s16 d7, q7
vst1.u8 {q2}, [r2], r3
subs r12, r12, #1
moveq r12, #2 ;loop counter for UV loop
vst1.u8 {q3}, [r2], r3
bne recon16x16mb_loop_y
mov r3, r3, lsr #1 ;uv_stride = ystride>>1
ldr r2, [sp] ;load upred_ptr
recon16x16mb_loop_uv
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]!
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]!
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vqmovun.s16 d0, q0 ;CLAMP() saturation
vadd.s16 q7, q7, q15
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vst1.u8 {d0}, [r2], r3 ;store result
vqmovun.s16 d4, q4
vst1.u8 {d1}, [r2], r3
vqmovun.s16 d5, q5
vst1.u8 {d2}, [r2], r3
vqmovun.s16 d6, q6
vst1.u8 {d3}, [r2], r3
vqmovun.s16 d7, q7
vst1.u8 {d4}, [r2], r3
subs r12, r12, #1
vst1.u8 {d5}, [r2], r3
vst1.u8 {d6}, [r2], r3
vst1.u8 {d7}, [r2], r3
ldrne r2, [sp, #4] ;load vpred_ptr
bne recon16x16mb_loop_uv
bx lr
ENDP
END

Просмотреть файл

@ -1,54 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon2b_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int stride
|vp8_recon2b_neon| PROC
vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr
vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr
vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits
vld1.16 {q6, q7}, [r1]!
vmovl.u8 q1, d17
vmovl.u8 q2, d18
vmovl.u8 q3, d19
vadd.s16 q0, q0, q4 ;add Diff data and Pred data together
vadd.s16 q1, q1, q5
vadd.s16 q2, q2, q6
vadd.s16 q3, q3, q7
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
add r0, r2, r3
vst1.u8 {d0}, [r2] ;store result
vst1.u8 {d1}, [r0], r3
add r2, r0, r3
vst1.u8 {d2}, [r0]
vst1.u8 {d3}, [r2], r3
bx lr
ENDP
END

Просмотреть файл

@ -1,69 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon4b_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int stride
|vp8_recon4b_neon| PROC
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vadd.s16 q7, q7, q15
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vqmovun.s16 d4, q4
vqmovun.s16 d5, q5
vqmovun.s16 d6, q6
vqmovun.s16 d7, q7
add r0, r2, r3
vst1.u8 {q0}, [r2] ;store result
vst1.u8 {q1}, [r0], r3
add r2, r0, r3
vst1.u8 {q2}, [r0]
vst1.u8 {q3}, [r2], r3
bx lr
ENDP
END

Просмотреть файл

@ -1,29 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8/common/recon.h"
#include "vp8/common/blockd.h"
extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
unsigned char *pred_ptr = &x->predictor[0];
short *diff_ptr = &x->diff[0];
unsigned char *dst_ptr = x->dst.y_buffer;
unsigned char *udst_ptr = x->dst.u_buffer;
unsigned char *vdst_ptr = x->dst.v_buffer;
int ystride = x->dst.y_stride;
/*int uv_stride = x->dst.uv_stride;*/
vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr);
}

Просмотреть файл

@ -1,61 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon_b_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int stride
|vp8_recon_b_neon| PROC
mov r12, #16
vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr
vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr
vld1.u8 {d29}, [r0], r12
vld1.16 {q11, q12}, [r1]!
vld1.u8 {d30}, [r0], r12
vld1.16 {q12, q13}, [r1]!
vld1.u8 {d31}, [r0], r12
vld1.16 {q13}, [r1]
vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6
vmovl.u8 q2, d30
vmovl.u8 q3, d31
vadd.s16 d0, d0, d20 ;add Diff data and Pred data together
vadd.s16 d2, d2, d22
vadd.s16 d4, d4, d24
vadd.s16 d6, d6, d26
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
add r1, r2, r3
vst1.32 {d0[0]}, [r2] ;store result
vst1.32 {d1[0]}, [r1], r3
add r2, r1, r3
vst1.32 {d2[0]}, [r1]
vst1.32 {d3[0]}, [r2], r3
bx lr
ENDP
END

Просмотреть файл

@ -1,67 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_idct4x4llm_1_neon|
EXPORT |vp8_dc_only_idct_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
; r0 short *input;
; r1 short *output;
; r2 int pitch;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|vp8_short_idct4x4llm_1_neon| PROC
vld1.16 {d0[]}, [r0] ;load input[0]
add r3, r1, r2
add r12, r3, r2
vrshr.s16 d0, d0, #3
add r0, r12, r2
vst1.16 {d0}, [r1]
vst1.16 {d0}, [r3]
vst1.16 {d0}, [r12]
vst1.16 {d0}, [r0]
bx lr
ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
; r0 short input_dc;
; r1 short *output;
; r2 int pitch;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|vp8_dc_only_idct_neon| PROC
vdup.16 d0, r0
add r3, r1, r2
add r12, r3, r2
vrshr.s16 d0, d0, #3
add r0, r12, r2
vst1.16 {d0}, [r1]
vst1.16 {d0}, [r3]
vst1.16 {d0}, [r12]
vst1.16 {d0}, [r0]
bx lr
ENDP
END

Просмотреть файл

@ -17,18 +17,24 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
;*************************************************************
;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
; unsigned char *dst, int stride)
;r0 short * input
;r1 short * output
;r1 short * pred
;r2 int pitch
;r3 unsigned char dst
;sp int stride
;*************************************************************
;static const int cospi8sqrt2minus1=20091;
;static const int sinpi8sqrt2 =35468;
;static const int rounding = 0;
;Optimization note: The resulted data from dequantization are signed 13-bit data that is
;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
;result of the multiplication that is needed in IDCT.
; static const int cospi8sqrt2minus1=20091;
; static const int sinpi8sqrt2 =35468;
; static const int rounding = 0;
; Optimization note: The resulted data from dequantization are signed
; 13-bit data that is in the range of [-4096, 4095]. This allows to
; use "vqdmulh"(neon) instruction since it won't go out of range
; (13+16+1=30bits<32bits). This instruction gives the high half
; result of the multiplication that is needed in IDCT.
|vp8_short_idct4x4llm_neon| PROC
adr r12, idct_coeff
@ -36,6 +42,7 @@
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
ldr r0, [sp] ; stride
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
@ -94,21 +101,31 @@
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
add r3, r1, r2
add r12, r3, r2
add r0, r12, r2
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vst1.16 {d2}, [r1]
vst1.16 {d3}, [r3]
vst1.16 {d4}, [r12]
vst1.16 {d5}, [r0]
; load prediction data
vld1.32 d6[0], [r1], r2
vld1.32 d6[1], [r1], r2
vld1.32 d7[0], [r1], r2
vld1.32 d7[1], [r1], r2
bx lr
; add prediction and residual
vaddw.u8 q1, q1, d6
vaddw.u8 q2, q2, d7
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
; store to destination
vst1.32 d1[0], [r3], r0
vst1.32 d1[1], [r3], r0
vst1.32 d2[0], [r3], r0
vst1.32 d2[1], [r3], r0
bx lr
ENDP

Просмотреть файл

@ -13,24 +13,12 @@
#define RECON_ARM_H
#if HAVE_ARMV6
extern prototype_recon_block(vp8_recon_b_armv6);
extern prototype_recon_block(vp8_recon2b_armv6);
extern prototype_recon_block(vp8_recon4b_armv6);
extern prototype_copy_block(vp8_copy_mem8x8_v6);
extern prototype_copy_block(vp8_copy_mem8x4_v6);
extern prototype_copy_block(vp8_copy_mem16x16_v6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon
#define vp8_recon_recon vp8_recon_b_armv6
#undef vp8_recon_recon2
#define vp8_recon_recon2 vp8_recon2b_armv6
#undef vp8_recon_recon4
#define vp8_recon_recon4 vp8_recon4b_armv6
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp8_copy_mem8x8_v6
@ -43,29 +31,15 @@ extern prototype_copy_block(vp8_copy_mem16x16_v6);
#endif
#if HAVE_ARMV7
extern prototype_recon_block(vp8_recon_b_neon);
extern prototype_recon_block(vp8_recon2b_neon);
extern prototype_recon_block(vp8_recon4b_neon);
extern prototype_copy_block(vp8_copy_mem8x8_neon);
extern prototype_copy_block(vp8_copy_mem8x4_neon);
extern prototype_copy_block(vp8_copy_mem16x16_neon);
extern prototype_recon_macroblock(vp8_recon_mb_neon);
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon);
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon
#define vp8_recon_recon vp8_recon_b_neon
#undef vp8_recon_recon2
#define vp8_recon_recon2 vp8_recon2b_neon
#undef vp8_recon_recon4
#define vp8_recon_recon4 vp8_recon4b_neon
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp8_copy_mem8x8_neon
@ -75,9 +49,6 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp8_copy_mem16x16_neon
#undef vp8_recon_recon_mb
#define vp8_recon_recon_mb vp8_recon_mb_neon
#undef vp8_recon_build_intra_predictors_mby
#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon

Просмотреть файл

@ -70,7 +70,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
#if CONFIG_RUNTIME_CPU_DETECT
VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c;
rtcd->idct.idct16 = vp8_short_idct4x4llm_c;
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c;
@ -79,11 +78,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
rtcd->recon.copy16x16 = vp8_copy_mem16x16_c;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_c;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_c;
rtcd->recon.recon = vp8_recon_b_c;
rtcd->recon.recon2 = vp8_recon2b_c;
rtcd->recon.recon4 = vp8_recon4b_c;
rtcd->recon.recon_mb = vp8_recon_mb_c;
rtcd->recon.recon_mby = vp8_recon_mby_c;
rtcd->recon.build_intra_predictors_mby =
vp8_build_intra_predictors_mby;
rtcd->recon.build_intra_predictors_mby_s =

Просмотреть файл

@ -16,12 +16,14 @@
void sym(short *input, short *output)
#define prototype_idct(sym) \
void sym(short *input, short *output, int pitch)
void sym(short *input, unsigned char *pred, int pitch, unsigned char *dst, \
int dst_stride)
#define prototype_idct_scalar_add(sym) \
void sym(short input, \
unsigned char *pred, unsigned char *output, \
int pitch, int stride)
unsigned char *pred, int pred_stride, \
unsigned char *dst, \
int dst_stride)
#if ARCH_X86 || ARCH_X86_64
#include "x86/idct_x86.h"
@ -31,11 +33,6 @@
#include "arm/idct_arm.h"
#endif
#ifndef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_c
#endif
extern prototype_idct(vp8_idct_idct1);
#ifndef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_c
#endif
@ -63,7 +60,6 @@ typedef prototype_second_order((*vp8_second_order_fn_t));
typedef struct
{
vp8_idct_fn_t idct1;
vp8_idct_fn_t idct16;
vp8_idct_scalar_add_fn_t idct1_scalar_add;

Просмотреть файл

@ -24,28 +24,31 @@
**************************************************************************/
static const int cospi8sqrt2minus1 = 20091;
static const int sinpi8sqrt2 = 35468;
static const int rounding = 0;
void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
int pred_stride, unsigned char *dst_ptr,
int dst_stride)
{
int i;
int r, c;
int a1, b1, c1, d1;
short output[16];
short *ip = input;
short *op = output;
int temp1, temp2;
int shortpitch = pitch >> 1;
int shortpitch = 4;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[8];
b1 = ip[0] - ip[8];
temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
temp1 = (ip[4] * sinpi8sqrt2) >> 16;
temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
c1 = temp1 - temp2;
temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
temp2 = (ip[12] * sinpi8sqrt2) >> 16;
d1 = temp1 + temp2;
op[shortpitch*0] = a1 + d1;
@ -66,12 +69,12 @@ void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
a1 = ip[0] + ip[2];
b1 = ip[0] - ip[2];
temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
temp1 = (ip[1] * sinpi8sqrt2) >> 16;
temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
c1 = temp1 - temp2;
temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
temp2 = (ip[3] * sinpi8sqrt2) >> 16;
d1 = temp1 + temp2;
@ -84,27 +87,31 @@ void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
ip += shortpitch;
op += shortpitch;
}
}
void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch)
{
int i;
int a1;
short *op = output;
int shortpitch = pitch >> 1;
a1 = ((input[0] + 4) >> 3);
for (i = 0; i < 4; i++)
ip = output;
for (r = 0; r < 4; r++)
{
op[0] = a1;
op[1] = a1;
op[2] = a1;
op[3] = a1;
op += shortpitch;
for (c = 0; c < 4; c++)
{
int a = ip[c] + pred_ptr[c] ;
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dst_ptr[c] = (unsigned char) a ;
}
ip += 4;
dst_ptr += dst_stride;
pred_ptr += pred_stride;
}
}
void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
int pred_stride, unsigned char *dst_ptr,
int dst_stride)
{
int a1 = ((input_dc + 4) >> 3);
int r, c;
@ -124,8 +131,8 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned ch
dst_ptr[c] = (unsigned char) a ;
}
dst_ptr += stride;
pred_ptr += pitch;
dst_ptr += dst_stride;
pred_ptr += pred_stride;
}
}

Просмотреть файл

@ -12,6 +12,21 @@
#include "invtrans.h"
void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
int pitch)
{
if (b->eob > 1)
{
IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch,
*(b->base_dst) + b->dst, b->dst_stride);
}
else
{
IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch,
*(b->base_dst) + b->dst, b->dst_stride);
}
}
static void recon_dcblock(MACROBLOCKD *x)
{
@ -25,15 +40,6 @@ static void recon_dcblock(MACROBLOCKD *x)
}
void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch)
{
if (b->eob > 1)
IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
else
IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
}
void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
int i;
@ -45,7 +51,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
for (i = 0; i < 16; i++)
{
vp8_inverse_transform_b(rtcd, &x->block[i], 32);
vp8_inverse_transform_b(rtcd, &x->block[i], 16);
}
}
@ -55,34 +61,10 @@ void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD
for (i = 16; i < 24; i++)
{
vp8_inverse_transform_b(rtcd, &x->block[i], 16);
vp8_inverse_transform_b(rtcd, &x->block[i], 8);
}
}
void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
int i;
if (x->mode_info_context->mbmi.mode != B_PRED &&
x->mode_info_context->mbmi.mode != SPLITMV)
{
/* do 2nd order transform on the dc block */
IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
recon_dcblock(x);
}
for (i = 0; i < 16; i++)
{
vp8_inverse_transform_b(rtcd, &x->block[i], 32);
}
for (i = 16; i < 24; i++)
{
vp8_inverse_transform_b(rtcd, &x->block[i], 16);
}
}

Просмотреть файл

@ -18,7 +18,7 @@
void sym(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch)
#define prototype_recon_block(sym) \
void sym(unsigned char *pred, short *diff, unsigned char *dst, int pitch)
void sym(unsigned char *pred, short *diff, int diff_stride, unsigned char *dst, int pitch)
#define prototype_recon_macroblock(sym) \
void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x)
@ -27,7 +27,7 @@
void sym(MACROBLOCKD *x)
#define prototype_intra4x4_predict(sym) \
void sym(BLOCKD *x, int b_mode, unsigned char *predictor)
void sym(BLOCKD *x, int b_mode, unsigned char *predictor, int stride)
struct vp8_recon_rtcd_vtable;
@ -54,31 +54,6 @@ extern prototype_copy_block(vp8_recon_copy8x8);
#endif
extern prototype_copy_block(vp8_recon_copy8x4);
#ifndef vp8_recon_recon
#define vp8_recon_recon vp8_recon_b_c
#endif
extern prototype_recon_block(vp8_recon_recon);
#ifndef vp8_recon_recon2
#define vp8_recon_recon2 vp8_recon2b_c
#endif
extern prototype_recon_block(vp8_recon_recon2);
#ifndef vp8_recon_recon4
#define vp8_recon_recon4 vp8_recon4b_c
#endif
extern prototype_recon_block(vp8_recon_recon4);
#ifndef vp8_recon_recon_mb
#define vp8_recon_recon_mb vp8_recon_mb_c
#endif
extern prototype_recon_macroblock(vp8_recon_recon_mb);
#ifndef vp8_recon_recon_mby
#define vp8_recon_recon_mby vp8_recon_mby_c
#endif
extern prototype_recon_macroblock(vp8_recon_recon_mby);
#ifndef vp8_recon_build_intra_predictors_mby
#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
#endif
@ -111,8 +86,6 @@ extern prototype_intra4x4_predict\
typedef prototype_copy_block((*vp8_copy_block_fn_t));
typedef prototype_recon_block((*vp8_recon_fn_t));
typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
typedef struct vp8_recon_rtcd_vtable
@ -120,11 +93,7 @@ typedef struct vp8_recon_rtcd_vtable
vp8_copy_block_fn_t copy16x16;
vp8_copy_block_fn_t copy8x8;
vp8_copy_block_fn_t copy8x4;
vp8_recon_fn_t recon;
vp8_recon_fn_t recon2;
vp8_recon_fn_t recon4;
vp8_recon_mb_fn_t recon_mb;
vp8_recon_mb_fn_t recon_mby;
vp8_build_intra_pred_fn_t build_intra_predictors_mby_s;
vp8_build_intra_pred_fn_t build_intra_predictors_mby;
vp8_build_intra_pred_fn_t build_intra_predictors_mbuv_s;
@ -138,5 +107,4 @@ typedef struct vp8_recon_rtcd_vtable
#define RECON_INVOKE(ctx,fn) vp8_recon_##fn
#endif
void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
#endif

Просмотреть файл

@ -123,7 +123,6 @@ void vp8_copy_mem8x4_c(
}
void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
{
int r;
@ -159,41 +158,73 @@ void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
}
}
static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride)
{
unsigned char *ptr_base;
unsigned char *ptr;
unsigned char *pred_ptr = d->predictor;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
}
else
{
RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, pred_ptr, pitch);
RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst, dst_stride);
}
}
static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride)
{
unsigned char *ptr_base;
unsigned char *ptr;
unsigned char *pred_ptr = d->predictor;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
}
else
{
RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, pred_ptr, pitch);
RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, dst, dst_stride);
}
}
static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, vp8_subpix_fn_t sppf)
{
int r;
unsigned char *ptr_base;
unsigned char *ptr;
ptr_base = *(d->base_pre);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
}
else
{
ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
ptr = ptr_base;
for (r = 0; r < 4; r++)
{
#if !(CONFIG_FAST_UNALIGNED)
dst[0] = ptr[0];
dst[1] = ptr[1];
dst[2] = ptr[2];
dst[3] = ptr[3];
#else
*(uint32_t *)dst = *(uint32_t *)ptr ;
#endif
dst += dst_stride;
ptr += d->pre_stride;
}
}
}
@ -292,7 +323,7 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, 8);
build_inter_predictors2b(x, d0, d0->predictor, 8);
else
{
vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
@ -435,6 +466,9 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
if (x->mode_info_context->mbmi.partitioning < 3)
{
BLOCKD *b;
int dst_stride = x->block[ 0].dst_stride;
x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
@ -447,10 +481,14 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
}
build_inter_predictors4b(x, &x->block[ 0], 16);
build_inter_predictors4b(x, &x->block[ 2], 16);
build_inter_predictors4b(x, &x->block[ 8], 16);
build_inter_predictors4b(x, &x->block[10], 16);
b = &x->block[ 0];
build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
b = &x->block[ 2];
build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
b = &x->block[ 8];
build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
b = &x->block[10];
build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
}
else
{
@ -458,6 +496,7 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
int dst_stride = x->block[ 0].dst_stride;
x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
@ -468,11 +507,11 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
}
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, 16);
build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride);
else
{
vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict);
build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict);
}
}
@ -483,15 +522,16 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
int dst_stride = x->block[ 16].dst_stride;
/* Note: uv mvs already clamped in build_4x4uvmvs() */
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, 8);
build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride);
else
{
vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict);
build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict);
}
}
}
@ -542,17 +582,83 @@ void build_4x4uvmvs(MACROBLOCKD *x)
}
}
void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
{
if (x->mode_info_context->mbmi.mode != SPLITMV)
if (xd->mode_info_context->mbmi.mode != SPLITMV)
{
vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256],
&x->predictor[320], 16, 8);
vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.y_stride, xd->dst.uv_stride);
}
else
{
build_4x4uvmvs(x);
build_inter4x4_predictors_mb(x);
build_4x4uvmvs(xd);
build_inter4x4_predictors_mb(xd);
}
}
/* encoder only*/
static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x)
{
int i;
if (x->mode_info_context->mbmi.partitioning < 3)
{
x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
x->block[10].bmi = x->mode_info_context->bmi[10];
build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16);
build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16);
build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16);
build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16);
}
else
{
for (i = 0; i < 16; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, d0->predictor, 16);
else
{
build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict);
build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict);
}
}
}
for (i = 16; i < 24; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, d0->predictor, 8);
else
{
build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict);
build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict);
}
}
}
void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd)
{
if (xd->mode_info_context->mbmi.mode != SPLITMV)
{
vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256],
&xd->predictor[320], 16, 8);
}
else
{
build_4x4uvmvs(xd);
build_inter4x4_predictors_mb_e(xd);
}
}

Просмотреть файл

@ -26,5 +26,6 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t s
extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd);
#endif

Просмотреть файл

@ -17,16 +17,6 @@
/* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
* vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
*/
void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
int i;
for (i = 16; i < 24; i += 2)
{
BLOCKD *b = &x->block[i];
RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
}
}
void vp8_build_intra_predictors_mby(MACROBLOCKD *x)
{

Просмотреть файл

@ -16,7 +16,7 @@
void vp8_intra4x4_predict(BLOCKD *x,
int b_mode,
unsigned char *predictor)
unsigned char *predictor, int stride)
{
int i, r, c;
@ -50,7 +50,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
predictor[c] = expected_dc;
}
predictor += 16;
predictor += stride;
}
}
break;
@ -72,7 +72,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
predictor[c] = pred;
}
predictor += 16;
predictor += stride;
}
}
break;
@ -94,7 +94,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
predictor[c] = ap[c];
}
predictor += 16;
predictor += stride;
}
}
@ -117,29 +117,29 @@ void vp8_intra4x4_predict(BLOCKD *x,
predictor[c] = lp[r];
}
predictor += 16;
predictor += stride;
}
}
break;
case B_LD_PRED:
{
unsigned char *ptr = Above;
predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
predictor[0 * 16 + 1] =
predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
predictor[0 * 16 + 2] =
predictor[1 * 16 + 1] =
predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
predictor[0 * 16 + 3] =
predictor[1 * 16 + 2] =
predictor[2 * 16 + 1] =
predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
predictor[1 * 16 + 3] =
predictor[2 * 16 + 2] =
predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
predictor[2 * 16 + 3] =
predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
predictor[0 * stride + 1] =
predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
predictor[0 * stride + 2] =
predictor[1 * stride + 1] =
predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
predictor[0 * stride + 3] =
predictor[1 * stride + 2] =
predictor[2 * stride + 1] =
predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
predictor[1 * stride + 3] =
predictor[2 * stride + 2] =
predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
predictor[2 * stride + 3] =
predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
}
break;
@ -158,22 +158,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
pp[7] = Above[2];
pp[8] = Above[3];
predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[3 * 16 + 1] =
predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[3 * 16 + 2] =
predictor[2 * 16 + 1] =
predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[3 * 16 + 3] =
predictor[2 * 16 + 2] =
predictor[1 * 16 + 1] =
predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * 16 + 3] =
predictor[1 * 16 + 2] =
predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[1 * 16 + 3] =
predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[3 * stride + 1] =
predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[3 * stride + 2] =
predictor[2 * stride + 1] =
predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[3 * stride + 3] =
predictor[2 * stride + 2] =
predictor[1 * stride + 1] =
predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * stride + 3] =
predictor[1 * stride + 2] =
predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[1 * stride + 3] =
predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
}
break;
@ -193,22 +193,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
pp[8] = Above[3];
predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[3 * 16 + 1] =
predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * 16 + 1] =
predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
predictor[3 * 16 + 2] =
predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[2 * 16 + 2] =
predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
predictor[3 * 16 + 3] =
predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[2 * 16 + 3] =
predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[3 * stride + 1] =
predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * stride + 1] =
predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1;
predictor[3 * stride + 2] =
predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[2 * stride + 2] =
predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1;
predictor[3 * stride + 3] =
predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[2 * stride + 3] =
predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1;
predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1;
}
break;
@ -217,22 +217,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
unsigned char *pp = Above;
predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[2 * 16 + 0] =
predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
predictor[1 * 16 + 1] =
predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * 16 + 1] =
predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
predictor[3 * 16 + 1] =
predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[0 * 16 + 3] =
predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
predictor[1 * 16 + 3] =
predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[2 * stride + 0] =
predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1;
predictor[1 * stride + 1] =
predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * stride + 1] =
predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1;
predictor[3 * stride + 1] =
predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[0 * stride + 3] =
predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1;
predictor[1 * stride + 3] =
predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
}
break;
@ -250,22 +250,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
pp[8] = Above[3];
predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[2 * 16 + 0] =
predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
predictor[2 * 16 + 1] =
predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * 16 + 2] =
predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
predictor[2 * 16 + 3] =
predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[1 * 16 + 2] =
predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
predictor[1 * 16 + 3] =
predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[2 * stride + 0] =
predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1;
predictor[2 * stride + 1] =
predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * stride + 2] =
predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
predictor[2 * stride + 3] =
predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[1 * stride + 2] =
predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1;
predictor[1 * stride + 3] =
predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
}
break;
@ -273,28 +273,33 @@ void vp8_intra4x4_predict(BLOCKD *x,
case B_HU_PRED:
{
unsigned char *pp = Left;
predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[0 * 16 + 2] =
predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
predictor[0 * 16 + 3] =
predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[1 * 16 + 2] =
predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
predictor[1 * 16 + 3] =
predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
predictor[2 * 16 + 2] =
predictor[2 * 16 + 3] =
predictor[3 * 16 + 0] =
predictor[3 * 16 + 1] =
predictor[3 * 16 + 2] =
predictor[3 * 16 + 3] = pp[3];
predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[0 * stride + 2] =
predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1;
predictor[0 * stride + 3] =
predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[1 * stride + 2] =
predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
predictor[1 * stride + 3] =
predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
predictor[2 * stride + 2] =
predictor[2 * stride + 3] =
predictor[3 * stride + 0] =
predictor[3 * stride + 1] =
predictor[3 * stride + 2] =
predictor[3 * stride + 3] = pp[3];
}
break;
}
}
/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
* to the right prediction have filled in pixels to use.
*/

Просмотреть файл

@ -20,7 +20,6 @@
*/
#if HAVE_MMX
extern prototype_idct(vp8_short_idct4x4llm_1_mmx);
extern prototype_idct(vp8_short_idct4x4llm_mmx);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
@ -28,9 +27,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_mmx
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_mmx

Просмотреть файл

@ -32,197 +32,10 @@
; **************************************************************************/
;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
;int pitch, unsigned char *dest,int stride)
global sym(vp8_short_idct4x4llm_mmx)
sym(vp8_short_idct4x4llm_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
; end prolog
mov rax, arg(0) ;input
mov rdx, arg(1) ;output
movq mm0, [rax ]
movq mm1, [rax+ 8]
movq mm2, [rax+16]
movq mm3, [rax+24]
movsxd rax, dword ptr arg(2) ;pitch
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
movq mm5, mm1
paddw mm2, mm0 ; a1 =0+2
pmulhw mm5, [GLOBAL(x_s1sqr2)] ;
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
movq mm7, mm3 ;
pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ;
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw mm7, mm5 ; c1
movq mm5, mm1
movq mm4, mm3
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
paddw mm5, mm1
pmulhw mm3, [GLOBAL(x_s1sqr2)]
paddw mm3, mm4
paddw mm3, mm5 ; d1
movq mm6, mm2 ; a1
movq mm4, mm0 ; b1
paddw mm2, mm3 ;0
paddw mm4, mm7 ;1
psubw mm0, mm7 ;2
psubw mm6, mm3 ;3
movq mm1, mm2 ; 03 02 01 00
movq mm3, mm4 ; 23 22 21 20
punpcklwd mm1, mm0 ; 11 01 10 00
punpckhwd mm2, mm0 ; 13 03 12 02
punpcklwd mm3, mm6 ; 31 21 30 20
punpckhwd mm4, mm6 ; 33 23 32 22
movq mm0, mm1 ; 11 01 10 00
movq mm5, mm2 ; 13 03 12 02
punpckldq mm0, mm3 ; 30 20 10 00
punpckhdq mm1, mm3 ; 31 21 11 01
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
movq mm3, mm5 ; 33 23 13 03
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
movq mm5, mm1
paddw mm2, mm0 ; a1 =0+2
pmulhw mm5, [GLOBAL(x_s1sqr2)] ;
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
movq mm7, mm3 ;
pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ;
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw mm7, mm5 ; c1
movq mm5, mm1
movq mm4, mm3
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
paddw mm5, mm1
pmulhw mm3, [GLOBAL(x_s1sqr2)]
paddw mm3, mm4
paddw mm3, mm5 ; d1
paddw mm0, [GLOBAL(fours)]
paddw mm2, [GLOBAL(fours)]
movq mm6, mm2 ; a1
movq mm4, mm0 ; b1
paddw mm2, mm3 ;0
paddw mm4, mm7 ;1
psubw mm0, mm7 ;2
psubw mm6, mm3 ;3
psraw mm2, 3
psraw mm0, 3
psraw mm4, 3
psraw mm6, 3
movq mm1, mm2 ; 03 02 01 00
movq mm3, mm4 ; 23 22 21 20
punpcklwd mm1, mm0 ; 11 01 10 00
punpckhwd mm2, mm0 ; 13 03 12 02
punpcklwd mm3, mm6 ; 31 21 30 20
punpckhwd mm4, mm6 ; 33 23 32 22
movq mm0, mm1 ; 11 01 10 00
movq mm5, mm2 ; 13 03 12 02
punpckldq mm0, mm3 ; 30 20 10 00
punpckhdq mm1, mm3 ; 31 21 11 01
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
movq [rdx], mm0
movq [rdx+rax], mm1
movq [rdx+rax*2], mm2
add rdx, rax
movq [rdx+rax*2], mm5
; begin epilog
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
global sym(vp8_short_idct4x4llm_1_mmx)
sym(vp8_short_idct4x4llm_1_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
; end prolog
mov rax, arg(0) ;input
movd mm0, [rax]
paddw mm0, [GLOBAL(fours)]
mov rdx, arg(1) ;output
psraw mm0, 3
movsxd rax, dword ptr arg(2) ;pitch
punpcklwd mm0, mm0
punpckldq mm0, mm0
movq [rdx], mm0
movq [rdx+rax], mm0
movq [rdx+rax*2], mm0
add rdx, rax
movq [rdx+rax*2], mm0
; begin epilog
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
global sym(vp8_dc_only_idct_add_mmx)
sym(vp8_dc_only_idct_add_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@ -231,46 +44,171 @@ sym(vp8_dc_only_idct_add_mmx):
push rdi
; end prolog
mov rsi, arg(1) ;s -- prediction
mov rdi, arg(2) ;d -- destination
movsxd rax, dword ptr arg(4) ;stride
movsxd rdx, dword ptr arg(3) ;pitch
pxor mm0, mm0
mov rax, arg(0) ;input
mov rsi, arg(1) ;pred
movd mm5, arg(0) ;input_dc
movq mm0, [rax ]
movq mm1, [rax+ 8]
movq mm2, [rax+16]
movq mm3, [rax+24]
paddw mm5, [GLOBAL(fours)]
%if 0
pxor mm7, mm7
movq [rax], mm7
movq [rax+8], mm7
movq [rax+16],mm7
movq [rax+24],mm7
%endif
movsxd rax, dword ptr arg(2) ;pitch
mov rdx, arg(3) ;dest
movsxd rdi, dword ptr arg(4) ;stride
psraw mm5, 3
punpcklwd mm5, mm5
punpckldq mm5, mm5
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
movd mm1, [rsi]
punpcklbw mm1, mm0
paddsw mm1, mm5
packuswb mm1, mm0 ; pack and unpack to saturate
movd [rdi], mm1
movq mm5, mm1
paddw mm2, mm0 ; a1 =0+2
movd mm2, [rsi+rdx]
punpcklbw mm2, mm0
paddsw mm2, mm5
packuswb mm2, mm0 ; pack and unpack to saturate
movd [rdi+rax], mm2
pmulhw mm5, [GLOBAL(x_s1sqr2)];
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
movd mm3, [rsi+2*rdx]
punpcklbw mm3, mm0
paddsw mm3, mm5
packuswb mm3, mm0 ; pack and unpack to saturate
movd [rdi+2*rax], mm3
movq mm7, mm3 ;
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
add rdi, rax
add rsi, rdx
movd mm4, [rsi+2*rdx]
punpcklbw mm4, mm0
paddsw mm4, mm5
packuswb mm4, mm0 ; pack and unpack to saturate
movd [rdi+2*rax], mm4
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw mm7, mm5 ; c1
movq mm5, mm1
movq mm4, mm3
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
paddw mm5, mm1
pmulhw mm3, [GLOBAL(x_s1sqr2)]
paddw mm3, mm4
paddw mm3, mm5 ; d1
movq mm6, mm2 ; a1
movq mm4, mm0 ; b1
paddw mm2, mm3 ;0
paddw mm4, mm7 ;1
psubw mm0, mm7 ;2
psubw mm6, mm3 ;3
movq mm1, mm2 ; 03 02 01 00
movq mm3, mm4 ; 23 22 21 20
punpcklwd mm1, mm0 ; 11 01 10 00
punpckhwd mm2, mm0 ; 13 03 12 02
punpcklwd mm3, mm6 ; 31 21 30 20
punpckhwd mm4, mm6 ; 33 23 32 22
movq mm0, mm1 ; 11 01 10 00
movq mm5, mm2 ; 13 03 12 02
punpckldq mm0, mm3 ; 30 20 10 00
punpckhdq mm1, mm3 ; 31 21 11 01
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
movq mm3, mm5 ; 33 23 13 03
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
movq mm5, mm1
paddw mm2, mm0 ; a1 =0+2
pmulhw mm5, [GLOBAL(x_s1sqr2)];
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
movq mm7, mm3 ;
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
psubw mm7, mm5 ; c1
movq mm5, mm1
movq mm4, mm3
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
paddw mm5, mm1
pmulhw mm3, [GLOBAL(x_s1sqr2)]
paddw mm3, mm4
paddw mm3, mm5 ; d1
paddw mm0, [GLOBAL(fours)]
paddw mm2, [GLOBAL(fours)]
movq mm6, mm2 ; a1
movq mm4, mm0 ; b1
paddw mm2, mm3 ;0
paddw mm4, mm7 ;1
psubw mm0, mm7 ;2
psubw mm6, mm3 ;3
psraw mm2, 3
psraw mm0, 3
psraw mm4, 3
psraw mm6, 3
movq mm1, mm2 ; 03 02 01 00
movq mm3, mm4 ; 23 22 21 20
punpcklwd mm1, mm0 ; 11 01 10 00
punpckhwd mm2, mm0 ; 13 03 12 02
punpcklwd mm3, mm6 ; 31 21 30 20
punpckhwd mm4, mm6 ; 33 23 32 22
movq mm0, mm1 ; 11 01 10 00
movq mm5, mm2 ; 13 03 12 02
punpckldq mm0, mm3 ; 30 20 10 00
punpckhdq mm1, mm3 ; 31 21 11 01
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
pxor mm7, mm7
movd mm4, [rsi]
punpcklbw mm4, mm7
paddsw mm0, mm4
packuswb mm0, mm7
movd [rdx], mm0
movd mm4, [rsi+rax]
punpcklbw mm4, mm7
paddsw mm1, mm4
packuswb mm1, mm7
movd [rdx+rdi], mm1
movd mm4, [rsi+2*rax]
punpcklbw mm4, mm7
paddsw mm2, mm4
packuswb mm2, mm7
movd [rdx+rdi*2], mm2
add rdx, rdi
add rsi, rax
movd mm4, [rsi+2*rax]
punpcklbw mm4, mm7
paddsw mm5, mm4
packuswb mm5, mm7
movd [rdx+rdi*2], mm5
; begin epilog
pop rdi
@ -280,6 +218,71 @@ sym(vp8_dc_only_idct_add_mmx):
pop rbp
ret
;void vp8_dc_only_idct_add_mmx(
;short input_dc,
;unsigned char *pred_ptr,
;int pred_stride,
;unsigned char *dst_ptr,
;int stride)
global sym(vp8_dc_only_idct_add_mmx)
sym(vp8_dc_only_idct_add_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
; end prolog
movd mm5, arg(0) ;input_dc
mov rax, arg(1) ;pred_ptr
movsxd rdx, dword ptr arg(2) ;pred_stride
pxor mm0, mm0
paddw mm5, [GLOBAL(fours)]
lea rcx, [rdx + rdx*2]
psraw mm5, 3
punpcklwd mm5, mm5
punpckldq mm5, mm5
movd mm1, [rax]
movd mm2, [rax+rdx]
movd mm3, [rax+2*rdx]
movd mm4, [rax+rcx]
mov rax, arg(3) ;d -- destination
movsxd rdx, dword ptr arg(4) ;dst_stride
punpcklbw mm1, mm0
paddsw mm1, mm5
packuswb mm1, mm0 ; pack and unpack to saturate
lea rcx, [rdx + rdx*2]
punpcklbw mm2, mm0
paddsw mm2, mm5
packuswb mm2, mm0 ; pack and unpack to saturate
punpcklbw mm3, mm0
paddsw mm3, mm5
packuswb mm3, mm0 ; pack and unpack to saturate
punpcklbw mm4, mm0
paddsw mm4, mm5
packuswb mm4, mm0 ; pack and unpack to saturate
movd [rax], mm1
movd [rax+rdx], mm2
movd [rax+2*rdx], mm3
movd [rax+rcx], mm4
; begin epilog
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
x_s1sqr2:

Просмотреть файл

@ -15,17 +15,15 @@
; (
; short *qcoeff - 0
; short *dequant - 1
; unsigned char *pre - 2
; unsigned char *dst - 3
; int dst_stride - 4
; int blk_stride - 5
; unsigned char *dst - 2
; int dst_stride - 3
; )
global sym(vp8_idct_dequant_0_2x_sse2)
sym(vp8_idct_dequant_0_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SHADOW_ARGS_TO_STACK 4
GET_GOT rbx
; end prolog
@ -47,19 +45,20 @@ sym(vp8_idct_dequant_0_2x_sse2):
movd [rax], xmm5
movd [rax+32], xmm5
;pshufb
mov rax, arg(2) ; dst
movsxd rdx, dword ptr arg(3) ; dst_stride
pshuflw xmm4, xmm4, 00000000b
pshufhw xmm4, xmm4, 00000000b
mov rax, arg(2) ; pre
lea rcx, [rdx + rdx*2]
paddw xmm4, [GLOBAL(fours)]
movsxd rcx, dword ptr arg(5) ; blk_stride
psraw xmm4, 3
movq xmm0, [rax]
movq xmm1, [rax+rcx]
movq xmm2, [rax+2*rcx]
lea rcx, [3*rcx]
movq xmm1, [rax+rdx]
movq xmm2, [rax+2*rdx]
movq xmm3, [rax+rcx]
punpcklbw xmm0, xmm5
@ -67,8 +66,6 @@ sym(vp8_idct_dequant_0_2x_sse2):
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
mov rax, arg(3) ; dst
movsxd rdx, dword ptr arg(4) ; dst_stride
; Add to predict buffer
paddw xmm0, xmm4
@ -97,11 +94,18 @@ sym(vp8_idct_dequant_0_2x_sse2):
pop rbp
ret
;void vp8_idct_dequant_full_2x_sse2
; (
; short *qcoeff - 0
; short *dequant - 1
; unsigned char *dst - 2
; int dst_stride - 3
; )
global sym(vp8_idct_dequant_full_2x_sse2)
sym(vp8_idct_dequant_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SHADOW_ARGS_TO_STACK 4
SAVE_XMM 7
GET_GOT rbx
push rsi
@ -111,14 +115,13 @@ sym(vp8_idct_dequant_full_2x_sse2):
; special case when 2 blocks have 0 or 1 coeffs
; dc is set as first coeff, so no need to load qcoeff
mov rax, arg(0) ; qcoeff
mov rsi, arg(2) ; pre
mov rdi, arg(3) ; dst
movsxd rcx, dword ptr arg(5) ; blk_stride
mov rdx, arg(1) ; dequant
mov rdi, arg(2) ; dst
; Zero out xmm7, for use unpacking
pxor xmm7, xmm7
mov rdx, arg(1) ; dequant
; note the transpose of xmm1 and xmm2, necessary for shuffle
; to spit out sensicle data
@ -138,6 +141,7 @@ sym(vp8_idct_dequant_full_2x_sse2):
pmullw xmm2, [rdx+16]
pmullw xmm1, [rdx]
pmullw xmm3, [rdx+16]
movsxd rdx, dword ptr arg(3) ; dst_stride
; repack so block 0 row x and block 1 row x are together
movdqa xmm4, xmm0
@ -162,6 +166,7 @@ sym(vp8_idct_dequant_full_2x_sse2):
paddw xmm2, xmm0 ; a1 = 0+2
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
lea rcx, [rdx + rdx*2] ;dst_stride * 3
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
movdqa xmm7, xmm3
@ -304,8 +309,8 @@ sym(vp8_idct_dequant_full_2x_sse2):
pxor xmm7, xmm7
; Load up predict blocks
movq xmm4, [rsi]
movq xmm5, [rsi+rcx]
movq xmm4, [rdi]
movq xmm5, [rdi+rdx]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
@ -313,9 +318,8 @@ sym(vp8_idct_dequant_full_2x_sse2):
paddw xmm0, xmm4
paddw xmm1, xmm5
movq xmm4, [rsi+2*rcx]
lea rcx, [3*rcx]
movq xmm5, [rsi+rcx]
movq xmm4, [rdi+2*rdx]
movq xmm5, [rdi+rcx]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
@ -331,18 +335,11 @@ sym(vp8_idct_dequant_full_2x_sse2):
packuswb xmm2, xmm7
packuswb xmm3, xmm7
; Load destination stride before writing out,
; doesn't need to persist
movsxd rdx, dword ptr arg(4) ; dst_stride
; store blocks back out
movq [rdi], xmm0
movq [rdi + rdx], xmm1
lea rdi, [rdi + 2*rdx]
movq [rdi], xmm2
movq [rdi + rdx], xmm3
movq [rdi + rdx*2], xmm2
movq [rdi + rcx], xmm3
; begin epilog
pop rdi
@ -357,27 +354,25 @@ sym(vp8_idct_dequant_full_2x_sse2):
; (
; short *qcoeff - 0
; short *dequant - 1
; unsigned char *pre - 2
; unsigned char *dst - 3
; int dst_stride - 4
; short *dc - 5
; unsigned char *dst - 2
; int dst_stride - 3
; short *dc - 4
; )
global sym(vp8_idct_dequant_dc_0_2x_sse2)
sym(vp8_idct_dequant_dc_0_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
; special case when 2 blocks have 0 or 1 coeffs
; dc is set as first coeff, so no need to load qcoeff
mov rax, arg(0) ; qcoeff
mov rsi, arg(2) ; pre
mov rdi, arg(3) ; dst
mov rdx, arg(5) ; dc
mov rdi, arg(2) ; dst
mov rdx, arg(4) ; dc
; Zero out xmm5, for use unpacking
pxor xmm5, xmm5
@ -385,11 +380,13 @@ sym(vp8_idct_dequant_dc_0_2x_sse2):
; load up 2 dc words here == 2*16 = doubleword
movd xmm4, [rdx]
movsxd rdx, dword ptr arg(3) ; dst_stride
lea rcx, [rdx + rdx*2]
; Load up predict blocks
movq xmm0, [rsi]
movq xmm1, [rsi+16]
movq xmm2, [rsi+32]
movq xmm3, [rsi+48]
movq xmm0, [rdi]
movq xmm1, [rdi+rdx*1]
movq xmm2, [rdi+rdx*2]
movq xmm3, [rdi+rcx]
; Duplicate and expand dc across
punpcklwd xmm4, xmm4
@ -417,48 +414,46 @@ sym(vp8_idct_dequant_dc_0_2x_sse2):
packuswb xmm2, xmm5
packuswb xmm3, xmm5
; Load destination stride before writing out,
; doesn't need to persist
movsxd rdx, dword ptr arg(4) ; dst_stride
; store blocks back out
movq [rdi], xmm0
movq [rdi + rdx], xmm1
lea rdi, [rdi + 2*rdx]
movq [rdi], xmm2
movq [rdi + rdx], xmm3
movq [rdi + rdx*2], xmm2
movq [rdi + rcx], xmm3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_idct_dequant_dc_full_2x_sse2
; (
; short *qcoeff - 0
; short *dequant - 1
; unsigned char *dst - 2
; int dst_stride - 3
; short *dc - 4
; )
global sym(vp8_idct_dequant_dc_full_2x_sse2)
sym(vp8_idct_dequant_dc_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
; special case when 2 blocks have 0 or 1 coeffs
; dc is set as first coeff, so no need to load qcoeff
mov rax, arg(0) ; qcoeff
mov rsi, arg(2) ; pre
mov rdi, arg(3) ; dst
mov rdx, arg(1) ; dequant
mov rdi, arg(2) ; dst
; Zero out xmm7, for use unpacking
pxor xmm7, xmm7
mov rdx, arg(1) ; dequant
; note the transpose of xmm1 and xmm2, necessary for shuffle
; to spit out sensicle data
@ -480,7 +475,7 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
pmullw xmm3, [rdx+16]
; DC component
mov rdx, arg(5)
mov rdx, arg(4)
; repack so block 0 row x and block 1 row x are together
movdqa xmm4, xmm0
@ -651,8 +646,10 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
pxor xmm7, xmm7
; Load up predict blocks
movq xmm4, [rsi]
movq xmm5, [rsi+16]
movsxd rdx, dword ptr arg(3) ; dst_stride
movq xmm4, [rdi]
movq xmm5, [rdi+rdx]
lea rcx, [rdx + rdx*2]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
@ -660,8 +657,8 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
paddw xmm0, xmm4
paddw xmm1, xmm5
movq xmm4, [rsi+32]
movq xmm5, [rsi+48]
movq xmm4, [rdi+rdx*2]
movq xmm5, [rdi+rcx]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
@ -679,7 +676,7 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
; Load destination stride before writing out,
; doesn't need to persist
movsxd rdx, dword ptr arg(4) ; dst_stride
movsxd rdx, dword ptr arg(3) ; dst_stride
; store blocks back out
movq [rdi], xmm0
@ -693,7 +690,6 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS

Просмотреть файл

@ -10,53 +10,6 @@
%include "vpx_ports/x86_abi_support.asm"
;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
global sym(vp8_recon_b_mmx)
sym(vp8_recon_b_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;s
mov rdi, arg(2) ;d
mov rdx, arg(1) ;q
movsxd rax, dword ptr arg(3) ;stride
pxor mm0, mm0
movd mm1, [rsi]
punpcklbw mm1, mm0
paddsw mm1, [rdx]
packuswb mm1, mm0 ; pack and unpack to saturate
movd [rdi], mm1
movd mm2, [rsi+16]
punpcklbw mm2, mm0
paddsw mm2, [rdx+32]
packuswb mm2, mm0 ; pack and unpack to saturate
movd [rdi+rax], mm2
movd mm3, [rsi+32]
punpcklbw mm3, mm0
paddsw mm3, [rdx+64]
packuswb mm3, mm0 ; pack and unpack to saturate
movd [rdi+2*rax], mm3
add rdi, rax
movd mm4, [rsi+48]
punpcklbw mm4, mm0
paddsw mm4, [rdx+96]
packuswb mm4, mm0 ; pack and unpack to saturate
movd [rdi+2*rax], mm4
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void copy_mem8x8_mmx(

Просмотреть файл

@ -10,121 +10,6 @@
%include "vpx_ports/x86_abi_support.asm"
;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
global sym(vp8_recon2b_sse2)
sym(vp8_recon2b_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;s
mov rdi, arg(2) ;d
mov rdx, arg(1) ;q
movsxd rax, dword ptr arg(3) ;stride
pxor xmm0, xmm0
movq xmm1, MMWORD PTR [rsi]
punpcklbw xmm1, xmm0
paddsw xmm1, XMMWORD PTR [rdx]
packuswb xmm1, xmm0 ; pack and unpack to saturate
movq MMWORD PTR [rdi], xmm1
movq xmm2, MMWORD PTR [rsi+8]
punpcklbw xmm2, xmm0
paddsw xmm2, XMMWORD PTR [rdx+16]
packuswb xmm2, xmm0 ; pack and unpack to saturate
movq MMWORD PTR [rdi+rax], xmm2
movq xmm3, MMWORD PTR [rsi+16]
punpcklbw xmm3, xmm0
paddsw xmm3, XMMWORD PTR [rdx+32]
packuswb xmm3, xmm0 ; pack and unpack to saturate
movq MMWORD PTR [rdi+rax*2], xmm3
add rdi, rax
movq xmm4, MMWORD PTR [rsi+24]
punpcklbw xmm4, xmm0
paddsw xmm4, XMMWORD PTR [rdx+48]
packuswb xmm4, xmm0 ; pack and unpack to saturate
movq MMWORD PTR [rdi+rax*2], xmm4
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
global sym(vp8_recon4b_sse2)
sym(vp8_recon4b_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM 7
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;s
mov rdi, arg(2) ;d
mov rdx, arg(1) ;q
movsxd rax, dword ptr arg(3) ;stride
pxor xmm0, xmm0
movdqa xmm1, XMMWORD PTR [rsi]
movdqa xmm5, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm5, xmm0
paddsw xmm1, XMMWORD PTR [rdx]
paddsw xmm5, XMMWORD PTR [rdx+16]
packuswb xmm1, xmm5 ; pack and unpack to saturate
movdqa XMMWORD PTR [rdi], xmm1
movdqa xmm2, XMMWORD PTR [rsi+16]
movdqa xmm6, xmm2
punpcklbw xmm2, xmm0
punpckhbw xmm6, xmm0
paddsw xmm2, XMMWORD PTR [rdx+32]
paddsw xmm6, XMMWORD PTR [rdx+48]
packuswb xmm2, xmm6 ; pack and unpack to saturate
movdqa XMMWORD PTR [rdi+rax], xmm2
movdqa xmm3, XMMWORD PTR [rsi+32]
movdqa xmm7, xmm3
punpcklbw xmm3, xmm0
punpckhbw xmm7, xmm0
paddsw xmm3, XMMWORD PTR [rdx+64]
paddsw xmm7, XMMWORD PTR [rdx+80]
packuswb xmm3, xmm7 ; pack and unpack to saturate
movdqa XMMWORD PTR [rdi+rax*2], xmm3
add rdi, rax
movdqa xmm4, XMMWORD PTR [rsi+48]
movdqa xmm5, xmm4
punpcklbw xmm4, xmm0
punpckhbw xmm5, xmm0
paddsw xmm4, XMMWORD PTR [rdx+96]
paddsw xmm5, XMMWORD PTR [rdx+112]
packuswb xmm4, xmm5 ; pack and unpack to saturate
movdqa XMMWORD PTR [rdi+rax*2], xmm4
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void copy_mem16x16_sse2(
; unsigned char *src,

Просмотреть файл

@ -20,16 +20,12 @@
*/
#if HAVE_MMX
extern prototype_recon_block(vp8_recon_b_mmx);
extern prototype_copy_block(vp8_copy_mem8x8_mmx);
extern prototype_copy_block(vp8_copy_mem8x4_mmx);
extern prototype_copy_block(vp8_copy_mem16x16_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon
#define vp8_recon_recon vp8_recon_b_mmx
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp8_copy_mem8x8_mmx
@ -43,19 +39,11 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
#endif
#if HAVE_SSE2
extern prototype_recon_block(vp8_recon2b_sse2);
extern prototype_recon_block(vp8_recon4b_sse2);
extern prototype_copy_block(vp8_copy_mem16x16_sse2);
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon2
#define vp8_recon_recon2 vp8_recon2b_sse2
#undef vp8_recon_recon4
#define vp8_recon_recon4 vp8_recon4b_sse2
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2

Просмотреть файл

@ -37,7 +37,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
if (flags & HAS_MMX)
{
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_mmx;
rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx;
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx;
@ -45,7 +44,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->recon.recon = vp8_recon_b_mmx;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_mmx;
@ -81,8 +79,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
if (flags & HAS_SSE2)
{
rtcd->recon.recon2 = vp8_recon2b_sse2;
rtcd->recon.recon4 = vp8_recon4b_sse2;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_sse2;
rtcd->recon.build_intra_predictors_mbuv =
vp8_build_intra_predictors_mbuv_sse2;

Просмотреть файл

@ -12,21 +12,19 @@
AREA |.text|, CODE, READONLY
;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride, int Dc)
;void vp8_dequant_dc_idct_v6(short *input, short *dq,
; unsigned char *dest, int stride, int Dc)
; r0 = input
; r1 = dq
; r2 = pred
; r3 = dest
; sp + 36 = pitch ; +4 = 40
; sp + 40 = stride ; +4 = 44
; sp + 44 = Dc ; +4 = 48
; r2 = dst
; r3 = stride
; sp + 36 = Dc
|vp8_dequant_dc_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r6, [sp, #44]
ldr r6, [sp, #36]
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
@ -149,7 +147,7 @@ vp8_dequant_dc_idct_loop2_v6
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
ldr r12, [sp, #40]
ldr r12, [sp] ; get stride from stack
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
@ -158,7 +156,7 @@ vp8_dequant_dc_idct_loop2_v6
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
ldr r11, [r2], r12
ldr r11, [r2] ; load input from dst
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
@ -170,9 +168,7 @@ vp8_dequant_dc_idct_loop2_v6
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
ldr r11, [r2], r12
ldr lr, [sp]
ldr r12, [sp, #44]
ldr r11, [r2, r12] ; load input from dst
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
@ -188,9 +184,8 @@ vp8_dequant_dc_idct_loop2_v6
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
str r9, [lr], r12
str r1, [lr], r12
str lr, [sp]
str r9, [r2], r12 ; store output to dst
str r1, [r2], r12 ; store output to dst
bne vp8_dequant_dc_idct_loop2_v6
; vpx_memset

Просмотреть файл

@ -10,15 +10,12 @@
EXPORT |vp8_dequant_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride)
; r0 = input
;void vp8_dequant_idct_v6(short *input, short *dq,
; unsigned char *dest, int stride)
; r0 = q
; r1 = dq
; r2 = pred
; r3 = dest
; sp + 36 = pitch ; +4 = 40
; sp + 40 = stride ; +4 = 44
; r2 = dst
; r3 = stride
|vp8_dequant_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
@ -127,7 +124,7 @@ vp8_dequant_idct_loop2_v6
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
ldr r12, [sp, #40]
ldr r12, [sp] ; get stride from stack
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
@ -136,7 +133,7 @@ vp8_dequant_idct_loop2_v6
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
ldr r11, [r2], r12
ldr r11, [r2] ; load input from dst
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
@ -148,9 +145,7 @@ vp8_dequant_idct_loop2_v6
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
ldr r11, [r2], r12
ldr lr, [sp]
ldr r12, [sp, #44]
ldr r11, [r2, r12] ; load input from dst
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
@ -166,9 +161,8 @@ vp8_dequant_idct_loop2_v6
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
str r9, [lr], r12
str r1, [lr], r12
str lr, [sp]
str r9, [r2], r12 ; store output to dst
str r1, [r2], r12 ; store output to dst
bne vp8_dequant_idct_loop2_v6
; vpx_memset

Просмотреть файл

@ -12,115 +12,121 @@
#include "vp8/common/idct.h"
#include "vp8/decoder/dequantize.h"
void vp8_dequant_dc_idct_add_y_block_v6
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs, short *dc)
void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
unsigned char *dst, int stride,
char *eobs, short *dc)
{
int i;
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]);
else
vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride);
vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]);
else if (eobs[0] == 1)
vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride);
if (eobs[1] > 1)
vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
else
vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride);
{
vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]);
}
else if (eobs[1] == 1)
vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride);
if (eobs[2] > 1)
vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
else
vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride);
{
vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]);
}
else if (eobs[2] == 1)
vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride);
if (eobs[3] > 1)
vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
else
vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride);
{
vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]);
}
else if (eobs[3] == 1)
vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride);
q += 64;
dc += 4;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_v6
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs)
void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
unsigned char *dst,
int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride);
else
vp8_dequant_idct_add_v6 (q, dq, dst, stride);
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride);
vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride);
else
vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);
else if (eobs[1] == 1)
{
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride);
vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);
((int *)(q+16))[0] = 0;
}
if (eobs[2] > 1)
vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride);
else
vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);
else if (eobs[2] == 1)
{
vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride);
vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);
((int *)(q+32))[0] = 0;
}
if (eobs[3] > 1)
vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride);
else
vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);
else if (eobs[3] == 1)
{
vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride);
vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);
((int *)(q+48))[0] = 0;
}
q += 64;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_v6
(short *q, short *dq, unsigned char *pre,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,
unsigned char *dstu,
unsigned char *dstv,
int stride, char *eobs)
{
int i;
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride);
else
vp8_dequant_idct_add_v6 (q, dq, dstu, stride);
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride);
vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride);
else
vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);
else if (eobs[1] == 1)
{
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride);
vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,
dstu+4, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
pre += 32;
dstu += 4*stride;
eobs += 2;
}
@ -128,23 +134,23 @@ void vp8_dequant_idct_add_uv_block_v6
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride);
else
vp8_dequant_idct_add_v6 (q, dq, dstv, stride);
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride);
vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride);
else
vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);
else if (eobs[1] == 1)
{
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride);
vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,
dstv+4, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
pre += 32;
dstv += 4*stride;
eobs += 2;
}

Просмотреть файл

@ -49,6 +49,7 @@ extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neo
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_neon
@ -68,6 +69,7 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
#undef vp8_dequant_idct_add_uv_block
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
#endif
#endif
#endif

Просмотреть файл

@ -15,25 +15,24 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride)
;void vp8_dequant_idct_add_neon(short *input, short *dq,
; unsigned char *dest, int stride)
; r0 short *input,
; r1 short *dq,
; r2 unsigned char *pred
; r3 unsigned char *dest
; sp int pitch
; sp+4 int stride
; r2 unsigned char *dest
; r3 int stride
|vp8_dequant_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
ldr r1, [sp] ; pitch
vld1.32 {d14[0]}, [r2], r1
vld1.32 {d14[1]}, [r2], r1
vld1.32 {d15[0]}, [r2], r1
vld1.32 {d15[1]}, [r2]
ldr r1, [sp, #4] ; stride
add r1, r2, r3 ; r1 = dest + stride
lsl r3, #1 ; 2x stride
vld1.32 {d14[0]}, [r2], r3
vld1.32 {d14[1]}, [r1], r3
vld1.32 {d15[0]}, [r2]
vld1.32 {d15[1]}, [r1]
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
@ -110,13 +109,16 @@
vaddw.u8 q1, q1, d14
vaddw.u8 q2, q2, d15
sub r2, r2, r3
sub r1, r1, r3
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r3], r1
vst1.32 {d0[1]}, [r3], r1
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r3]
vst1.32 {d0[0]}, [r2], r3
vst1.32 {d0[1]}, [r1], r3
vst1.32 {d1[0]}, [r2]
vst1.32 {d1[1]}, [r1]
bx lr

Просмотреть файл

@ -15,101 +15,118 @@
/* place these declarations here because we don't want to maintain them
* outside of this scope
*/
void idct_dequant_dc_full_2x_neon
(short *input, short *dq, unsigned char *pre, unsigned char *dst,
int stride, short *dc);
void idct_dequant_dc_0_2x_neon
(short *dc, unsigned char *pre, unsigned char *dst, int stride);
void idct_dequant_full_2x_neon
(short *q, short *dq, unsigned char *pre, unsigned char *dst,
int pitch, int stride);
void idct_dequant_0_2x_neon
(short *q, short dq, unsigned char *pre, int pitch,
unsigned char *dst, int stride);
void idct_dequant_dc_full_2x_neon(short *input, short *dq,
unsigned char *dst,
int stride, short *dc);
void idct_dequant_dc_0_2x_neon(short *input, short *dq,
unsigned char *dst,
int stride, short *dc);
void idct_dequant_full_2x_neon(short *q, short *dq,
unsigned char *dst, int stride);
void idct_dequant_0_2x_neon(short *q, short dq,
unsigned char *dst, int stride);
void vp8_dequant_dc_idct_add_y_block_neon
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs, short *dc)
void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
unsigned char *dst,
int stride, char *eobs, short *dc)
{
int i;
for (i = 0; i < 4; i++)
{
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);
else
idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);
else
idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);
if (((short *)(eobs))[0])
{
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc);
else
idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc);
}
if (((short *)(eobs))[1])
{
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2);
else
idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2);
}
q += 64;
dc += 4;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_neon
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs)
void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
unsigned char *dst,
int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);
else
idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);
else
idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);
if (((short *)(eobs))[0])
{
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dst, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dst, stride);
}
if (((short *)(eobs))[1])
{
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
else
idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
}
q += 64;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_neon
(short *q, short *dq, unsigned char *pre,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
unsigned char *dstu,
unsigned char *dstv,
int stride, char *eobs)
{
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
else
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
if (((short *)(eobs))[0])
{
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dstu, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
}
q += 32;
pre += 32;
dstu += 4*stride;
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
else
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
if (((short *)(eobs))[1])
{
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dstu, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
}
q += 32;
pre += 32;
if (((short *)eobs)[2] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
else
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
if (((short *)(eobs))[2])
{
if (((short *)eobs)[2] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dstv, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
}
q += 32;
pre += 32;
dstv += 4*stride;
if (((short *)eobs)[3] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
else
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
if (((short *)(eobs))[3])
{
if (((short *)eobs)[3] & 0xfefe)
idct_dequant_full_2x_neon (q, dq, dstv, stride);
else
idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
}
}

Просмотреть файл

@ -14,38 +14,38 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
; int pitch, unsigned char *dst, int stride);
;void idct_dequant_0_2x_neon(short *q, short dq,
; unsigned char *dst, int stride);
; r0 *q
; r1 dq
; r2 *pre
; r3 pitch
; sp *dst
; sp+4 stride
; r2 *dst
; r3 stride
|idct_dequant_0_2x_neon| PROC
push {r4, r5}
add r12, r2, #4
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d4[0]}, [r2], r3
vld1.32 {d4[1]}, [r2]
vld1.32 {d8[0]}, [r12], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d8[1]}, [r12], r3
vld1.32 {d4[0]}, [r2], r3
vld1.32 {d10[0]}, [r12], r3
vld1.32 {d10[1]}, [r12]
vld1.32 {d4[1]}, [r2], r3
vld1.32 {d10[1]}, [r12], r3
ldrh r12, [r0] ; lo q
ldrh r2, [r0, #32] ; hi q
mov r3, #0
strh r3, [r0]
strh r3, [r0, #32]
ldrh r4, [r0, #32] ; hi q
mov r5, #0
strh r5, [r0]
strh r5, [r0, #32]
sxth r12, r12 ; lo
mul r0, r12, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q0, r0
sxth r2, r2 ; hi
mul r0, r2, r1
sxth r4, r4 ; hi
mul r0, r4, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q3, r0
@ -55,25 +55,25 @@
vaddw.u8 q4, q3, d8 ; hi
vaddw.u8 q5, q3, d10
ldr r2, [sp] ; dst
ldr r3, [sp, #4] ; stride
sub r2, r2, r3, lsl #2 ; dst - 4*stride
add r0, r2, #4
vqmovun.s16 d2, q1 ; lo
vqmovun.s16 d4, q2
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
add r0, r2, #4
vst1.32 {d2[0]}, [r2], r3 ; lo
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d8[0]}, [r0], r3 ; hi
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d8[1]}, [r0], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d10[0]}, [r0], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d10[1]}, [r0]
bx lr
pop {r4, r5}
bx lr
ENDP ; |idct_dequant_0_2x_neon|
ENDP ; |idct_dequant_0_2x_neon|
END

Просмотреть файл

@ -14,25 +14,29 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
; unsigned char *dst, int stride);
; r0 *dc
; r1 *pre
; r2 *dst
; r3 stride
|idct_dequant_dc_0_2x_neon| PROC
ldr r0, [r0] ; *dc
mov r12, #16
vld1.32 {d2[0]}, [r1], r12 ; lo
vld1.32 {d2[1]}, [r1], r12
vld1.32 {d4[0]}, [r1], r12
vld1.32 {d4[1]}, [r1]
sub r1, r1, #44
vld1.32 {d8[0]}, [r1], r12 ; hi
vld1.32 {d8[1]}, [r1], r12
vld1.32 {d10[0]}, [r1], r12
vld1.32 {d10[1]}, [r1]
;void idct_dequant_dc_0_2x_neon(short *q, short *dq,
; unsigned char *dst, int stride);
; r0 *q,
; r1 *dq,
; r2 *dst
; r3 stride
; sp *dc
|idct_dequant_dc_0_2x_neon| PROC
; no q- or dq-coeffs, so r0 and r1 are free to use
ldr r1, [sp] ; *dc
add r12, r2, #4
ldr r0, [r1]
vld1.32 {d2[0]}, [r2], r3 ; lo
vld1.32 {d8[0]}, [r12], r3 ; hi
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d8[1]}, [r12], r3
vld1.32 {d4[0]}, [r2], r3
vld1.32 {d10[0]}, [r12], r3
vld1.32 {d4[1]}, [r2], r3
vld1.32 {d10[1]}, [r12]
sxth r1, r0 ; lo *dc
add r1, r1, #4
@ -53,14 +57,16 @@
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
sub r2, r2, r3, lsl #2 ; dst - 4*stride
add r0, r2, #4
vst1.32 {d2[0]}, [r2], r3 ; lo
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d8[0]}, [r0], r3 ; hi
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d8[1]}, [r0], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d10[0]}, [r0], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d10[1]}, [r0]
bx lr

Просмотреть файл

@ -15,33 +15,34 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
;void idct_dequant_dc_full_2x_neon(short *q, short *dq,
; unsigned char *dst, int stride, short *dc);
; r0 *q,
; r1 *dq,
; r2 *pre
; r3 *dst
; sp stride
; sp+4 *dc
; r2 *dst
; r3 stride
; sp *dc
|idct_dequant_dc_full_2x_neon| PROC
push {r4}
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
mov r1, #16 ; pitch
add r0, r0, #32
vld1.16 {q4, q5}, [r0] ; r q
add r12, r2, #4
; interleave the predictors
vld1.32 {d28[0]}, [r2], r1 ; l pre
vld1.32 {d28[1]}, [r12], r1 ; r pre
vld1.32 {d29[0]}, [r2], r1
vld1.32 {d29[1]}, [r12], r1
vld1.32 {d30[0]}, [r2], r1
vld1.32 {d30[1]}, [r12], r1
vld1.32 {d31[0]}, [r2]
ldr r1, [sp, #4]
vld1.32 {d28[0]}, [r2], r3 ; l pre
vld1.32 {d28[1]}, [r12], r3 ; r pre
vld1.32 {d29[0]}, [r2], r3
vld1.32 {d29[1]}, [r12], r3
vld1.32 {d30[0]}, [r2], r3
vld1.32 {d30[1]}, [r12], r3
vld1.32 {d31[0]}, [r2], r3
ldr r1, [sp, #4] ; *dc
vld1.32 {d31[1]}, [r12]
adr r2, cospi8sqrt2minus1 ; pointer to the first constant
adr r4, cospi8sqrt2minus1 ; pointer to the first constant
ldrh r12, [r1], #2 ; lo *dc
ldrh r1, [r1] ; hi *dc
@ -56,7 +57,7 @@
vmov.16 d4[0], r12
vmov.16 d8[0], r1
vld1.16 {d0}, [r2]
vld1.16 {d0}, [r4]
; q2: l0r0 q3: l8r8
; q4: l4r4 q5: l12r12
@ -176,26 +177,28 @@
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
sub r2, r2, r3, lsl #2 ; dst - 4*stride
add r1, r2, #4 ; hi
;saturate and narrow
vqmovun.s16 d0, q4 ; lo
vqmovun.s16 d1, q5
vqmovun.s16 d2, q6 ; hi
vqmovun.s16 d3, q7
ldr r1, [sp] ; stride
add r2, r3, #4 ; hi
vst1.32 {d0[0]}, [r3], r1 ; lo
vst1.32 {d0[1]}, [r2], r1 ; hi
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r2], r1
vst1.32 {d2[0]}, [r3], r1
vst1.32 {d2[1]}, [r2], r1
vst1.32 {d3[0]}, [r3]
vst1.32 {d3[1]}, [r2]
vst1.32 {d0[0]}, [r2], r3 ; lo
vst1.32 {d0[1]}, [r1], r3 ; hi
vst1.32 {d1[0]}, [r2], r3
vst1.32 {d1[1]}, [r1], r3
vst1.32 {d2[0]}, [r2], r3
vst1.32 {d2[1]}, [r1], r3
vst1.32 {d3[0]}, [r2]
vst1.32 {d3[1]}, [r1]
bx lr
pop {r4}
bx lr
ENDP ; |idct_dequant_dc_full_2x_neon|
ENDP ; |idct_dequant_dc_full_2x_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b

Просмотреть файл

@ -15,32 +15,30 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
; unsigned char *dst, int pitch, int stride);
;void idct_dequant_full_2x_neon(short *q, short *dq,
; unsigned char *dst, int stride);
; r0 *q,
; r1 *dq,
; r2 *pre
; r3 *dst
; sp pitch
; sp+4 stride
; r2 *dst
; r3 stride
|idct_dequant_full_2x_neon| PROC
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
ldr r1, [sp] ; pitch
add r0, r0, #32
vld1.16 {q4, q5}, [r0] ; r q
add r12, r2, #4
; interleave the predictors
vld1.32 {d28[0]}, [r2], r1 ; l pre
vld1.32 {d28[1]}, [r12], r1 ; r pre
vld1.32 {d29[0]}, [r2], r1
vld1.32 {d29[1]}, [r12], r1
vld1.32 {d30[0]}, [r2], r1
vld1.32 {d30[1]}, [r12], r1
vld1.32 {d31[0]}, [r2]
vld1.32 {d28[0]}, [r2], r3 ; l pre
vld1.32 {d28[1]}, [r12], r3 ; r pre
vld1.32 {d29[0]}, [r2], r3
vld1.32 {d29[1]}, [r12], r3
vld1.32 {d30[0]}, [r2], r3
vld1.32 {d30[1]}, [r12], r3
vld1.32 {d31[0]}, [r2], r3
vld1.32 {d31[1]}, [r12]
adr r2, cospi8sqrt2minus1 ; pointer to the first constant
adr r1, cospi8sqrt2minus1 ; pointer to the first constant
; dequant: q[i] = q[i] * dq[i]
vmul.i16 q2, q2, q0
@ -48,7 +46,7 @@
vmul.i16 q4, q4, q0
vmul.i16 q5, q5, q1
vld1.16 {d0}, [r2]
vld1.16 {d0}, [r1]
; q2: l0r0 q3: l8r8
; q4: l4r4 q5: l12r12
@ -168,22 +166,23 @@
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
sub r2, r2, r3, lsl #2 ; dst - 4*stride
add r1, r2, #4 ; hi
;saturate and narrow
vqmovun.s16 d0, q4 ; lo
vqmovun.s16 d1, q5
vqmovun.s16 d2, q6 ; hi
vqmovun.s16 d3, q7
ldr r1, [sp, #4] ; stride
add r2, r3, #4 ; hi
vst1.32 {d0[0]}, [r3], r1 ; lo
vst1.32 {d0[1]}, [r2], r1 ; hi
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r2], r1
vst1.32 {d2[0]}, [r3], r1
vst1.32 {d2[1]}, [r2], r1
vst1.32 {d3[0]}, [r3]
vst1.32 {d3[1]}, [r2]
vst1.32 {d0[0]}, [r2], r3 ; lo
vst1.32 {d0[1]}, [r1], r3 ; hi
vst1.32 {d1[0]}, [r2], r3
vst1.32 {d1[1]}, [r1], r3
vst1.32 {d2[0]}, [r2], r3
vst1.32 {d2[1]}, [r1], r3
vst1.32 {d3[0]}, [r2]
vst1.32 {d3[1]}, [r1]
bx lr

Просмотреть файл

@ -167,12 +167,12 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
/* do prediction */
if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
{
RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd);
RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
if (mode != B_PRED)
{
RECON_INVOKE(&pbi->common.rtcd.recon,
build_intra_predictors_mby)(xd);
build_intra_predictors_mby_s)(xd);
} else {
vp8_intra_prediction_down_copy(xd);
}
@ -211,20 +211,24 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
int b_mode = xd->mode_info_context->bmi[i].as_mode;
RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
(b, b_mode, b->predictor);
(b, b_mode, *(b->base_dst) + b->dst, b->dst_stride);
if (xd->eobs[i] > 1)
if (xd->eobs[i] )
{
DEQUANT_INVOKE(&pbi->dequant, idct_add)
(b->qcoeff, b->dequant, b->predictor,
*(b->base_dst) + b->dst, 16, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
(b->qcoeff[0] * b->dequant[0], b->predictor,
*(b->base_dst) + b->dst, 16, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
if (xd->eobs[i] > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_add)
(b->qcoeff, b->dequant,
*(b->base_dst) + b->dst, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
(b->qcoeff[0] * b->dequant[0],
*(b->base_dst) + b->dst, b->dst_stride,
*(b->base_dst) + b->dst, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
}
}
}
@ -233,18 +237,18 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
{
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
xd->predictor, xd->dst.y_buffer,
xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
else
{
BLOCKD *b = &xd->block[24];
DEQUANT_INVOKE(&pbi->dequant, block)(b);
/* do 2nd order transform on the dc block */
if (xd->eobs[24] > 1)
{
DEQUANT_INVOKE(&pbi->dequant, block)(b);
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
((int *)b->qcoeff)[0] = 0;
((int *)b->qcoeff)[1] = 0;
@ -257,19 +261,20 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
}
else
{
b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
((int *)b->qcoeff)[0] = 0;
}
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
xd->predictor, xd->dst.y_buffer,
xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
}
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
(xd->qcoeff+16*16, xd->block[16].dequant,
xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.uv_stride, xd->eobs+16);
}

Просмотреть файл

@ -14,10 +14,6 @@
#include "vp8/common/idct.h"
#include "vpx_mem/vpx_mem.h"
extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) ;
extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
void vp8_dequantize_b_c(BLOCKD *d)
{
int i;
@ -31,12 +27,9 @@ void vp8_dequantize_b_c(BLOCKD *d)
}
}
void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride)
void vp8_dequant_idct_add_c(short *input, short *dq,
unsigned char *dest, int stride)
{
short output[16];
short *diff_ptr = output;
int r, c;
int i;
for (i = 0; i < 16; i++)
@ -44,40 +37,17 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
input[i] = dq[i] * input[i];
}
/* the idct halves ( >> 1) the pitch */
vp8_short_idct4x4llm_c(input, output, 4 << 1);
vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
vpx_memset(input, 0, 32);
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 4;
pred += pitch;
}
}
void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride,
void vp8_dequant_dc_idct_add_c(short *input, short *dq,
unsigned char *dest, int stride,
int Dc)
{
int i;
short output[16];
short *diff_ptr = output;
int r, c;
input[0] = (short)Dc;
@ -86,28 +56,8 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
input[i] = dq[i] * input[i];
}
/* the idct halves ( >> 1) the pitch */
vp8_short_idct4x4llm_c(input, output, 4 << 1);
vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
vpx_memset(input, 0, 32);
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
int a = diff_ptr[c] + pred[c];
if (a < 0)
a = 0;
if (a > 255)
a = 255;
dest[c] = (unsigned char) a;
}
dest += stride;
diff_ptr += 4;
pred += pitch;
}
}

Просмотреть файл

@ -18,28 +18,28 @@
#define prototype_dequant_idct_add(sym) \
void sym(short *input, short *dq, \
unsigned char *pred, unsigned char *output, \
int pitch, int stride)
unsigned char *output, \
int stride)
#define prototype_dequant_dc_idct_add(sym) \
void sym(short *input, short *dq, \
unsigned char *pred, unsigned char *output, \
int pitch, int stride, \
unsigned char *dst, \
int stride, \
int dc)
#define prototype_dequant_dc_idct_add_y_block(sym) \
void sym(short *q, short *dq, \
unsigned char *pre, unsigned char *dst, \
unsigned char *dst, \
int stride, char *eobs, short *dc)
#define prototype_dequant_idct_add_y_block(sym) \
void sym(short *q, short *dq, \
unsigned char *pre, unsigned char *dst, \
unsigned char *dst, \
int stride, char *eobs)
#define prototype_dequant_idct_add_uv_block(sym) \
void sym(short *q, short *dq, \
unsigned char *pre, unsigned char *dst_u, \
unsigned char *dst_u, \
unsigned char *dst_v, int stride, char *eobs)
#if ARCH_X86 || ARCH_X86_64

Просмотреть файл

@ -621,9 +621,8 @@ void vp8_conceal_corrupt_mb(MACROBLOCKD *xd)
{
/* This macroblock has corrupt residual, use the motion compensated
image (predictor) for concealment */
vp8_recon_copy16x16(xd->predictor, 16, xd->dst.y_buffer, xd->dst.y_stride);
vp8_recon_copy8x8(xd->predictor + 256, 8,
xd->dst.u_buffer, xd->dst.uv_stride);
vp8_recon_copy8x8(xd->predictor + 320, 8,
xd->dst.v_buffer, xd->dst.uv_stride);
/* The build predictor functions now output directly into the dst buffer,
* so the copies are no longer necessary */
}

Просмотреть файл

@ -12,16 +12,17 @@
#include "vp8/common/idct.h"
#include "dequantize.h"
void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride,
void vp8_dequant_dc_idct_add_c(short *input, short *dq,
unsigned char *dest, int stride,
int Dc);
void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride);
void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
unsigned char *dst_ptr, int pitch, int stride);
void vp8_dequant_idct_add_c(short *input, short *dq,
unsigned char *dest, int stride);
void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
int pred_stride, unsigned char *dst_ptr,
int dst_stride);
void vp8_dequant_dc_idct_add_y_block_c
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs, short *dc)
{
int i, j;
@ -31,23 +32,21 @@ void vp8_dequant_dc_idct_add_y_block_c
for (j = 0; j < 4; j++)
{
if (*eobs++ > 1)
vp8_dequant_dc_idct_add_c (q, dq, pre, dst, 16, stride, dc[0]);
vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]);
else
vp8_dc_only_idct_add_c (dc[0], pre, dst, 16, stride);
vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride);
q += 16;
pre += 4;
dst += 4;
dc ++;
}
pre += 64 - 16;
dst += 4*stride - 16;
}
}
void vp8_dequant_idct_add_y_block_c
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
{
int i, j;
@ -57,25 +56,23 @@ void vp8_dequant_idct_add_y_block_c
for (j = 0; j < 4; j++)
{
if (*eobs++ > 1)
vp8_dequant_idct_add_c (q, dq, pre, dst, 16, stride);
vp8_dequant_idct_add_c (q, dq, dst, stride);
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dst, 16, stride);
vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
((int *)q)[0] = 0;
}
q += 16;
pre += 4;
dst += 4;
}
pre += 64 - 16;
dst += 4*stride - 16;
}
}
void vp8_dequant_idct_add_uv_block_c
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
int i, j;
@ -85,19 +82,17 @@ void vp8_dequant_idct_add_uv_block_c
for (j = 0; j < 2; j++)
{
if (*eobs++ > 1)
vp8_dequant_idct_add_c (q, dq, pre, dstu, 8, stride);
vp8_dequant_idct_add_c (q, dq, dstu, stride);
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstu, 8, stride);
vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
((int *)q)[0] = 0;
}
q += 16;
pre += 4;
dstu += 4;
}
pre += 32 - 8;
dstu += 4*stride - 8;
}
@ -106,19 +101,17 @@ void vp8_dequant_idct_add_uv_block_c
for (j = 0; j < 2; j++)
{
if (*eobs++ > 1)
vp8_dequant_idct_add_c (q, dq, pre, dstv, 8, stride);
vp8_dequant_idct_add_c (q, dq, dstv, stride);
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstv, 8, stride);
vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
((int *)q)[0] = 0;
}
q += 16;
pre += 4;
dstv += 4;
}
pre += 32 - 8;
dstv += 4*stride - 8;
}
}

Просмотреть файл

@ -606,6 +606,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
MACROBLOCKD *xd,
int b_mode,
unsigned char *predictor,
int stride,
int mb_row,
int mb_col,
int num)
@ -662,7 +663,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
predictor[c] = expected_dc;
}
predictor += 16;
predictor += stride;
}
}
break;
@ -684,7 +685,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
predictor[c] = pred;
}
predictor += 16;
predictor += stride;
}
}
break;
@ -706,7 +707,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
predictor[c] = ap[c];
}
predictor += 16;
predictor += stride;
}
}
@ -729,29 +730,29 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
predictor[c] = lp[r];
}
predictor += 16;
predictor += stride;
}
}
break;
case B_LD_PRED:
{
unsigned char *ptr = Above;
predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
predictor[0 * 16 + 1] =
predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
predictor[0 * 16 + 2] =
predictor[1 * 16 + 1] =
predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
predictor[0 * 16 + 3] =
predictor[1 * 16 + 2] =
predictor[2 * 16 + 1] =
predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
predictor[1 * 16 + 3] =
predictor[2 * 16 + 2] =
predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
predictor[2 * 16 + 3] =
predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
predictor[0 * stride + 1] =
predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
predictor[0 * stride + 2] =
predictor[1 * stride + 1] =
predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
predictor[0 * stride + 3] =
predictor[1 * stride + 2] =
predictor[2 * stride + 1] =
predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
predictor[1 * stride + 3] =
predictor[2 * stride + 2] =
predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
predictor[2 * stride + 3] =
predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
}
break;
@ -770,22 +771,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
pp[7] = Above[2];
pp[8] = Above[3];
predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[3 * 16 + 1] =
predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[3 * 16 + 2] =
predictor[2 * 16 + 1] =
predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[3 * 16 + 3] =
predictor[2 * 16 + 2] =
predictor[1 * 16 + 1] =
predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * 16 + 3] =
predictor[1 * 16 + 2] =
predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[1 * 16 + 3] =
predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[3 * stride + 1] =
predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[3 * stride + 2] =
predictor[2 * stride + 1] =
predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[3 * stride + 3] =
predictor[2 * stride + 2] =
predictor[1 * stride + 1] =
predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * stride + 3] =
predictor[1 * stride + 2] =
predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[1 * stride + 3] =
predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
}
break;
@ -805,22 +806,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
pp[8] = Above[3];
predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[3 * 16 + 1] =
predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * 16 + 1] =
predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
predictor[3 * 16 + 2] =
predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[2 * 16 + 2] =
predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
predictor[3 * 16 + 3] =
predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[2 * 16 + 3] =
predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[3 * stride + 1] =
predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * stride + 1] =
predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1;
predictor[3 * stride + 2] =
predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[2 * stride + 2] =
predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1;
predictor[3 * stride + 3] =
predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[2 * stride + 3] =
predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1;
predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1;
}
break;
@ -829,22 +830,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
unsigned char *pp = Above;
predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[2 * 16 + 0] =
predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
predictor[1 * 16 + 1] =
predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * 16 + 1] =
predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
predictor[3 * 16 + 1] =
predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[0 * 16 + 3] =
predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
predictor[1 * 16 + 3] =
predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[2 * stride + 0] =
predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1;
predictor[1 * stride + 1] =
predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * stride + 1] =
predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1;
predictor[3 * stride + 1] =
predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[0 * stride + 3] =
predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1;
predictor[1 * stride + 3] =
predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
}
break;
@ -862,22 +863,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
pp[8] = Above[3];
predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[2 * 16 + 0] =
predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
predictor[2 * 16 + 1] =
predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * 16 + 2] =
predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
predictor[2 * 16 + 3] =
predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[1 * 16 + 2] =
predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
predictor[1 * 16 + 3] =
predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[2 * stride + 0] =
predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1;
predictor[2 * stride + 1] =
predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[2 * stride + 2] =
predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
predictor[2 * stride + 3] =
predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
predictor[1 * stride + 2] =
predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1;
predictor[1 * stride + 3] =
predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
}
break;
@ -885,22 +886,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
case B_HU_PRED:
{
unsigned char *pp = Left;
predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[0 * 16 + 2] =
predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
predictor[0 * 16 + 3] =
predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[1 * 16 + 2] =
predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
predictor[1 * 16 + 3] =
predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
predictor[2 * 16 + 2] =
predictor[2 * 16 + 3] =
predictor[3 * 16 + 0] =
predictor[3 * 16 + 1] =
predictor[3 * 16 + 2] =
predictor[3 * 16 + 3] = pp[3];
predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
predictor[0 * stride + 2] =
predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1;
predictor[0 * stride + 3] =
predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
predictor[1 * stride + 2] =
predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
predictor[1 * stride + 3] =
predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
predictor[2 * stride + 2] =
predictor[2 * stride + 3] =
predictor[3 * stride + 0] =
predictor[3 * stride + 1] =
predictor[3 * stride + 2] =
predictor[3 * stride + 3] = pp[3];
}
break;

Просмотреть файл

@ -19,7 +19,7 @@ extern void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, i
extern void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
extern void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int mb_row, int mb_col, int num);
extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int stride, int mb_row, int mb_col, int num);
extern void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
#endif

Просмотреть файл

@ -138,11 +138,11 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
/* do prediction */
if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
{
vp8mt_build_intra_predictors_mbuv(pbi, xd, mb_row, mb_col);
vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col);
if (xd->mode_info_context->mbmi.mode != B_PRED)
{
vp8mt_build_intra_predictors_mby(pbi, xd, mb_row, mb_col);
vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col);
} else {
vp8mt_intra_prediction_down_copy(pbi, xd, mb_row, mb_col);
}
@ -201,7 +201,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
xd->predictor, xd->dst.y_buffer,
xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
}
else if (xd->mode_info_context->mbmi.mode == B_PRED)
@ -211,19 +211,21 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
BLOCKD *b = &xd->block[i];
int b_mode = xd->mode_info_context->bmi[i].as_mode;
vp8mt_predict_intra4x4(pbi, xd, b_mode, b->predictor, mb_row, mb_col, i);
vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
b->dst_stride, mb_row, mb_col, i);
if (xd->eobs[i] > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_add)
(b->qcoeff, b->dequant, b->predictor,
*(b->base_dst) + b->dst, 16, b->dst_stride);
(b->qcoeff, b->dequant,
*(b->base_dst) + b->dst, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
(b->qcoeff[0] * b->dequant[0], b->predictor,
*(b->base_dst) + b->dst, 16, b->dst_stride);
(b->qcoeff[0] * b->dequant[0],
*(b->base_dst) + b->dst, b->dst_stride,
*(b->base_dst) + b->dst, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
}
}
@ -232,13 +234,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
{
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
xd->predictor, xd->dst.y_buffer,
xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
(xd->qcoeff+16*16, xd->block[16].dequant,
xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.uv_stride, xd->eobs+16);
}

Просмотреть файл

@ -50,14 +50,17 @@ sym(vp8_dequantize_b_impl_mmx):
ret
;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
;void dequant_idct_add_mmx(
;short *input, 0
;short *dq, 1
;unsigned char *dest, 2
;int stride) 3
global sym(vp8_dequant_idct_add_mmx)
sym(vp8_dequant_idct_add_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SHADOW_ARGS_TO_STACK 4
GET_GOT rbx
push rsi
push rdi
; end prolog
@ -77,8 +80,8 @@ sym(vp8_dequant_idct_add_mmx):
movq mm3, [rax+24]
pmullw mm3, [rdx+24]
mov rdx, arg(3) ;dest
mov rsi, arg(2) ;pred
mov rdx, arg(2) ;dest
pxor mm7, mm7
@ -89,8 +92,7 @@ sym(vp8_dequant_idct_add_mmx):
movq [rax+24],mm7
movsxd rax, dword ptr arg(4) ;pitch
movsxd rdi, dword ptr arg(5) ;stride
movsxd rdi, dword ptr arg(3) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
@ -211,28 +213,27 @@ sym(vp8_dequant_idct_add_mmx):
pxor mm7, mm7
movd mm4, [rsi]
movd mm4, [rdx]
punpcklbw mm4, mm7
paddsw mm0, mm4
packuswb mm0, mm7
movd [rdx], mm0
movd mm4, [rsi+rax]
movd mm4, [rdx+rdi]
punpcklbw mm4, mm7
paddsw mm1, mm4
packuswb mm1, mm7
movd [rdx+rdi], mm1
movd mm4, [rsi+2*rax]
movd mm4, [rdx+2*rdi]
punpcklbw mm4, mm7
paddsw mm2, mm4
packuswb mm2, mm7
movd [rdx+rdi*2], mm2
add rdx, rdi
add rsi, rax
movd mm4, [rsi+2*rax]
movd mm4, [rdx+2*rdi]
punpcklbw mm4, mm7
paddsw mm5, mm4
packuswb mm5, mm7
@ -240,22 +241,24 @@ sym(vp8_dequant_idct_add_mmx):
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
;void dequant_dc_idct_add_mmx(
;short *input, 0
;short *dq, 1
;unsigned char *dest, 2
;int stride, 3
;int Dc) 4
global sym(vp8_dequant_dc_idct_add_mmx)
sym(vp8_dequant_dc_idct_add_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rax, arg(0) ;input
@ -273,8 +276,7 @@ sym(vp8_dequant_dc_idct_add_mmx):
movq mm3, [rax+24]
pmullw mm3, [rdx+24]
mov rdx, arg(3) ;dest
mov rsi, arg(2) ;pred
mov rdx, arg(2) ;pred
pxor mm7, mm7
@ -286,13 +288,12 @@ sym(vp8_dequant_dc_idct_add_mmx):
; move lower word of Dc to lower word of mm0
psrlq mm0, 16
movzx rcx, word ptr arg(6) ;Dc
movzx rcx, word ptr arg(4) ;Dc
psllq mm0, 16
movq mm7, rcx
por mm0, mm7
movsxd rax, dword ptr arg(4) ;pitch
movsxd rdi, dword ptr arg(5) ;stride
movsxd rax, dword ptr arg(3) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
@ -413,36 +414,33 @@ sym(vp8_dequant_dc_idct_add_mmx):
pxor mm7, mm7
movd mm4, [rsi]
movd mm4, [rdx]
punpcklbw mm4, mm7
paddsw mm0, mm4
packuswb mm0, mm7
movd [rdx], mm0
movd mm4, [rsi+rax]
movd mm4, [rdx+rax]
punpcklbw mm4, mm7
paddsw mm1, mm4
packuswb mm1, mm7
movd [rdx+rdi], mm1
movd [rdx+rax], mm1
movd mm4, [rsi+2*rax]
movd mm4, [rdx+2*rax]
punpcklbw mm4, mm7
paddsw mm2, mm4
packuswb mm2, mm7
movd [rdx+rdi*2], mm2
movd [rdx+rax*2], mm2
add rdx, rdi
add rsi, rax
add rdx, rax
movd mm4, [rsi+2*rax]
movd mm4, [rdx+2*rax]
punpcklbw mm4, mm7
paddsw mm5, mm4
packuswb mm5, mm7
movd [rdx+rdi*2], mm5
movd [rdx+rax*2], mm5
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp

Просмотреть файл

@ -13,7 +13,7 @@
#include "vp8/decoder/dequantize.h"
void vp8_dequant_dc_idct_add_y_block_mmx
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs, short *dc)
{
int i;
@ -21,35 +21,34 @@ void vp8_dequant_dc_idct_add_y_block_mmx
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_dc_idct_add_mmx (q, dq, pre, dst, 16, stride, dc[0]);
else
vp8_dc_only_idct_add_mmx (dc[0], pre, dst, 16, stride);
vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]);
else if (eobs[0] == 1)
vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride);
if (eobs[1] > 1)
vp8_dequant_dc_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
else
vp8_dc_only_idct_add_mmx (dc[1], pre+4, dst+4, 16, stride);
vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]);
else if (eobs[1] == 1)
vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride);
if (eobs[2] > 1)
vp8_dequant_dc_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
else
vp8_dc_only_idct_add_mmx (dc[2], pre+8, dst+8, 16, stride);
vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]);
else if (eobs[2] == 1)
vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride);
if (eobs[3] > 1)
vp8_dequant_dc_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
else
vp8_dc_only_idct_add_mmx (dc[3], pre+12, dst+12, 16, stride);
vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]);
else if (eobs[3] == 1)
vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride);
q += 64;
dc += 4;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_mmx
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
{
int i;
@ -57,46 +56,48 @@ void vp8_dequant_idct_add_y_block_mmx
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_mmx (q, dq, pre, dst, 16, stride);
else
vp8_dequant_idct_add_mmx (q, dq, dst, stride);
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dst, 16, stride);
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride);
else
vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
else if (eobs[1] == 1)
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dst+4, 16, stride);
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
dst+4, stride);
((int *)(q+16))[0] = 0;
}
if (eobs[2] > 1)
vp8_dequant_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride);
else
vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
else if (eobs[2] == 1)
{
vp8_dc_only_idct_add_mmx (q[32]*dq[0], pre+8, dst+8, 16, stride);
vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
dst+8, stride);
((int *)(q+32))[0] = 0;
}
if (eobs[3] > 1)
vp8_dequant_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride);
else
vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
else if (eobs[3] == 1)
{
vp8_dc_only_idct_add_mmx (q[48]*dq[0], pre+12, dst+12, 16, stride);
vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
dst+12, stride);
((int *)(q+48))[0] = 0;
}
q += 64;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_mmx
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
int i;
@ -104,23 +105,23 @@ void vp8_dequant_idct_add_uv_block_mmx
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_mmx (q, dq, pre, dstu, 8, stride);
else
vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstu, 8, stride);
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstu+4, 8, stride);
else
vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
else if (eobs[1] == 1)
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstu+4, 8, stride);
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
dstu+4, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
pre += 32;
dstu += 4*stride;
eobs += 2;
}
@ -128,23 +129,23 @@ void vp8_dequant_idct_add_uv_block_mmx
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_mmx (q, dq, pre, dstv, 8, stride);
else
vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstv, 8, stride);
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstv+4, 8, stride);
else
vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
else if (eobs[1] == 1)
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstv+4, 8, stride);
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
dstv+4, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
pre += 32;
dstv += 4*stride;
eobs += 2;
}

Просмотреть файл

@ -13,102 +13,115 @@
#include "vp8/decoder/dequantize.h"
void vp8_idct_dequant_dc_0_2x_sse2
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dst, int dst_stride, short *dc);
void vp8_idct_dequant_dc_full_2x_sse2
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dst, int dst_stride, short *dc);
void vp8_idct_dequant_0_2x_sse2
(short *q, short *dq ,unsigned char *pre,
unsigned char *dst, int dst_stride, int blk_stride);
(short *q, short *dq ,
unsigned char *dst, int dst_stride);
void vp8_idct_dequant_full_2x_sse2
(short *q, short *dq ,unsigned char *pre,
unsigned char *dst, int dst_stride, int blk_stride);
(short *q, short *dq ,
unsigned char *dst, int dst_stride);
void vp8_dequant_dc_idct_add_y_block_sse2
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs, short *dc)
{
int i;
for (i = 0; i < 4; i++)
{
if (((short *)(eobs))[0] & 0xfefe)
vp8_idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
else
vp8_idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
if (((short *)(eobs))[1] & 0xfefe)
vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
else
vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
if (((short *)(eobs))[0])
{
if (((short *)(eobs))[0] & 0xfefe)
vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc);
else
vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc);
}
if (((short *)(eobs))[1])
{
if (((short *)(eobs))[1] & 0xfefe)
vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
else
vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
}
q += 64;
dc += 4;
pre += 64;
dst += stride*4;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_sse2
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
if (((short *)(eobs))[0] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
if (((short *)(eobs))[1] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
else
vp8_idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
if (((short *)(eobs))[0])
{
if (((short *)(eobs))[0] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
}
if (((short *)(eobs))[1])
{
if (((short *)(eobs))[1] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
else
vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
}
q += 64;
pre += 64;
dst += stride*4;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_sse2
(short *q, short *dq, unsigned char *pre,
(short *q, short *dq,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
if (((short *)(eobs))[0] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
if (((short *)(eobs))[0])
{
if (((short *)(eobs))[0] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
}
q += 32;
pre += 32;
dstu += stride*4;
if (((short *)(eobs))[1] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
if (((short *)(eobs))[1])
{
if (((short *)(eobs))[1] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
}
q += 32;
pre += 32;
if (((short *)(eobs))[2] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
if (((short *)(eobs))[2])
{
if (((short *)(eobs))[2] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
}
q += 32;
pre += 32;
dstv += stride*4;
if (((short *)(eobs))[3] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
if (((short *)(eobs))[3])
{
if (((short *)(eobs))[3] & 0xfefe)
vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
else
vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
}
}

Просмотреть файл

@ -64,7 +64,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
BLOCK *be = &x->block[ib];
RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
(b, b->bmi.as_mode, b->predictor);
(b, b->bmi.as_mode, b->predictor, 16);
ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
@ -72,9 +72,8 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
x->quantize_b(be, b);
vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);
vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16);
RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
}
void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
@ -106,9 +105,6 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
RECON_INVOKE(&rtcd->common->recon, recon_mby)
(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
}
void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
@ -126,5 +122,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
}

Просмотреть файл

@ -577,9 +577,70 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
}
}
static void recon_dcblock(MACROBLOCKD *x)
{
BLOCKD *b = &x->block[24];
int i;
for (i = 0; i < 16; i++)
{
x->block[i].dqcoeff[0] = b->diff[i];
}
}
static void inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd,
MACROBLOCKD *x)
{
int i;
if (x->mode_info_context->mbmi.mode != B_PRED &&
x->mode_info_context->mbmi.mode != SPLITMV)
{
/* do 2nd order transform on the dc block */
IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
recon_dcblock(x);
}
for (i = 0; i < 16; i++)
{
BLOCKD *b = &x->block[i];
if (b->eob > 1)
{
IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, 16,
*(b->base_dst) + b->dst, b->dst_stride);
}
else
{
IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, 16,
*(b->base_dst) + b->dst, b->dst_stride);
}
}
for (i = 16; i < 24; i++)
{
BLOCKD *b = &x->block[i];
if (b->eob > 1)
{
IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, 8,
*(b->base_dst) + b->dst, b->dst_stride);
}
else
{
IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, 8,
*(b->base_dst) + b->dst, b->dst_stride);
}
}
}
void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
{
vp8_build_inter_predictors_mb(&x->e_mbd);
vp8_build_inter_predictors_mb_e(&x->e_mbd);
vp8_subtract_mb(rtcd, x);
@ -590,10 +651,8 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
if (x->optimize)
optimize_mb(x, rtcd);
vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
RECON_INVOKE(&rtcd->common->recon, recon_mb)
(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
}
@ -612,6 +671,4 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
RECON_INVOKE(&rtcd->common->recon, recon_mby)
(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
}

Просмотреть файл

@ -157,7 +157,7 @@ static int pick_intra4x4block(
rate = mode_costs[mode];
RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
(b, mode, b->predictor);
(b, mode, b->predictor, 16);
distortion = get_prediction_error(be, b, &rtcd->variance);
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

Просмотреть файл

@ -631,7 +631,7 @@ static int rd_pick_intra4x4block(
rate = bmode_costs[mode];
RECON_INVOKE(&cpi->rtcd.common->recon, intra4x4_predict)
(b, mode, b->predictor);
(b, mode, b->predictor, 16);
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
x->quantize_b(be, b);
@ -660,8 +660,8 @@ static int rd_pick_intra4x4block(
}
b->bmi.as_mode = (B_PREDICTION_MODE)(*best_mode);
IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32);
RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff,
best_predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);
return best_rd;
}

Просмотреть файл

@ -64,7 +64,6 @@ VP8_COMMON_SRCS-yes += common/mbpitch.c
VP8_COMMON_SRCS-yes += common/modecont.c
VP8_COMMON_SRCS-yes += common/modecontext.c
VP8_COMMON_SRCS-yes += common/quant_common.c
VP8_COMMON_SRCS-yes += common/recon.c
VP8_COMMON_SRCS-yes += common/reconinter.c
VP8_COMMON_SRCS-yes += common/reconintra.c
VP8_COMMON_SRCS-yes += common/reconintra4x4.c
@ -125,7 +124,6 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/recon_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM)
@ -143,16 +141,10 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfilter_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/mbloopfilter_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon2b_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon4b_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/reconb_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_1_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict4x4_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x4_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x8_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict16x16_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon16x16mb_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon_neon.c