Rework idct calling structure.
Moving the eob structure allows for a non-struct based function to handle decoding an entire mb of idct/dequant/recon data. This allows for SIMD functions to idct/dequant/recon multiple blocks at once. SSE2 implementation gives 3% gain on Atom. Change-Id: I8a8f3efd546ea4e0535f517d94f347cfb737c9c2
This commit is contained in:
Родитель
b0660457fe
Коммит
93c32a55c2
|
@ -218,6 +218,7 @@ typedef struct
|
|||
//not used DECLARE_ALIGNED(16, short, reference[384]);
|
||||
DECLARE_ALIGNED(16, short, qcoeff[400]);
|
||||
DECLARE_ALIGNED(16, short, dqcoeff[400]);
|
||||
DECLARE_ALIGNED(16, char, eobs[25]);
|
||||
|
||||
// 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries.
|
||||
BLOCKD block[25];
|
||||
|
|
|
@ -0,0 +1,708 @@
|
|||
;
|
||||
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void idct_dequant_0_2x_sse2
|
||||
; (
|
||||
; short *qcoeff - 0
|
||||
; short *dequant - 1
|
||||
; unsigned char *pre - 2
|
||||
; unsigned char *dst - 3
|
||||
; int dst_stride - 4
|
||||
; int blk_stride - 5
|
||||
; )
|
||||
|
||||
global sym(idct_dequant_0_2x_sse2)
|
||||
sym(idct_dequant_0_2x_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
mov rdx, arg(1) ; dequant
|
||||
mov rax, arg(0) ; qcoeff
|
||||
|
||||
; Zero out xmm7, for use unpacking
|
||||
pxor xmm7, xmm7
|
||||
|
||||
movd xmm4, [rax]
|
||||
movd xmm5, [rdx]
|
||||
|
||||
pinsrw xmm4, [rax+32], 4
|
||||
pinsrw xmm5, [rdx], 4
|
||||
|
||||
pmullw xmm4, xmm5
|
||||
|
||||
; clear coeffs
|
||||
movd [rax], xmm7
|
||||
movd [rax+32], xmm7
|
||||
;pshufb
|
||||
pshuflw xmm4, xmm4, 00000000b
|
||||
pshufhw xmm4, xmm4, 00000000b
|
||||
|
||||
mov rax, arg(2) ; pre
|
||||
paddw xmm4, [fours GLOBAL]
|
||||
|
||||
movsxd rcx, dword ptr arg(5) ; blk_stride
|
||||
psraw xmm4, 3
|
||||
|
||||
movq xmm0, [rax]
|
||||
movq xmm1, [rax+rcx]
|
||||
movq xmm2, [rax+2*rcx]
|
||||
lea rcx, [3*rcx]
|
||||
movq xmm3, [rax+rcx]
|
||||
|
||||
punpcklbw xmm0, xmm7
|
||||
punpcklbw xmm1, xmm7
|
||||
punpcklbw xmm2, xmm7
|
||||
punpcklbw xmm3, xmm7
|
||||
|
||||
mov rax, arg(3) ; dst
|
||||
movsxd rdx, dword ptr arg(4) ; dst_stride
|
||||
|
||||
; Add to predict buffer
|
||||
paddw xmm0, xmm4
|
||||
paddw xmm1, xmm4
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm4
|
||||
|
||||
; pack up before storing
|
||||
packuswb xmm0, xmm7
|
||||
packuswb xmm1, xmm7
|
||||
packuswb xmm2, xmm7
|
||||
packuswb xmm3, xmm7
|
||||
|
||||
; store blocks back out
|
||||
movq [rax], xmm0
|
||||
movq [rax + rdx], xmm1
|
||||
|
||||
lea rax, [rax + 2*rdx]
|
||||
|
||||
movq [rax], xmm2
|
||||
movq [rax + rdx], xmm3
|
||||
|
||||
; begin epilog
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
global sym(idct_dequant_full_2x_sse2)
|
||||
sym(idct_dequant_full_2x_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
; special case when 2 blocks have 0 or 1 coeffs
|
||||
; dc is set as first coeff, so no need to load qcoeff
|
||||
mov rax, arg(0) ; qcoeff
|
||||
mov rsi, arg(2) ; pre
|
||||
mov rdi, arg(3) ; dst
|
||||
movsxd rcx, dword ptr arg(5) ; blk_stride
|
||||
|
||||
; Zero out xmm7, for use unpacking
|
||||
pxor xmm7, xmm7
|
||||
|
||||
mov rdx, arg(1) ; dequant
|
||||
|
||||
; note the transpose of xmm1 and xmm2, necessary for shuffle
|
||||
; to spit out sensicle data
|
||||
movdqa xmm0, [rax]
|
||||
movdqa xmm2, [rax+16]
|
||||
movdqa xmm1, [rax+32]
|
||||
movdqa xmm3, [rax+48]
|
||||
|
||||
; Clear out coeffs
|
||||
movdqa [rax], xmm7
|
||||
movdqa [rax+16], xmm7
|
||||
movdqa [rax+32], xmm7
|
||||
movdqa [rax+48], xmm7
|
||||
|
||||
; dequantize qcoeff buffer
|
||||
pmullw xmm0, [rdx]
|
||||
pmullw xmm2, [rdx+16]
|
||||
pmullw xmm1, [rdx]
|
||||
pmullw xmm3, [rdx+16]
|
||||
|
||||
; repack so block 0 row x and block 1 row x are together
|
||||
movdqa xmm4, xmm0
|
||||
punpckldq xmm0, xmm1
|
||||
punpckhdq xmm4, xmm1
|
||||
|
||||
pshufd xmm0, xmm0, 11011000b
|
||||
pshufd xmm1, xmm4, 11011000b
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
punpckldq xmm2, xmm3
|
||||
punpckhdq xmm4, xmm3
|
||||
|
||||
pshufd xmm2, xmm2, 11011000b
|
||||
pshufd xmm3, xmm4, 11011000b
|
||||
|
||||
; first pass
|
||||
psubw xmm0, xmm2 ; b1 = 0-2
|
||||
paddw xmm2, xmm2 ;
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
paddw xmm2, xmm0 ; a1 = 0+2
|
||||
|
||||
pmulhw xmm5, [x_s1sqr2 GLOBAL]
|
||||
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movdqa xmm7, xmm3
|
||||
pmulhw xmm7, [x_c1sqr2less1 GLOBAL]
|
||||
|
||||
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw xmm7, xmm5 ; c1
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
movdqa xmm4, xmm3
|
||||
|
||||
pmulhw xmm5, [x_c1sqr2less1 GLOBAL]
|
||||
paddw xmm5, xmm1
|
||||
|
||||
pmulhw xmm3, [x_s1sqr2 GLOBAL]
|
||||
paddw xmm3, xmm4
|
||||
|
||||
paddw xmm3, xmm5 ; d1
|
||||
movdqa xmm6, xmm2 ; a1
|
||||
|
||||
movdqa xmm4, xmm0 ; b1
|
||||
paddw xmm2, xmm3 ;0
|
||||
|
||||
paddw xmm4, xmm7 ;1
|
||||
psubw xmm0, xmm7 ;2
|
||||
|
||||
psubw xmm6, xmm3 ;3
|
||||
|
||||
; transpose for the second pass
|
||||
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
|
||||
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
|
||||
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
|
||||
|
||||
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
|
||||
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
|
||||
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
|
||||
|
||||
|
||||
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
|
||||
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
|
||||
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
|
||||
|
||||
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
|
||||
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
|
||||
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
|
||||
|
||||
|
||||
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
|
||||
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
|
||||
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
|
||||
|
||||
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
|
||||
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
|
||||
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
|
||||
|
||||
pshufd xmm0, xmm2, 11011000b
|
||||
pshufd xmm2, xmm1, 11011000b
|
||||
|
||||
pshufd xmm1, xmm5, 11011000b
|
||||
pshufd xmm3, xmm7, 11011000b
|
||||
|
||||
; second pass
|
||||
psubw xmm0, xmm2 ; b1 = 0-2
|
||||
paddw xmm2, xmm2
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
paddw xmm2, xmm0 ; a1 = 0+2
|
||||
|
||||
pmulhw xmm5, [x_s1sqr2 GLOBAL]
|
||||
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movdqa xmm7, xmm3
|
||||
pmulhw xmm7, [x_c1sqr2less1 GLOBAL]
|
||||
|
||||
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw xmm7, xmm5 ; c1
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
movdqa xmm4, xmm3
|
||||
|
||||
pmulhw xmm5, [x_c1sqr2less1 GLOBAL]
|
||||
paddw xmm5, xmm1
|
||||
|
||||
pmulhw xmm3, [x_s1sqr2 GLOBAL]
|
||||
paddw xmm3, xmm4
|
||||
|
||||
paddw xmm3, xmm5 ; d1
|
||||
paddw xmm0, [fours GLOBAL]
|
||||
|
||||
paddw xmm2, [fours GLOBAL]
|
||||
movdqa xmm6, xmm2 ; a1
|
||||
|
||||
movdqa xmm4, xmm0 ; b1
|
||||
paddw xmm2, xmm3 ;0
|
||||
|
||||
paddw xmm4, xmm7 ;1
|
||||
psubw xmm0, xmm7 ;2
|
||||
|
||||
psubw xmm6, xmm3 ;3
|
||||
psraw xmm2, 3
|
||||
|
||||
psraw xmm0, 3
|
||||
psraw xmm4, 3
|
||||
|
||||
psraw xmm6, 3
|
||||
|
||||
; transpose to save
|
||||
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
|
||||
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
|
||||
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
|
||||
|
||||
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
|
||||
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
|
||||
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
|
||||
|
||||
|
||||
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
|
||||
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
|
||||
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
|
||||
|
||||
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
|
||||
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
|
||||
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
|
||||
|
||||
|
||||
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
|
||||
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
|
||||
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
|
||||
|
||||
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
|
||||
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
|
||||
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
|
||||
|
||||
pshufd xmm0, xmm2, 11011000b
|
||||
pshufd xmm2, xmm1, 11011000b
|
||||
|
||||
pshufd xmm1, xmm5, 11011000b
|
||||
pshufd xmm3, xmm7, 11011000b
|
||||
|
||||
pxor xmm7, xmm7
|
||||
|
||||
; Load up predict blocks
|
||||
movq xmm4, [rsi]
|
||||
movq xmm5, [rsi+rcx]
|
||||
|
||||
punpcklbw xmm4, xmm7
|
||||
punpcklbw xmm5, xmm7
|
||||
|
||||
paddw xmm0, xmm4
|
||||
paddw xmm1, xmm5
|
||||
|
||||
movq xmm4, [rsi+2*rcx]
|
||||
lea rcx, [3*rcx]
|
||||
movq xmm5, [rsi+rcx]
|
||||
|
||||
punpcklbw xmm4, xmm7
|
||||
punpcklbw xmm5, xmm7
|
||||
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm5
|
||||
|
||||
.finish:
|
||||
|
||||
; pack up before storing
|
||||
packuswb xmm0, xmm7
|
||||
packuswb xmm1, xmm7
|
||||
packuswb xmm2, xmm7
|
||||
packuswb xmm3, xmm7
|
||||
|
||||
; Load destination stride before writing out,
|
||||
; doesn't need to persist
|
||||
movsxd rdx, dword ptr arg(4) ; dst_stride
|
||||
|
||||
; store blocks back out
|
||||
movq [rdi], xmm0
|
||||
movq [rdi + rdx], xmm1
|
||||
|
||||
lea rdi, [rdi + 2*rdx]
|
||||
|
||||
movq [rdi], xmm2
|
||||
movq [rdi + rdx], xmm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void idct_dequant_dc_0_2x_sse2
|
||||
; (
|
||||
; short *qcoeff - 0
|
||||
; short *dequant - 1
|
||||
; unsigned char *pre - 2
|
||||
; unsigned char *dst - 3
|
||||
; int dst_stride - 4
|
||||
; short *dc - 5
|
||||
; )
|
||||
global sym(idct_dequant_dc_0_2x_sse2)
|
||||
sym(idct_dequant_dc_0_2x_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
; special case when 2 blocks have 0 or 1 coeffs
|
||||
; dc is set as first coeff, so no need to load qcoeff
|
||||
mov rax, arg(0) ; qcoeff
|
||||
mov rsi, arg(2) ; pre
|
||||
mov rdi, arg(3) ; dst
|
||||
mov rdx, arg(5) ; dc
|
||||
|
||||
; Zero out xmm7, for use unpacking
|
||||
pxor xmm7, xmm7
|
||||
|
||||
; load up 2 dc words here == 2*16 = doubleword
|
||||
movd xmm4, [rdx]
|
||||
|
||||
; Load up predict blocks
|
||||
movq xmm0, [rsi]
|
||||
movq xmm1, [rsi+16]
|
||||
movq xmm2, [rsi+32]
|
||||
movq xmm3, [rsi+48]
|
||||
|
||||
; Duplicate and expand dc across
|
||||
punpcklwd xmm4, xmm4
|
||||
punpckldq xmm4, xmm4
|
||||
|
||||
; Rounding to dequant and downshift
|
||||
paddw xmm4, [fours GLOBAL]
|
||||
psraw xmm4, 3
|
||||
|
||||
; Predict buffer needs to be expanded from bytes to words
|
||||
punpcklbw xmm0, xmm7
|
||||
punpcklbw xmm1, xmm7
|
||||
punpcklbw xmm2, xmm7
|
||||
punpcklbw xmm3, xmm7
|
||||
|
||||
; Add to predict buffer
|
||||
paddw xmm0, xmm4
|
||||
paddw xmm1, xmm4
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm4
|
||||
|
||||
; pack up before storing
|
||||
packuswb xmm0, xmm7
|
||||
packuswb xmm1, xmm7
|
||||
packuswb xmm2, xmm7
|
||||
packuswb xmm3, xmm7
|
||||
|
||||
; Load destination stride before writing out,
|
||||
; doesn't need to persist
|
||||
movsxd rdx, dword ptr arg(4) ; dst_stride
|
||||
|
||||
; store blocks back out
|
||||
movq [rdi], xmm0
|
||||
movq [rdi + rdx], xmm1
|
||||
|
||||
lea rdi, [rdi + 2*rdx]
|
||||
|
||||
movq [rdi], xmm2
|
||||
movq [rdi + rdx], xmm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
global sym(idct_dequant_dc_full_2x_sse2)
|
||||
sym(idct_dequant_dc_full_2x_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
; special case when 2 blocks have 0 or 1 coeffs
|
||||
; dc is set as first coeff, so no need to load qcoeff
|
||||
mov rax, arg(0) ; qcoeff
|
||||
mov rsi, arg(2) ; pre
|
||||
mov rdi, arg(3) ; dst
|
||||
|
||||
; Zero out xmm7, for use unpacking
|
||||
pxor xmm7, xmm7
|
||||
|
||||
mov rdx, arg(1) ; dequant
|
||||
|
||||
; note the transpose of xmm1 and xmm2, necessary for shuffle
|
||||
; to spit out sensicle data
|
||||
movdqa xmm0, [rax]
|
||||
movdqa xmm2, [rax+16]
|
||||
movdqa xmm1, [rax+32]
|
||||
movdqa xmm3, [rax+48]
|
||||
|
||||
; Clear out coeffs
|
||||
movdqa [rax], xmm7
|
||||
movdqa [rax+16], xmm7
|
||||
movdqa [rax+32], xmm7
|
||||
movdqa [rax+48], xmm7
|
||||
|
||||
; dequantize qcoeff buffer
|
||||
pmullw xmm0, [rdx]
|
||||
pmullw xmm2, [rdx+16]
|
||||
pmullw xmm1, [rdx]
|
||||
pmullw xmm3, [rdx+16]
|
||||
|
||||
; DC component
|
||||
mov rdx, arg(5)
|
||||
|
||||
; repack so block 0 row x and block 1 row x are together
|
||||
movdqa xmm4, xmm0
|
||||
punpckldq xmm0, xmm1
|
||||
punpckhdq xmm4, xmm1
|
||||
|
||||
pshufd xmm0, xmm0, 11011000b
|
||||
pshufd xmm1, xmm4, 11011000b
|
||||
|
||||
movdqa xmm4, xmm2
|
||||
punpckldq xmm2, xmm3
|
||||
punpckhdq xmm4, xmm3
|
||||
|
||||
pshufd xmm2, xmm2, 11011000b
|
||||
pshufd xmm3, xmm4, 11011000b
|
||||
|
||||
; insert DC component
|
||||
pinsrw xmm0, [rdx], 0
|
||||
pinsrw xmm0, [rdx+2], 4
|
||||
|
||||
; first pass
|
||||
psubw xmm0, xmm2 ; b1 = 0-2
|
||||
paddw xmm2, xmm2 ;
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
paddw xmm2, xmm0 ; a1 = 0+2
|
||||
|
||||
pmulhw xmm5, [x_s1sqr2 GLOBAL]
|
||||
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movdqa xmm7, xmm3
|
||||
pmulhw xmm7, [x_c1sqr2less1 GLOBAL]
|
||||
|
||||
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw xmm7, xmm5 ; c1
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
movdqa xmm4, xmm3
|
||||
|
||||
pmulhw xmm5, [x_c1sqr2less1 GLOBAL]
|
||||
paddw xmm5, xmm1
|
||||
|
||||
pmulhw xmm3, [x_s1sqr2 GLOBAL]
|
||||
paddw xmm3, xmm4
|
||||
|
||||
paddw xmm3, xmm5 ; d1
|
||||
movdqa xmm6, xmm2 ; a1
|
||||
|
||||
movdqa xmm4, xmm0 ; b1
|
||||
paddw xmm2, xmm3 ;0
|
||||
|
||||
paddw xmm4, xmm7 ;1
|
||||
psubw xmm0, xmm7 ;2
|
||||
|
||||
psubw xmm6, xmm3 ;3
|
||||
|
||||
; transpose for the second pass
|
||||
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
|
||||
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
|
||||
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
|
||||
|
||||
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
|
||||
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
|
||||
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
|
||||
|
||||
|
||||
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
|
||||
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
|
||||
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
|
||||
|
||||
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
|
||||
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
|
||||
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
|
||||
|
||||
|
||||
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
|
||||
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
|
||||
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
|
||||
|
||||
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
|
||||
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
|
||||
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
|
||||
|
||||
pshufd xmm0, xmm2, 11011000b
|
||||
pshufd xmm2, xmm1, 11011000b
|
||||
|
||||
pshufd xmm1, xmm5, 11011000b
|
||||
pshufd xmm3, xmm7, 11011000b
|
||||
|
||||
; second pass
|
||||
psubw xmm0, xmm2 ; b1 = 0-2
|
||||
paddw xmm2, xmm2
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
paddw xmm2, xmm0 ; a1 = 0+2
|
||||
|
||||
pmulhw xmm5, [x_s1sqr2 GLOBAL]
|
||||
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movdqa xmm7, xmm3
|
||||
pmulhw xmm7, [x_c1sqr2less1 GLOBAL]
|
||||
|
||||
paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw xmm7, xmm5 ; c1
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
movdqa xmm4, xmm3
|
||||
|
||||
pmulhw xmm5, [x_c1sqr2less1 GLOBAL]
|
||||
paddw xmm5, xmm1
|
||||
|
||||
pmulhw xmm3, [x_s1sqr2 GLOBAL]
|
||||
paddw xmm3, xmm4
|
||||
|
||||
paddw xmm3, xmm5 ; d1
|
||||
paddw xmm0, [fours GLOBAL]
|
||||
|
||||
paddw xmm2, [fours GLOBAL]
|
||||
movdqa xmm6, xmm2 ; a1
|
||||
|
||||
movdqa xmm4, xmm0 ; b1
|
||||
paddw xmm2, xmm3 ;0
|
||||
|
||||
paddw xmm4, xmm7 ;1
|
||||
psubw xmm0, xmm7 ;2
|
||||
|
||||
psubw xmm6, xmm3 ;3
|
||||
psraw xmm2, 3
|
||||
|
||||
psraw xmm0, 3
|
||||
psraw xmm4, 3
|
||||
|
||||
psraw xmm6, 3
|
||||
|
||||
; transpose to save
|
||||
movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
|
||||
punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
|
||||
punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
|
||||
|
||||
movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
|
||||
punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
|
||||
punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
|
||||
|
||||
|
||||
movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
|
||||
punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
|
||||
punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
|
||||
|
||||
movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
|
||||
punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
|
||||
punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
|
||||
|
||||
|
||||
movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
|
||||
punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
|
||||
punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
|
||||
|
||||
movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
|
||||
punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
|
||||
punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
|
||||
|
||||
pshufd xmm0, xmm2, 11011000b
|
||||
pshufd xmm2, xmm1, 11011000b
|
||||
|
||||
pshufd xmm1, xmm5, 11011000b
|
||||
pshufd xmm3, xmm7, 11011000b
|
||||
|
||||
pxor xmm7, xmm7
|
||||
|
||||
; Load up predict blocks
|
||||
movq xmm4, [rsi]
|
||||
movq xmm5, [rsi+16]
|
||||
|
||||
punpcklbw xmm4, xmm7
|
||||
punpcklbw xmm5, xmm7
|
||||
|
||||
paddw xmm0, xmm4
|
||||
paddw xmm1, xmm5
|
||||
|
||||
movq xmm4, [rsi+32]
|
||||
movq xmm5, [rsi+48]
|
||||
|
||||
punpcklbw xmm4, xmm7
|
||||
punpcklbw xmm5, xmm7
|
||||
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm5
|
||||
|
||||
.finish:
|
||||
|
||||
; pack up before storing
|
||||
packuswb xmm0, xmm7
|
||||
packuswb xmm1, xmm7
|
||||
packuswb xmm2, xmm7
|
||||
packuswb xmm3, xmm7
|
||||
|
||||
; Load destination stride before writing out,
|
||||
; doesn't need to persist
|
||||
movsxd rdx, dword ptr arg(4) ; dst_stride
|
||||
|
||||
; store blocks back out
|
||||
movq [rdi], xmm0
|
||||
movq [rdi + rdx], xmm1
|
||||
|
||||
lea rdi, [rdi + 2*rdx]
|
||||
|
||||
movq [rdi], xmm2
|
||||
movq [rdi + rdx], xmm3
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
fours:
|
||||
times 8 dw 0x0004
|
||||
align 16
|
||||
x_s1sqr2:
|
||||
times 8 dw 0x8A8C
|
||||
align 16
|
||||
x_c1sqr2less1:
|
||||
times 8 dw 0x4E7B
|
|
@ -0,0 +1,151 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "idct.h"
|
||||
#include "dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_v6
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride);
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride);
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride);
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_v6
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride);
|
||||
((int *)(q+32))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride);
|
||||
((int *)(q+48))[0] = 0;
|
||||
}
|
||||
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_v6
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
}
|
|
@ -16,6 +16,9 @@
|
|||
extern prototype_dequant_block(vp8_dequantize_b_v6);
|
||||
extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
|
||||
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
|
||||
|
||||
#undef vp8_dequant_block
|
||||
#define vp8_dequant_block vp8_dequantize_b_v6
|
||||
|
@ -25,12 +28,24 @@ extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
|
|||
|
||||
#undef vp8_dequant_dc_idct_add
|
||||
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
|
||||
|
||||
#undef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6
|
||||
|
||||
#undef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
|
||||
|
||||
#undef vp8_dequant_idct_add_uv_block
|
||||
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern prototype_dequant_block(vp8_dequantize_b_neon);
|
||||
extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
|
||||
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
|
||||
|
||||
#undef vp8_dequant_block
|
||||
#define vp8_dequant_block vp8_dequantize_b_neon
|
||||
|
@ -40,6 +55,15 @@ extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
|
|||
|
||||
#undef vp8_dequant_dc_idct_add
|
||||
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
|
||||
|
||||
#undef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon
|
||||
|
||||
#undef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
|
||||
|
||||
#undef vp8_dequant_idct_add_uv_block
|
||||
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,151 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "idct.h"
|
||||
#include "dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_neon
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]);
|
||||
else
|
||||
vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride);
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
|
||||
else
|
||||
vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride);
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
|
||||
else
|
||||
vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride);
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
|
||||
else
|
||||
vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_neon
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride);
|
||||
((int *)(q+32))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride);
|
||||
((int *)(q+48))[0] = 0;
|
||||
}
|
||||
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_neon
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
}
|
|
@ -237,7 +237,7 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
|
|||
DEQUANT_INVOKE(&pbi->dequant, block)(b);
|
||||
|
||||
// do 2nd order transform on the dc block
|
||||
if (b->eob > 1)
|
||||
if (xd->eobs[24] > 1)
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
|
@ -255,24 +255,10 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
|
|||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
|
||||
b = &xd->block[i];
|
||||
|
||||
if (b->eob > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, dc_idct_add)
|
||||
(b->qcoeff, &b->dequant[0][0], b->predictor,
|
||||
*(b->base_dst) + b->dst, 16, b->dst_stride,
|
||||
xd->block[24].diff[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(xd->block[24].diff[i], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
}
|
||||
}
|
||||
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
|
||||
(xd->qcoeff, &xd->block[0].dequant[0][0],
|
||||
xd->predictor, xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
|
||||
}
|
||||
else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
|
||||
{
|
||||
|
@ -282,13 +268,17 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
|
|||
BLOCKD *b = &xd->block[i];
|
||||
vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
|
||||
|
||||
if (b->eob > 1)
|
||||
if (xd->eobs[i] > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
DEQUANT_INVOKE(&pbi->dequant, idct_add)
|
||||
(b->qcoeff, &b->dequant[0][0], b->predictor,
|
||||
*(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
|
||||
(b->qcoeff[0] * b->dequant[0][0], b->predictor,
|
||||
*(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
}
|
||||
|
@ -296,37 +286,16 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
|
|||
}
|
||||
else
|
||||
{
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
BLOCKD *b = &xd->block[i];
|
||||
|
||||
if (b->eob > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
}
|
||||
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
|
||||
(xd->qcoeff, &xd->block[0].dequant[0][0],
|
||||
xd->predictor, xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs);
|
||||
}
|
||||
|
||||
for (i = 16; i < 24; i++)
|
||||
{
|
||||
|
||||
BLOCKD *b = &xd->block[i];
|
||||
|
||||
if (b->eob > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
}
|
||||
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
|
||||
(xd->qcoeff+16*16, &xd->block[16].dequant[0][0],
|
||||
xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
|
||||
xd->dst.uv_stride, xd->eobs+16);
|
||||
}
|
||||
|
||||
static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
|
||||
|
|
|
@ -27,6 +27,21 @@
|
|||
int pitch, int stride, \
|
||||
int dc)
|
||||
|
||||
#define prototype_dequant_dc_idct_add_y_block(sym) \
|
||||
void sym(short *q, short *dq, \
|
||||
unsigned char *pre, unsigned char *dst, \
|
||||
int stride, char *eobs, short *dc)
|
||||
|
||||
#define prototype_dequant_idct_add_y_block(sym) \
|
||||
void sym(short *q, short *dq, \
|
||||
unsigned char *pre, unsigned char *dst, \
|
||||
int stride, char *eobs)
|
||||
|
||||
#define prototype_dequant_idct_add_uv_block(sym) \
|
||||
void sym(short *q, short *dq, \
|
||||
unsigned char *pre, unsigned char *dst_u, \
|
||||
unsigned char *dst_v, int stride, char *eobs)
|
||||
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
#include "x86/dequantize_x86.h"
|
||||
#endif
|
||||
|
@ -50,16 +65,42 @@ extern prototype_dequant_idct_add(vp8_dequant_idct_add);
|
|||
#endif
|
||||
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
|
||||
|
||||
#ifndef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c
|
||||
#endif
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);
|
||||
|
||||
#ifndef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
|
||||
#endif
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block);
|
||||
|
||||
#ifndef vp8_dequant_idct_add_uv_block
|
||||
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
|
||||
#endif
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block);
|
||||
|
||||
|
||||
typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
|
||||
|
||||
typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
|
||||
|
||||
typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
|
||||
|
||||
typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));
|
||||
|
||||
typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
|
||||
|
||||
typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
|
||||
|
||||
typedef struct
|
||||
{
|
||||
vp8_dequant_block_fn_t block;
|
||||
vp8_dequant_idct_add_fn_t idct_add;
|
||||
vp8_dequant_dc_idct_add_fn_t dc_idct_add;
|
||||
vp8_dequant_block_fn_t block;
|
||||
vp8_dequant_idct_add_fn_t idct_add;
|
||||
vp8_dequant_dc_idct_add_fn_t dc_idct_add;
|
||||
vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
|
||||
vp8_dequant_idct_add_y_block_fn_t idct_add_y_block;
|
||||
vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block;
|
||||
} vp8_dequant_rtcd_vtable_t;
|
||||
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
|
|
|
@ -266,6 +266,8 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
|
|||
|
||||
BOOL_DECODER *bc = x->current_bc;
|
||||
|
||||
char *eobs = x->eobs;
|
||||
|
||||
ENTROPY_CONTEXT *a;
|
||||
ENTROPY_CONTEXT *l;
|
||||
int i;
|
||||
|
@ -416,8 +418,8 @@ ONE_CONTEXT_NODE_0_:
|
|||
|
||||
qcoeff_ptr [ scan[15] ] = (INT16) v;
|
||||
BLOCK_FINISHED:
|
||||
t = ((x->block[i].eob = c) != !type); // any nonzero data?
|
||||
eobtotal += x->block[i].eob;
|
||||
t = ((eobs[i] = c) != !type); // any nonzero data?
|
||||
eobtotal += c;
|
||||
*a = *l = t;
|
||||
qcoeff_ptr += 16;
|
||||
|
||||
|
|
|
@ -19,12 +19,15 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
|
|||
{
|
||||
// Pure C:
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
pbi->mb.rtcd = &pbi->common.rtcd;
|
||||
pbi->dequant.block = vp8_dequantize_b_c;
|
||||
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
|
||||
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
|
||||
pbi->dboolhuff.start = vp8dx_start_decode_c;
|
||||
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
|
||||
pbi->mb.rtcd = &pbi->common.rtcd;
|
||||
pbi->dequant.block = vp8_dequantize_b_c;
|
||||
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
|
||||
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
|
||||
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
|
||||
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c;
|
||||
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_c;
|
||||
pbi->dboolhuff.start = vp8dx_start_decode_c;
|
||||
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
|
||||
#if 0 //For use with RTCD, when implemented
|
||||
pbi->dboolhuff.debool = vp8dx_decode_bool_c;
|
||||
pbi->dboolhuff.devalue = vp8dx_decode_value_c;
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "idct.h"
|
||||
#include "dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_c
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
for (j = 0; j < 4; j++)
|
||||
{
|
||||
if (*eobs++ > 1)
|
||||
vp8_dequant_dc_idct_add_c (q, dq, pre, dst, 16, stride, dc[0]);
|
||||
else
|
||||
vp8_dc_only_idct_add_c (dc[0], pre, dst, 16, stride);
|
||||
|
||||
q += 16;
|
||||
pre += 4;
|
||||
dst += 4;
|
||||
dc ++;
|
||||
}
|
||||
|
||||
pre += 64 - 16;
|
||||
dst += 4*stride - 16;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_c
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
for (j = 0; j < 4; j++)
|
||||
{
|
||||
if (*eobs++ > 1)
|
||||
vp8_dequant_idct_add_c (q, dq, pre, dst, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dst, 16, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
q += 16;
|
||||
pre += 4;
|
||||
dst += 4;
|
||||
}
|
||||
|
||||
pre += 64 - 16;
|
||||
dst += 4*stride - 16;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_c
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
for (j = 0; j < 2; j++)
|
||||
{
|
||||
if (*eobs++ > 1)
|
||||
vp8_dequant_idct_add_c (q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstu, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
q += 16;
|
||||
pre += 4;
|
||||
dstu += 4;
|
||||
}
|
||||
|
||||
pre += 32 - 8;
|
||||
dstu += 4*stride - 8;
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
for (j = 0; j < 2; j++)
|
||||
{
|
||||
if (*eobs++ > 1)
|
||||
vp8_dequant_idct_add_c (q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstv, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
q += 16;
|
||||
pre += 4;
|
||||
dstv += 4;
|
||||
}
|
||||
|
||||
pre += 32 - 8;
|
||||
dstv += 4*stride - 8;
|
||||
}
|
||||
}
|
|
@ -23,7 +23,9 @@
|
|||
extern prototype_dequant_block(vp8_dequantize_b_mmx);
|
||||
extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
|
||||
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
|
||||
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_dequant_block
|
||||
|
@ -35,6 +37,33 @@ extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
|
|||
#undef vp8_dequant_dc_idct_add
|
||||
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx
|
||||
|
||||
#undef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx
|
||||
|
||||
#undef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
|
||||
|
||||
#undef vp8_dequant_idct_add_uv_block
|
||||
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_mmx
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2);
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2
|
||||
|
||||
#undef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
|
||||
|
||||
#undef vp8_dequant_idct_add_uv_block
|
||||
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,151 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "idct.h"
|
||||
#include "dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_mmx
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q, dq, pre, dst, 16, stride, dc[0]);
|
||||
else
|
||||
vp8_dc_only_idct_add_mmx (dc[0], pre, dst, 16, stride);
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
|
||||
else
|
||||
vp8_dc_only_idct_add_mmx (dc[1], pre+4, dst+4, 16, stride);
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
|
||||
else
|
||||
vp8_dc_only_idct_add_mmx (dc[2], pre+8, dst+8, 16, stride);
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
|
||||
else
|
||||
vp8_dc_only_idct_add_mmx (dc[3], pre+12, dst+12, 16, stride);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_mmx
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_mmx (q, dq, pre, dst, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dst, 16, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dst+4, 16, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[32]*dq[0], pre+8, dst+8, 16, stride);
|
||||
((int *)(q+32))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[48]*dq[0], pre+12, dst+12, 16, stride);
|
||||
((int *)(q+48))[0] = 0;
|
||||
}
|
||||
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_mmx
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_mmx (q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstu, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstu+4, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstu+4, 8, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_mmx (q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstv, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstv+4, 8, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstv+4, 8, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "idct.h"
|
||||
#include "dequantize.h"
|
||||
|
||||
void idct_dequant_dc_0_2x_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int dst_stride, short *dc);
|
||||
void idct_dequant_dc_full_2x_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int dst_stride, short *dc);
|
||||
|
||||
void idct_dequant_0_2x_sse2
|
||||
(short *q, short *dq ,unsigned char *pre,
|
||||
unsigned char *dst, int dst_stride, int blk_stride);
|
||||
void idct_dequant_full_2x_sse2
|
||||
(short *q, short *dq ,unsigned char *pre,
|
||||
unsigned char *dst, int dst_stride, int blk_stride);
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
|
||||
else
|
||||
idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
|
||||
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
|
||||
else
|
||||
idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += stride*4;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
|
||||
else
|
||||
idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
|
||||
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
|
||||
else
|
||||
idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
|
||||
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += stride*4;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
|
||||
else
|
||||
idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += stride*4;
|
||||
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
|
||||
else
|
||||
idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
|
||||
if (((short *)(eobs))[2] & 0xfefe)
|
||||
idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
|
||||
else
|
||||
idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += stride*4;
|
||||
|
||||
if (((short *)(eobs))[3] & 0xfefe)
|
||||
idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
|
||||
else
|
||||
idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
|
||||
}
|
|
@ -39,14 +39,24 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
|
|||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
/* Override default functions with fastest ones for this CPU. */
|
||||
#if HAVE_MMX
|
||||
|
||||
if (flags & HAS_MMX)
|
||||
{
|
||||
pbi->dequant.block = vp8_dequantize_b_mmx;
|
||||
pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
|
||||
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx;
|
||||
pbi->dequant.block = vp8_dequantize_b_mmx;
|
||||
pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
|
||||
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx;
|
||||
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
|
||||
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_mmx;
|
||||
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx;
|
||||
}
|
||||
#endif
|
||||
#if HAVE_SSE2
|
||||
if (flags & HAS_SSE2)
|
||||
{
|
||||
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2;
|
||||
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_sse2;
|
||||
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -103,6 +103,7 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
|
|||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
|
||||
|
|
|
@ -68,9 +68,12 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h
|
|||
VP8_DX_SRCS-yes += decoder/treereader.h
|
||||
VP8_DX_SRCS-yes += decoder/onyxd_if.c
|
||||
VP8_DX_SRCS-yes += decoder/threading.c
|
||||
VP8_DX_SRCS-yes += decoder/idct_blk.c
|
||||
|
||||
VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
|
||||
|
||||
VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/dequantize_x86.h
|
||||
VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c
|
||||
VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
|
||||
VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
|
||||
VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
|
||||
|
|
|
@ -15,14 +15,17 @@ VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/dequantize_arm.c
|
|||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/dsystemdependent.c
|
||||
VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/generic/dsystemdependent.c
|
||||
VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/dequantize.c
|
||||
VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/idct_blk.c
|
||||
VP8_DX_SRCS-$(CONFIG_ARM_ASM_DETOK) += decoder/arm/detokenize$(ASM)
|
||||
|
||||
#File list for armv6
|
||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c
|
||||
|
||||
#File list for neon
|
||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_blk_neon.c
|
||||
|
|
Загрузка…
Ссылка в новой задаче