SIMD-accelerated slow integer forward DCT and quantize routines for MIPS DSPr2

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1054 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC 2013-09-30 18:13:27 +00:00
Родитель ecc9560822
Коммит a6b7fbd352
4 изменённых файлов: 299 добавлений и 3 удалений

Просмотреть файл

@ -6,8 +6,7 @@ line padding (previously, it only supported 4-byte padding, which was
compatible with X Video.) Also, the decompress-to-YUV function has been
extended to support image scaling.
[2] Added SIMD acceleration for performing color conversion, downsampling,
upsampling, and IDCT scaling on DSPr2-capable MIPS platforms. This speeds up
[2] Added SIMD acceleration for DSPr2-capable MIPS platforms. This speeds up
the compression of full-color JPEGs by 6-21% on such platforms and
decompression by 6-17%.

Просмотреть файл

@ -674,6 +674,8 @@ EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data));
EXTERN(void) jsimd_fdct_ifast_neon JPP((DCTELEM * data));
EXTERN(void) jsimd_fdct_islow_mips_dspr2 JPP((DCTELEM * data));
EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
extern const int jconst_fdct_float_sse[];
@ -692,6 +694,10 @@ EXTERN(void) jsimd_quantize_neon JPP((JCOEFPTR coef_block,
DCTELEM * divisors,
DCTELEM * workspace));
EXTERN(void) jsimd_quantize_mips_dspr2 JPP((JCOEFPTR coef_block,
DCTELEM * divisors,
DCTELEM * workspace));
EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
FAST_FLOAT * divisors,
FAST_FLOAT * workspace));
@ -742,7 +748,6 @@ EXTERN(void) jsimd_idct_4x4_mips_dspr2 JPP((void * dct_table,
JSAMPARRAY output_buf,
JDIMENSION output_col,
int * workspace));
EXTERN(void) jsimd_idct_6x6_mips_dspr2 JPP((void * dct_table,
JCOEFPTR coef_block,
JSAMPARRAY output_buf,

Просмотреть файл

@ -471,6 +471,17 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
GLOBAL(int)
jsimd_can_fdct_islow (void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_MIPS_DSPR2)
return 1;
return 0;
}
@ -489,6 +500,8 @@ jsimd_can_fdct_float (void)
GLOBAL(void)
jsimd_fdct_islow (DCTELEM * data)
{
if (simd_support & JSIMD_MIPS_DSPR2)
jsimd_fdct_islow_mips_dspr2(data);
}
GLOBAL(void)
@ -504,6 +517,19 @@ jsimd_fdct_float (FAST_FLOAT * data)
GLOBAL(int)
jsimd_can_quantize (void)
{
init_simd();
/* The code is optimised for these values only */
if (DCTSIZE != 8)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
if (sizeof(DCTELEM) != 2)
return 0;
if (simd_support & JSIMD_MIPS_DSPR2)
return 1;
return 0;
}
@ -517,6 +543,8 @@ GLOBAL(void)
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
DCTELEM * workspace)
{
if (simd_support & JSIMD_MIPS_DSPR2)
jsimd_quantize_mips_dspr2(coef_block, divisors, workspace);
}
GLOBAL(void)
@ -638,6 +666,7 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
output_buf, output_col, workspace);
}
}
GLOBAL(void)
jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,

Просмотреть файл

@ -963,6 +963,269 @@ LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
j ra
nop
END(jsimd_h2v2_upsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
/*
* a0 - data
*/
SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
lui t0, 6437
ori t0, 2260
lui t1, 9633
ori t1, 11363
lui t2, 0xd39e
ori t2, 0xe6dc
lui t3, 0xf72d
ori t3, 9633
lui t4, 2261
ori t4, 9633
lui t5, 0xd39e
ori t5, 6437
lui t6, 9633
ori t6, 0xd39d
lui t7, 0xe6dc
ori t7, 2260
lui t8, 4433
ori t8, 10703
lui t9, 0xd630
ori t9, 4433
li s8, 8
move a1, a0
1:
lw s0, 0(a1) // tmp0 = 1|0
lw s1, 4(a1) // tmp1 = 3|2
lw s2, 8(a1) // tmp2 = 5|4
lw s3, 12(a1) // tmp3 = 7|6
packrl.ph s1, s1, s1 // tmp1 = 2|3
packrl.ph s3, s3, s3 // tmp3 = 6|7
subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
mult $0, $0 // ac0 = 0
dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
mult $ac1, $0, $0 // ac1 = 0
dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
mult $ac2, $0, $0 // ac2 = 0
dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
mult $ac3, $0, $0 // ac3 = 0
dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
sh s0, 2(a1)
sh s1, 6(a1)
sh s2, 10(a1)
sh s3, 14(a1)
mult $0, $0 // ac0 = 0
dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
mult $ac1, $0, $0 // ac1 = 0
dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
sra s4, s5, 16 // tmp4 = t11
addiu a1, a1, 16
addiu s8, s8, -1
extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
addu s2, s5, s4 // tmp2 = t10 + t11
subu s3, s5, s4 // tmp3 = t10 - t11
sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
sh s2, -16(a1)
sh s3, -8(a1)
sh s0, -12(a1)
bgtz s8, 1b
sh s1, -4(a1)
li t0, 2260
li t1, 11363
li t2, 9633
li t3, 6436
li t4, 6437
li t5, 2261
li t6, 11362
li t7, 2259
li t8, 4433
li t9, 10703
li a1, 10704
li s8, 8
2:
lh a2, 0(a0) // 0
lh a3, 16(a0) // 8
lh v0, 32(a0) // 16
lh v1, 48(a0) // 24
lh s4, 64(a0) // 32
lh s5, 80(a0) // 40
lh s6, 96(a0) // 48
lh s7, 112(a0) // 56
addu s2, v0, s5 // tmp2 = 16 + 40
subu s5, v0, s5 // tmp5 = 16 - 40
addu s3, v1, s4 // tmp3 = 24 + 32
subu s4, v1, s4 // tmp4 = 24 - 32
addu s0, a2, s7 // tmp0 = 0 + 56
subu s7, a2, s7 // tmp7 = 0 - 56
addu s1, a3, s6 // tmp1 = 8 + 48
subu s6, a3, s6 // tmp6 = 8 - 48
addu a2, s0, s3 // tmp10 = tmp0 + tmp3
subu v1, s0, s3 // tmp13 = tmp0 - tmp3
addu a3, s1, s2 // tmp11 = tmp1 + tmp2
subu v0, s1, s2 // tmp12 = tmp1 - tmp2
mult s7, t1 // ac0 = tmp7 * c1
madd s4, t0 // ac0 += tmp4 * c0
madd s5, t4 // ac0 += tmp5 * c4
madd s6, t2 // ac0 += tmp6 * c2
mult $ac1, s7, t2 // ac1 = tmp7 * c2
msub $ac1, s4, t3 // ac1 -= tmp4 * c3
msub $ac1, s5, t6 // ac1 -= tmp5 * c6
msub $ac1, s6, t7 // ac1 -= tmp6 * c7
mult $ac2, s7, t4 // ac2 = tmp7 * c4
madd $ac2, s4, t2 // ac2 += tmp4 * c2
madd $ac2, s5, t5 // ac2 += tmp5 * c5
msub $ac2, s6, t6 // ac2 -= tmp6 * c6
mult $ac3, s7, t0 // ac3 = tmp7 * c0
msub $ac3, s4, t1 // ac3 -= tmp4 * c1
madd $ac3, s5, t2 // ac3 += tmp5 * c2
msub $ac3, s6, t3 // ac3 -= tmp6 * c3
extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
addiu s8, s8, -1
addu s4, a2, a3 // tmp4 = tmp10 + tmp11
subu s5, a2, a3 // tmp5 = tmp10 - tmp11
sh s0, 16(a0)
sh s1, 48(a0)
sh s2, 80(a0)
sh s3, 112(a0)
mult v0, t8 // ac0 = tmp12 * c8
madd v1, t9 // ac0 += tmp13 * c9
mult $ac1, v1, t8 // ac1 = tmp13 * c8
msub $ac1, v0, a1 // ac1 -= tmp12 * c10
addiu a0, a0, 2
extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
sh s4, -2(a0)
sh s5, 62(a0)
sh s6, 30(a0)
bgtz s8, 2b
sh s7, 94(a0)
RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
jr ra
nop
END(jsimd_fdct_islow_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
/*
* a0 - coef_block
* a1 - divisors
* a2 - workspace
*/
.set at
SAVE_REGS_ON_STACK 16, s0, s1, s2
addiu v0, a2, 124 // v0 = workspace_end
lh t0, 0(a2)
lh t1, 0(a1)
lh t2, 128(a1)
sra t3, t0, 15
sll t3, t3, 1
addiu t3, t3, 1
mul t0, t0, t3
lh t4, 384(a1)
lh t5, 130(a1)
lh t6, 2(a2)
lh t7, 2(a1)
lh t8, 386(a1)
1:
andi t1, 0xffff
add t9, t0, t2
andi t9, 0xffff
mul v1, t9, t1
sra s0, t6, 15
sll s0, s0, 1
addiu s0, s0, 1
addiu t9, t4, 16
srav v1, v1, t9
mul v1, v1, t3
mul t6, t6, s0
andi t7, 0xffff
addiu a2, a2, 4
addiu a1, a1, 4
add s1, t6, t5
andi s1, 0xffff
sh v1, 0(a0)
mul s2, s1, t7
addiu s1, t8, 16
srav s2, s2, s1
mul s2,s2, s0
lh t0, 0(a2)
lh t1, 0(a1)
sra t3, t0, 15
sll t3, t3, 1
addiu t3, t3, 1
mul t0, t0, t3
lh t2, 128(a1)
lh t4, 384(a1)
lh t5, 130(a1)
lh t8, 386(a1)
lh t6, 2(a2)
lh t7, 2(a1)
sh s2, 2(a0)
lh t0, 0(a2)
sra t3, t0, 15
sll t3, t3, 1
addiu t3, t3, 1
mul t0, t0,t3
bne a2, v0, 1b
addiu a0, a0, 4
andi t1, 0xffff
add t9, t0, t2
andi t9, 0xffff
mul v1, t9, t1
sra s0, t6, 15
sll s0, s0, 1
addiu s0, s0, 1
addiu t9, t4, 16
srav v1, v1, t9
mul v1, v1, t3
mul t6, t6, s0
andi t7, 0xffff
sh v1, 0(a0)
add s1, t6, t5
andi s1, 0xffff
mul s2, s1, t7
addiu s1, t8, 16
addiu a2, a2, 4
addiu a1, a1, 4
srav s2, s2, s1
mul s2, s2, s0
sh s2, 2(a0)
RESTORE_REGS_FROM_STACK 16, s0, s1, s2
j ra
nop
END(jsimd_quantize_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)