зеркало из https://github.com/mozilla/mozjpeg.git
SIMD-accelerated slow integer forward DCT and quantize routines for MIPS DSPr2
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1054 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
Родитель
ecc9560822
Коммит
a6b7fbd352
|
@ -6,8 +6,7 @@ line padding (previously, it only supported 4-byte padding, which was
|
|||
compatible with X Video.) Also, the decompress-to-YUV function has been
|
||||
extended to support image scaling.
|
||||
|
||||
[2] Added SIMD acceleration for performing color conversion, downsampling,
|
||||
upsampling, and IDCT scaling on DSPr2-capable MIPS platforms. This speeds up
|
||||
[2] Added SIMD acceleration for DSPr2-capable MIPS platforms. This speeds up
|
||||
the compression of full-color JPEGs by 6-21% on such platforms and
|
||||
decompression by 6-17%.
|
||||
|
||||
|
|
|
@ -674,6 +674,8 @@ EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data));
|
|||
|
||||
EXTERN(void) jsimd_fdct_ifast_neon JPP((DCTELEM * data));
|
||||
|
||||
EXTERN(void) jsimd_fdct_islow_mips_dspr2 JPP((DCTELEM * data));
|
||||
|
||||
EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
|
||||
|
||||
extern const int jconst_fdct_float_sse[];
|
||||
|
@ -692,6 +694,10 @@ EXTERN(void) jsimd_quantize_neon JPP((JCOEFPTR coef_block,
|
|||
DCTELEM * divisors,
|
||||
DCTELEM * workspace));
|
||||
|
||||
EXTERN(void) jsimd_quantize_mips_dspr2 JPP((JCOEFPTR coef_block,
|
||||
DCTELEM * divisors,
|
||||
DCTELEM * workspace));
|
||||
|
||||
EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
|
||||
FAST_FLOAT * divisors,
|
||||
FAST_FLOAT * workspace));
|
||||
|
@ -742,7 +748,6 @@ EXTERN(void) jsimd_idct_4x4_mips_dspr2 JPP((void * dct_table,
|
|||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col,
|
||||
int * workspace));
|
||||
|
||||
EXTERN(void) jsimd_idct_6x6_mips_dspr2 JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
|
|
|
@ -471,6 +471,17 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
|
|||
GLOBAL(int)
|
||||
jsimd_can_fdct_islow (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MIPS_DSPR2)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -489,6 +500,8 @@ jsimd_can_fdct_float (void)
|
|||
GLOBAL(void)
|
||||
jsimd_fdct_islow (DCTELEM * data)
|
||||
{
|
||||
if (simd_support & JSIMD_MIPS_DSPR2)
|
||||
jsimd_fdct_islow_mips_dspr2(data);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
|
@ -504,6 +517,19 @@ jsimd_fdct_float (FAST_FLOAT * data)
|
|||
GLOBAL(int)
|
||||
jsimd_can_quantize (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_MIPS_DSPR2)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -517,6 +543,8 @@ GLOBAL(void)
|
|||
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
|
||||
DCTELEM * workspace)
|
||||
{
|
||||
if (simd_support & JSIMD_MIPS_DSPR2)
|
||||
jsimd_quantize_mips_dspr2(coef_block, divisors, workspace);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
|
@ -638,6 +666,7 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|||
output_buf, output_col, workspace);
|
||||
}
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
|
|
|
@ -963,6 +963,269 @@ LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
|
|||
j ra
|
||||
nop
|
||||
END(jsimd_h2v2_upsample_mips_dspr2)
|
||||
/*****************************************************************************/
|
||||
LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
|
||||
/*
|
||||
* a0 - data
|
||||
*/
|
||||
|
||||
SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
|
||||
|
||||
lui t0, 6437
|
||||
ori t0, 2260
|
||||
lui t1, 9633
|
||||
ori t1, 11363
|
||||
lui t2, 0xd39e
|
||||
ori t2, 0xe6dc
|
||||
lui t3, 0xf72d
|
||||
ori t3, 9633
|
||||
lui t4, 2261
|
||||
ori t4, 9633
|
||||
lui t5, 0xd39e
|
||||
ori t5, 6437
|
||||
lui t6, 9633
|
||||
ori t6, 0xd39d
|
||||
lui t7, 0xe6dc
|
||||
ori t7, 2260
|
||||
lui t8, 4433
|
||||
ori t8, 10703
|
||||
lui t9, 0xd630
|
||||
ori t9, 4433
|
||||
li s8, 8
|
||||
move a1, a0
|
||||
1:
|
||||
lw s0, 0(a1) // tmp0 = 1|0
|
||||
lw s1, 4(a1) // tmp1 = 3|2
|
||||
lw s2, 8(a1) // tmp2 = 5|4
|
||||
lw s3, 12(a1) // tmp3 = 7|6
|
||||
packrl.ph s1, s1, s1 // tmp1 = 2|3
|
||||
packrl.ph s3, s3, s3 // tmp3 = 6|7
|
||||
subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
|
||||
subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
|
||||
mult $0, $0 // ac0 = 0
|
||||
dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
|
||||
dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
|
||||
mult $ac1, $0, $0 // ac1 = 0
|
||||
dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
|
||||
dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
|
||||
mult $ac2, $0, $0 // ac2 = 0
|
||||
dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
|
||||
dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
|
||||
mult $ac3, $0, $0 // ac3 = 0
|
||||
dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
|
||||
dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
|
||||
addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
|
||||
addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
|
||||
extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
|
||||
extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
|
||||
extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
|
||||
extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
|
||||
addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
|
||||
subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
|
||||
sh s0, 2(a1)
|
||||
sh s1, 6(a1)
|
||||
sh s2, 10(a1)
|
||||
sh s3, 14(a1)
|
||||
mult $0, $0 // ac0 = 0
|
||||
dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
|
||||
mult $ac1, $0, $0 // ac1 = 0
|
||||
dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
|
||||
sra s4, s5, 16 // tmp4 = t11
|
||||
addiu a1, a1, 16
|
||||
addiu s8, s8, -1
|
||||
extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
|
||||
extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
|
||||
addu s2, s5, s4 // tmp2 = t10 + t11
|
||||
subu s3, s5, s4 // tmp3 = t10 - t11
|
||||
sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
|
||||
sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
|
||||
sh s2, -16(a1)
|
||||
sh s3, -8(a1)
|
||||
sh s0, -12(a1)
|
||||
bgtz s8, 1b
|
||||
sh s1, -4(a1)
|
||||
li t0, 2260
|
||||
li t1, 11363
|
||||
li t2, 9633
|
||||
li t3, 6436
|
||||
li t4, 6437
|
||||
li t5, 2261
|
||||
li t6, 11362
|
||||
li t7, 2259
|
||||
li t8, 4433
|
||||
li t9, 10703
|
||||
li a1, 10704
|
||||
li s8, 8
|
||||
|
||||
2:
|
||||
lh a2, 0(a0) // 0
|
||||
lh a3, 16(a0) // 8
|
||||
lh v0, 32(a0) // 16
|
||||
lh v1, 48(a0) // 24
|
||||
lh s4, 64(a0) // 32
|
||||
lh s5, 80(a0) // 40
|
||||
lh s6, 96(a0) // 48
|
||||
lh s7, 112(a0) // 56
|
||||
addu s2, v0, s5 // tmp2 = 16 + 40
|
||||
subu s5, v0, s5 // tmp5 = 16 - 40
|
||||
addu s3, v1, s4 // tmp3 = 24 + 32
|
||||
subu s4, v1, s4 // tmp4 = 24 - 32
|
||||
addu s0, a2, s7 // tmp0 = 0 + 56
|
||||
subu s7, a2, s7 // tmp7 = 0 - 56
|
||||
addu s1, a3, s6 // tmp1 = 8 + 48
|
||||
subu s6, a3, s6 // tmp6 = 8 - 48
|
||||
addu a2, s0, s3 // tmp10 = tmp0 + tmp3
|
||||
subu v1, s0, s3 // tmp13 = tmp0 - tmp3
|
||||
addu a3, s1, s2 // tmp11 = tmp1 + tmp2
|
||||
subu v0, s1, s2 // tmp12 = tmp1 - tmp2
|
||||
mult s7, t1 // ac0 = tmp7 * c1
|
||||
madd s4, t0 // ac0 += tmp4 * c0
|
||||
madd s5, t4 // ac0 += tmp5 * c4
|
||||
madd s6, t2 // ac0 += tmp6 * c2
|
||||
mult $ac1, s7, t2 // ac1 = tmp7 * c2
|
||||
msub $ac1, s4, t3 // ac1 -= tmp4 * c3
|
||||
msub $ac1, s5, t6 // ac1 -= tmp5 * c6
|
||||
msub $ac1, s6, t7 // ac1 -= tmp6 * c7
|
||||
mult $ac2, s7, t4 // ac2 = tmp7 * c4
|
||||
madd $ac2, s4, t2 // ac2 += tmp4 * c2
|
||||
madd $ac2, s5, t5 // ac2 += tmp5 * c5
|
||||
msub $ac2, s6, t6 // ac2 -= tmp6 * c6
|
||||
mult $ac3, s7, t0 // ac3 = tmp7 * c0
|
||||
msub $ac3, s4, t1 // ac3 -= tmp4 * c1
|
||||
madd $ac3, s5, t2 // ac3 += tmp5 * c2
|
||||
msub $ac3, s6, t3 // ac3 -= tmp6 * c3
|
||||
extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
|
||||
extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
|
||||
extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
|
||||
extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
|
||||
addiu s8, s8, -1
|
||||
addu s4, a2, a3 // tmp4 = tmp10 + tmp11
|
||||
subu s5, a2, a3 // tmp5 = tmp10 - tmp11
|
||||
sh s0, 16(a0)
|
||||
sh s1, 48(a0)
|
||||
sh s2, 80(a0)
|
||||
sh s3, 112(a0)
|
||||
mult v0, t8 // ac0 = tmp12 * c8
|
||||
madd v1, t9 // ac0 += tmp13 * c9
|
||||
mult $ac1, v1, t8 // ac1 = tmp13 * c8
|
||||
msub $ac1, v0, a1 // ac1 -= tmp12 * c10
|
||||
addiu a0, a0, 2
|
||||
extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
|
||||
extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
|
||||
shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
|
||||
shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
|
||||
sh s4, -2(a0)
|
||||
sh s5, 62(a0)
|
||||
sh s6, 30(a0)
|
||||
bgtz s8, 2b
|
||||
sh s7, 94(a0)
|
||||
|
||||
RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
|
||||
|
||||
jr ra
|
||||
nop
|
||||
|
||||
END(jsimd_fdct_islow_mips_dspr2)
|
||||
|
||||
/*****************************************************************************/
|
||||
LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
|
||||
/*
|
||||
* a0 - coef_block
|
||||
* a1 - divisors
|
||||
* a2 - workspace
|
||||
*/
|
||||
|
||||
.set at
|
||||
|
||||
SAVE_REGS_ON_STACK 16, s0, s1, s2
|
||||
|
||||
addiu v0, a2, 124 // v0 = workspace_end
|
||||
lh t0, 0(a2)
|
||||
lh t1, 0(a1)
|
||||
lh t2, 128(a1)
|
||||
sra t3, t0, 15
|
||||
sll t3, t3, 1
|
||||
addiu t3, t3, 1
|
||||
mul t0, t0, t3
|
||||
lh t4, 384(a1)
|
||||
lh t5, 130(a1)
|
||||
lh t6, 2(a2)
|
||||
lh t7, 2(a1)
|
||||
lh t8, 386(a1)
|
||||
|
||||
1:
|
||||
andi t1, 0xffff
|
||||
add t9, t0, t2
|
||||
andi t9, 0xffff
|
||||
mul v1, t9, t1
|
||||
sra s0, t6, 15
|
||||
sll s0, s0, 1
|
||||
addiu s0, s0, 1
|
||||
addiu t9, t4, 16
|
||||
srav v1, v1, t9
|
||||
mul v1, v1, t3
|
||||
mul t6, t6, s0
|
||||
andi t7, 0xffff
|
||||
addiu a2, a2, 4
|
||||
addiu a1, a1, 4
|
||||
add s1, t6, t5
|
||||
andi s1, 0xffff
|
||||
sh v1, 0(a0)
|
||||
|
||||
mul s2, s1, t7
|
||||
addiu s1, t8, 16
|
||||
srav s2, s2, s1
|
||||
mul s2,s2, s0
|
||||
lh t0, 0(a2)
|
||||
lh t1, 0(a1)
|
||||
sra t3, t0, 15
|
||||
sll t3, t3, 1
|
||||
addiu t3, t3, 1
|
||||
mul t0, t0, t3
|
||||
lh t2, 128(a1)
|
||||
lh t4, 384(a1)
|
||||
lh t5, 130(a1)
|
||||
lh t8, 386(a1)
|
||||
lh t6, 2(a2)
|
||||
lh t7, 2(a1)
|
||||
sh s2, 2(a0)
|
||||
lh t0, 0(a2)
|
||||
sra t3, t0, 15
|
||||
sll t3, t3, 1
|
||||
addiu t3, t3, 1
|
||||
mul t0, t0,t3
|
||||
bne a2, v0, 1b
|
||||
addiu a0, a0, 4
|
||||
|
||||
andi t1, 0xffff
|
||||
add t9, t0, t2
|
||||
andi t9, 0xffff
|
||||
mul v1, t9, t1
|
||||
sra s0, t6, 15
|
||||
sll s0, s0, 1
|
||||
addiu s0, s0, 1
|
||||
addiu t9, t4, 16
|
||||
srav v1, v1, t9
|
||||
mul v1, v1, t3
|
||||
mul t6, t6, s0
|
||||
andi t7, 0xffff
|
||||
sh v1, 0(a0)
|
||||
add s1, t6, t5
|
||||
andi s1, 0xffff
|
||||
mul s2, s1, t7
|
||||
addiu s1, t8, 16
|
||||
addiu a2, a2, 4
|
||||
addiu a1, a1, 4
|
||||
srav s2, s2, s1
|
||||
mul s2, s2, s0
|
||||
sh s2, 2(a0)
|
||||
|
||||
RESTORE_REGS_FROM_STACK 16, s0, s1, s2
|
||||
|
||||
j ra
|
||||
nop
|
||||
|
||||
END(jsimd_quantize_mips_dspr2)
|
||||
|
||||
/*****************************************************************************/
|
||||
LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
|
||||
|
|
Загрузка…
Ссылка в новой задаче