зеркало из https://github.com/mozilla/mozjpeg.git
64-bit SIMD acceleration
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@45 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
Родитель
3a63184474
Коммит
cdc8ac3eb1
11
acinclude.m4
11
acinclude.m4
|
@ -24,7 +24,14 @@ case "$host_os" in
|
|||
objfmt='a.out'
|
||||
;;
|
||||
linux*)
|
||||
objfmt='ELF'
|
||||
case "$host_cpu" in
|
||||
x86_64)
|
||||
objfmt='ELF64'
|
||||
;;
|
||||
*)
|
||||
objfmt='ELF'
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
freebsd* | netbsd* | openbsd*)
|
||||
if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
|
||||
|
@ -58,6 +65,7 @@ case "$objfmt" in
|
|||
a.out) NAFLAGS='-faout -DAOUT';;
|
||||
BSD-a.out) NAFLAGS='-faoutb -DAOUT';;
|
||||
ELF) NAFLAGS='-felf -DELF';;
|
||||
ELF64) NAFLAGS='-felf64 -DELF -D__x86_64__';;
|
||||
RDF) NAFLAGS='-frdf -DRDF';;
|
||||
Mach-O) NAFLAGS='-fmacho -DMACHO';;
|
||||
esac
|
||||
|
@ -68,7 +76,6 @@ AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
|
|||
cat > conftest.asm <<EOF
|
||||
[%line __oline__ "configure"
|
||||
section .text
|
||||
bits 32
|
||||
global _main,main
|
||||
_main:
|
||||
main: xor eax,eax
|
||||
|
|
|
@ -89,10 +89,17 @@ if test "x${with_simd}" != "xno"; then
|
|||
# Check if we're on a supported CPU
|
||||
AC_MSG_CHECKING([if host cpu type is i386 or compatible])
|
||||
case "$host_cpu" in
|
||||
x86_64)
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_PROG_NASM
|
||||
AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
|
||||
AC_DEFINE([WITH_SIMD64], [1], [Use x86-64 accelerated SIMD routines.])
|
||||
;;
|
||||
i*86 | x86 | ia32)
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_PROG_NASM
|
||||
AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
|
||||
SIMDDIR=simd
|
||||
;;
|
||||
*)
|
||||
AC_MSG_RESULT([no ("$host_cpu")])
|
||||
|
@ -100,6 +107,7 @@ if test "x${with_simd}" != "xno"; then
|
|||
;;
|
||||
esac
|
||||
fi
|
||||
AM_CONDITIONAL([WITH_SIMD64], [test "x$with_simd64" != "xno"])
|
||||
AM_CONDITIONAL([WITH_SIMD], [test "x$with_simd" != "xno"])
|
||||
|
||||
# jconfig.h is the file we use, but we have another before that to
|
||||
|
|
150
jsimd.c
150
jsimd.c
|
@ -49,10 +49,12 @@ init_simd (void)
|
|||
|
||||
#ifdef WITH_SIMD
|
||||
simd_support = jpeg_simd_cpu_support();
|
||||
#ifndef __x86_64__
|
||||
if((env=getenv("JSIMD_FORCEMMX"))!=NULL && !strcmp(env, "1"))
|
||||
simd_support = JSIMD_MMX;
|
||||
else if((env=getenv("JSIMD_FORCESSE2"))!=NULL && !strcmp(env, "1"))
|
||||
simd_support = JSIMD_SSE2;
|
||||
#endif
|
||||
#else
|
||||
simd_support = JSIMD_NONE;
|
||||
#endif
|
||||
|
@ -109,45 +111,63 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
|
|||
{
|
||||
#ifdef WITH_SIMD
|
||||
void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
#ifndef __x86_64__
|
||||
void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
|
||||
#endif
|
||||
switch(cinfo->in_color_space)
|
||||
{
|
||||
case JCS_EXT_RGB:
|
||||
sse2fct=jsimd_extrgb_ycc_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_extrgb_ycc_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
sse2fct=jsimd_extrgbx_ycc_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_extrgbx_ycc_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
sse2fct=jsimd_extbgr_ycc_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_extbgr_ycc_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
sse2fct=jsimd_extbgrx_ycc_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_extbgrx_ycc_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
sse2fct=jsimd_extxbgr_ycc_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_extxbgr_ycc_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
sse2fct=jsimd_extxrgb_ycc_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_extxrgb_ycc_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
sse2fct=jsimd_rgb_ycc_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_rgb_ycc_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
|
||||
sse2fct(cinfo->image_width, input_buf,
|
||||
output_buf, output_row, num_rows);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
mmxfct(cinfo->image_width, input_buf,
|
||||
output_buf, output_row, num_rows);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -158,45 +178,63 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
|
|||
{
|
||||
#ifdef WITH_SIMD
|
||||
void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
|
||||
#ifndef __x86_64__
|
||||
void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
|
||||
#endif
|
||||
switch(cinfo->out_color_space)
|
||||
{
|
||||
case JCS_EXT_RGB:
|
||||
sse2fct=jsimd_ycc_extrgb_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_ycc_extrgb_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
sse2fct=jsimd_ycc_extrgbx_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_ycc_extrgbx_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
sse2fct=jsimd_ycc_extbgr_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_ycc_extbgr_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
sse2fct=jsimd_ycc_extbgrx_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_ycc_extbgrx_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
sse2fct=jsimd_ycc_extxbgr_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_ycc_extxbgr_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
sse2fct=jsimd_ycc_extxrgb_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_ycc_extxrgb_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
sse2fct=jsimd_ycc_rgb_convert_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_ycc_rgb_convert_mmx;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
|
||||
sse2fct(cinfo->output_width, input_buf,
|
||||
input_row, output_buf, num_rows);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
mmxfct(cinfo->output_width, input_buf,
|
||||
input_row, output_buf, num_rows);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -213,8 +251,10 @@ jsimd_can_h2v2_downsample (void)
|
|||
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -232,8 +272,10 @@ jsimd_can_h2v1_downsample (void)
|
|||
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -247,10 +289,12 @@ jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
|
|||
jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor, compptr->width_in_blocks,
|
||||
input_data, output_data);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor, compptr->width_in_blocks,
|
||||
input_data, output_data);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -263,10 +307,12 @@ jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
|
|||
jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor, compptr->width_in_blocks,
|
||||
input_data, output_data);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
|
||||
compptr->v_samp_factor, compptr->width_in_blocks,
|
||||
input_data, output_data);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -283,8 +329,10 @@ jsimd_can_h2v2_upsample (void)
|
|||
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -302,8 +350,10 @@ jsimd_can_h2v1_upsample (void)
|
|||
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -318,9 +368,11 @@ jsimd_h2v2_upsample (j_decompress_ptr cinfo,
|
|||
if (simd_support & JSIMD_SSE2)
|
||||
jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
|
||||
cinfo->output_width, input_data, output_data_ptr);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor,
|
||||
cinfo->output_width, input_data, output_data_ptr);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -334,9 +386,11 @@ jsimd_h2v1_upsample (j_decompress_ptr cinfo,
|
|||
if (simd_support & JSIMD_SSE2)
|
||||
jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
|
||||
cinfo->output_width, input_data, output_data_ptr);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor,
|
||||
cinfo->output_width, input_data, output_data_ptr);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -354,8 +408,10 @@ jsimd_can_h2v2_fancy_upsample (void)
|
|||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -374,8 +430,10 @@ jsimd_can_h2v1_fancy_upsample (void)
|
|||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -391,9 +449,11 @@ jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
|
|||
IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
|
||||
jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -408,9 +468,11 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
|
|||
IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
|
||||
jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
|
||||
compptr->downsampled_width, input_data, output_data_ptr);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -428,8 +490,10 @@ jsimd_can_h2v2_merged_upsample (void)
|
|||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -448,8 +512,10 @@ jsimd_can_h2v1_merged_upsample (void)
|
|||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -462,45 +528,63 @@ jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
|
|||
{
|
||||
#ifdef WITH_SIMD
|
||||
void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||
#ifndef __x86_64__
|
||||
void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||
#endif
|
||||
switch(cinfo->out_color_space)
|
||||
{
|
||||
case JCS_EXT_RGB:
|
||||
sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
sse2fct=jsimd_h2v2_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v2_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
|
||||
sse2fct(cinfo->output_width, input_buf,
|
||||
in_row_group_ctr, output_buf);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
mmxfct(cinfo->output_width, input_buf,
|
||||
in_row_group_ctr, output_buf);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -512,45 +596,63 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
|
|||
{
|
||||
#ifdef WITH_SIMD
|
||||
void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||
#ifndef __x86_64__
|
||||
void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
|
||||
#endif
|
||||
switch(cinfo->out_color_space)
|
||||
{
|
||||
case JCS_EXT_RGB:
|
||||
sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
sse2fct=jsimd_h2v1_merged_upsample_sse2;
|
||||
#ifndef __x86_64__
|
||||
mmxfct=jsimd_h2v1_merged_upsample_mmx;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
|
||||
sse2fct(cinfo->output_width, input_buf,
|
||||
in_row_group_ctr, output_buf);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
mmxfct(cinfo->output_width, input_buf,
|
||||
in_row_group_ctr, output_buf);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -571,8 +673,10 @@ jsimd_can_convsamp (void)
|
|||
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -596,8 +700,10 @@ jsimd_can_convsamp_float (void)
|
|||
return 1;
|
||||
if (simd_support & JSIMD_SSE)
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_3DNOW)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -609,8 +715,10 @@ jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
|
|||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
jsimd_convsamp_sse2(sample_data, start_col, workspace);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_convsamp_mmx(sample_data, start_col, workspace);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -621,10 +729,12 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
|
|||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_SSE)
|
||||
jsimd_convsamp_float_sse(sample_data, start_col, workspace);
|
||||
else if (simd_support & JSIMD_3DNOW)
|
||||
jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -641,8 +751,10 @@ jsimd_can_fdct_islow (void)
|
|||
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -660,8 +772,10 @@ jsimd_can_fdct_ifast (void)
|
|||
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -679,8 +793,10 @@ jsimd_can_fdct_float (void)
|
|||
|
||||
if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_3DNOW)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -691,8 +807,10 @@ jsimd_fdct_islow (DCTELEM * data)
|
|||
#ifdef WITH_SIMD
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
|
||||
jsimd_fdct_islow_sse2(data);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_fdct_islow_mmx(data);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -702,8 +820,10 @@ jsimd_fdct_ifast (DCTELEM * data)
|
|||
#ifdef WITH_SIMD
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
|
||||
jsimd_fdct_ifast_sse2(data);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_fdct_ifast_mmx(data);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -713,8 +833,10 @@ jsimd_fdct_float (FAST_FLOAT * data)
|
|||
#ifdef WITH_SIMD
|
||||
if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
|
||||
jsimd_fdct_float_sse(data);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_3DNOW)
|
||||
jsimd_fdct_float_3dnow(data);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -733,8 +855,10 @@ jsimd_can_quantize (void)
|
|||
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -756,8 +880,10 @@ jsimd_can_quantize_float (void)
|
|||
return 1;
|
||||
if (simd_support & JSIMD_SSE)
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_3DNOW)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -769,8 +895,10 @@ jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
|
|||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
jsimd_quantize_sse2(coef_block, divisors, workspace);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_quantize_mmx(coef_block, divisors, workspace);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -781,10 +909,12 @@ jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
|
|||
#ifdef WITH_SIMD
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
jsimd_quantize_float_sse2(coef_block, divisors, workspace);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_SSE)
|
||||
jsimd_quantize_float_sse(coef_block, divisors, workspace);
|
||||
else if (simd_support & JSIMD_3DNOW)
|
||||
jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -807,8 +937,10 @@ jsimd_can_idct_2x2 (void)
|
|||
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -832,8 +964,10 @@ jsimd_can_idct_4x4 (void)
|
|||
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -846,8 +980,10 @@ jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|||
#if WITH_SIMD
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
|
||||
jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -859,8 +995,10 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|||
#if WITH_SIMD
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
|
||||
jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -883,8 +1021,10 @@ jsimd_can_idct_islow (void)
|
|||
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -910,8 +1050,10 @@ jsimd_can_idct_ifast (void)
|
|||
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if (simd_support & JSIMD_MMX)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -936,10 +1078,12 @@ jsimd_can_idct_float (void)
|
|||
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
|
||||
return 1;
|
||||
#ifndef __x86_64__
|
||||
if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
|
||||
return 1;
|
||||
if (simd_support & JSIMD_3DNOW)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -952,8 +1096,10 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|||
#if WITH_SIMD
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
|
||||
jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -965,8 +1111,10 @@ jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|||
#if WITH_SIMD
|
||||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
|
||||
jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#ifndef __x86_64__
|
||||
else if (simd_support & JSIMD_MMX)
|
||||
jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -979,12 +1127,14 @@ jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
|||
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
|
||||
jsimd_idct_float_sse2(compptr->dct_table, coef_block,
|
||||
output_buf, output_col);
|
||||
#ifndef __x86_64__
|
||||
else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
|
||||
jsimd_idct_float_sse(compptr->dct_table, coef_block,
|
||||
output_buf, output_col);
|
||||
else if (simd_support & JSIMD_3DNOW)
|
||||
jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
|
||||
output_buf, output_col);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -4,6 +4,23 @@ BUILT_SOURCES = jsimdcfg.inc
|
|||
|
||||
EXTRA_DIST = nasm_lt.sh
|
||||
|
||||
if WITH_SIMD64
|
||||
|
||||
libsimd_la_SOURCES = jsimd.h jsimdcfg.inc.h \
|
||||
jsimdext.inc jcolsamp.inc jdct.inc \
|
||||
jsimdcpu-64.asm jfsseflt-64.asm \
|
||||
jccolss2-64.asm jdcolss2-64.asm \
|
||||
jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \
|
||||
jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \
|
||||
jiss2red-64.asm jiss2int-64.asm jiss2fst-64.asm \
|
||||
jcqnts2f-64.asm jiss2flt-64.asm
|
||||
|
||||
jccolss2-64.lo: jcclrss2-64.asm
|
||||
jdcolss2-64.lo: jdclrss2-64.asm
|
||||
jdmerss2-64.lo: jdmrgss2-64.asm
|
||||
|
||||
else
|
||||
|
||||
libsimd_la_SOURCES = jsimd.h jsimdcfg.inc.h \
|
||||
jsimdext.inc jcolsamp.inc jdct.inc \
|
||||
jsimdcpu.asm \
|
||||
|
@ -26,6 +43,8 @@ jdcolss2.lo: jdclrss2.asm
|
|||
jdmermmx.lo: jdmrgmmx.asm
|
||||
jdmerss2.lo: jdmrgss2.asm
|
||||
|
||||
endif
|
||||
|
||||
.asm.lo:
|
||||
$(LIBTOOL) --mode=compile --tag NASM ./nasm_lt.sh $(NASM) $(NAFLAGS) $< -o $@
|
||||
|
||||
|
|
|
@ -0,0 +1,484 @@
|
|||
;
|
||||
; jcclrss2.asm - colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; Copyright (C) 2009, D. R. Commander.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
|
||||
; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
; JDIMENSION output_row, int num_rows);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION img_width
|
||||
; r11 = JSAMPARRAY input_buf
|
||||
; r12 = JSAMPIMAGE output_buf
|
||||
; r13 = JDIMENSION output_row
|
||||
; r14 = int num_rows
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
|
||||
align 16
|
||||
|
||||
global EXTN(jsimd_rgb_ycc_convert_sse2)
|
||||
|
||||
EXTN(jsimd_rgb_ycc_convert_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
mov rcx, r10
|
||||
test rcx,rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rsi, r12
|
||||
mov rcx, r13
|
||||
mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
|
||||
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
|
||||
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
|
||||
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rsi, r11
|
||||
mov rax, r14
|
||||
test rax,rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rdx
|
||||
push rbx
|
||||
push rdi
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr0
|
||||
mov rbx, JSAMPROW [rbx] ; outptr1
|
||||
mov rdx, JSAMPROW [rdx] ; outptr2
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
.column_ld1:
|
||||
push rax
|
||||
push rdx
|
||||
lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
|
||||
test cl, SIZEOF_BYTE
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_BYTE
|
||||
movzx rax, BYTE [rsi+rcx]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_WORD
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
movzx rdx, WORD [rsi+rcx]
|
||||
shl rax, WORD_BIT
|
||||
or rax,rdx
|
||||
.column_ld4:
|
||||
movd xmmA,eax
|
||||
pop rdx
|
||||
pop rax
|
||||
test cl, SIZEOF_DWORD
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
movd xmmF, XMM_DWORD [rsi+rcx]
|
||||
pslldq xmmA, SIZEOF_DWORD
|
||||
por xmmA,xmmF
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_MMWORD
|
||||
jz short .column_ld16
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
movq xmmB, XMM_MMWORD [rsi+rcx]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA,xmmB
|
||||
.column_ld16:
|
||||
test cl, SIZEOF_XMMWORD
|
||||
jz short .column_ld32
|
||||
movdqa xmmF,xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jmp short .rgb_ycc_cnv
|
||||
.column_ld32:
|
||||
test cl, 2*SIZEOF_XMMWORD
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movdqa xmmB,xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
movdqa xmmG,xmmA
|
||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
|
||||
psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
|
||||
pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
|
||||
|
||||
punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
|
||||
punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
|
||||
|
||||
movdqa xmmD,xmmA
|
||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
|
||||
psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
|
||||
pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
|
||||
|
||||
punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
|
||||
punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
|
||||
|
||||
movdqa xmmE,xmmA
|
||||
pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
|
||||
psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
|
||||
|
||||
punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
|
||||
|
||||
punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
pxor xmmH,xmmH
|
||||
|
||||
movdqa xmmC,xmmA
|
||||
punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmB,xmmE
|
||||
punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
|
||||
movdqa xmmF,xmmD
|
||||
punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
.column_ld1:
|
||||
test cl, SIZEOF_XMMWORD/16
|
||||
jz short .column_ld2
|
||||
sub rcx, byte SIZEOF_XMMWORD/16
|
||||
movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld2:
|
||||
test cl, SIZEOF_XMMWORD/8
|
||||
jz short .column_ld4
|
||||
sub rcx, byte SIZEOF_XMMWORD/8
|
||||
movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
pslldq xmmA, SIZEOF_MMWORD
|
||||
por xmmA,xmmE
|
||||
.column_ld4:
|
||||
test cl, SIZEOF_XMMWORD/4
|
||||
jz short .column_ld8
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
movdqa xmmE,xmmA
|
||||
movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
|
||||
.column_ld8:
|
||||
test cl, SIZEOF_XMMWORD/2
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jz short .rgb_ycc_cnv
|
||||
movdqa xmmF,xmmA
|
||||
movdqa xmmH,xmmE
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
jmp short .rgb_ycc_cnv
|
||||
|
||||
.columnloop:
|
||||
movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
|
||||
movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
|
||||
|
||||
.rgb_ycc_cnv:
|
||||
; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD,xmmA
|
||||
punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
|
||||
punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
|
||||
|
||||
movdqa xmmC,xmmF
|
||||
punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
|
||||
punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
|
||||
|
||||
movdqa xmmB,xmmA
|
||||
punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
|
||||
punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
|
||||
|
||||
movdqa xmmG,xmmD
|
||||
punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
|
||||
punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
|
||||
|
||||
movdqa xmmE,xmmA
|
||||
punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
|
||||
punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
movdqa xmmH,xmmB
|
||||
punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
|
||||
punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
pxor xmmF,xmmF
|
||||
|
||||
movdqa xmmC,xmmA
|
||||
punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
|
||||
punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
|
||||
|
||||
movdqa xmmD,xmmB
|
||||
punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
|
||||
punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
|
||||
|
||||
movdqa xmmG,xmmE
|
||||
punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
|
||||
punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
|
||||
|
||||
punpcklbw xmmF,xmmH
|
||||
punpckhbw xmmH,xmmH
|
||||
psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
|
||||
psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
|
||||
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
|
||||
|
||||
; (Original)
|
||||
; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
;
|
||||
; (This implementation)
|
||||
; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
|
||||
; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
|
||||
; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
|
||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
|
||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
|
||||
movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
|
||||
|
||||
movdqa xmm6,xmm1
|
||||
punpcklwd xmm1,xmm3
|
||||
punpckhwd xmm6,xmm3
|
||||
movdqa xmm7,xmm1
|
||||
movdqa xmm4,xmm6
|
||||
pmaddwd xmm1,[PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
pmaddwd xmm6,[PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
pmaddwd xmm7,[PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
|
||||
pmaddwd xmm4,[PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
|
||||
|
||||
pxor xmm1,xmm1
|
||||
pxor xmm6,xmm6
|
||||
punpcklwd xmm1,xmm5 ; xmm1=BOL
|
||||
punpckhwd xmm6,xmm5 ; xmm6=BOH
|
||||
psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
|
||||
psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
|
||||
|
||||
movdqa xmm5,[PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm7,xmm1
|
||||
paddd xmm4,xmm6
|
||||
paddd xmm7,xmm5
|
||||
paddd xmm4,xmm5
|
||||
psrld xmm7,SCALEBITS ; xmm7=CbOL
|
||||
psrld xmm4,SCALEBITS ; xmm4=CbOH
|
||||
packssdw xmm7,xmm4 ; xmm7=CbO
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
|
||||
|
||||
movdqa xmm6,xmm0
|
||||
punpcklwd xmm0,xmm2
|
||||
punpckhwd xmm6,xmm2
|
||||
movdqa xmm5,xmm0
|
||||
movdqa xmm4,xmm6
|
||||
pmaddwd xmm0,[PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
pmaddwd xmm6,[PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
pmaddwd xmm5,[PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
|
||||
pmaddwd xmm4,[PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
|
||||
|
||||
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
|
||||
movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
|
||||
|
||||
pxor xmm0,xmm0
|
||||
pxor xmm6,xmm6
|
||||
punpcklwd xmm0,xmm1 ; xmm0=BEL
|
||||
punpckhwd xmm6,xmm1 ; xmm6=BEH
|
||||
psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
|
||||
psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
|
||||
|
||||
movdqa xmm1,[PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm5,xmm0
|
||||
paddd xmm4,xmm6
|
||||
paddd xmm5,xmm1
|
||||
paddd xmm4,xmm1
|
||||
psrld xmm5,SCALEBITS ; xmm5=CbEL
|
||||
psrld xmm4,SCALEBITS ; xmm4=CbEH
|
||||
packssdw xmm5,xmm4 ; xmm5=CbE
|
||||
|
||||
psllw xmm7,BYTE_BIT
|
||||
por xmm5,xmm7 ; xmm5=Cb
|
||||
movdqa XMMWORD [ebx], xmm5 ; Save Cb
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
|
||||
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
|
||||
|
||||
movdqa xmm4,xmm0
|
||||
punpcklwd xmm0,xmm3
|
||||
punpckhwd xmm4,xmm3
|
||||
movdqa xmm7,xmm0
|
||||
movdqa xmm5,xmm4
|
||||
pmaddwd xmm0,[PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
|
||||
pmaddwd xmm4,[PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
|
||||
pmaddwd xmm7,[PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
|
||||
pmaddwd xmm5,[PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
|
||||
|
||||
movdqa xmm3,[PD_ONEHALF] ; xmm3=[PD_ONEHALF]
|
||||
|
||||
paddd xmm0, XMMWORD [wk(4)]
|
||||
paddd xmm4, XMMWORD [wk(5)]
|
||||
paddd xmm0,xmm3
|
||||
paddd xmm4,xmm3
|
||||
psrld xmm0,SCALEBITS ; xmm0=YOL
|
||||
psrld xmm4,SCALEBITS ; xmm4=YOH
|
||||
packssdw xmm0,xmm4 ; xmm0=YO
|
||||
|
||||
pxor xmm3,xmm3
|
||||
pxor xmm4,xmm4
|
||||
punpcklwd xmm3,xmm1 ; xmm3=ROL
|
||||
punpckhwd xmm4,xmm1 ; xmm4=ROH
|
||||
psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
|
||||
psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
|
||||
|
||||
movdqa xmm1,[PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm7,xmm3
|
||||
paddd xmm5,xmm4
|
||||
paddd xmm7,xmm1
|
||||
paddd xmm5,xmm1
|
||||
psrld xmm7,SCALEBITS ; xmm7=CrOL
|
||||
psrld xmm5,SCALEBITS ; xmm5=CrOH
|
||||
packssdw xmm7,xmm5 ; xmm7=CrO
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
|
||||
|
||||
movdqa xmm4,xmm6
|
||||
punpcklwd xmm6,xmm2
|
||||
punpckhwd xmm4,xmm2
|
||||
movdqa xmm1,xmm6
|
||||
movdqa xmm5,xmm4
|
||||
pmaddwd xmm6,[PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
|
||||
pmaddwd xmm4,[PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
|
||||
pmaddwd xmm1,[PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
|
||||
pmaddwd xmm5,[PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
|
||||
|
||||
movdqa xmm2,[PD_ONEHALF] ; xmm2=[PD_ONEHALF]
|
||||
|
||||
paddd xmm6, XMMWORD [wk(6)]
|
||||
paddd xmm4, XMMWORD [wk(7)]
|
||||
paddd xmm6,xmm2
|
||||
paddd xmm4,xmm2
|
||||
psrld xmm6,SCALEBITS ; xmm6=YEL
|
||||
psrld xmm4,SCALEBITS ; xmm4=YEH
|
||||
packssdw xmm6,xmm4 ; xmm6=YE
|
||||
|
||||
psllw xmm0,BYTE_BIT
|
||||
por xmm6,xmm0 ; xmm6=Y
|
||||
movdqa XMMWORD [rdi], xmm6 ; Save Y
|
||||
|
||||
pxor xmm2,xmm2
|
||||
pxor xmm4,xmm4
|
||||
punpcklwd xmm2,xmm3 ; xmm2=REL
|
||||
punpckhwd xmm4,xmm3 ; xmm4=REH
|
||||
psrld xmm2,1 ; xmm2=REL*FIX(0.500)
|
||||
psrld xmm4,1 ; xmm4=REH*FIX(0.500)
|
||||
|
||||
movdqa xmm0,[PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
|
||||
|
||||
paddd xmm1,xmm2
|
||||
paddd xmm5,xmm4
|
||||
paddd xmm1,xmm0
|
||||
paddd xmm5,xmm0
|
||||
psrld xmm1,SCALEBITS ; xmm1=CrEL
|
||||
psrld xmm5,SCALEBITS ; xmm5=CrEH
|
||||
packssdw xmm1,xmm5 ; xmm1=CrE
|
||||
|
||||
psllw xmm7,BYTE_BIT
|
||||
por xmm1,xmm7 ; xmm1=Cr
|
||||
movdqa XMMWORD [rdx], xmm1 ; Save Cr
|
||||
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr0
|
||||
add rbx, byte SIZEOF_XMMWORD ; outptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; outptr2
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test rcx,rcx
|
||||
jnz near .column_ld1
|
||||
|
||||
pop rcx ; col
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rbx
|
||||
pop rdx
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_buf
|
||||
add rdi, byte SIZEOF_JSAMPROW
|
||||
add rbx, byte SIZEOF_JSAMPROW
|
||||
add rdx, byte SIZEOF_JSAMPROW
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
pop rbx
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
;
|
||||
; jccolss2.asm - colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; Copyright (C) 2009, D. R. Commander.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_081 equ 5329 ; FIX(0.08131)
|
||||
F_0_114 equ 7471 ; FIX(0.11400)
|
||||
F_0_168 equ 11059 ; FIX(0.16874)
|
||||
F_0_250 equ 16384 ; FIX(0.25000)
|
||||
F_0_299 equ 19595 ; FIX(0.29900)
|
||||
F_0_331 equ 21709 ; FIX(0.33126)
|
||||
F_0_418 equ 27439 ; FIX(0.41869)
|
||||
F_0_587 equ 38470 ; FIX(0.58700)
|
||||
F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_rgb_ycc_convert_sse2)
|
||||
|
||||
EXTN(jconst_rgb_ycc_convert_sse2):
|
||||
|
||||
PW_F0299_F0337 times 4 dw F_0_299, F_0_337
|
||||
PW_F0114_F0250 times 4 dw F_0_114, F_0_250
|
||||
PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331
|
||||
PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
|
||||
PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
|
||||
PD_ONEHALF times 4 dd (1 << (SCALEBITS-1))
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
%include "jcclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
|
||||
%include "jcclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
|
||||
%include "jcclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
|
||||
%include "jcclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
|
||||
%include "jcclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 3
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 1
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
|
||||
%include "jcclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 1
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 3
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
|
||||
%include "jcclrss2-64.asm"
|
|
@ -0,0 +1,152 @@
|
|||
;
|
||||
; jcqnts2f.asm - sample data conversion and quantization (64-bit SSE & SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; FAST_FLOAT * workspace);
|
||||
;
|
||||
|
||||
; r10 = JSAMPARRAY sample_data
|
||||
; r11 = JDIMENSION start_col
|
||||
; r12 = FAST_FLOAT * workspace
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_convsamp_float_sse2)
|
||||
|
||||
EXTN(jsimd_convsamp_float_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
pcmpeqw xmm7,xmm7
|
||||
psllw xmm7,7
|
||||
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
|
||||
|
||||
mov rsi, r10
|
||||
mov rax, r11
|
||||
mov rdi, r12
|
||||
mov rcx, DCTSIZE/2
|
||||
.convloop:
|
||||
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
|
||||
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
|
||||
|
||||
psubb xmm0,xmm7 ; xmm0=(01234567)
|
||||
psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
|
||||
|
||||
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
|
||||
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
|
||||
|
||||
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
|
||||
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
|
||||
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
|
||||
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
|
||||
|
||||
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
|
||||
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
|
||||
cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
|
||||
cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
|
||||
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
|
||||
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
|
||||
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
|
||||
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
|
||||
add rsi, byte 2*SIZEOF_JSAMPROW
|
||||
add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec rcx
|
||||
jnz short .convloop
|
||||
|
||||
uncollect_args
|
||||
pop rbx
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
|
||||
; FAST_FLOAT * workspace);
|
||||
;
|
||||
|
||||
; r10 = JCOEFPTR coef_block
|
||||
; r11 = FAST_FLOAT * divisors
|
||||
; r12 = FAST_FLOAT * workspace
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_quantize_float_sse2)
|
||||
|
||||
EXTN(jsimd_quantize_float_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
collect_args
|
||||
|
||||
mov rsi, r12
|
||||
mov rdx, r11
|
||||
mov rdi, r10
|
||||
mov rax, DCTSIZE2/16
|
||||
.quantloop:
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
cvtps2dq xmm0,xmm0
|
||||
cvtps2dq xmm1,xmm1
|
||||
cvtps2dq xmm2,xmm2
|
||||
cvtps2dq xmm3,xmm3
|
||||
|
||||
packssdw xmm0,xmm1
|
||||
packssdw xmm2,xmm3
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
|
||||
|
||||
add rsi, byte 16*SIZEOF_FAST_FLOAT
|
||||
add rdx, byte 16*SIZEOF_FAST_FLOAT
|
||||
add rdi, byte 16*SIZEOF_JCOEF
|
||||
dec rax
|
||||
jnz short .quantloop
|
||||
|
||||
uncollect_args
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,181 @@
|
|||
;
|
||||
; jcqnts2i.asm - sample data conversion and quantization (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; DCTELEM * workspace);
|
||||
;
|
||||
|
||||
; r10 = JSAMPARRAY sample_data
|
||||
; r11 = JDIMENSION start_col
|
||||
; r12 = DCTELEM * workspace
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_convsamp_sse2)
|
||||
|
||||
EXTN(jsimd_convsamp_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
pxor xmm6,xmm6 ; xmm6=(all 0's)
|
||||
pcmpeqw xmm7,xmm7
|
||||
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
mov rsi, r10
|
||||
mov rax, r11
|
||||
mov rdi, r12
|
||||
mov rcx, DCTSIZE/4
|
||||
.convloop:
|
||||
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
|
||||
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
|
||||
|
||||
mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
|
||||
movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
|
||||
movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
|
||||
|
||||
punpcklbw xmm0,xmm6 ; xmm0=(01234567)
|
||||
punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)
|
||||
paddw xmm0,xmm7
|
||||
paddw xmm1,xmm7
|
||||
punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)
|
||||
punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)
|
||||
paddw xmm2,xmm7
|
||||
paddw xmm3,xmm7
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
add rsi, byte 4*SIZEOF_JSAMPROW
|
||||
add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
|
||||
dec rcx
|
||||
jnz short .convloop
|
||||
|
||||
uncollect_args
|
||||
pop rbx
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Quantize/descale the coefficients, and store into coef_block
|
||||
;
|
||||
; This implementation is based on an algorithm described in
|
||||
; "How to optimize for the Pentium family of microprocessors"
|
||||
; (http://www.agner.org/assem/).
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
|
||||
; DCTELEM * workspace);
|
||||
;
|
||||
|
||||
%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
|
||||
%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
|
||||
%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
|
||||
|
||||
; r10 = JCOEFPTR coef_block
|
||||
; r11 = DCTELEM * divisors
|
||||
; r12 = DCTELEM * workspace
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_quantize_sse2)
|
||||
|
||||
EXTN(jsimd_quantize_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
collect_args
|
||||
|
||||
mov rsi, r12
|
||||
mov rdx, r11
|
||||
mov rdi, r10
|
||||
mov rax, DCTSIZE2/32
|
||||
.quantloop:
|
||||
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
|
||||
movdqa xmm0,xmm4
|
||||
movdqa xmm1,xmm5
|
||||
movdqa xmm2,xmm6
|
||||
movdqa xmm3,xmm7
|
||||
psraw xmm4,(WORD_BIT-1)
|
||||
psraw xmm5,(WORD_BIT-1)
|
||||
psraw xmm6,(WORD_BIT-1)
|
||||
psraw xmm7,(WORD_BIT-1)
|
||||
pxor xmm0,xmm4
|
||||
pxor xmm1,xmm5
|
||||
pxor xmm2,xmm6
|
||||
pxor xmm3,xmm7
|
||||
psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
|
||||
psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
|
||||
psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
|
||||
psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
|
||||
|
||||
paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
|
||||
paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
|
||||
paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
|
||||
paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
|
||||
pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
|
||||
pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
|
||||
pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
|
||||
pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
|
||||
pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
|
||||
pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
|
||||
pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
|
||||
pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
|
||||
|
||||
pxor xmm0,xmm4
|
||||
pxor xmm1,xmm5
|
||||
pxor xmm2,xmm6
|
||||
pxor xmm3,xmm7
|
||||
psubw xmm0,xmm4
|
||||
psubw xmm1,xmm5
|
||||
psubw xmm2,xmm6
|
||||
psubw xmm3,xmm7
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
add rsi, byte 32*SIZEOF_DCTELEM
|
||||
add rdx, byte 32*SIZEOF_DCTELEM
|
||||
add rdi, byte 32*SIZEOF_JCOEF
|
||||
dec rax
|
||||
jnz near .quantloop
|
||||
|
||||
uncollect_args
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,324 @@
|
|||
;
|
||||
; jcsamss2.asm - downsampling (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the common case of 2:1 horizontal and 1:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION image_width
|
||||
; r11 = int max_v_samp_factor
|
||||
; r12 = JDIMENSION v_samp_factor
|
||||
; r13 = JDIMENSION width_blocks
|
||||
; r14 = JSAMPARRAY input_data
|
||||
; r15 = JSAMPARRAY output_data
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v1_downsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_downsample_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
collect_args
|
||||
|
||||
mov rcx, r13
|
||||
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov rdx, r10
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push rcx
|
||||
shl rcx,1 ; output_cols * 2
|
||||
sub rcx,rdx
|
||||
jle short .expand_end
|
||||
|
||||
mov rax, r11
|
||||
test rax,rax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov rsi, r14 ; input_data
|
||||
.expandloop:
|
||||
push rax
|
||||
push rcx
|
||||
|
||||
mov rdi, JSAMPROW [rsi]
|
||||
add rdi,rdx
|
||||
mov al, JSAMPLE [rdi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop rcx
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
dec rax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop rcx ; output_cols
|
||||
|
||||
; -- h2v1_downsample
|
||||
|
||||
mov rax, r12 ; rowctr
|
||||
test eax,eax
|
||||
jle near .return
|
||||
|
||||
mov rdx, 0x00010000 ; bias pattern
|
||||
movd xmm7,edx
|
||||
pcmpeqw xmm6,xmm6
|
||||
pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
|
||||
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov rsi, r14 ; input_data
|
||||
mov rdi, r15 ; output_data
|
||||
.rowloop:
|
||||
push rcx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae short .columnloop
|
||||
|
||||
.columnloop_r8:
|
||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
pxor xmm1,xmm1
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop:
|
||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
|
||||
.downsample:
|
||||
movdqa xmm2,xmm0
|
||||
movdqa xmm3,xmm1
|
||||
|
||||
pand xmm0,xmm6
|
||||
psrlw xmm2,BYTE_BIT
|
||||
pand xmm1,xmm6
|
||||
psrlw xmm3,BYTE_BIT
|
||||
|
||||
paddw xmm0,xmm2
|
||||
paddw xmm1,xmm3
|
||||
paddw xmm0,xmm7
|
||||
paddw xmm1,xmm7
|
||||
psrlw xmm0,1
|
||||
psrlw xmm1,1
|
||||
|
||||
packuswb xmm0,xmm1
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
|
||||
sub rcx, byte SIZEOF_XMMWORD ; outcol
|
||||
add rsi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae short .columnloop
|
||||
test rcx,rcx
|
||||
jnz short .columnloop_r8
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rcx
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec rax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Downsample pixel values of a single component.
|
||||
; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
|
||||
; without smoothing.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
|
||||
; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
|
||||
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION image_width
|
||||
; r11 = int max_v_samp_factor
|
||||
; r12 = JDIMENSION v_samp_factor
|
||||
; r13 = JDIMENSION width_blocks
|
||||
; r14 = JSAMPARRAY input_data
|
||||
; r15 = JSAMPARRAY output_data
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v2_downsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_downsample_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
collect_args
|
||||
|
||||
mov rcx, r13
|
||||
shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||
jz near .return
|
||||
|
||||
mov rdx, r10
|
||||
|
||||
; -- expand_right_edge
|
||||
|
||||
push rcx
|
||||
shl rcx,1 ; output_cols * 2
|
||||
sub rcx,rdx
|
||||
jle short .expand_end
|
||||
|
||||
mov rax, r11
|
||||
test rax,rax
|
||||
jle short .expand_end
|
||||
|
||||
cld
|
||||
mov rsi, r14 ; input_data
|
||||
.expandloop:
|
||||
push rax
|
||||
push rcx
|
||||
|
||||
mov rdi, JSAMPROW [rsi]
|
||||
add rdi,rdx
|
||||
mov al, JSAMPLE [rdi-1]
|
||||
|
||||
rep stosb
|
||||
|
||||
pop rcx
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
dec rax
|
||||
jg short .expandloop
|
||||
|
||||
.expand_end:
|
||||
pop rcx ; output_cols
|
||||
|
||||
; -- h2v2_downsample
|
||||
|
||||
mov rax, r12 ; rowctr
|
||||
test rax,rax
|
||||
jle near .return
|
||||
|
||||
mov rdx, 0x00020001 ; bias pattern
|
||||
movd xmm7,edx
|
||||
pcmpeqw xmm6,xmm6
|
||||
pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
|
||||
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
mov rsi, r14 ; input_data
|
||||
mov rdi, r15 ; output_data
|
||||
.rowloop:
|
||||
push rcx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae short .columnloop
|
||||
|
||||
.columnloop_r8:
|
||||
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
pxor xmm2,xmm2
|
||||
pxor xmm3,xmm3
|
||||
mov rcx, SIZEOF_XMMWORD
|
||||
jmp short .downsample
|
||||
|
||||
.columnloop:
|
||||
movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
|
||||
movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
|
||||
.downsample:
|
||||
movdqa xmm4,xmm0
|
||||
movdqa xmm5,xmm1
|
||||
pand xmm0,xmm6
|
||||
psrlw xmm4,BYTE_BIT
|
||||
pand xmm1,xmm6
|
||||
psrlw xmm5,BYTE_BIT
|
||||
paddw xmm0,xmm4
|
||||
paddw xmm1,xmm5
|
||||
|
||||
movdqa xmm4,xmm2
|
||||
movdqa xmm5,xmm3
|
||||
pand xmm2,xmm6
|
||||
psrlw xmm4,BYTE_BIT
|
||||
pand xmm3,xmm6
|
||||
psrlw xmm5,BYTE_BIT
|
||||
paddw xmm2,xmm4
|
||||
paddw xmm3,xmm5
|
||||
|
||||
paddw xmm0,xmm1
|
||||
paddw xmm2,xmm3
|
||||
paddw xmm0,xmm7
|
||||
paddw xmm2,xmm7
|
||||
psrlw xmm0,2
|
||||
psrlw xmm2,2
|
||||
|
||||
packuswb xmm0,xmm2
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
|
||||
sub rcx, byte SIZEOF_XMMWORD ; outcol
|
||||
add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
|
||||
add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
|
||||
add rdi, byte 1*SIZEOF_XMMWORD ; outptr
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jae near .columnloop
|
||||
test rcx,rcx
|
||||
jnz near .columnloop_r8
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rcx
|
||||
|
||||
add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
|
||||
dec rax ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,483 @@
|
|||
;
|
||||
; jdclrss2.asm - colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
|
||||
; JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
; JSAMPARRAY output_buf, int num_rows)
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION out_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12 = JDIMENSION input_row
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
; r14 = int num_rows
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_ycc_rgb_convert_sse2)
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
mov rcx, r10 ; num_cols
|
||||
test rcx,rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rdi, r11
|
||||
mov rcx, r12
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
|
||||
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
|
||||
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rdi, r13
|
||||
mov rax, r14
|
||||
test rax,rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rax
|
||||
push rdi
|
||||
push rdx
|
||||
push rbx
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr0
|
||||
mov rbx, JSAMPROW [rbx] ; inptr1
|
||||
mov rdx, JSAMPROW [rdx] ; inptr2
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
|
||||
movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
|
||||
|
||||
pcmpeqw xmm4,xmm4
|
||||
pcmpeqw xmm7,xmm7
|
||||
psrlw xmm4,BYTE_BIT
|
||||
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
|
||||
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
|
||||
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
|
||||
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
|
||||
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
|
||||
|
||||
paddw xmm4,xmm7
|
||||
paddw xmm5,xmm7
|
||||
paddw xmm0,xmm7
|
||||
paddw xmm1,xmm7
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movdqa xmm2,xmm4 ; xmm2=CbE
|
||||
movdqa xmm3,xmm5 ; xmm3=CbO
|
||||
paddw xmm4,xmm4 ; xmm4=2*CbE
|
||||
paddw xmm5,xmm5 ; xmm5=2*CbO
|
||||
movdqa xmm6,xmm0 ; xmm6=CrE
|
||||
movdqa xmm7,xmm1 ; xmm7=CrO
|
||||
paddw xmm0,xmm0 ; xmm0=2*CrE
|
||||
paddw xmm1,xmm1 ; xmm1=2*CrO
|
||||
|
||||
pmulhw xmm4,[PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
|
||||
pmulhw xmm5,[PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
|
||||
pmulhw xmm0,[PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
|
||||
pmulhw xmm1,[PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
|
||||
|
||||
paddw xmm4,[PW_ONE]
|
||||
paddw xmm5,[PW_ONE]
|
||||
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
|
||||
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
|
||||
paddw xmm0,[PW_ONE]
|
||||
paddw xmm1,[PW_ONE]
|
||||
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
|
||||
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
|
||||
|
||||
paddw xmm4,xmm2
|
||||
paddw xmm5,xmm3
|
||||
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
|
||||
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
|
||||
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
|
||||
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
|
||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
|
||||
|
||||
movdqa xmm4,xmm2
|
||||
movdqa xmm5,xmm3
|
||||
punpcklwd xmm2,xmm6
|
||||
punpckhwd xmm4,xmm6
|
||||
pmaddwd xmm2,[PW_MF0344_F0285]
|
||||
pmaddwd xmm4,[PW_MF0344_F0285]
|
||||
punpcklwd xmm3,xmm7
|
||||
punpckhwd xmm5,xmm7
|
||||
pmaddwd xmm3,[PW_MF0344_F0285]
|
||||
pmaddwd xmm5,[PW_MF0344_F0285]
|
||||
|
||||
paddd xmm2,[PD_ONEHALF]
|
||||
paddd xmm4,[PD_ONEHALF]
|
||||
psrad xmm2,SCALEBITS
|
||||
psrad xmm4,SCALEBITS
|
||||
paddd xmm3,[PD_ONEHALF]
|
||||
paddd xmm5,[PD_ONEHALF]
|
||||
psrad xmm3,SCALEBITS
|
||||
psrad xmm5,SCALEBITS
|
||||
|
||||
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
||||
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
||||
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
||||
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
||||
|
||||
movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
|
||||
|
||||
pcmpeqw xmm4,xmm4
|
||||
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
|
||||
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
|
||||
|
||||
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
|
||||
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
|
||||
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
|
||||
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
|
||||
|
||||
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
|
||||
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
|
||||
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
|
||||
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
|
||||
|
||||
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
|
||||
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
|
||||
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
|
||||
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||
|
||||
movdqa xmmG,xmmA
|
||||
movdqa xmmH,xmmA
|
||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||
|
||||
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||
|
||||
movdqa xmmC,xmmD
|
||||
movdqa xmmB,xmmD
|
||||
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||
|
||||
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||
|
||||
movdqa xmmF,xmmE
|
||||
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||
|
||||
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||
movdqa xmmB,xmmE
|
||||
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||
|
||||
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||
movdqa xmmB,xmmF
|
||||
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||
|
||||
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test rdi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
.out0:
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
||||
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
mov rax,rcx
|
||||
xor rcx, byte 0x0F
|
||||
shl rcx, 2
|
||||
movd xmmB,ecx
|
||||
psrlq xmmH,4
|
||||
pcmpeqb xmmE,xmmE
|
||||
psrlq xmmH,xmmB
|
||||
psrlq xmmE,xmmB
|
||||
punpcklbw xmmE,xmmH
|
||||
; ----------------
|
||||
mov rcx,rdi
|
||||
and rcx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
add rax,rcx
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx
|
||||
movdqa xmmG,xmmA
|
||||
movdqa xmmC,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmD,ecx
|
||||
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmF,ecx
|
||||
psllq xmmA,xmmF
|
||||
psllq xmmE,xmmF
|
||||
jmp short .adj0
|
||||
.adj1: neg ecx
|
||||
movd xmmF,ecx
|
||||
psrlq xmmA,xmmF
|
||||
psrlq xmmE,xmmF
|
||||
psllq xmmG,xmmD
|
||||
psllq xmmC,xmmD
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%else
|
||||
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%endif
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||
|
||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||
|
||||
movdqa xmmC,xmmA
|
||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||
movdqa xmmG,xmmB
|
||||
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD,xmmA
|
||||
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
movdqa xmmH,xmmC
|
||||
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test rdi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
.out0:
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
||||
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/16
|
||||
jb near .nextrow
|
||||
mov rax,rcx
|
||||
xor rcx, byte 0x03
|
||||
inc rcx
|
||||
shl rcx, 4
|
||||
movd xmmF,ecx
|
||||
psrlq xmmE,xmmF
|
||||
punpcklbw xmmE,xmmE
|
||||
; ----------------
|
||||
mov rcx,rdi
|
||||
and rcx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmB,xmmA
|
||||
movdqa xmmG,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmC,ecx
|
||||
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmH,ecx
|
||||
psllq xmmA,xmmH
|
||||
psllq xmmE,xmmH
|
||||
jmp short .adj0
|
||||
.adj1: neg rcx
|
||||
movd xmmH,ecx
|
||||
psrlq xmmA,xmmH
|
||||
psrlq xmmE,xmmH
|
||||
psllq xmmB,xmmC
|
||||
psllq xmmG,xmmC
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.nextrow:
|
||||
pop rcx
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
pop rdi
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
add rbx, byte SIZEOF_JSAMPROW
|
||||
add rdx, byte SIZEOF_JSAMPROW
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_buf
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
pop rbx
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,117 @@
|
|||
;
|
||||
; jdcolss2.asm - colorspace conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_ycc_rgb_convert_sse2)
|
||||
|
||||
EXTN(jconst_ycc_rgb_convert_sse2):
|
||||
|
||||
PW_F0402 times 8 dw F_0_402
|
||||
PW_MF0228 times 8 dw -F_0_228
|
||||
PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
|
||||
PW_ONE times 8 dw 1
|
||||
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
%include "jdclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
|
||||
%include "jdclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
|
||||
%include "jdclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
|
||||
%include "jdclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
|
||||
%include "jdclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 3
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 1
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
|
||||
%include "jdclrss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 1
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 3
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
|
||||
%include "jdclrss2-64.asm"
|
|
@ -0,0 +1,123 @@
|
|||
;
|
||||
; jdmerss2.asm - merged upsampling/color conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_merged_upsample_sse2)
|
||||
|
||||
EXTN(jconst_merged_upsample_sse2):
|
||||
|
||||
PW_F0402 times 8 dw F_0_402
|
||||
PW_MF0228 times 8 dw -F_0_228
|
||||
PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
|
||||
PW_ONE times 8 dw 1
|
||||
PD_ONEHALF times 4 dd 1 << (SCALEBITS-1)
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
%include "jdmrgss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
|
||||
%include "jdmrgss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 0
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 2
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
|
||||
%include "jdmrgss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 3
|
||||
%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
|
||||
%include "jdmrgss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 2
|
||||
%define RGB_GREEN 1
|
||||
%define RGB_BLUE 0
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
|
||||
%include "jdmrgss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 3
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 1
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
|
||||
%include "jdmrgss2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED 1
|
||||
%define RGB_GREEN 2
|
||||
%define RGB_BLUE 3
|
||||
%define RGB_PIXELSIZE 4
|
||||
%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
|
||||
%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
|
||||
%include "jdmrgss2-64.asm"
|
|
@ -0,0 +1,565 @@
|
|||
;
|
||||
; jdmrgss2.asm - merged upsampling/color conversion (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION output_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12 = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 3
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v1_merged_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
mov rcx, r10 ; col
|
||||
test rcx,rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rdi, r11
|
||||
mov rcx, r12
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
mov rdi, r13
|
||||
mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
|
||||
mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
pop rcx ; col
|
||||
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
|
||||
movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
|
||||
|
||||
pxor xmm1,xmm1 ; xmm1=(all 0's)
|
||||
pcmpeqw xmm3,xmm3
|
||||
psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
movdqa xmm4,xmm6
|
||||
punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
|
||||
punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
|
||||
movdqa xmm0,xmm7
|
||||
punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
|
||||
punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
|
||||
|
||||
paddw xmm6,xmm3
|
||||
paddw xmm4,xmm3
|
||||
paddw xmm7,xmm3
|
||||
paddw xmm0,xmm3
|
||||
|
||||
; (Original)
|
||||
; R = Y + 1.40200 * Cr
|
||||
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
||||
; B = Y + 1.77200 * Cb
|
||||
;
|
||||
; (This implementation)
|
||||
; R = Y + 0.40200 * Cr + Cr
|
||||
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
||||
; B = Y - 0.22800 * Cb + Cb + Cb
|
||||
|
||||
movdqa xmm5,xmm6 ; xmm5=CbH
|
||||
movdqa xmm2,xmm4 ; xmm2=CbL
|
||||
paddw xmm6,xmm6 ; xmm6=2*CbH
|
||||
paddw xmm4,xmm4 ; xmm4=2*CbL
|
||||
movdqa xmm1,xmm7 ; xmm1=CrH
|
||||
movdqa xmm3,xmm0 ; xmm3=CrL
|
||||
paddw xmm7,xmm7 ; xmm7=2*CrH
|
||||
paddw xmm0,xmm0 ; xmm0=2*CrL
|
||||
|
||||
pmulhw xmm6,[PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
|
||||
pmulhw xmm4,[PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
|
||||
pmulhw xmm7,[PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
|
||||
pmulhw xmm0,[PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
|
||||
|
||||
paddw xmm6,[PW_ONE]
|
||||
paddw xmm4,[PW_ONE]
|
||||
psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
|
||||
psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
|
||||
paddw xmm7,[PW_ONE]
|
||||
paddw xmm0,[PW_ONE]
|
||||
psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
|
||||
psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
|
||||
|
||||
paddw xmm6,xmm5
|
||||
paddw xmm4,xmm2
|
||||
paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
|
||||
paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
|
||||
paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
|
||||
paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
|
||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
|
||||
|
||||
movdqa xmm6,xmm5
|
||||
movdqa xmm7,xmm2
|
||||
punpcklwd xmm5,xmm1
|
||||
punpckhwd xmm6,xmm1
|
||||
pmaddwd xmm5,[PW_MF0344_F0285]
|
||||
pmaddwd xmm6,[PW_MF0344_F0285]
|
||||
punpcklwd xmm2,xmm3
|
||||
punpckhwd xmm7,xmm3
|
||||
pmaddwd xmm2,[PW_MF0344_F0285]
|
||||
pmaddwd xmm7,[PW_MF0344_F0285]
|
||||
|
||||
paddd xmm5,[PD_ONEHALF]
|
||||
paddd xmm6,[PD_ONEHALF]
|
||||
psrad xmm5,SCALEBITS
|
||||
psrad xmm6,SCALEBITS
|
||||
paddd xmm2,[PD_ONEHALF]
|
||||
paddd xmm7,[PD_ONEHALF]
|
||||
psrad xmm2,SCALEBITS
|
||||
psrad xmm7,SCALEBITS
|
||||
|
||||
packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
|
||||
packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
|
||||
psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
|
||||
psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
|
||||
|
||||
mov al,2 ; Yctr
|
||||
jmp short .Yloop_1st
|
||||
|
||||
.Yloop_2nd:
|
||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
|
||||
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
|
||||
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
|
||||
|
||||
.Yloop_1st:
|
||||
movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
|
||||
|
||||
pcmpeqw xmm6,xmm6
|
||||
psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
|
||||
pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
|
||||
psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
|
||||
|
||||
movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
|
||||
movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
|
||||
movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
|
||||
|
||||
paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
|
||||
paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
|
||||
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
|
||||
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
|
||||
|
||||
paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
|
||||
paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
|
||||
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
|
||||
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
|
||||
|
||||
paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
|
||||
paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
|
||||
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
|
||||
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
||||
|
||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
||||
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
||||
|
||||
movdqa xmmG,xmmA
|
||||
movdqa xmmH,xmmA
|
||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
||||
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
||||
|
||||
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
||||
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
||||
|
||||
movdqa xmmC,xmmD
|
||||
movdqa xmmB,xmmD
|
||||
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
||||
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
||||
|
||||
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
||||
|
||||
movdqa xmmF,xmmE
|
||||
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
||||
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
||||
|
||||
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
||||
movdqa xmmB,xmmE
|
||||
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
||||
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
||||
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
||||
|
||||
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
||||
movdqa xmmB,xmmF
|
||||
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
||||
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
||||
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
||||
|
||||
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
||||
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
||||
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test rdi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
.out0:
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_XMMWORD
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmF
|
||||
sub rcx, byte 2*SIZEOF_XMMWORD
|
||||
jmp short .column_st15
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
mov rax,rcx
|
||||
xor rcx, byte 0x0F
|
||||
shl rcx, 2
|
||||
movd xmmB,ecx
|
||||
psrlq xmmH,4
|
||||
pcmpeqb xmmE,xmmE
|
||||
psrlq xmmH,xmmB
|
||||
psrlq xmmE,xmmB
|
||||
punpcklbw xmmE,xmmH
|
||||
; ----------------
|
||||
mov rcx,rdi
|
||||
and rcx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
add rax,rcx
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmG,xmmA
|
||||
movdqa xmmC,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmD,ecx
|
||||
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmF,ecx
|
||||
psllq xmmA,xmmF
|
||||
psllq xmmE,xmmF
|
||||
jmp short .adj0
|
||||
.adj1: neg rcx
|
||||
movd xmmF,ecx
|
||||
psrlq xmmA,xmmF
|
||||
psrlq xmmE,xmmF
|
||||
psllq xmmG,xmmD
|
||||
psllq xmmC,xmmD
|
||||
por xmmA,xmmG
|
||||
por xmmE,xmmC
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%else
|
||||
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
||||
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
||||
%endif
|
||||
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
||||
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
||||
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
||||
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
||||
|
||||
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
||||
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
||||
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
||||
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
||||
|
||||
movdqa xmmC,xmmA
|
||||
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
||||
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
||||
movdqa xmmG,xmmB
|
||||
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
||||
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
||||
|
||||
movdqa xmmD,xmmA
|
||||
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
||||
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
||||
movdqa xmmH,xmmC
|
||||
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
||||
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
||||
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st32
|
||||
|
||||
test rdi, SIZEOF_XMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
||||
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
||||
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
.out0:
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
jz near .endcolumn
|
||||
|
||||
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
||||
dec al ; Yctr
|
||||
jnz near .Yloop_2nd
|
||||
|
||||
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st32:
|
||||
pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
|
||||
cmp rcx, byte SIZEOF_XMMWORD/2
|
||||
jb short .column_st16
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmC
|
||||
movdqa xmmD,xmmH
|
||||
sub rcx, byte SIZEOF_XMMWORD/2
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/4
|
||||
jb short .column_st15
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
movdqa xmmA,xmmD
|
||||
sub rcx, byte SIZEOF_XMMWORD/4
|
||||
.column_st15:
|
||||
cmp rcx, byte SIZEOF_XMMWORD/16
|
||||
jb near .endcolumn
|
||||
mov rax,rcx
|
||||
xor rcx, byte 0x03
|
||||
inc rcx
|
||||
shl rcx, 4
|
||||
movd xmmF,ecx
|
||||
psrlq xmmE,xmmF
|
||||
punpcklbw xmmE,xmmE
|
||||
; ----------------
|
||||
mov rcx,rdi
|
||||
and rcx, byte SIZEOF_XMMWORD-1
|
||||
jz short .adj0
|
||||
lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .adj0
|
||||
and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
|
||||
shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
|
||||
movdqa xmmB,xmmA
|
||||
movdqa xmmG,xmmE
|
||||
pslldq xmmA, SIZEOF_XMMWORD/2
|
||||
pslldq xmmE, SIZEOF_XMMWORD/2
|
||||
movd xmmC,ecx
|
||||
sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
|
||||
jb short .adj1
|
||||
movd xmmH,ecx
|
||||
psllq xmmA,xmmH
|
||||
psllq xmmE,xmmH
|
||||
jmp short .adj0
|
||||
.adj1: neg rcx
|
||||
movd xmmH,ecx
|
||||
psrlq xmmA,xmmH
|
||||
psrlq xmmE,xmmH
|
||||
psllq xmmB,xmmC
|
||||
psllq xmmG,xmmC
|
||||
por xmmA,xmmB
|
||||
por xmmE,xmmG
|
||||
.adj0: ; ----------------
|
||||
maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.endcolumn:
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
pop rbx
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
|
||||
; JSAMPIMAGE input_buf,
|
||||
; JDIMENSION in_row_group_ctr,
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION output_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12 = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v2_merged_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_merged_upsample_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
mov rax, r10
|
||||
|
||||
mov rdi, r11
|
||||
mov rcx, r12
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
mov rdi, r13
|
||||
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
push rdx ; inptr2
|
||||
push rbx ; inptr1
|
||||
push rsi ; inptr00
|
||||
mov rbx,rsp
|
||||
|
||||
push rdi
|
||||
push rcx
|
||||
push rax
|
||||
|
||||
mov rdx, rcx
|
||||
mov rcx, rdi
|
||||
mov rdi, rax
|
||||
mov rsi, rbx
|
||||
|
||||
call EXTN(jsimd_h2v1_merged_upsample_sse2)
|
||||
|
||||
pop rax
|
||||
pop rcx
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
|
||||
add rdi, byte SIZEOF_JSAMPROW ; outptr1
|
||||
add rsi, byte SIZEOF_JSAMPROW ; inptr01
|
||||
|
||||
push rdx ; inptr2
|
||||
push rbx ; inptr1
|
||||
push rsi ; inptr00
|
||||
mov rbx,rsp
|
||||
|
||||
push rdi
|
||||
push rcx
|
||||
push rax
|
||||
|
||||
mov rdx, rcx
|
||||
mov rcx, rdi
|
||||
mov rdi, rax
|
||||
mov rsi, rbx
|
||||
|
||||
call EXTN(jsimd_h2v1_merged_upsample_sse2)
|
||||
|
||||
pop rax
|
||||
pop rcx
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
|
||||
uncollect_args
|
||||
pop rbx
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,664 @@
|
|||
;
|
||||
; jdsamss2.asm - upsampling (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_fancy_upsample_sse2)
|
||||
|
||||
EXTN(jconst_fancy_upsample_sse2):
|
||||
|
||||
PW_ONE times 8 dw 1
|
||||
PW_TWO times 8 dw 2
|
||||
PW_THREE times 8 dw 3
|
||||
PW_SEVEN times 8 dw 7
|
||||
PW_EIGHT times 8 dw 8
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
;
|
||||
; The upsampling algorithm is linear interpolation between pixel centers,
|
||||
; also known as a "triangle filter". This is a good compromise between
|
||||
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
|
||||
; of the way between input pixel centers.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY * output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11 = JDIMENSION downsampled_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY * output_data_ptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v1_fancy_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
collect_args
|
||||
|
||||
mov rax, r11 ; colctr
|
||||
test rax,rax
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx,rcx
|
||||
jz near .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rax ; colctr
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
|
||||
test rax, SIZEOF_XMMWORD-1
|
||||
jz short .skip
|
||||
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
.skip:
|
||||
pxor xmm0,xmm0 ; xmm0=(all 0's)
|
||||
pcmpeqb xmm7,xmm7
|
||||
psrldq xmm7,(SIZEOF_XMMWORD-1)
|
||||
pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
|
||||
add rax, byte SIZEOF_XMMWORD-1
|
||||
and rax, byte -SIZEOF_XMMWORD
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .columnloop
|
||||
|
||||
.columnloop_last:
|
||||
pcmpeqb xmm6,xmm6
|
||||
pslldq xmm6,(SIZEOF_XMMWORD-1)
|
||||
pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
jmp short .upsample
|
||||
|
||||
.columnloop:
|
||||
movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
pslldq xmm6,(SIZEOF_XMMWORD-1)
|
||||
|
||||
.upsample:
|
||||
movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm2,xmm1
|
||||
movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
|
||||
pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14)
|
||||
psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --)
|
||||
|
||||
por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
|
||||
por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
|
||||
|
||||
movdqa xmm7,xmm1
|
||||
psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
|
||||
|
||||
movdqa xmm4,xmm1
|
||||
punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm5,xmm2
|
||||
punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
|
||||
punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||
movdqa xmm6,xmm3
|
||||
punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
|
||||
punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
|
||||
|
||||
pmullw xmm1,[PW_THREE]
|
||||
pmullw xmm4,[PW_THREE]
|
||||
paddw xmm2,[PW_ONE]
|
||||
paddw xmm5,[PW_ONE]
|
||||
paddw xmm3,[PW_TWO]
|
||||
paddw xmm6,[PW_TWO]
|
||||
|
||||
paddw xmm2,xmm1
|
||||
paddw xmm5,xmm4
|
||||
psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
|
||||
psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
|
||||
paddw xmm3,xmm1
|
||||
paddw xmm6,xmm4
|
||||
psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
|
||||
psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
|
||||
|
||||
psllw xmm3,BYTE_BIT
|
||||
psllw xmm6,BYTE_BIT
|
||||
por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
|
||||
por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
|
||||
|
||||
sub rax, byte SIZEOF_XMMWORD
|
||||
add rsi, byte 1*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja near .columnloop
|
||||
test eax,eax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec rcx ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; Again a triangle filter; see comments for h2v1 case, above.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
|
||||
; JDIMENSION downsampled_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY * output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11 = JDIMENSION downsampled_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY * output_data_ptr
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v2_fancy_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
mov rax, r11 ; colctr
|
||||
test rax,rax
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx,rcx
|
||||
jz near .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rax ; colctr
|
||||
push rcx
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
|
||||
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
|
||||
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
|
||||
test rax, SIZEOF_XMMWORD-1
|
||||
jz short .skip
|
||||
push rdx
|
||||
mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
|
||||
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
|
||||
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
|
||||
pop rdx
|
||||
.skip:
|
||||
; -- process the first column block
|
||||
|
||||
movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
|
||||
movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
|
||||
movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
|
||||
|
||||
pxor xmm3,xmm3 ; xmm3=(all 0's)
|
||||
movdqa xmm4,xmm0
|
||||
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm5,xmm1
|
||||
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm6,xmm2
|
||||
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||
|
||||
pmullw xmm0,[PW_THREE]
|
||||
pmullw xmm4,[PW_THREE]
|
||||
|
||||
pcmpeqb xmm7,xmm7
|
||||
psrldq xmm7,(SIZEOF_XMMWORD-2)
|
||||
|
||||
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
|
||||
|
||||
pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
|
||||
pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm1
|
||||
movdqa XMMWORD [wk(1)], xmm2
|
||||
|
||||
add rax, byte SIZEOF_XMMWORD-1
|
||||
and rax, byte -SIZEOF_XMMWORD
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja short .columnloop
|
||||
|
||||
.columnloop_last:
|
||||
; -- process the last column block
|
||||
|
||||
pcmpeqb xmm1,xmm1
|
||||
pslldq xmm1,(SIZEOF_XMMWORD-2)
|
||||
movdqa xmm2,xmm1
|
||||
|
||||
pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
|
||||
pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
|
||||
movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
|
||||
|
||||
jmp near .upsample
|
||||
|
||||
.columnloop:
|
||||
; -- process the next column block
|
||||
|
||||
movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
|
||||
movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
|
||||
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
|
||||
|
||||
pxor xmm3,xmm3 ; xmm3=(all 0's)
|
||||
movdqa xmm4,xmm0
|
||||
punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm5,xmm1
|
||||
punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
|
||||
movdqa xmm6,xmm2
|
||||
punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
|
||||
punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
|
||||
|
||||
pmullw xmm0,[PW_THREE]
|
||||
pmullw xmm4,[PW_THREE]
|
||||
|
||||
paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
|
||||
paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
|
||||
paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
|
||||
|
||||
movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
|
||||
movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
|
||||
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
|
||||
|
||||
pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
|
||||
pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm1
|
||||
movdqa XMMWORD [wk(3)], xmm2
|
||||
|
||||
.upsample:
|
||||
; -- process the upper row
|
||||
|
||||
movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
|
||||
movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
|
||||
psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --)
|
||||
pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
|
||||
movdqa xmm5,xmm7
|
||||
movdqa xmm6,xmm3
|
||||
psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
|
||||
pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14)
|
||||
|
||||
por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
|
||||
por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
|
||||
|
||||
movdqa xmm1,xmm7
|
||||
movdqa xmm2,xmm3
|
||||
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||
psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --)
|
||||
movdqa xmm4,xmm3
|
||||
psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
|
||||
|
||||
por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||
por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4
|
||||
|
||||
pmullw xmm7,[PW_THREE]
|
||||
pmullw xmm3,[PW_THREE]
|
||||
paddw xmm1,[PW_EIGHT]
|
||||
paddw xmm5,[PW_EIGHT]
|
||||
paddw xmm0,[PW_SEVEN]
|
||||
paddw xmm2,[PW_SEVEN]
|
||||
|
||||
paddw xmm1,xmm7
|
||||
paddw xmm5,xmm3
|
||||
psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
|
||||
psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
|
||||
paddw xmm0,xmm7
|
||||
paddw xmm2,xmm3
|
||||
psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
|
||||
psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
|
||||
|
||||
psllw xmm0,BYTE_BIT
|
||||
psllw xmm2,BYTE_BIT
|
||||
por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
|
||||
por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
|
||||
|
||||
movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
|
||||
movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
|
||||
|
||||
; -- process the lower row
|
||||
|
||||
movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
|
||||
movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
|
||||
movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
|
||||
psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --)
|
||||
pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
|
||||
movdqa xmm0,xmm6
|
||||
movdqa xmm2,xmm4
|
||||
psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
|
||||
pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14)
|
||||
|
||||
por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
|
||||
por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
|
||||
|
||||
movdqa xmm1,xmm6
|
||||
movdqa xmm5,xmm4
|
||||
pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6)
|
||||
psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --)
|
||||
movdqa xmm3,xmm4
|
||||
psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
|
||||
|
||||
por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
|
||||
por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
|
||||
|
||||
movdqa XMMWORD [wk(1)], xmm3
|
||||
|
||||
pmullw xmm6,[PW_THREE]
|
||||
pmullw xmm4,[PW_THREE]
|
||||
paddw xmm1,[PW_EIGHT]
|
||||
paddw xmm0,[PW_EIGHT]
|
||||
paddw xmm7,[PW_SEVEN]
|
||||
paddw xmm5,[PW_SEVEN]
|
||||
|
||||
paddw xmm1,xmm6
|
||||
paddw xmm0,xmm4
|
||||
psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
|
||||
psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
|
||||
paddw xmm7,xmm6
|
||||
paddw xmm5,xmm4
|
||||
psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
|
||||
psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
|
||||
|
||||
psllw xmm7,BYTE_BIT
|
||||
psllw xmm5,BYTE_BIT
|
||||
por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
|
||||
por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
|
||||
|
||||
sub rax, byte SIZEOF_XMMWORD
|
||||
add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
|
||||
add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
|
||||
add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
|
||||
add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
|
||||
add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
|
||||
cmp rax, byte SIZEOF_XMMWORD
|
||||
ja near .columnloop
|
||||
test rax,rax
|
||||
jnz near .columnloop_last
|
||||
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop rcx
|
||||
pop rax
|
||||
|
||||
add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub rcx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
pop rbx
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
|
||||
; JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY * output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11 = JDIMENSION output_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY * output_data_ptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v1_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v1_upsample_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
collect_args
|
||||
|
||||
mov rdx, r11
|
||||
add rdx, byte (2*SIZEOF_XMMWORD)-1
|
||||
and rdx, byte -(2*SIZEOF_XMMWORD)
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx,rcx
|
||||
jz short .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
mov rax,rdx ; colctr
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm1,xmm0
|
||||
punpcklbw xmm0,xmm0
|
||||
punpckhbw xmm1,xmm1
|
||||
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
sub rax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm3,xmm2
|
||||
punpcklbw xmm2,xmm2
|
||||
punpckhbw xmm3,xmm3
|
||||
|
||||
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
|
||||
|
||||
sub rax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add rsi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||
add rdi, byte 4*SIZEOF_XMMWORD ; outptr
|
||||
jmp short .columnloop
|
||||
|
||||
.nextrow:
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_data
|
||||
dec rcx ; rowctr
|
||||
jg short .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
|
||||
; It's still a box filter.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
|
||||
; JDIMENSION output_width,
|
||||
; JSAMPARRAY input_data,
|
||||
; JSAMPARRAY * output_data_ptr);
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11 = JDIMENSION output_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY * output_data_ptr
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_h2v2_upsample_sse2)
|
||||
|
||||
EXTN(jsimd_h2v2_upsample_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
mov rdx, r11
|
||||
add rdx, byte (2*SIZEOF_XMMWORD)-1
|
||||
and rdx, byte -(2*SIZEOF_XMMWORD)
|
||||
jz near .return
|
||||
|
||||
mov rcx, r10 ; rowctr
|
||||
test rcx,rcx
|
||||
jz near .return
|
||||
|
||||
mov rsi, r12 ; input_data
|
||||
mov rdi, r13
|
||||
mov rdi, JSAMPARRAY [rdi] ; output_data
|
||||
.rowloop:
|
||||
push rdi
|
||||
push rsi
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr
|
||||
mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
|
||||
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
|
||||
mov rax,rdx ; colctr
|
||||
.columnloop:
|
||||
|
||||
movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm1,xmm0
|
||||
punpcklbw xmm0,xmm0
|
||||
punpckhbw xmm1,xmm1
|
||||
|
||||
movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
|
||||
movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
|
||||
movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
|
||||
movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
|
||||
|
||||
sub rax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
|
||||
|
||||
movdqa xmm3,xmm2
|
||||
punpcklbw xmm2,xmm2
|
||||
punpckhbw xmm3,xmm3
|
||||
|
||||
movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
|
||||
movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
|
||||
movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
|
||||
|
||||
sub rax, byte 2*SIZEOF_XMMWORD
|
||||
jz short .nextrow
|
||||
|
||||
add rsi, byte 2*SIZEOF_XMMWORD ; inptr
|
||||
add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
|
||||
add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
|
||||
jmp short .columnloop
|
||||
|
||||
.nextrow:
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
|
||||
add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
|
||||
sub rcx, byte 2 ; rowctr
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
pop rbx
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,388 @@
|
|||
;
|
||||
; jfss2fst.asm - fast integer FDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the forward DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
|
||||
; for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_0_382 equ 98 ; FIX(0.382683433)
|
||||
F_0_541 equ 139 ; FIX(0.541196100)
|
||||
F_0_707 equ 181 ; FIX(0.707106781)
|
||||
F_1_306 equ 334 ; FIX(1.306562965)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
|
||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
|
||||
F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_fdct_ifast_sse2)
|
||||
|
||||
EXTN(jconst_fdct_ifast_sse2):
|
||||
|
||||
PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
|
||||
PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
|
||||
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
|
||||
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_ifast_sse2 (DCTELEM * data)
|
||||
;
|
||||
|
||||
; r10 = DCTELEM * data
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_fdct_ifast_sse2)
|
||||
|
||||
EXTN(jsimd_fdct_ifast_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov rdx, r10 ; (DCTELEM *)
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
|
||||
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||
movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73)
|
||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75)
|
||||
|
||||
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||
|
||||
movdqa xmm6,xmm1
|
||||
movdqa xmm3,xmm0
|
||||
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
|
||||
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
|
||||
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
|
||||
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
|
||||
|
||||
movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||
movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||
|
||||
movdqa xmm2,xmm1
|
||||
movdqa xmm5,xmm7
|
||||
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
|
||||
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
|
||||
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
|
||||
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4,xmm3
|
||||
movdqa xmm0,xmm6
|
||||
psubw xmm3,xmm1 ; xmm3=tmp13
|
||||
psubw xmm6,xmm7 ; xmm6=tmp12
|
||||
paddw xmm4,xmm1 ; xmm4=tmp10
|
||||
paddw xmm0,xmm7 ; xmm0=tmp11
|
||||
|
||||
paddw xmm6,xmm3
|
||||
psllw xmm6,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm6,[PW_F0707] ; xmm6=z1
|
||||
|
||||
movdqa xmm1,xmm4
|
||||
movdqa xmm7,xmm3
|
||||
psubw xmm4,xmm0 ; xmm4=data4
|
||||
psubw xmm3,xmm6 ; xmm3=data6
|
||||
paddw xmm1,xmm0 ; xmm1=data0
|
||||
paddw xmm7,xmm6 ; xmm7=data2
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6
|
||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6
|
||||
|
||||
; -- Odd part
|
||||
|
||||
paddw xmm2,xmm5 ; xmm2=tmp10
|
||||
paddw xmm5,xmm0 ; xmm5=tmp11
|
||||
paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7
|
||||
|
||||
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm5,[PW_F0707] ; xmm5=z3
|
||||
|
||||
movdqa xmm4,xmm2 ; xmm4=tmp10
|
||||
psubw xmm2,xmm0
|
||||
pmulhw xmm2,[PW_F0382] ; xmm2=z5
|
||||
pmulhw xmm4,[PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||
pmulhw xmm0,[PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
|
||||
paddw xmm4,xmm2 ; xmm4=z2
|
||||
paddw xmm0,xmm2 ; xmm0=z4
|
||||
|
||||
movdqa xmm3,xmm6
|
||||
psubw xmm6,xmm5 ; xmm6=z13
|
||||
paddw xmm3,xmm5 ; xmm3=z11
|
||||
|
||||
movdqa xmm2,xmm6
|
||||
movdqa xmm5,xmm3
|
||||
psubw xmm6,xmm4 ; xmm6=data3
|
||||
psubw xmm3,xmm0 ; xmm3=data7
|
||||
paddw xmm2,xmm4 ; xmm2=data5
|
||||
paddw xmm5,xmm0 ; xmm5=data1
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
|
||||
; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31)
|
||||
punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71)
|
||||
movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33)
|
||||
punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4
|
||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6
|
||||
|
||||
; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
|
||||
; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm7,xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35)
|
||||
punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75)
|
||||
movdqa xmm0,xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37)
|
||||
punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77)
|
||||
|
||||
movdqa xmm2,xmm5 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17)
|
||||
punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm3,xmm7 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57)
|
||||
punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33)
|
||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73)
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37)
|
||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57)
|
||||
|
||||
movdqa xmm2,xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13)
|
||||
punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33)
|
||||
movdqa xmm7,xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53)
|
||||
punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73)
|
||||
|
||||
movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0
|
||||
punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1
|
||||
movdqa xmm0,xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6
|
||||
punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7
|
||||
|
||||
movdqa xmm5,xmm6
|
||||
movdqa xmm3,xmm1
|
||||
psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6
|
||||
psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7
|
||||
paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1
|
||||
paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57)
|
||||
movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm6,xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2
|
||||
punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3
|
||||
movdqa xmm1,xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4
|
||||
punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5
|
||||
|
||||
movdqa xmm7,xmm6
|
||||
movdqa xmm0,xmm2
|
||||
paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3
|
||||
paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2
|
||||
psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4
|
||||
psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4,xmm3
|
||||
movdqa xmm1,xmm5
|
||||
psubw xmm3,xmm6 ; xmm3=tmp13
|
||||
psubw xmm5,xmm2 ; xmm5=tmp12
|
||||
paddw xmm4,xmm6 ; xmm4=tmp10
|
||||
paddw xmm1,xmm2 ; xmm1=tmp11
|
||||
|
||||
paddw xmm5,xmm3
|
||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm5,[PW_F0707] ; xmm5=z1
|
||||
|
||||
movdqa xmm6,xmm4
|
||||
movdqa xmm2,xmm3
|
||||
psubw xmm4,xmm1 ; xmm4=data4
|
||||
psubw xmm3,xmm5 ; xmm3=data6
|
||||
paddw xmm6,xmm1 ; xmm6=data0
|
||||
paddw xmm2,xmm5 ; xmm2=data2
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
|
||||
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||
|
||||
paddw xmm7,xmm0 ; xmm7=tmp10
|
||||
paddw xmm0,xmm1 ; xmm0=tmp11
|
||||
paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7
|
||||
|
||||
psllw xmm7,PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm0,[PW_F0707] ; xmm0=z3
|
||||
|
||||
movdqa xmm4,xmm7 ; xmm4=tmp10
|
||||
psubw xmm7,xmm1
|
||||
pmulhw xmm7,[PW_F0382] ; xmm7=z5
|
||||
pmulhw xmm4,[PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
|
||||
pmulhw xmm1,[PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
|
||||
paddw xmm4,xmm7 ; xmm4=z2
|
||||
paddw xmm1,xmm7 ; xmm1=z4
|
||||
|
||||
movdqa xmm3,xmm5
|
||||
psubw xmm5,xmm0 ; xmm5=z13
|
||||
paddw xmm3,xmm0 ; xmm3=z11
|
||||
|
||||
movdqa xmm6,xmm5
|
||||
movdqa xmm2,xmm3
|
||||
psubw xmm5,xmm4 ; xmm5=data3
|
||||
psubw xmm3,xmm1 ; xmm3=data7
|
||||
paddw xmm6,xmm4 ; xmm6=data5
|
||||
paddw xmm2,xmm1 ; xmm2=data1
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
|
||||
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
|
||||
|
||||
uncollect_args
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,618 @@
|
|||
;
|
||||
; jfss2int.asm - accurate integer FDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; forward DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS+PASS1_BITS)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_fdct_islow_sse2)
|
||||
|
||||
EXTN(jconst_fdct_islow_sse2):
|
||||
|
||||
PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
|
||||
PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
|
||||
PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
|
||||
PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
|
||||
PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
|
||||
PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
|
||||
PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
|
||||
PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
|
||||
PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
|
||||
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
|
||||
PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1)
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_islow_sse2 (DCTELEM * data)
|
||||
;
|
||||
|
||||
; r10 = DCTELEM * data
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 6
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_fdct_islow_sse2)
|
||||
|
||||
EXTN(jsimd_fdct_islow_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov rdx, r10 ; (DCTELEM *)
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
|
||||
; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm4,xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
|
||||
|
||||
; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
|
||||
; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33)
|
||||
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||
movdqa xmm5,xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm7,xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm3,xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33)
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37)
|
||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73)
|
||||
movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75)
|
||||
|
||||
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm2,xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm1,xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0
|
||||
punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1
|
||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6
|
||||
punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7
|
||||
|
||||
movdqa xmm6,xmm1
|
||||
movdqa xmm3,xmm0
|
||||
psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6
|
||||
psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7
|
||||
paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1
|
||||
paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0
|
||||
|
||||
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75)
|
||||
movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2
|
||||
punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3
|
||||
movdqa xmm0,xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4
|
||||
punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5
|
||||
|
||||
movdqa xmm2,xmm1
|
||||
movdqa xmm5,xmm7
|
||||
paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3
|
||||
paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2
|
||||
psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4
|
||||
psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4,xmm3
|
||||
movdqa xmm0,xmm6
|
||||
paddw xmm3,xmm1 ; xmm3=tmp10
|
||||
paddw xmm6,xmm7 ; xmm6=tmp11
|
||||
psubw xmm4,xmm1 ; xmm4=tmp13
|
||||
psubw xmm0,xmm7 ; xmm0=tmp12
|
||||
|
||||
movdqa xmm1,xmm3
|
||||
paddw xmm3,xmm6 ; xmm3=tmp10+tmp11
|
||||
psubw xmm1,xmm6 ; xmm1=tmp10-tmp11
|
||||
|
||||
psllw xmm3,PASS1_BITS ; xmm3=data0
|
||||
psllw xmm1,PASS1_BITS ; xmm1=data4
|
||||
|
||||
movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0
|
||||
movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movdqa xmm7,xmm4 ; xmm4=tmp13
|
||||
movdqa xmm6,xmm4
|
||||
punpcklwd xmm7,xmm0 ; xmm0=tmp12
|
||||
punpckhwd xmm6,xmm0
|
||||
movdqa xmm4,xmm7
|
||||
movdqa xmm0,xmm6
|
||||
pmaddwd xmm7,[PW_F130_F054] ; xmm7=data2L
|
||||
pmaddwd xmm6,[PW_F130_F054] ; xmm6=data2H
|
||||
pmaddwd xmm4,[PW_F054_MF130] ; xmm4=data6L
|
||||
pmaddwd xmm0,[PW_F054_MF130] ; xmm0=data6H
|
||||
|
||||
paddd xmm7,[PD_DESCALE_P1]
|
||||
paddd xmm6,[PD_DESCALE_P1]
|
||||
psrad xmm7,DESCALE_P1
|
||||
psrad xmm6,DESCALE_P1
|
||||
paddd xmm4,[PD_DESCALE_P1]
|
||||
paddd xmm0,[PD_DESCALE_P1]
|
||||
psrad xmm4,DESCALE_P1
|
||||
psrad xmm0,DESCALE_P1
|
||||
|
||||
packssdw xmm7,xmm6 ; xmm7=data2
|
||||
packssdw xmm4,xmm0 ; xmm4=data6
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2
|
||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7
|
||||
|
||||
movdqa xmm6,xmm2 ; xmm2=tmp4
|
||||
movdqa xmm0,xmm5 ; xmm5=tmp5
|
||||
paddw xmm6,xmm3 ; xmm6=z3
|
||||
paddw xmm0,xmm1 ; xmm0=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm7,xmm6
|
||||
movdqa xmm4,xmm6
|
||||
punpcklwd xmm7,xmm0
|
||||
punpckhwd xmm4,xmm0
|
||||
movdqa xmm6,xmm7
|
||||
movdqa xmm0,xmm4
|
||||
pmaddwd xmm7,[PW_MF078_F117] ; xmm7=z3L
|
||||
pmaddwd xmm4,[PW_MF078_F117] ; xmm4=z3H
|
||||
pmaddwd xmm6,[PW_F117_F078] ; xmm6=z4L
|
||||
pmaddwd xmm0,[PW_F117_F078] ; xmm0=z4H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L
|
||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movdqa xmm7,xmm2
|
||||
movdqa xmm4,xmm2
|
||||
punpcklwd xmm7,xmm1
|
||||
punpckhwd xmm4,xmm1
|
||||
movdqa xmm2,xmm7
|
||||
movdqa xmm1,xmm4
|
||||
pmaddwd xmm7,[PW_MF060_MF089] ; xmm7=tmp4L
|
||||
pmaddwd xmm4,[PW_MF060_MF089] ; xmm4=tmp4H
|
||||
pmaddwd xmm2,[PW_MF089_F060] ; xmm2=tmp7L
|
||||
pmaddwd xmm1,[PW_MF089_F060] ; xmm1=tmp7H
|
||||
|
||||
paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L
|
||||
paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H
|
||||
paddd xmm2,xmm6 ; xmm2=data1L
|
||||
paddd xmm1,xmm0 ; xmm1=data1H
|
||||
|
||||
paddd xmm7,[PD_DESCALE_P1]
|
||||
paddd xmm4,[PD_DESCALE_P1]
|
||||
psrad xmm7,DESCALE_P1
|
||||
psrad xmm4,DESCALE_P1
|
||||
paddd xmm2,[PD_DESCALE_P1]
|
||||
paddd xmm1,[PD_DESCALE_P1]
|
||||
psrad xmm2,DESCALE_P1
|
||||
psrad xmm1,DESCALE_P1
|
||||
|
||||
packssdw xmm7,xmm4 ; xmm7=data7
|
||||
packssdw xmm2,xmm1 ; xmm2=data1
|
||||
|
||||
movdqa xmm4,xmm5
|
||||
movdqa xmm1,xmm5
|
||||
punpcklwd xmm4,xmm3
|
||||
punpckhwd xmm1,xmm3
|
||||
movdqa xmm5,xmm4
|
||||
movdqa xmm3,xmm1
|
||||
pmaddwd xmm4,[PW_MF050_MF256] ; xmm4=tmp5L
|
||||
pmaddwd xmm1,[PW_MF050_MF256] ; xmm1=tmp5H
|
||||
pmaddwd xmm5,[PW_MF256_F050] ; xmm5=tmp6L
|
||||
pmaddwd xmm3,[PW_MF256_F050] ; xmm3=tmp6H
|
||||
|
||||
paddd xmm4,xmm6 ; xmm4=data5L
|
||||
paddd xmm1,xmm0 ; xmm1=data5H
|
||||
paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L
|
||||
paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H
|
||||
|
||||
paddd xmm4,[PD_DESCALE_P1]
|
||||
paddd xmm1,[PD_DESCALE_P1]
|
||||
psrad xmm4,DESCALE_P1
|
||||
psrad xmm1,DESCALE_P1
|
||||
paddd xmm5,[PD_DESCALE_P1]
|
||||
paddd xmm3,[PD_DESCALE_P1]
|
||||
psrad xmm5,DESCALE_P1
|
||||
psrad xmm3,DESCALE_P1
|
||||
|
||||
packssdw xmm4,xmm1 ; xmm4=data5
|
||||
packssdw xmm5,xmm3 ; xmm5=data3
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0
|
||||
movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2
|
||||
|
||||
; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
|
||||
; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm1,xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||
punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71)
|
||||
movdqa xmm3,xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||
punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4
|
||||
movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6
|
||||
|
||||
; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
|
||||
; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33)
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73)
|
||||
|
||||
movdqa xmm0,xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35)
|
||||
punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75)
|
||||
movdqa xmm3,xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37)
|
||||
punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77)
|
||||
|
||||
movdqa xmm4,xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17)
|
||||
punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm7,xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57)
|
||||
punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33)
|
||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73)
|
||||
movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37)
|
||||
movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57)
|
||||
|
||||
movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13)
|
||||
punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33)
|
||||
movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53)
|
||||
punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73)
|
||||
|
||||
movdqa xmm5,xmm6 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0
|
||||
punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1
|
||||
movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6
|
||||
punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7
|
||||
|
||||
movdqa xmm2,xmm5
|
||||
movdqa xmm7,xmm6
|
||||
psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6
|
||||
psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7
|
||||
paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1
|
||||
paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37)
|
||||
movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57)
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||
|
||||
movdqa xmm5,xmm4 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2
|
||||
punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3
|
||||
movdqa xmm6,xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4
|
||||
punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5
|
||||
|
||||
movdqa xmm0,xmm5
|
||||
movdqa xmm3,xmm4
|
||||
paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3
|
||||
paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2
|
||||
psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4
|
||||
psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm1,xmm7
|
||||
movdqa xmm6,xmm2
|
||||
paddw xmm7,xmm5 ; xmm7=tmp10
|
||||
paddw xmm2,xmm4 ; xmm2=tmp11
|
||||
psubw xmm1,xmm5 ; xmm1=tmp13
|
||||
psubw xmm6,xmm4 ; xmm6=tmp12
|
||||
|
||||
movdqa xmm5,xmm7
|
||||
paddw xmm7,xmm2 ; xmm7=tmp10+tmp11
|
||||
psubw xmm5,xmm2 ; xmm5=tmp10-tmp11
|
||||
|
||||
paddw xmm7,[PW_DESCALE_P2X]
|
||||
paddw xmm5,[PW_DESCALE_P2X]
|
||||
psraw xmm7,PASS1_BITS ; xmm7=data0
|
||||
psraw xmm5,PASS1_BITS ; xmm5=data4
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
|
||||
movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
|
||||
|
||||
; (Original)
|
||||
; z1 = (tmp12 + tmp13) * 0.541196100;
|
||||
; data2 = z1 + tmp13 * 0.765366865;
|
||||
; data6 = z1 + tmp12 * -1.847759065;
|
||||
;
|
||||
; (This implementation)
|
||||
; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
|
||||
; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
|
||||
|
||||
movdqa xmm4,xmm1 ; xmm1=tmp13
|
||||
movdqa xmm2,xmm1
|
||||
punpcklwd xmm4,xmm6 ; xmm6=tmp12
|
||||
punpckhwd xmm2,xmm6
|
||||
movdqa xmm1,xmm4
|
||||
movdqa xmm6,xmm2
|
||||
pmaddwd xmm4,[PW_F130_F054] ; xmm4=data2L
|
||||
pmaddwd xmm2,[PW_F130_F054] ; xmm2=data2H
|
||||
pmaddwd xmm1,[PW_F054_MF130] ; xmm1=data6L
|
||||
pmaddwd xmm6,[PW_F054_MF130] ; xmm6=data6H
|
||||
|
||||
paddd xmm4,[PD_DESCALE_P2]
|
||||
paddd xmm2,[PD_DESCALE_P2]
|
||||
psrad xmm4,DESCALE_P2
|
||||
psrad xmm2,DESCALE_P2
|
||||
paddd xmm1,[PD_DESCALE_P2]
|
||||
paddd xmm6,[PD_DESCALE_P2]
|
||||
psrad xmm1,DESCALE_P2
|
||||
psrad xmm6,DESCALE_P2
|
||||
|
||||
packssdw xmm4,xmm2 ; xmm4=data2
|
||||
packssdw xmm1,xmm6 ; xmm1=data6
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
|
||||
movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7
|
||||
|
||||
movdqa xmm2,xmm0 ; xmm0=tmp4
|
||||
movdqa xmm6,xmm3 ; xmm3=tmp5
|
||||
paddw xmm2,xmm7 ; xmm2=z3
|
||||
paddw xmm6,xmm5 ; xmm6=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm4,xmm2
|
||||
movdqa xmm1,xmm2
|
||||
punpcklwd xmm4,xmm6
|
||||
punpckhwd xmm1,xmm6
|
||||
movdqa xmm2,xmm4
|
||||
movdqa xmm6,xmm1
|
||||
pmaddwd xmm4,[PW_MF078_F117] ; xmm4=z3L
|
||||
pmaddwd xmm1,[PW_MF078_F117] ; xmm1=z3H
|
||||
pmaddwd xmm2,[PW_F117_F078] ; xmm2=z4L
|
||||
pmaddwd xmm6,[PW_F117_F078] ; xmm6=z4H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L
|
||||
movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6;
|
||||
; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869;
|
||||
; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4;
|
||||
; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
|
||||
; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
|
||||
; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
|
||||
; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
|
||||
; data7 = tmp4 + z3; data5 = tmp5 + z4;
|
||||
; data3 = tmp6 + z3; data1 = tmp7 + z4;
|
||||
|
||||
movdqa xmm4,xmm0
|
||||
movdqa xmm1,xmm0
|
||||
punpcklwd xmm4,xmm5
|
||||
punpckhwd xmm1,xmm5
|
||||
movdqa xmm0,xmm4
|
||||
movdqa xmm5,xmm1
|
||||
pmaddwd xmm4,[PW_MF060_MF089] ; xmm4=tmp4L
|
||||
pmaddwd xmm1,[PW_MF060_MF089] ; xmm1=tmp4H
|
||||
pmaddwd xmm0,[PW_MF089_F060] ; xmm0=tmp7L
|
||||
pmaddwd xmm5,[PW_MF089_F060] ; xmm5=tmp7H
|
||||
|
||||
paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L
|
||||
paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H
|
||||
paddd xmm0,xmm2 ; xmm0=data1L
|
||||
paddd xmm5,xmm6 ; xmm5=data1H
|
||||
|
||||
paddd xmm4,[PD_DESCALE_P2]
|
||||
paddd xmm1,[PD_DESCALE_P2]
|
||||
psrad xmm4,DESCALE_P2
|
||||
psrad xmm1,DESCALE_P2
|
||||
paddd xmm0,[PD_DESCALE_P2]
|
||||
paddd xmm5,[PD_DESCALE_P2]
|
||||
psrad xmm0,DESCALE_P2
|
||||
psrad xmm5,DESCALE_P2
|
||||
|
||||
packssdw xmm4,xmm1 ; xmm4=data7
|
||||
packssdw xmm0,xmm5 ; xmm0=data1
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
|
||||
|
||||
movdqa xmm1,xmm3
|
||||
movdqa xmm5,xmm3
|
||||
punpcklwd xmm1,xmm7
|
||||
punpckhwd xmm5,xmm7
|
||||
movdqa xmm3,xmm1
|
||||
movdqa xmm7,xmm5
|
||||
pmaddwd xmm1,[PW_MF050_MF256] ; xmm1=tmp5L
|
||||
pmaddwd xmm5,[PW_MF050_MF256] ; xmm5=tmp5H
|
||||
pmaddwd xmm3,[PW_MF256_F050] ; xmm3=tmp6L
|
||||
pmaddwd xmm7,[PW_MF256_F050] ; xmm7=tmp6H
|
||||
|
||||
paddd xmm1,xmm2 ; xmm1=data5L
|
||||
paddd xmm5,xmm6 ; xmm5=data5H
|
||||
paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L
|
||||
paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H
|
||||
|
||||
paddd xmm1,[PD_DESCALE_P2]
|
||||
paddd xmm5,[PD_DESCALE_P2]
|
||||
psrad xmm1,DESCALE_P2
|
||||
psrad xmm5,DESCALE_P2
|
||||
paddd xmm3,[PD_DESCALE_P2]
|
||||
paddd xmm7,[PD_DESCALE_P2]
|
||||
psrad xmm3,DESCALE_P2
|
||||
psrad xmm7,DESCALE_P2
|
||||
|
||||
packssdw xmm1,xmm5 ; xmm1=data5
|
||||
packssdw xmm3,xmm7 ; xmm3=data3
|
||||
|
||||
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
uncollect_args
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,354 @@
|
|||
;
|
||||
; jfsseflt.asm - floating-point FDCT (64-bit SSE)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the forward DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||
shufps %1,%2,0x44
|
||||
%endmacro
|
||||
|
||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||
shufps %1,%2,0xEE
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_fdct_float_sse)
|
||||
|
||||
EXTN(jconst_fdct_float_sse):
|
||||
|
||||
PD_0_382 times 4 dd 0.382683432365089771728460
|
||||
PD_0_707 times 4 dd 0.707106781186547524400844
|
||||
PD_0_541 times 4 dd 0.541196100146196984399723
|
||||
PD_1_306 times 4 dd 1.306562964876376527856643
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform the forward DCT on one block of samples.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_fdct_float_sse (FAST_FLOAT * data)
|
||||
;
|
||||
|
||||
; r10 = FAST_FLOAT * data
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_fdct_float_sse)
|
||||
|
||||
EXTN(jsimd_fdct_float_sse):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
mov rdx, r10 ; (FAST_FLOAT *)
|
||||
mov rcx, DCTSIZE/4
|
||||
.rowloop:
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
|
||||
; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
|
||||
|
||||
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31)
|
||||
unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33)
|
||||
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35)
|
||||
unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37)
|
||||
|
||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
|
||||
; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33)
|
||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35)
|
||||
|
||||
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
|
||||
unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13)
|
||||
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15)
|
||||
unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17)
|
||||
|
||||
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0
|
||||
unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1
|
||||
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6
|
||||
unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7
|
||||
|
||||
movaps xmm0,xmm7
|
||||
movaps xmm5,xmm6
|
||||
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
|
||||
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
|
||||
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
|
||||
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
|
||||
|
||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33)
|
||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35)
|
||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||
|
||||
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2
|
||||
unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3
|
||||
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4
|
||||
unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5
|
||||
|
||||
movaps xmm2,xmm7
|
||||
movaps xmm3,xmm4
|
||||
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
|
||||
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
|
||||
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
|
||||
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm1,xmm5
|
||||
movaps xmm6,xmm0
|
||||
subps xmm5,xmm7 ; xmm5=tmp13
|
||||
subps xmm0,xmm4 ; xmm0=tmp12
|
||||
addps xmm1,xmm7 ; xmm1=tmp10
|
||||
addps xmm6,xmm4 ; xmm6=tmp11
|
||||
|
||||
addps xmm0,xmm5
|
||||
mulps xmm0,[PD_0_707] ; xmm0=z1
|
||||
|
||||
movaps xmm7,xmm1
|
||||
movaps xmm4,xmm5
|
||||
subps xmm1,xmm6 ; xmm1=data4
|
||||
subps xmm5,xmm0 ; xmm5=data6
|
||||
addps xmm7,xmm6 ; xmm7=data0
|
||||
addps xmm4,xmm0 ; xmm4=data2
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||
|
||||
addps xmm2,xmm3 ; xmm2=tmp10
|
||||
addps xmm3,xmm6 ; xmm3=tmp11
|
||||
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||
|
||||
mulps xmm3,[PD_0_707] ; xmm3=z3
|
||||
|
||||
movaps xmm1,xmm2 ; xmm1=tmp10
|
||||
subps xmm2,xmm6
|
||||
mulps xmm2,[PD_0_382] ; xmm2=z5
|
||||
mulps xmm1,[PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||
mulps xmm6,[PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||
addps xmm1,xmm2 ; xmm1=z2
|
||||
addps xmm6,xmm2 ; xmm6=z4
|
||||
|
||||
movaps xmm5,xmm0
|
||||
subps xmm0,xmm3 ; xmm0=z13
|
||||
addps xmm5,xmm3 ; xmm5=z11
|
||||
|
||||
movaps xmm7,xmm0
|
||||
movaps xmm4,xmm5
|
||||
subps xmm0,xmm1 ; xmm0=data3
|
||||
subps xmm5,xmm6 ; xmm5=data7
|
||||
addps xmm7,xmm1 ; xmm7=data5
|
||||
addps xmm4,xmm6 ; xmm4=data1
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
|
||||
dec rcx
|
||||
jnz near .rowloop
|
||||
|
||||
; ---- Pass 2: process columns.
|
||||
|
||||
mov rdx, r10 ; (FAST_FLOAT *)
|
||||
mov rcx, DCTSIZE/4
|
||||
.columnloop:
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
|
||||
; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
|
||||
|
||||
movaps xmm4,xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13)
|
||||
unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33)
|
||||
movaps xmm5,xmm2 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53)
|
||||
unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73)
|
||||
|
||||
movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
|
||||
; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33)
|
||||
movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53)
|
||||
|
||||
movaps xmm4,xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11)
|
||||
unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31)
|
||||
movaps xmm2,xmm1 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51)
|
||||
unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71)
|
||||
|
||||
movaps xmm7,xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0
|
||||
unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1
|
||||
movaps xmm3,xmm2 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6
|
||||
unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7
|
||||
|
||||
movaps xmm0,xmm7
|
||||
movaps xmm5,xmm6
|
||||
subps xmm7,xmm2 ; xmm7=data1-data6=tmp6
|
||||
subps xmm6,xmm3 ; xmm6=data0-data7=tmp7
|
||||
addps xmm0,xmm2 ; xmm0=data1+data6=tmp1
|
||||
addps xmm5,xmm3 ; xmm5=data0+data7=tmp0
|
||||
|
||||
movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33)
|
||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53)
|
||||
movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6
|
||||
movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7
|
||||
|
||||
movaps xmm7,xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2
|
||||
unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3
|
||||
movaps xmm6,xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4
|
||||
unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5
|
||||
|
||||
movaps xmm2,xmm7
|
||||
movaps xmm3,xmm4
|
||||
addps xmm7,xmm1 ; xmm7=data3+data4=tmp3
|
||||
addps xmm4,xmm6 ; xmm4=data2+data5=tmp2
|
||||
subps xmm2,xmm1 ; xmm2=data3-data4=tmp4
|
||||
subps xmm3,xmm6 ; xmm3=data2-data5=tmp5
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm1,xmm5
|
||||
movaps xmm6,xmm0
|
||||
subps xmm5,xmm7 ; xmm5=tmp13
|
||||
subps xmm0,xmm4 ; xmm0=tmp12
|
||||
addps xmm1,xmm7 ; xmm1=tmp10
|
||||
addps xmm6,xmm4 ; xmm6=tmp11
|
||||
|
||||
addps xmm0,xmm5
|
||||
mulps xmm0,[PD_0_707] ; xmm0=z1
|
||||
|
||||
movaps xmm7,xmm1
|
||||
movaps xmm4,xmm5
|
||||
subps xmm1,xmm6 ; xmm1=data4
|
||||
subps xmm5,xmm0 ; xmm5=data6
|
||||
addps xmm7,xmm6 ; xmm7=data0
|
||||
addps xmm4,xmm0 ; xmm4=data2
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6
|
||||
movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7
|
||||
|
||||
addps xmm2,xmm3 ; xmm2=tmp10
|
||||
addps xmm3,xmm6 ; xmm3=tmp11
|
||||
addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7
|
||||
|
||||
mulps xmm3,[PD_0_707] ; xmm3=z3
|
||||
|
||||
movaps xmm1,xmm2 ; xmm1=tmp10
|
||||
subps xmm2,xmm6
|
||||
mulps xmm2,[PD_0_382] ; xmm2=z5
|
||||
mulps xmm1,[PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
|
||||
mulps xmm6,[PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
|
||||
addps xmm1,xmm2 ; xmm1=z2
|
||||
addps xmm6,xmm2 ; xmm6=z4
|
||||
|
||||
movaps xmm5,xmm0
|
||||
subps xmm0,xmm3 ; xmm0=z13
|
||||
addps xmm5,xmm3 ; xmm5=z11
|
||||
|
||||
movaps xmm7,xmm0
|
||||
movaps xmm4,xmm5
|
||||
subps xmm0,xmm1 ; xmm0=data3
|
||||
subps xmm5,xmm6 ; xmm5=data7
|
||||
addps xmm7,xmm1 ; xmm7=data5
|
||||
addps xmm4,xmm6 ; xmm4=data1
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
|
||||
|
||||
add rdx, byte 4*SIZEOF_FAST_FLOAT
|
||||
dec rcx
|
||||
jnz near .columnloop
|
||||
|
||||
uncollect_args
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,479 @@
|
|||
;
|
||||
; jiss2flt.asm - floating-point IDCT (64-bit SSE & SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a floating-point implementation of the inverse DCT
|
||||
; (Discrete Cosine Transform). The following code is based directly on
|
||||
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
|
||||
shufps %1,%2,0x44
|
||||
%endmacro
|
||||
|
||||
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
|
||||
shufps %1,%2,0xEE
|
||||
%endmacro
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_idct_float_sse2)
|
||||
|
||||
EXTN(jconst_idct_float_sse2):
|
||||
|
||||
PD_1_414 times 4 dd 1.414213562373095048801689
|
||||
PD_1_847 times 4 dd 1.847759065022573512256366
|
||||
PD_1_082 times 4 dd 1.082392200292393968799446
|
||||
PD_M2_613 times 4 dd -2.613125929752753055713286
|
||||
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = void * dct_table
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp+0
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
|
||||
; FAST_FLOAT workspace[DCTSIZE2]
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_float_sse2)
|
||||
|
||||
EXTN(jsimd_idct_float_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],eax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [workspace]
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
lea rdi, [workspace] ; FAST_FLOAT * wsptr
|
||||
mov rcx, DCTSIZE/4 ; ctr
|
||||
.columnloop:
|
||||
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
|
||||
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1,xmm2
|
||||
por xmm3,xmm4
|
||||
por xmm5,xmm6
|
||||
por xmm1,xmm3
|
||||
por xmm5,xmm7
|
||||
por xmm1,xmm5
|
||||
packsswb xmm1,xmm1
|
||||
movd eax,xmm1
|
||||
test rax,rax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
|
||||
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm1,xmm0
|
||||
movaps xmm2,xmm0
|
||||
movaps xmm3,xmm0
|
||||
|
||||
shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
|
||||
shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
|
||||
shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
|
||||
shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
|
||||
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
jmp near .nextcolumn
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
|
||||
psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
|
||||
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
|
||||
cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
|
||||
cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
|
||||
|
||||
punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
|
||||
punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
|
||||
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
|
||||
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
|
||||
cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
|
||||
cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
|
||||
|
||||
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm4,xmm0
|
||||
movaps xmm5,xmm1
|
||||
subps xmm0,xmm2 ; xmm0=tmp11
|
||||
subps xmm1,xmm3
|
||||
addps xmm4,xmm2 ; xmm4=tmp10
|
||||
addps xmm5,xmm3 ; xmm5=tmp13
|
||||
|
||||
mulps xmm1,[PD_1_414]
|
||||
subps xmm1,xmm5 ; xmm1=tmp12
|
||||
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm0
|
||||
subps xmm4,xmm5 ; xmm4=tmp3
|
||||
subps xmm0,xmm1 ; xmm0=tmp2
|
||||
addps xmm6,xmm5 ; xmm6=tmp0
|
||||
addps xmm7,xmm1 ; xmm7=tmp1
|
||||
|
||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
|
||||
punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
|
||||
punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
|
||||
psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
|
||||
psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
|
||||
cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
|
||||
cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
|
||||
|
||||
punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
|
||||
punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
|
||||
psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
|
||||
psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
|
||||
cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
|
||||
cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
|
||||
|
||||
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
|
||||
|
||||
movaps xmm4,xmm2
|
||||
movaps xmm0,xmm5
|
||||
addps xmm2,xmm1 ; xmm2=z11
|
||||
addps xmm5,xmm3 ; xmm5=z13
|
||||
subps xmm4,xmm1 ; xmm4=z12
|
||||
subps xmm0,xmm3 ; xmm0=z10
|
||||
|
||||
movaps xmm1,xmm2
|
||||
subps xmm2,xmm5
|
||||
addps xmm1,xmm5 ; xmm1=tmp7
|
||||
|
||||
mulps xmm2,[PD_1_414] ; xmm2=tmp11
|
||||
|
||||
movaps xmm3,xmm0
|
||||
addps xmm0,xmm4
|
||||
mulps xmm0,[PD_1_847] ; xmm0=z5
|
||||
mulps xmm3,[PD_M2_613] ; xmm3=(z10 * -2.613125930)
|
||||
mulps xmm4,[PD_1_082] ; xmm4=(z12 * 1.082392200)
|
||||
addps xmm3,xmm0 ; xmm3=tmp12
|
||||
subps xmm4,xmm0 ; xmm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
subps xmm3,xmm1 ; xmm3=tmp6
|
||||
movaps xmm5,xmm6
|
||||
movaps xmm0,xmm7
|
||||
addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
|
||||
addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
|
||||
subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
|
||||
subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
|
||||
subps xmm2,xmm3 ; xmm2=tmp5
|
||||
|
||||
movaps xmm1,xmm6 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
|
||||
unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
|
||||
movaps xmm3,xmm0 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
|
||||
unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
|
||||
|
||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||
movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
|
||||
|
||||
movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
|
||||
movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
|
||||
|
||||
addps xmm4,xmm2 ; xmm4=tmp4
|
||||
movaps xmm0,xmm7
|
||||
movaps xmm3,xmm5
|
||||
addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
|
||||
addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
|
||||
subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
|
||||
subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
|
||||
|
||||
movaps xmm2,xmm7 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
|
||||
unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
|
||||
movaps xmm4,xmm5 ; transpose coefficients(phase 1)
|
||||
unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
|
||||
unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
|
||||
|
||||
movaps xmm3,xmm6 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
|
||||
unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
|
||||
movaps xmm0,xmm1 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
|
||||
unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
|
||||
|
||||
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
|
||||
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
|
||||
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
|
||||
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
|
||||
|
||||
movaps xmm6,xmm5 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
|
||||
unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
|
||||
movaps xmm3,xmm4 ; transpose coefficients(phase 2)
|
||||
unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
|
||||
unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
|
||||
|
||||
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
|
||||
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
|
||||
movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
|
||||
movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
|
||||
|
||||
.nextcolumn:
|
||||
add rsi, byte 4*SIZEOF_JCOEF ; coef_block
|
||||
add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
|
||||
add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
|
||||
dec rcx ; ctr
|
||||
jnz near .columnloop
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
lea rsi, [workspace] ; FAST_FLOAT * wsptr
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov rax, r13
|
||||
mov rcx, DCTSIZE/4 ; ctr
|
||||
.rowloop:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movaps xmm4,xmm0
|
||||
movaps xmm5,xmm1
|
||||
subps xmm0,xmm2 ; xmm0=tmp11
|
||||
subps xmm1,xmm3
|
||||
addps xmm4,xmm2 ; xmm4=tmp10
|
||||
addps xmm5,xmm3 ; xmm5=tmp13
|
||||
|
||||
mulps xmm1,[PD_1_414]
|
||||
subps xmm1,xmm5 ; xmm1=tmp12
|
||||
|
||||
movaps xmm6,xmm4
|
||||
movaps xmm7,xmm0
|
||||
subps xmm4,xmm5 ; xmm4=tmp3
|
||||
subps xmm0,xmm1 ; xmm0=tmp2
|
||||
addps xmm6,xmm5 ; xmm6=tmp0
|
||||
addps xmm7,xmm1 ; xmm7=tmp1
|
||||
|
||||
movaps XMMWORD [wk(1)], xmm4 ; tmp3
|
||||
movaps XMMWORD [wk(0)], xmm0 ; tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
|
||||
|
||||
movaps xmm4,xmm2
|
||||
movaps xmm0,xmm5
|
||||
addps xmm2,xmm1 ; xmm2=z11
|
||||
addps xmm5,xmm3 ; xmm5=z13
|
||||
subps xmm4,xmm1 ; xmm4=z12
|
||||
subps xmm0,xmm3 ; xmm0=z10
|
||||
|
||||
movaps xmm1,xmm2
|
||||
subps xmm2,xmm5
|
||||
addps xmm1,xmm5 ; xmm1=tmp7
|
||||
|
||||
mulps xmm2,[PD_1_414] ; xmm2=tmp11
|
||||
|
||||
movaps xmm3,xmm0
|
||||
addps xmm0,xmm4
|
||||
mulps xmm0,[PD_1_847] ; xmm0=z5
|
||||
mulps xmm3,[PD_M2_613] ; xmm3=(z10 * -2.613125930)
|
||||
mulps xmm4,[PD_1_082] ; xmm4=(z12 * 1.082392200)
|
||||
addps xmm3,xmm0 ; xmm3=tmp12
|
||||
subps xmm4,xmm0 ; xmm4=tmp10
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
subps xmm3,xmm1 ; xmm3=tmp6
|
||||
movaps xmm5,xmm6
|
||||
movaps xmm0,xmm7
|
||||
addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
|
||||
addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
|
||||
subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
|
||||
subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
|
||||
subps xmm2,xmm3 ; xmm2=tmp5
|
||||
|
||||
movaps xmm1,[PD_RNDINT_MAGIC] ; xmm1=[PD_RNDINT_MAGIC]
|
||||
pcmpeqd xmm3,xmm3
|
||||
psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||
|
||||
addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
|
||||
addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
|
||||
addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
|
||||
addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
|
||||
|
||||
pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
|
||||
pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
|
||||
pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
|
||||
pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
|
||||
por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
|
||||
por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
|
||||
|
||||
movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
|
||||
movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
|
||||
|
||||
addps xmm4,xmm2 ; xmm4=tmp4
|
||||
movaps xmm7,xmm1
|
||||
movaps xmm5,xmm3
|
||||
addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
|
||||
addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
|
||||
subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
|
||||
subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
|
||||
|
||||
movaps xmm2,[PD_RNDINT_MAGIC] ; xmm2=[PD_RNDINT_MAGIC]
|
||||
pcmpeqd xmm4,xmm4
|
||||
psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
|
||||
|
||||
addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
|
||||
addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
|
||||
addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
|
||||
addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
|
||||
|
||||
pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
|
||||
pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
|
||||
pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
|
||||
pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
|
||||
por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
|
||||
por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
|
||||
|
||||
movdqa xmm2,[PB_CENTERJSAMP] ; xmm2=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
|
||||
packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
|
||||
paddb xmm6,xmm2
|
||||
paddb xmm1,xmm2
|
||||
|
||||
movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||
punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||
|
||||
movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
|
||||
pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
|
||||
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
|
||||
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
|
||||
add rdi, byte 4*SIZEOF_JSAMPROW
|
||||
dec rcx ; ctr
|
||||
jnz near .rowloop
|
||||
|
||||
uncollect_args
|
||||
pop rbx
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,488 @@
|
|||
;
|
||||
; jiss2fst.asm - fast integer IDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/projecpt/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a fast, not so accurate integer implementation of
|
||||
; the inverse DCT (Discrete Cosine Transform). The following code is
|
||||
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
|
||||
; for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 8 ; 14 is also OK.
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%if IFAST_SCALE_BITS != PASS1_BITS
|
||||
%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
|
||||
%endif
|
||||
|
||||
%if CONST_BITS == 8
|
||||
F_1_082 equ 277 ; FIX(1.082392200)
|
||||
F_1_414 equ 362 ; FIX(1.414213562)
|
||||
F_1_847 equ 473 ; FIX(1.847759065)
|
||||
F_2_613 equ 669 ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
|
||||
F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
|
||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
|
||||
F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
|
||||
; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
|
||||
|
||||
%define PRE_MULTIPLY_SCALE_BITS 2
|
||||
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_idct_ifast_sse2)
|
||||
|
||||
EXTN(jconst_idct_ifast_sse2):
|
||||
|
||||
PW_F1414 times 8 dw F_1_414 << CONST_SHIFT
|
||||
PW_F1847 times 8 dw F_1_847 << CONST_SHIFT
|
||||
PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
|
||||
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = jpeg_component_info * compptr
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp+0
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_ifast_sse2)
|
||||
|
||||
EXTN(jsimd_idct_ifast_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],eax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
|
||||
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1,xmm0
|
||||
packsswb xmm1,xmm1
|
||||
packsswb xmm1,xmm1
|
||||
movd eax,xmm1
|
||||
test rax,rax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07)
|
||||
|
||||
pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00)
|
||||
pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01)
|
||||
pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02)
|
||||
pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03)
|
||||
pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04)
|
||||
pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05)
|
||||
pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06)
|
||||
pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
|
||||
jmp near .column_end
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movdqa xmm4,xmm0
|
||||
movdqa xmm5,xmm1
|
||||
psubw xmm0,xmm2 ; xmm0=tmp11
|
||||
psubw xmm1,xmm3
|
||||
paddw xmm4,xmm2 ; xmm4=tmp10
|
||||
paddw xmm5,xmm3 ; xmm5=tmp13
|
||||
|
||||
psllw xmm1,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm1,[PW_F1414]
|
||||
psubw xmm1,xmm5 ; xmm1=tmp12
|
||||
|
||||
movdqa xmm6,xmm4
|
||||
movdqa xmm7,xmm0
|
||||
psubw xmm4,xmm5 ; xmm4=tmp3
|
||||
psubw xmm0,xmm1 ; xmm0=tmp2
|
||||
paddw xmm6,xmm5 ; xmm6=tmp0
|
||||
paddw xmm7,xmm1 ; xmm7=tmp1
|
||||
|
||||
movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3
|
||||
movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
|
||||
|
||||
movdqa xmm4,xmm2
|
||||
movdqa xmm0,xmm5
|
||||
psubw xmm2,xmm1 ; xmm2=z12
|
||||
psubw xmm5,xmm3 ; xmm5=z10
|
||||
paddw xmm4,xmm1 ; xmm4=z11
|
||||
paddw xmm0,xmm3 ; xmm0=z13
|
||||
|
||||
movdqa xmm1,xmm5 ; xmm1=z10(unscaled)
|
||||
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movdqa xmm3,xmm4
|
||||
psubw xmm4,xmm0
|
||||
paddw xmm3,xmm0 ; xmm3=tmp7
|
||||
|
||||
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm4,[PW_F1414] ; xmm4=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movdqa xmm0,xmm5
|
||||
paddw xmm5,xmm2
|
||||
pmulhw xmm5,[PW_F1847] ; xmm5=z5
|
||||
pmulhw xmm0,[PW_MF1613]
|
||||
pmulhw xmm2,[PW_F1082]
|
||||
psubw xmm0,xmm1
|
||||
psubw xmm2,xmm5 ; xmm2=tmp10
|
||||
paddw xmm0,xmm5 ; xmm0=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw xmm0,xmm3 ; xmm0=tmp6
|
||||
movdqa xmm1,xmm6
|
||||
movdqa xmm5,xmm7
|
||||
paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07)
|
||||
paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17)
|
||||
psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77)
|
||||
psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67)
|
||||
psubw xmm4,xmm0 ; xmm4=tmp5
|
||||
|
||||
movdqa xmm3,xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm0,xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73)
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77)
|
||||
|
||||
paddw xmm2,xmm4 ; xmm2=tmp4
|
||||
movdqa xmm5,xmm7
|
||||
movdqa xmm0,xmm1
|
||||
paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27)
|
||||
paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47)
|
||||
psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||
psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm4,xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||
movdqa xmm2,xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57)
|
||||
|
||||
movdqa xmm0,xmm3 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37)
|
||||
movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33)
|
||||
|
||||
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73)
|
||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35)
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm3,xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm0,xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm4,xmm6 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70)
|
||||
punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71)
|
||||
movdqa xmm7,xmm5 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72)
|
||||
punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35)
|
||||
movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1
|
||||
movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3
|
||||
|
||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74)
|
||||
punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75)
|
||||
movdqa xmm7,xmm3 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76)
|
||||
punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77)
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov rax, r13
|
||||
|
||||
; -- Even part
|
||||
|
||||
; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
|
||||
|
||||
movdqa xmm2,xmm6
|
||||
movdqa xmm0,xmm5
|
||||
psubw xmm6,xmm1 ; xmm6=tmp11
|
||||
psubw xmm5,xmm3
|
||||
paddw xmm2,xmm1 ; xmm2=tmp10
|
||||
paddw xmm0,xmm3 ; xmm0=tmp13
|
||||
|
||||
psllw xmm5,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm5,[PW_F1414]
|
||||
psubw xmm5,xmm0 ; xmm5=tmp12
|
||||
|
||||
movdqa xmm1,xmm2
|
||||
movdqa xmm3,xmm6
|
||||
psubw xmm2,xmm0 ; xmm2=tmp3
|
||||
psubw xmm6,xmm5 ; xmm6=tmp2
|
||||
paddw xmm1,xmm0 ; xmm1=tmp0
|
||||
paddw xmm3,xmm5 ; xmm3=tmp1
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2
|
||||
|
||||
; -- Odd part
|
||||
|
||||
; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
|
||||
|
||||
movdqa xmm2,xmm0
|
||||
movdqa xmm6,xmm4
|
||||
psubw xmm0,xmm7 ; xmm0=z12
|
||||
psubw xmm4,xmm5 ; xmm4=z10
|
||||
paddw xmm2,xmm7 ; xmm2=z11
|
||||
paddw xmm6,xmm5 ; xmm6=z13
|
||||
|
||||
movdqa xmm7,xmm4 ; xmm7=z10(unscaled)
|
||||
psllw xmm0,PRE_MULTIPLY_SCALE_BITS
|
||||
psllw xmm4,PRE_MULTIPLY_SCALE_BITS
|
||||
|
||||
movdqa xmm5,xmm2
|
||||
psubw xmm2,xmm6
|
||||
paddw xmm5,xmm6 ; xmm5=tmp7
|
||||
|
||||
psllw xmm2,PRE_MULTIPLY_SCALE_BITS
|
||||
pmulhw xmm2,[PW_F1414] ; xmm2=tmp11
|
||||
|
||||
; To avoid overflow...
|
||||
;
|
||||
; (Original)
|
||||
; tmp12 = -2.613125930 * z10 + z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp12 = (-1.613125930 - 1) * z10 + z5;
|
||||
; = -1.613125930 * z10 - z10 + z5;
|
||||
|
||||
movdqa xmm6,xmm4
|
||||
paddw xmm4,xmm0
|
||||
pmulhw xmm4,[PW_F1847] ; xmm4=z5
|
||||
pmulhw xmm6,[PW_MF1613]
|
||||
pmulhw xmm0,[PW_F1082]
|
||||
psubw xmm6,xmm7
|
||||
psubw xmm0,xmm4 ; xmm0=tmp10
|
||||
paddw xmm6,xmm4 ; xmm6=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
psubw xmm6,xmm5 ; xmm6=tmp6
|
||||
movdqa xmm7,xmm1
|
||||
movdqa xmm4,xmm3
|
||||
paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70)
|
||||
paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||
psraw xmm1,(PASS1_BITS+3) ; descale
|
||||
psraw xmm3,(PASS1_BITS+3) ; descale
|
||||
psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77)
|
||||
psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||
psraw xmm7,(PASS1_BITS+3) ; descale
|
||||
psraw xmm4,(PASS1_BITS+3) ; descale
|
||||
psubw xmm2,xmm6 ; xmm2=tmp5
|
||||
|
||||
packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2
|
||||
movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3
|
||||
|
||||
paddw xmm0,xmm2 ; xmm0=tmp4
|
||||
movdqa xmm4,xmm5
|
||||
movdqa xmm7,xmm6
|
||||
paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72)
|
||||
paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74)
|
||||
psraw xmm5,(PASS1_BITS+3) ; descale
|
||||
psraw xmm6,(PASS1_BITS+3) ; descale
|
||||
psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||
psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73)
|
||||
psraw xmm4,(PASS1_BITS+3) ; descale
|
||||
psraw xmm7,(PASS1_BITS+3) ; descale
|
||||
|
||||
movdqa xmm2,[PB_CENTERJSAMP] ; xmm2=[PB_CENTERJSAMP]
|
||||
|
||||
packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||
packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||
|
||||
paddb xmm1,xmm2
|
||||
paddb xmm3,xmm2
|
||||
paddb xmm5,xmm2
|
||||
paddb xmm7,xmm2
|
||||
|
||||
movdqa xmm0,xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||
punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||
movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||
punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||
|
||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||
punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||
punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm3,xmm1 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
movdqa xmm7,xmm4 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||
pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||
mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
|
||||
|
||||
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
|
||||
mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
||||
|
||||
uncollect_args
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
ret
|
|
@ -0,0 +1,844 @@
|
|||
;
|
||||
; jiss2int.asm - accurate integer IDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains a slow-but-accurate integer implementation of the
|
||||
; inverse DCT (Discrete Cosine Transform). The following code is based
|
||||
; directly on the IJG's original jidctint.c; see the jidctint.c for
|
||||
; more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1 (CONST_BITS-PASS1_BITS)
|
||||
%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_298 equ 2446 ; FIX(0.298631336)
|
||||
F_0_390 equ 3196 ; FIX(0.390180644)
|
||||
F_0_541 equ 4433 ; FIX(0.541196100)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_175 equ 9633 ; FIX(1.175875602)
|
||||
F_1_501 equ 12299 ; FIX(1.501321110)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_1_961 equ 16069 ; FIX(1.961570560)
|
||||
F_2_053 equ 16819 ; FIX(2.053119869)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_072 equ 25172 ; FIX(3.072711026)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
|
||||
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
|
||||
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
|
||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
|
||||
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
|
||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
||||
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
|
||||
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
|
||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_idct_islow_sse2)
|
||||
|
||||
EXTN(jconst_idct_islow_sse2):
|
||||
|
||||
PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
|
||||
PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
|
||||
PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
|
||||
PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
|
||||
PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
|
||||
PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
|
||||
PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
|
||||
PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
|
||||
PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
|
||||
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = jpeg_component_info * compptr
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp+0
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 12
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_islow_sse2)
|
||||
|
||||
EXTN(jsimd_idct_islow_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
|
||||
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
jnz near .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1,xmm0
|
||||
packsswb xmm1,xmm1
|
||||
packsswb xmm1,xmm1
|
||||
movd eax,xmm1
|
||||
test rax,rax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw xmm5,PASS1_BITS
|
||||
|
||||
movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
|
||||
punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
|
||||
punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
|
||||
|
||||
pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
|
||||
pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
|
||||
pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
|
||||
pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
|
||||
pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
|
||||
pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
|
||||
pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
|
||||
pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
|
||||
movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
|
||||
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
||||
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
||||
jmp near .column_end
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movdqa xmm4,xmm1 ; xmm1=in2=z2
|
||||
movdqa xmm5,xmm1
|
||||
punpcklwd xmm4,xmm3 ; xmm3=in6=z3
|
||||
punpckhwd xmm5,xmm3
|
||||
movdqa xmm1,xmm4
|
||||
movdqa xmm3,xmm5
|
||||
pmaddwd xmm4,[PW_F130_F054] ; xmm4=tmp3L
|
||||
pmaddwd xmm5,[PW_F130_F054] ; xmm5=tmp3H
|
||||
pmaddwd xmm1,[PW_F054_MF130] ; xmm1=tmp2L
|
||||
pmaddwd xmm3,[PW_F054_MF130] ; xmm3=tmp2H
|
||||
|
||||
movdqa xmm6,xmm0
|
||||
paddw xmm0,xmm2 ; xmm0=in0+in4
|
||||
psubw xmm6,xmm2 ; xmm6=in0-in4
|
||||
|
||||
pxor xmm7,xmm7
|
||||
pxor xmm2,xmm2
|
||||
punpcklwd xmm7,xmm0 ; xmm7=tmp0L
|
||||
punpckhwd xmm2,xmm0 ; xmm2=tmp0H
|
||||
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||
psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
|
||||
|
||||
movdqa xmm0,xmm7
|
||||
paddd xmm7,xmm4 ; xmm7=tmp10L
|
||||
psubd xmm0,xmm4 ; xmm0=tmp13L
|
||||
movdqa xmm4,xmm2
|
||||
paddd xmm2,xmm5 ; xmm2=tmp10H
|
||||
psubd xmm4,xmm5 ; xmm4=tmp13H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
|
||||
movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
|
||||
movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
|
||||
movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
|
||||
|
||||
pxor xmm5,xmm5
|
||||
pxor xmm7,xmm7
|
||||
punpcklwd xmm5,xmm6 ; xmm5=tmp1L
|
||||
punpckhwd xmm7,xmm6 ; xmm7=tmp1H
|
||||
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||
psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
|
||||
|
||||
movdqa xmm2,xmm5
|
||||
paddd xmm5,xmm1 ; xmm5=tmp11L
|
||||
psubd xmm2,xmm1 ; xmm2=tmp12L
|
||||
movdqa xmm0,xmm7
|
||||
paddd xmm7,xmm3 ; xmm7=tmp11H
|
||||
psubd xmm0,xmm3 ; xmm0=tmp12H
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||
movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
|
||||
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
|
||||
movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movdqa xmm5,xmm6
|
||||
movdqa xmm7,xmm4
|
||||
paddw xmm5,xmm3 ; xmm5=z3
|
||||
paddw xmm7,xmm1 ; xmm7=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm2,xmm5
|
||||
movdqa xmm0,xmm5
|
||||
punpcklwd xmm2,xmm7
|
||||
punpckhwd xmm0,xmm7
|
||||
movdqa xmm5,xmm2
|
||||
movdqa xmm7,xmm0
|
||||
pmaddwd xmm2,[PW_MF078_F117] ; xmm2=z3L
|
||||
pmaddwd xmm0,[PW_MF078_F117] ; xmm0=z3H
|
||||
pmaddwd xmm5,[PW_F117_F078] ; xmm5=z4L
|
||||
pmaddwd xmm7,[PW_F117_F078] ; xmm7=z4H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
|
||||
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movdqa xmm2,xmm3
|
||||
movdqa xmm0,xmm3
|
||||
punpcklwd xmm2,xmm4
|
||||
punpckhwd xmm0,xmm4
|
||||
movdqa xmm3,xmm2
|
||||
movdqa xmm4,xmm0
|
||||
pmaddwd xmm2,[PW_MF060_MF089] ; xmm2=tmp0L
|
||||
pmaddwd xmm0,[PW_MF060_MF089] ; xmm0=tmp0H
|
||||
pmaddwd xmm3,[PW_MF089_F060] ; xmm3=tmp3L
|
||||
pmaddwd xmm4,[PW_MF089_F060] ; xmm4=tmp3H
|
||||
|
||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
|
||||
paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
|
||||
paddd xmm3,xmm5 ; xmm3=tmp3L
|
||||
paddd xmm4,xmm7 ; xmm4=tmp3H
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
|
||||
movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
|
||||
|
||||
movdqa xmm2,xmm1
|
||||
movdqa xmm0,xmm1
|
||||
punpcklwd xmm2,xmm6
|
||||
punpckhwd xmm0,xmm6
|
||||
movdqa xmm1,xmm2
|
||||
movdqa xmm6,xmm0
|
||||
pmaddwd xmm2,[PW_MF050_MF256] ; xmm2=tmp1L
|
||||
pmaddwd xmm0,[PW_MF050_MF256] ; xmm0=tmp1H
|
||||
pmaddwd xmm1,[PW_MF256_F050] ; xmm1=tmp2L
|
||||
pmaddwd xmm6,[PW_MF256_F050] ; xmm6=tmp2H
|
||||
|
||||
paddd xmm2,xmm5 ; xmm2=tmp1L
|
||||
paddd xmm0,xmm7 ; xmm0=tmp1H
|
||||
paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
|
||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
|
||||
movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||
movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
|
||||
|
||||
movdqa xmm2,xmm5
|
||||
movdqa xmm0,xmm7
|
||||
paddd xmm5,xmm3 ; xmm5=data0L
|
||||
paddd xmm7,xmm4 ; xmm7=data0H
|
||||
psubd xmm2,xmm3 ; xmm2=data7L
|
||||
psubd xmm0,xmm4 ; xmm0=data7H
|
||||
|
||||
movdqa xmm3,[PD_DESCALE_P1] ; xmm3=[PD_DESCALE_P1]
|
||||
|
||||
paddd xmm5,xmm3
|
||||
paddd xmm7,xmm3
|
||||
psrad xmm5,DESCALE_P1
|
||||
psrad xmm7,DESCALE_P1
|
||||
paddd xmm2,xmm3
|
||||
paddd xmm0,xmm3
|
||||
psrad xmm2,DESCALE_P1
|
||||
psrad xmm0,DESCALE_P1
|
||||
|
||||
packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
|
||||
packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
|
||||
|
||||
movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
|
||||
movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
|
||||
|
||||
movdqa xmm7,xmm4
|
||||
movdqa xmm0,xmm3
|
||||
paddd xmm4,xmm1 ; xmm4=data1L
|
||||
paddd xmm3,xmm6 ; xmm3=data1H
|
||||
psubd xmm7,xmm1 ; xmm7=data6L
|
||||
psubd xmm0,xmm6 ; xmm0=data6H
|
||||
|
||||
movdqa xmm1,[PD_DESCALE_P1] ; xmm1=[PD_DESCALE_P1]
|
||||
|
||||
paddd xmm4,xmm1
|
||||
paddd xmm3,xmm1
|
||||
psrad xmm4,DESCALE_P1
|
||||
psrad xmm3,DESCALE_P1
|
||||
paddd xmm7,xmm1
|
||||
paddd xmm0,xmm1
|
||||
psrad xmm7,DESCALE_P1
|
||||
psrad xmm0,DESCALE_P1
|
||||
|
||||
packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||
packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
|
||||
|
||||
movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
|
||||
punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
|
||||
movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
|
||||
movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
|
||||
movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
|
||||
movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
|
||||
movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
|
||||
movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa xmm5,xmm3
|
||||
movdqa xmm6,xmm0
|
||||
paddd xmm3,xmm4 ; xmm3=data2L
|
||||
paddd xmm0,xmm2 ; xmm0=data2H
|
||||
psubd xmm5,xmm4 ; xmm5=data5L
|
||||
psubd xmm6,xmm2 ; xmm6=data5H
|
||||
|
||||
movdqa xmm7,[PD_DESCALE_P1] ; xmm7=[PD_DESCALE_P1]
|
||||
|
||||
paddd xmm3,xmm7
|
||||
paddd xmm0,xmm7
|
||||
psrad xmm3,DESCALE_P1
|
||||
psrad xmm0,DESCALE_P1
|
||||
paddd xmm5,xmm7
|
||||
paddd xmm6,xmm7
|
||||
psrad xmm5,DESCALE_P1
|
||||
psrad xmm6,DESCALE_P1
|
||||
|
||||
packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
|
||||
packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
|
||||
|
||||
movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
|
||||
movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
|
||||
movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
|
||||
movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
|
||||
|
||||
movdqa xmm0,xmm1
|
||||
movdqa xmm6,xmm4
|
||||
paddd xmm1,xmm2 ; xmm1=data3L
|
||||
paddd xmm4,xmm7 ; xmm4=data3H
|
||||
psubd xmm0,xmm2 ; xmm0=data4L
|
||||
psubd xmm6,xmm7 ; xmm6=data4H
|
||||
|
||||
movdqa xmm2,[PD_DESCALE_P1] ; xmm2=[PD_DESCALE_P1]
|
||||
|
||||
paddd xmm1,xmm2
|
||||
paddd xmm4,xmm2
|
||||
psrad xmm1,DESCALE_P1
|
||||
psrad xmm4,DESCALE_P1
|
||||
paddd xmm0,xmm2
|
||||
paddd xmm6,xmm2
|
||||
psrad xmm0,DESCALE_P1
|
||||
psrad xmm6,DESCALE_P1
|
||||
|
||||
packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
|
||||
packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
|
||||
movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
|
||||
|
||||
movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
|
||||
movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
|
||||
punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
|
||||
|
||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
|
||||
movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
|
||||
movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
|
||||
punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
|
||||
movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
|
||||
punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
|
||||
|
||||
movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
|
||||
punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
|
||||
movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
|
||||
punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
|
||||
|
||||
movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
|
||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
|
||||
movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
|
||||
|
||||
movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
|
||||
punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
|
||||
movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
|
||||
punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
|
||||
punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
|
||||
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov rax, r13
|
||||
|
||||
; -- Even part
|
||||
|
||||
; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
|
||||
|
||||
; (Original)
|
||||
; z1 = (z2 + z3) * 0.541196100;
|
||||
; tmp2 = z1 + z3 * -1.847759065;
|
||||
; tmp3 = z1 + z2 * 0.765366865;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
|
||||
; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
|
||||
|
||||
movdqa xmm6,xmm1 ; xmm1=in2=z2
|
||||
movdqa xmm5,xmm1
|
||||
punpcklwd xmm6,xmm2 ; xmm2=in6=z3
|
||||
punpckhwd xmm5,xmm2
|
||||
movdqa xmm1,xmm6
|
||||
movdqa xmm2,xmm5
|
||||
pmaddwd xmm6,[PW_F130_F054] ; xmm6=tmp3L
|
||||
pmaddwd xmm5,[PW_F130_F054] ; xmm5=tmp3H
|
||||
pmaddwd xmm1,[PW_F054_MF130] ; xmm1=tmp2L
|
||||
pmaddwd xmm2,[PW_F054_MF130] ; xmm2=tmp2H
|
||||
|
||||
movdqa xmm3,xmm7
|
||||
paddw xmm7,xmm0 ; xmm7=in0+in4
|
||||
psubw xmm3,xmm0 ; xmm3=in0-in4
|
||||
|
||||
pxor xmm4,xmm4
|
||||
pxor xmm0,xmm0
|
||||
punpcklwd xmm4,xmm7 ; xmm4=tmp0L
|
||||
punpckhwd xmm0,xmm7 ; xmm0=tmp0H
|
||||
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||
psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
|
||||
|
||||
movdqa xmm7,xmm4
|
||||
paddd xmm4,xmm6 ; xmm4=tmp10L
|
||||
psubd xmm7,xmm6 ; xmm7=tmp13L
|
||||
movdqa xmm6,xmm0
|
||||
paddd xmm0,xmm5 ; xmm0=tmp10H
|
||||
psubd xmm6,xmm5 ; xmm6=tmp13H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
|
||||
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
|
||||
movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
|
||||
movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
|
||||
|
||||
pxor xmm5,xmm5
|
||||
pxor xmm4,xmm4
|
||||
punpcklwd xmm5,xmm3 ; xmm5=tmp1L
|
||||
punpckhwd xmm4,xmm3 ; xmm4=tmp1H
|
||||
psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
|
||||
psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
|
||||
|
||||
movdqa xmm0,xmm5
|
||||
paddd xmm5,xmm1 ; xmm5=tmp11L
|
||||
psubd xmm0,xmm1 ; xmm0=tmp12L
|
||||
movdqa xmm7,xmm4
|
||||
paddd xmm4,xmm2 ; xmm4=tmp11H
|
||||
psubd xmm7,xmm2 ; xmm7=tmp12H
|
||||
|
||||
movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
|
||||
movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
|
||||
movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
|
||||
movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
|
||||
movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
|
||||
movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
|
||||
movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
|
||||
|
||||
movdqa xmm5,xmm6
|
||||
movdqa xmm4,xmm3
|
||||
paddw xmm5,xmm1 ; xmm5=z3
|
||||
paddw xmm4,xmm2 ; xmm4=z4
|
||||
|
||||
; (Original)
|
||||
; z5 = (z3 + z4) * 1.175875602;
|
||||
; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
|
||||
; z3 += z5; z4 += z5;
|
||||
;
|
||||
; (This implementation)
|
||||
; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
|
||||
; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
|
||||
|
||||
movdqa xmm0,xmm5
|
||||
movdqa xmm7,xmm5
|
||||
punpcklwd xmm0,xmm4
|
||||
punpckhwd xmm7,xmm4
|
||||
movdqa xmm5,xmm0
|
||||
movdqa xmm4,xmm7
|
||||
pmaddwd xmm0,[PW_MF078_F117] ; xmm0=z3L
|
||||
pmaddwd xmm7,[PW_MF078_F117] ; xmm7=z3H
|
||||
pmaddwd xmm5,[PW_F117_F078] ; xmm5=z4L
|
||||
pmaddwd xmm4,[PW_F117_F078] ; xmm4=z4H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
|
||||
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
|
||||
|
||||
; (Original)
|
||||
; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
|
||||
; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
|
||||
; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
|
||||
; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
|
||||
; tmp0 += z1 + z3; tmp1 += z2 + z4;
|
||||
; tmp2 += z2 + z3; tmp3 += z1 + z4;
|
||||
;
|
||||
; (This implementation)
|
||||
; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
|
||||
; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
|
||||
; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
|
||||
; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
|
||||
; tmp0 += z3; tmp1 += z4;
|
||||
; tmp2 += z3; tmp3 += z4;
|
||||
|
||||
movdqa xmm0,xmm1
|
||||
movdqa xmm7,xmm1
|
||||
punpcklwd xmm0,xmm3
|
||||
punpckhwd xmm7,xmm3
|
||||
movdqa xmm1,xmm0
|
||||
movdqa xmm3,xmm7
|
||||
pmaddwd xmm0,[PW_MF060_MF089] ; xmm0=tmp0L
|
||||
pmaddwd xmm7,[PW_MF060_MF089] ; xmm7=tmp0H
|
||||
pmaddwd xmm1,[PW_MF089_F060] ; xmm1=tmp3L
|
||||
pmaddwd xmm3,[PW_MF089_F060] ; xmm3=tmp3H
|
||||
|
||||
paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
|
||||
paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
|
||||
paddd xmm1,xmm5 ; xmm1=tmp3L
|
||||
paddd xmm3,xmm4 ; xmm3=tmp3H
|
||||
|
||||
movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
|
||||
movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
|
||||
|
||||
movdqa xmm0,xmm2
|
||||
movdqa xmm7,xmm2
|
||||
punpcklwd xmm0,xmm6
|
||||
punpckhwd xmm7,xmm6
|
||||
movdqa xmm2,xmm0
|
||||
movdqa xmm6,xmm7
|
||||
pmaddwd xmm0,[PW_MF050_MF256] ; xmm0=tmp1L
|
||||
pmaddwd xmm7,[PW_MF050_MF256] ; xmm7=tmp1H
|
||||
pmaddwd xmm2,[PW_MF256_F050] ; xmm2=tmp2L
|
||||
pmaddwd xmm6,[PW_MF256_F050] ; xmm6=tmp2H
|
||||
|
||||
paddd xmm0,xmm5 ; xmm0=tmp1L
|
||||
paddd xmm7,xmm4 ; xmm7=tmp1H
|
||||
paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
|
||||
paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
|
||||
|
||||
movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
|
||||
movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
|
||||
movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
|
||||
|
||||
movdqa xmm0,xmm5
|
||||
movdqa xmm7,xmm4
|
||||
paddd xmm5,xmm1 ; xmm5=data0L
|
||||
paddd xmm4,xmm3 ; xmm4=data0H
|
||||
psubd xmm0,xmm1 ; xmm0=data7L
|
||||
psubd xmm7,xmm3 ; xmm7=data7H
|
||||
|
||||
movdqa xmm1,[PD_DESCALE_P2] ; xmm1=[PD_DESCALE_P2]
|
||||
|
||||
paddd xmm5,xmm1
|
||||
paddd xmm4,xmm1
|
||||
psrad xmm5,DESCALE_P2
|
||||
psrad xmm4,DESCALE_P2
|
||||
paddd xmm0,xmm1
|
||||
paddd xmm7,xmm1
|
||||
psrad xmm0,DESCALE_P2
|
||||
psrad xmm7,DESCALE_P2
|
||||
|
||||
packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
|
||||
packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
|
||||
movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
|
||||
|
||||
movdqa xmm4,xmm3
|
||||
movdqa xmm7,xmm1
|
||||
paddd xmm3,xmm2 ; xmm3=data1L
|
||||
paddd xmm1,xmm6 ; xmm1=data1H
|
||||
psubd xmm4,xmm2 ; xmm4=data6L
|
||||
psubd xmm7,xmm6 ; xmm7=data6H
|
||||
|
||||
movdqa xmm2,[PD_DESCALE_P2] ; xmm2=[PD_DESCALE_P2]
|
||||
|
||||
paddd xmm3,xmm2
|
||||
paddd xmm1,xmm2
|
||||
psrad xmm3,DESCALE_P2
|
||||
psrad xmm1,DESCALE_P2
|
||||
paddd xmm4,xmm2
|
||||
paddd xmm7,xmm2
|
||||
psrad xmm4,DESCALE_P2
|
||||
psrad xmm7,DESCALE_P2
|
||||
|
||||
packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
|
||||
packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
|
||||
|
||||
packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
|
||||
movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
|
||||
movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
|
||||
movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
movdqa xmm4,xmm6
|
||||
movdqa xmm0,xmm2
|
||||
paddd xmm6,xmm1 ; xmm6=data2L
|
||||
paddd xmm2,xmm7 ; xmm2=data2H
|
||||
psubd xmm4,xmm1 ; xmm4=data5L
|
||||
psubd xmm0,xmm7 ; xmm0=data5H
|
||||
|
||||
movdqa xmm5,[PD_DESCALE_P2] ; xmm5=[PD_DESCALE_P2]
|
||||
|
||||
paddd xmm6,xmm5
|
||||
paddd xmm2,xmm5
|
||||
psrad xmm6,DESCALE_P2
|
||||
psrad xmm2,DESCALE_P2
|
||||
paddd xmm4,xmm5
|
||||
paddd xmm0,xmm5
|
||||
psrad xmm4,DESCALE_P2
|
||||
psrad xmm0,DESCALE_P2
|
||||
|
||||
packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
|
||||
packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
|
||||
|
||||
movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
|
||||
movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
|
||||
movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
|
||||
movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
|
||||
|
||||
movdqa xmm2,xmm3
|
||||
movdqa xmm0,xmm1
|
||||
paddd xmm3,xmm7 ; xmm3=data3L
|
||||
paddd xmm1,xmm5 ; xmm1=data3H
|
||||
psubd xmm2,xmm7 ; xmm2=data4L
|
||||
psubd xmm0,xmm5 ; xmm0=data4H
|
||||
|
||||
movdqa xmm7,[PD_DESCALE_P2] ; xmm7=[PD_DESCALE_P2]
|
||||
|
||||
paddd xmm3,xmm7
|
||||
paddd xmm1,xmm7
|
||||
psrad xmm3,DESCALE_P2
|
||||
psrad xmm1,DESCALE_P2
|
||||
paddd xmm2,xmm7
|
||||
paddd xmm0,xmm7
|
||||
psrad xmm2,DESCALE_P2
|
||||
psrad xmm0,DESCALE_P2
|
||||
|
||||
movdqa xmm5,[PB_CENTERJSAMP] ; xmm5=[PB_CENTERJSAMP]
|
||||
|
||||
packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
|
||||
packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
|
||||
movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
|
||||
|
||||
packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
|
||||
packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
|
||||
|
||||
paddb xmm7,xmm5
|
||||
paddb xmm1,xmm5
|
||||
paddb xmm6,xmm5
|
||||
paddb xmm3,xmm5
|
||||
|
||||
movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
|
||||
punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
|
||||
movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
|
||||
punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
|
||||
punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
|
||||
|
||||
movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
|
||||
punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
|
||||
movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
|
||||
punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
|
||||
punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
|
||||
|
||||
movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
|
||||
punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
|
||||
pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
|
||||
pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
|
||||
pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
|
||||
mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
|
||||
mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
|
||||
|
||||
uncollect_args
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,571 @@
|
|||
;
|
||||
; jiss2red.asm - reduced-size IDCT (64-bit SSE2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; This file contains inverse-DCT routines that produce reduced-size
|
||||
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
|
||||
; The following code is based directly on the IJG's original jidctred.c;
|
||||
; see the jidctred.c for more details.
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
%include "jdct.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define CONST_BITS 13
|
||||
%define PASS1_BITS 2
|
||||
|
||||
%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
|
||||
%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
|
||||
%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
|
||||
%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
|
||||
|
||||
%if CONST_BITS == 13
|
||||
F_0_211 equ 1730 ; FIX(0.211164243)
|
||||
F_0_509 equ 4176 ; FIX(0.509795579)
|
||||
F_0_601 equ 4926 ; FIX(0.601344887)
|
||||
F_0_720 equ 5906 ; FIX(0.720959822)
|
||||
F_0_765 equ 6270 ; FIX(0.765366865)
|
||||
F_0_850 equ 6967 ; FIX(0.850430095)
|
||||
F_0_899 equ 7373 ; FIX(0.899976223)
|
||||
F_1_061 equ 8697 ; FIX(1.061594337)
|
||||
F_1_272 equ 10426 ; FIX(1.272758580)
|
||||
F_1_451 equ 11893 ; FIX(1.451774981)
|
||||
F_1_847 equ 15137 ; FIX(1.847759065)
|
||||
F_2_172 equ 17799 ; FIX(2.172734803)
|
||||
F_2_562 equ 20995 ; FIX(2.562915447)
|
||||
F_3_624 equ 29692 ; FIX(3.624509785)
|
||||
%else
|
||||
; NASM cannot do compile-time arithmetic on floating-point constants.
|
||||
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
|
||||
F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
|
||||
F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
|
||||
F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
|
||||
F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
|
||||
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
|
||||
F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
|
||||
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
|
||||
F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
|
||||
F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
|
||||
F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
|
||||
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
|
||||
F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
|
||||
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
|
||||
F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 16
|
||||
global EXTN(jconst_idct_red_sse2)
|
||||
|
||||
EXTN(jconst_idct_red_sse2):
|
||||
|
||||
PW_F184_MF076 times 4 dw F_1_847,-F_0_765
|
||||
PW_F256_F089 times 4 dw F_2_562, F_0_899
|
||||
PW_F106_MF217 times 4 dw F_1_061,-F_2_172
|
||||
PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
|
||||
PW_F145_MF021 times 4 dw F_1_451,-F_0_211
|
||||
PW_F362_MF127 times 4 dw F_3_624,-F_1_272
|
||||
PW_F085_MF072 times 4 dw F_0_850,-F_0_720
|
||||
PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1)
|
||||
PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1)
|
||||
PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1)
|
||||
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1)
|
||||
PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
alignz 16
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 4x4 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = void * dct_table
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp+0
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_4x4_sse2)
|
||||
|
||||
EXTN(jsimd_idct_4x4_sse2):
|
||||
push rbp
|
||||
mov rax,rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp],eax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
|
||||
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
|
||||
mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
jnz short .columnDCT
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
por xmm0,xmm1
|
||||
packsswb xmm0,xmm0
|
||||
packsswb xmm0,xmm0
|
||||
movd eax,xmm0
|
||||
test rax,rax
|
||||
jnz short .columnDCT
|
||||
|
||||
; -- AC terms all zero
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
psllw xmm0,PASS1_BITS
|
||||
|
||||
movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07)
|
||||
punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
|
||||
punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07)
|
||||
|
||||
pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
|
||||
pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
|
||||
pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
|
||||
pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
|
||||
|
||||
jmp near .column_end
|
||||
%endif
|
||||
.columnDCT:
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
movdqa xmm4,xmm0
|
||||
movdqa xmm5,xmm0
|
||||
punpcklwd xmm4,xmm1
|
||||
punpckhwd xmm5,xmm1
|
||||
movdqa xmm0,xmm4
|
||||
movdqa xmm1,xmm5
|
||||
pmaddwd xmm4,[PW_F256_F089] ; xmm4=(tmp2L)
|
||||
pmaddwd xmm5,[PW_F256_F089] ; xmm5=(tmp2H)
|
||||
pmaddwd xmm0,[PW_F106_MF217] ; xmm0=(tmp0L)
|
||||
pmaddwd xmm1,[PW_F106_MF217] ; xmm1=(tmp0H)
|
||||
|
||||
movdqa xmm6,xmm2
|
||||
movdqa xmm7,xmm2
|
||||
punpcklwd xmm6,xmm3
|
||||
punpckhwd xmm7,xmm3
|
||||
movdqa xmm2,xmm6
|
||||
movdqa xmm3,xmm7
|
||||
pmaddwd xmm6,[PW_MF060_MF050] ; xmm6=(tmp2L)
|
||||
pmaddwd xmm7,[PW_MF060_MF050] ; xmm7=(tmp2H)
|
||||
pmaddwd xmm2,[PW_F145_MF021] ; xmm2=(tmp0L)
|
||||
pmaddwd xmm3,[PW_F145_MF021] ; xmm3=(tmp0H)
|
||||
|
||||
paddd xmm6,xmm4 ; xmm6=tmp2L
|
||||
paddd xmm7,xmm5 ; xmm7=tmp2H
|
||||
paddd xmm2,xmm0 ; xmm2=tmp0L
|
||||
paddd xmm3,xmm1 ; xmm3=tmp0H
|
||||
|
||||
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L
|
||||
movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
pxor xmm1,xmm1
|
||||
pxor xmm2,xmm2
|
||||
punpcklwd xmm1,xmm4 ; xmm1=tmp0L
|
||||
punpckhwd xmm2,xmm4 ; xmm2=tmp0H
|
||||
psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
|
||||
psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
|
||||
|
||||
movdqa xmm3,xmm5 ; xmm5=in2=z2
|
||||
punpcklwd xmm5,xmm0 ; xmm0=in6=z3
|
||||
punpckhwd xmm3,xmm0
|
||||
pmaddwd xmm5,[PW_F184_MF076] ; xmm5=tmp2L
|
||||
pmaddwd xmm3,[PW_F184_MF076] ; xmm3=tmp2H
|
||||
|
||||
movdqa xmm4,xmm1
|
||||
movdqa xmm0,xmm2
|
||||
paddd xmm1,xmm5 ; xmm1=tmp10L
|
||||
paddd xmm2,xmm3 ; xmm2=tmp10H
|
||||
psubd xmm4,xmm5 ; xmm4=tmp12L
|
||||
psubd xmm0,xmm3 ; xmm0=tmp12H
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm5,xmm1
|
||||
movdqa xmm3,xmm2
|
||||
paddd xmm1,xmm6 ; xmm1=data0L
|
||||
paddd xmm2,xmm7 ; xmm2=data0H
|
||||
psubd xmm5,xmm6 ; xmm5=data3L
|
||||
psubd xmm3,xmm7 ; xmm3=data3H
|
||||
|
||||
movdqa xmm6,[PD_DESCALE_P1_4] ; xmm6=[PD_DESCALE_P1_4]
|
||||
|
||||
paddd xmm1,xmm6
|
||||
paddd xmm2,xmm6
|
||||
psrad xmm1,DESCALE_P1_4
|
||||
psrad xmm2,DESCALE_P1_4
|
||||
paddd xmm5,xmm6
|
||||
paddd xmm3,xmm6
|
||||
psrad xmm5,DESCALE_P1_4
|
||||
psrad xmm3,DESCALE_P1_4
|
||||
|
||||
packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07)
|
||||
packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37)
|
||||
|
||||
movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L
|
||||
movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H
|
||||
|
||||
movdqa xmm2,xmm4
|
||||
movdqa xmm3,xmm0
|
||||
paddd xmm4,xmm7 ; xmm4=data1L
|
||||
paddd xmm0,xmm6 ; xmm0=data1H
|
||||
psubd xmm2,xmm7 ; xmm2=data2L
|
||||
psubd xmm3,xmm6 ; xmm3=data2H
|
||||
|
||||
movdqa xmm7,[PD_DESCALE_P1_4] ; xmm7=[PD_DESCALE_P1_4]
|
||||
|
||||
paddd xmm4,xmm7
|
||||
paddd xmm0,xmm7
|
||||
psrad xmm4,DESCALE_P1_4
|
||||
psrad xmm0,DESCALE_P1_4
|
||||
paddd xmm2,xmm7
|
||||
paddd xmm3,xmm7
|
||||
psrad xmm2,DESCALE_P1_4
|
||||
psrad xmm3,DESCALE_P1_4
|
||||
|
||||
packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17)
|
||||
packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27)
|
||||
|
||||
movdqa xmm6,xmm1 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13)
|
||||
punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
|
||||
movdqa xmm7,xmm2 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33)
|
||||
punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37)
|
||||
|
||||
movdqa xmm0,xmm1 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
|
||||
punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
|
||||
movdqa xmm3,xmm6 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
|
||||
punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
|
||||
.column_end:
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov rax, r13
|
||||
|
||||
; -- Even part
|
||||
|
||||
pxor xmm4,xmm4
|
||||
punpcklwd xmm4,xmm1 ; xmm4=tmp0
|
||||
psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
|
||||
|
||||
; -- Odd part
|
||||
|
||||
punpckhwd xmm1,xmm0
|
||||
punpckhwd xmm6,xmm3
|
||||
movdqa xmm5,xmm1
|
||||
movdqa xmm2,xmm6
|
||||
pmaddwd xmm1,[PW_F256_F089] ; xmm1=(tmp2)
|
||||
pmaddwd xmm6,[PW_MF060_MF050] ; xmm6=(tmp2)
|
||||
pmaddwd xmm5,[PW_F106_MF217] ; xmm5=(tmp0)
|
||||
pmaddwd xmm2,[PW_F145_MF021] ; xmm2=(tmp0)
|
||||
|
||||
paddd xmm6,xmm1 ; xmm6=tmp2
|
||||
paddd xmm2,xmm5 ; xmm2=tmp0
|
||||
|
||||
; -- Even part
|
||||
|
||||
punpcklwd xmm0,xmm3
|
||||
pmaddwd xmm0,[PW_F184_MF076] ; xmm0=tmp2
|
||||
|
||||
movdqa xmm7,xmm4
|
||||
paddd xmm4,xmm0 ; xmm4=tmp10
|
||||
psubd xmm7,xmm0 ; xmm7=tmp12
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm1,[PD_DESCALE_P2_4] ; xmm1=[PD_DESCALE_P2_4]
|
||||
|
||||
movdqa xmm5,xmm4
|
||||
movdqa xmm3,xmm7
|
||||
paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30)
|
||||
paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31)
|
||||
psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33)
|
||||
psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32)
|
||||
|
||||
paddd xmm4,xmm1
|
||||
paddd xmm7,xmm1
|
||||
psrad xmm4,DESCALE_P2_4
|
||||
psrad xmm7,DESCALE_P2_4
|
||||
paddd xmm5,xmm1
|
||||
paddd xmm3,xmm1
|
||||
psrad xmm5,DESCALE_P2_4
|
||||
psrad xmm3,DESCALE_P2_4
|
||||
|
||||
packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32)
|
||||
packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33)
|
||||
|
||||
movdqa xmm0,xmm4 ; transpose coefficients(phase 1)
|
||||
punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31)
|
||||
punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33)
|
||||
|
||||
movdqa xmm6,xmm4 ; transpose coefficients(phase 2)
|
||||
punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13)
|
||||
punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33)
|
||||
|
||||
packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
|
||||
paddb xmm4,[PB_CENTERJSAMP]
|
||||
|
||||
pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
|
||||
pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
|
||||
pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
|
||||
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
||||
mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
|
||||
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
|
||||
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
uncollect_args
|
||||
mov rsp,rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Perform dequantization and inverse DCT on one block of coefficients,
|
||||
; producing a reduced-size 2x2 output block.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
|
||||
; JSAMPARRAY output_buf, JDIMENSION output_col)
|
||||
;
|
||||
|
||||
; r10 = void * dct_table
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_2x2_sse2)
|
||||
|
||||
EXTN(jsimd_idct_2x2_sse2):
|
||||
push rbp
|
||||
mov rbp,rsp
|
||||
push rbx
|
||||
collect_args
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
mov rdx, r10 ; quantptr
|
||||
mov rsi, r11 ; inptr
|
||||
|
||||
; | input: | result: |
|
||||
; | 00 01 ** 03 ** 05 ** 07 | |
|
||||
; | 10 11 ** 13 ** 15 ** 17 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
|
||||
; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
|
||||
; | 50 51 ** 53 ** 55 ** 57 | |
|
||||
; | ** ** ** ** ** ** ** ** | |
|
||||
; | 70 71 ** 73 ** 75 ** 77 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
|
||||
movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
|
||||
; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
|
||||
|
||||
pcmpeqd xmm7,xmm7
|
||||
pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
|
||||
|
||||
movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17)
|
||||
movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57)
|
||||
punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33)
|
||||
punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73)
|
||||
pmaddwd xmm4,[PW_F362_MF127]
|
||||
pmaddwd xmm5,[PW_F085_MF072]
|
||||
|
||||
psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --)
|
||||
pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37)
|
||||
psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --)
|
||||
pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77)
|
||||
por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37)
|
||||
por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77)
|
||||
pmaddwd xmm0,[PW_F362_MF127]
|
||||
pmaddwd xmm2,[PW_F085_MF072]
|
||||
|
||||
paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3]
|
||||
paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7]
|
||||
|
||||
; -- Even part
|
||||
|
||||
movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
|
||||
pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
|
||||
|
||||
; xmm6=(00 01 ** 03 ** 05 ** 07)
|
||||
|
||||
movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07)
|
||||
pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **)
|
||||
pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07)
|
||||
psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
|
||||
psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm3,xmm6
|
||||
movdqa xmm5,xmm1
|
||||
paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
|
||||
paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
|
||||
psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
|
||||
psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
|
||||
|
||||
movdqa xmm2,[PD_DESCALE_P1_2] ; xmm2=[PD_DESCALE_P1_2]
|
||||
|
||||
punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **)
|
||||
|
||||
movdqa xmm7,xmm1
|
||||
punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3)
|
||||
punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7)
|
||||
|
||||
paddd xmm6,xmm2
|
||||
psrad xmm6,DESCALE_P1_2
|
||||
|
||||
paddd xmm1,xmm2
|
||||
paddd xmm7,xmm2
|
||||
psrad xmm1,DESCALE_P1_2
|
||||
psrad xmm7,DESCALE_P1_2
|
||||
|
||||
; -- Prefetch the next coefficient block
|
||||
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
|
||||
prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
|
||||
|
||||
; ---- Pass 2: process rows, store into output array.
|
||||
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov rax, r13
|
||||
|
||||
; | input:| result:|
|
||||
; | A0 B0 | |
|
||||
; | A1 B1 | C0 C1 |
|
||||
; | A3 B3 | D0 D1 |
|
||||
; | A5 B5 | |
|
||||
; | A7 B7 | |
|
||||
|
||||
; -- Odd part
|
||||
|
||||
packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
|
||||
packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
|
||||
pmaddwd xmm1,[PW_F362_MF127]
|
||||
pmaddwd xmm7,[PW_F085_MF072]
|
||||
|
||||
paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1]
|
||||
|
||||
; -- Even part
|
||||
|
||||
pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****]
|
||||
|
||||
; -- Final output stage
|
||||
|
||||
movdqa xmm4,xmm6
|
||||
paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
|
||||
psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
|
||||
|
||||
punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1)
|
||||
|
||||
paddd xmm6,[PD_DESCALE_P2_2]
|
||||
psrad xmm6,DESCALE_P2_2
|
||||
|
||||
packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
|
||||
packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
|
||||
paddb xmm6,[PB_CENTERJSAMP]
|
||||
|
||||
pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --)
|
||||
pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --)
|
||||
|
||||
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
|
||||
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
|
||||
mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
|
||||
mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
|
||||
|
||||
uncollect_args
|
||||
pop rbx
|
||||
pop rbp
|
||||
ret
|
|
@ -0,0 +1,95 @@
|
|||
;
|
||||
; jsimdcpu.asm - SIMD instruction support check
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright 2009 D. R. Commander
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Check if the CPU supports SIMD instructions
|
||||
;
|
||||
; GLOBAL(unsigned int)
|
||||
; jpeg_simd_cpu_support (void)
|
||||
;
|
||||
|
||||
align 16
|
||||
global EXTN(jpeg_simd_cpu_support)
|
||||
|
||||
EXTN(jpeg_simd_cpu_support):
|
||||
push rbx
|
||||
|
||||
xor rdi,rdi ; simd support flag
|
||||
|
||||
pushfq
|
||||
pop rax
|
||||
mov rdx,rax
|
||||
xor rax, 1<<21 ; flip ID bit in EFLAGS
|
||||
push rax
|
||||
popfq
|
||||
pushfq
|
||||
pop rax
|
||||
xor rax,rdx
|
||||
jz short .return ; CPUID is not supported
|
||||
|
||||
; Check for MMX instruction support
|
||||
xor rax,rax
|
||||
cpuid
|
||||
test rax,rax
|
||||
jz short .return
|
||||
|
||||
xor rax,rax
|
||||
inc rax
|
||||
cpuid
|
||||
mov rax,rdx ; rax = Standard feature flags
|
||||
|
||||
test rax, 1<<23 ; bit23:MMX
|
||||
jz short .no_mmx
|
||||
or rdi, byte JSIMD_MMX
|
||||
.no_mmx:
|
||||
test rax, 1<<25 ; bit25:SSE
|
||||
jz short .no_sse
|
||||
or rdi, byte JSIMD_SSE
|
||||
.no_sse:
|
||||
test rax, 1<<26 ; bit26:SSE2
|
||||
jz short .no_sse2
|
||||
or rdi, byte JSIMD_SSE2
|
||||
.no_sse2:
|
||||
|
||||
; Check for 3DNow! instruction support
|
||||
mov eax, 0x80000000
|
||||
cpuid
|
||||
cmp eax, 0x80000000
|
||||
jbe short .return
|
||||
|
||||
mov rax, 0x80000001
|
||||
cpuid
|
||||
mov rax,rdx ; eax = Extended feature flags
|
||||
|
||||
test eax, 1<<31 ; bit31:3DNow!(vendor independent)
|
||||
jz short .no_3dnow
|
||||
or edi, byte JSIMD_3DNOW
|
||||
.no_3dnow:
|
||||
|
||||
.return:
|
||||
mov rax,rdi
|
||||
|
||||
pop rbx
|
||||
ret
|
||||
|
|
@ -48,15 +48,20 @@
|
|||
%define SEG_TEXT .text align=16 public use32 class=CODE
|
||||
%define SEG_CONST .data align=16 public use32 class=DATA
|
||||
|
||||
%elifdef ELF ; ----(nasm -felf -DELF ...)------------
|
||||
%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
|
||||
; * Linux
|
||||
; * *BSD family Unix using elf format
|
||||
; * Unix System V, including Solaris x86, UnixWare and SCO Unix
|
||||
|
||||
; -- segment definition --
|
||||
;
|
||||
%ifdef __x86_64__
|
||||
%define SEG_TEXT .text progbits align=16
|
||||
%define SEG_CONST .rodata progbits align=16
|
||||
%else
|
||||
%define SEG_TEXT .text progbits alloc exec nowrite align=16
|
||||
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
|
||||
%endif
|
||||
|
||||
; To make the code position-independent, append -DPIC to the commandline
|
||||
;
|
||||
|
@ -103,9 +108,15 @@
|
|||
; --------------------------------------------------------------------------
|
||||
; Common types
|
||||
;
|
||||
%ifdef __x86_64__
|
||||
%define POINTER qword ; general pointer type
|
||||
%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
|
||||
%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
|
||||
%else
|
||||
%define POINTER dword ; general pointer type
|
||||
%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
|
||||
%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
|
||||
%endif
|
||||
|
||||
%define INT dword ; signed integer type
|
||||
%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
|
||||
|
@ -268,6 +279,32 @@ const_base:
|
|||
align %1, db 0 ; filling zeros
|
||||
%endmacro
|
||||
|
||||
%ifdef __x86_64__
|
||||
%imacro collect_args 0
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
mov r10, rdi
|
||||
mov r11, rsi
|
||||
mov r12, rdx
|
||||
mov r13, rcx
|
||||
mov r14, r8
|
||||
mov r15, r9
|
||||
%endmacro
|
||||
|
||||
%imacro uncollect_args 0
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
%endmacro
|
||||
|
||||
%endif
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
; Defines picked up from the C headers
|
||||
|
|
|
@ -11,7 +11,7 @@ while [ $# -gt 0 ]; do
|
|||
pic=yes
|
||||
fi
|
||||
;;
|
||||
-f|-fbin|-faout|-faoutb|-fcoff|-felf|-fas86| \
|
||||
-f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
|
||||
-fobj|-fwin32|-frdf|-fieee|-fmacho)
|
||||
# it's a file format specifier for nasm.
|
||||
command="$command $1"
|
||||
|
|
Загрузка…
Ссылка в новой задаче