/* * jdcolor.c * * This file contains an Optimized Routine for YCbCr->RGB Color Space Conversion * * Copyright (C) 1991-1996, Thomas G. Lane. * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * * This file contains output colorspace conversion routines. * */ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" #include "xp_core.h" /* Private subobject */ typedef struct { struct jpeg_color_deconverter pub; /* public fields */ /* Private state for YCC->RGB conversion */ int * Cr_r_tab; /* => table for Cr to R conversion */ int * Cb_b_tab; /* => table for Cb to B conversion */ INT32 * Cr_g_tab; /* => table for Cr to G conversion */ INT32 * Cb_g_tab; /* => table for Cb to G conversion */ } my_color_deconverter; typedef my_color_deconverter * my_cconvert_ptr; #ifdef XP_WIN32 /* Info Added for MMX(TM) Technology Optimization */ extern void MMXYCbCr2RGB( int columns, unsigned char *inY, unsigned char *inU, unsigned char *inV, unsigned char *outRGB); /* These constants correspond to CCIR 601-1 R = [256*Y + 359*(Cr-128)] / 256 G = [256*Y - 88*(Cb-128) - 183*(Cr-128)] / 256 B = [256*Y + 454*(Cb-128)] / 256 Conventional floating point equations: R = Y + 1.40200 * Cr G = Y - 0.34414 * Cb - 0.71414 * Cr B = Y + 1.77200 * Cb */ /*Ry=0100 Ru=0000 Rv=0167*/ /*Gy=0100 Gu=FFA8 Gv=FF49*/ /*By=0100 Bu=01C6 Bv=0000*/ /* constants for YCbCr->RGB and YCbCrA->RGBA*/ static __int64 const_0 = 0x0000000000000000; static __int64 const_sub128 = 0x0080008000800080; static __int64 const_VUmul = 0xFF49FFA8FF49FFA8; static __int64 const_YVmul = 0x0100016701000167; static __int64 const_YUmul = 0x010001C6010001C6; static __int64 mask_highd = 0xFFFFFFFF00000000; static __int64 const_invert = 0x00FFFFFF00FFFFFF; /* End of added info */ #endif /**************** YCbCr -> RGB conversion: most common case **************/ /* * YCbCr is defined per CCIR 601-1, except that Cb and Cr are * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5. * The conversion equations to be implemented are therefore * R = Y + 1.40200 * Cr * G = Y - 0.34414 * Cb - 0.71414 * Cr * B = Y + 1.77200 * Cb * where Cb and Cr represent the incoming values less CENTERJSAMPLE. * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.) * * To avoid floating-point arithmetic, we represent the fractional constants * as integers scaled up by 2^16 (about 4 digits precision); we have to divide * the products by 2^16, with appropriate rounding, to get the correct answer. * Notice that Y, being an integral input, does not contribute any fraction * so it need not participate in the rounding. * * For even more speed, we avoid doing any multiplications in the inner loop * by precalculating the constants times Cb and Cr for all possible values. * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table); * for 12-bit samples it is still acceptable. It's not very reasonable for * 16-bit samples, but if you want lossless storage you shouldn't be changing * colorspace anyway. * The Cr=>R and Cb=>B values can be rounded to integers in advance; the * values for the G calculation are left scaled up, since we must add them * together before rounding. */ #define SCALEBITS 16 /* speediest right-shift on some machines */ #define ONE_HALF ((INT32) 1 << (SCALEBITS-1)) #define FIX(x) ((INT32) ((x) * (1L<RGB colorspace conversion. */ LOCAL void build_ycc_rgb_table (j_decompress_ptr cinfo) { my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; int i; INT32 x; SHIFT_TEMPS cconvert->Cr_r_tab = (int *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE+1) * SIZEOF(int)); cconvert->Cb_b_tab = (int *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE+1) * SIZEOF(int)); cconvert->Cr_g_tab = (INT32 *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE+1) * SIZEOF(INT32)); cconvert->Cb_g_tab = (INT32 *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE+1) * SIZEOF(INT32)); for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) { /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */ /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */ /* Cr=>R value is nearest int to 1.40200 * x */ cconvert->Cr_r_tab[i] = (int) RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS); /* Cb=>B value is nearest int to 1.77200 * x */ cconvert->Cb_b_tab[i] = (int) RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS); /* Cr=>G value is scaled-up -0.71414 * x */ cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x; /* Cb=>G value is scaled-up -0.34414 * x */ /* We also add in ONE_HALF so that need not do it in inner loop */ cconvert->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF; } } /* * Convert some rows of samples to the output colorspace. * * Note that we change from noninterleaved, one-plane-per-component format * to interleaved-pixel format. The output buffer is therefore three times * as wide as the input buffer. * A starting row offset is provided only for the input buffer. The caller * can easily adjust the passed output_buf value to accommodate any row * offset required on that side. */ METHODDEF void ycc_rgb_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) { my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; register int y, cb, cr; register JSAMPROW outptr; register JSAMPROW inptr0, inptr1, inptr2; register JDIMENSION col; JDIMENSION num_cols = cinfo->output_width; #ifdef XP_WIN32 /* Alignment variables - CRK */ /* JDIMENSION tail_cols = num_cols&7; */ JDIMENSION mmx_cols=num_cols&~7; #endif /* copy these pointers into registers if possible */ register JSAMPLE * range_limit = cinfo->sample_range_limit; register int * Crrtab = cconvert->Cr_r_tab; register int * Cbbtab = cconvert->Cb_b_tab; register INT32 * Crgtab = cconvert->Cr_g_tab; register INT32 * Cbgtab = cconvert->Cb_g_tab; SHIFT_TEMPS #ifdef XP_WIN32 if(MMXAvailable) { //MMX Code - CRK while (--num_rows >= 0) { inptr0 = input_buf[0][input_row]; inptr1 = input_buf[1][input_row]; inptr2 = input_buf[2][input_row]; input_row++; outptr = *output_buf++; MMXYCbCr2RGB(mmx_cols, inptr0, inptr1, inptr2, outptr); outptr += 3*mmx_cols; for (col = mmx_cols; col < num_cols; col++) { y = GETJSAMPLE(inptr0[col]); cb = GETJSAMPLE(inptr1[col]); cr = GETJSAMPLE(inptr2[col]); /* Range-limiting is essential due to noise introduced by DCT losses. */ outptr[RGB_RED] = range_limit[y + Crrtab[cr]]; outptr[RGB_GREEN] = range_limit[y + ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS))]; outptr[RGB_BLUE] = range_limit[y + Cbbtab[cb]]; outptr += RGB_PIXELSIZE; } } __asm emms } else { #endif while (--num_rows >= 0) { inptr0 = input_buf[0][input_row]; inptr1 = input_buf[1][input_row]; inptr2 = input_buf[2][input_row]; input_row++; outptr = *output_buf++; for (col = 0; col < num_cols; col++) { y = GETJSAMPLE(inptr0[col]); cb = GETJSAMPLE(inptr1[col]); cr = GETJSAMPLE(inptr2[col]); /* Range-limiting is essential due to noise introduced by DCT losses. */ outptr[RGB_RED] = range_limit[y + Crrtab[cr]]; outptr[RGB_GREEN] = range_limit[y + ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS))]; outptr[RGB_BLUE] = range_limit[y + Cbbtab[cb]]; outptr += RGB_PIXELSIZE; } } #ifdef XP_WIN32 } #endif } /**************** Cases other than YCbCr -> RGB **************/ /* * Color conversion for no colorspace change: just copy the data, * converting from separate-planes to interleaved representation. */ METHODDEF void null_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) { register JSAMPROW inptr, outptr; register JDIMENSION count; register int num_components = cinfo->num_components; JDIMENSION num_cols = cinfo->output_width; int ci; while (--num_rows >= 0) { for (ci = 0; ci < num_components; ci++) { inptr = input_buf[ci][input_row]; outptr = output_buf[0] + ci; for (count = num_cols; count > 0; count--) { *outptr = *inptr++; /* needn't bother with GETJSAMPLE() here */ outptr += num_components; } } input_row++; output_buf++; } } /* * Color conversion for grayscale: just copy the data. * This also works for YCbCr -> grayscale conversion, in which * we just copy the Y (luminance) component and ignore chrominance. */ METHODDEF void grayscale_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) { jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0, num_rows, cinfo->output_width); } /* * Adobe-style YCCK->CMYK conversion. * We convert YCbCr to R=1-C, G=1-M, and B=1-Y using the same * conversion as above, while passing K (black) unchanged. * We assume build_ycc_rgb_table has been called. */ METHODDEF void ycck_cmyk_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) { my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; register int y, cb, cr; register JSAMPROW outptr; register JSAMPROW inptr0, inptr1, inptr2, inptr3; register JDIMENSION col; JDIMENSION num_cols = cinfo->output_width; /* copy these pointers into registers if possible */ register JSAMPLE * range_limit = cinfo->sample_range_limit; register int * Crrtab = cconvert->Cr_r_tab; register int * Cbbtab = cconvert->Cb_b_tab; register INT32 * Crgtab = cconvert->Cr_g_tab; register INT32 * Cbgtab = cconvert->Cb_g_tab; SHIFT_TEMPS while (--num_rows >= 0) { inptr0 = input_buf[0][input_row]; inptr1 = input_buf[1][input_row]; inptr2 = input_buf[2][input_row]; inptr3 = input_buf[3][input_row]; input_row++; outptr = *output_buf++; for (col = 0; col < num_cols; col++) { y = GETJSAMPLE(inptr0[col]); cb = GETJSAMPLE(inptr1[col]); cr = GETJSAMPLE(inptr2[col]); /* Range-limiting is essential due to noise introduced by DCT losses. */ outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])]; /* red */ outptr[1] = range_limit[MAXJSAMPLE - (y + /* green */ ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS)))]; outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])]; /* blue */ /* K passes through unchanged */ outptr[3] = inptr3[col]; /* don't need GETJSAMPLE here */ outptr += 4; } } } /* * Empty method for start_pass. */ METHODDEF void start_pass_dcolor (j_decompress_ptr cinfo) { /* no work needed */ } /* * Module initialization routine for output colorspace conversion. */ GLOBAL void jinit_color_deconverter (j_decompress_ptr cinfo) { my_cconvert_ptr cconvert; int ci; cconvert = (my_cconvert_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_color_deconverter)); cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert; cconvert->pub.start_pass = start_pass_dcolor; /* Make sure num_components agrees with jpeg_color_space */ switch (cinfo->jpeg_color_space) { case JCS_GRAYSCALE: if (cinfo->num_components != 1) ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); break; case JCS_RGB: case JCS_YCbCr: if (cinfo->num_components != 3) ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); break; case JCS_CMYK: case JCS_YCCK: if (cinfo->num_components != 4) ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); break; default: /* JCS_UNKNOWN can be anything */ if (cinfo->num_components < 1) ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); break; } /* Set out_color_components and conversion method based on requested space. * Also clear the component_needed flags for any unused components, * so that earlier pipeline stages can avoid useless computation. */ switch (cinfo->out_color_space) { case JCS_GRAYSCALE: cinfo->out_color_components = 1; if (cinfo->jpeg_color_space == JCS_GRAYSCALE || cinfo->jpeg_color_space == JCS_YCbCr) { cconvert->pub.color_convert = grayscale_convert; /* For color->grayscale conversion, only the Y (0) component is needed */ for (ci = 1; ci < cinfo->num_components; ci++) cinfo->comp_info[ci].component_needed = FALSE; } else ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); break; case JCS_RGB: cinfo->out_color_components = RGB_PIXELSIZE; if (cinfo->jpeg_color_space == JCS_YCbCr) { cconvert->pub.color_convert = ycc_rgb_convert; build_ycc_rgb_table(cinfo); } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) { cconvert->pub.color_convert = null_convert; } else ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); break; case JCS_CMYK: cinfo->out_color_components = 4; if (cinfo->jpeg_color_space == JCS_YCCK) { cconvert->pub.color_convert = ycck_cmyk_convert; build_ycc_rgb_table(cinfo); } else if (cinfo->jpeg_color_space == JCS_CMYK) { cconvert->pub.color_convert = null_convert; } else ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); break; default: /* Permit null conversion to same output space */ if (cinfo->out_color_space == cinfo->jpeg_color_space) { cinfo->out_color_components = cinfo->num_components; cconvert->pub.color_convert = null_convert; } else /* unsupported non-null conversion */ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); break; } if (cinfo->quantize_colors) cinfo->output_components = 1; /* single colormapped output component */ else cinfo->output_components = cinfo->out_color_components; } #ifdef XP_WIN32 // MMX(tm) technology assembly code additions begin here void MMXYCbCr2RGB( int columns, unsigned char *inY, unsigned char *inU, unsigned char *inV, unsigned char *outRGB) { //; This program will compile with Microsoft Visual C++ 4.1 or greater. //; Use the /GM compile switch to allow the compilation of MMX(tm) Technology //; instructions as inline assembly __asm { // Initialize all the pointers, loop variables mov eax, inY mov ecx, inV mov edi, columns mov ebx, inU shr edi, 2 ; number of loops = cols/4 mov edx, outRGB // Main Loop to process 12 bytes YUVtoRGB: movd mm0, [eax] ; 0/0/0/0/Y3/Y2/Y1/Y0 pxor mm7, mm7 ; use mm7 as const_0 to achieve better pairing at start movd mm2, [ebx] ; 0/0/0/0/U3/U2/U1/U0 punpcklbw mm0, mm7 ; Y3/Y2/Y1/Y0 movd mm3, [ecx] ; 0/0/0/0/V3/V2/V1/V0 punpcklbw mm2, mm7 ; U3/U2/U1/U0 psubsw mm2, const_sub128 ; U3'/U2'/U1'/U0' punpcklbw mm3, mm7 ; V3/V2/V1/V0 psubsw mm3, const_sub128 ; V3'/V2'/V1'/V0' movq mm4, mm2 punpcklwd mm2, mm3 ; V1'/U1'/V0'/U0' movq mm1, mm0 pmaddwd mm2, const_VUmul ; gvV1'+guU1'/gvV0'+guU0' psllw mm1, 8 ; Y3*256/Y2*256/Y1*256/Y0*256 movq mm6, mm1 punpcklwd mm1, mm7 ; Y1*256/Y0*256 punpckhwd mm6, mm7 ; Y3*256/Y2*256 movq mm5, mm4 punpckhwd mm5, mm3 ; V3'/U3'/V2'/U2' paddd mm2, mm1 ; G1*256/G0*256 (mm1 free) pmaddwd mm5, const_VUmul ; gvV3'+guU3'/gvV2'+guU2' movq mm1, mm3 ; (using mm1) punpcklwd mm3, mm0 ; Y1/V1'/Y0/V0' movq mm7, mm4 ; This wipes out the zero constant pmaddwd mm3, const_YVmul ; ryY1+rvV1'/ryY0+rvV0' psrad mm2, 8 ; G1/G0 paddd mm5, mm6 ; G3*256/G2*256 (mm6 free) punpcklwd mm4, mm0 ; Y1/U1'/Y0/U0' pmaddwd mm4, const_YUmul ; // "byY1+buU1'/byY0'+buU0'" psrad mm5, 8 ; G3/G2 psrad mm3, 8 ; R1/R0 punpckhwd mm7 , mm0 ; Y3/U3'/Y2/U2' psrad mm4, 8 ; B1/B0 movq mm6, mm3 pmaddwd mm7, const_YUmul ; // "byY3+buU3'/byY2'+buU2'" punpckhwd mm1, mm0 ; Y3/V3'/Y2/V2' pmaddwd mm1, const_YVmul ; ryY3+rvV3'/ryY2+rvV2' punpckldq mm3, mm2 ; G0/R0 punpckhdq mm6, mm2 ; G1/R1 (mm2 free) movq mm0, mm4 psrad mm7, 8 ; B3/B2 punpckldq mm4, const_0 ; 0/B0 punpckhdq mm0, const_0 ; 0/B1 psrad mm1, 8 ; R3/R2 packssdw mm3, mm4 ; 0/B0/G0/R0 (mm4 free) movq mm2, mm1 packssdw mm6, mm0 ; 0/B1/G1/R1 (mm0 free) packuswb mm3, mm6 ; 0/B1/G1/R1/0/B0/G0/R0 (mm6 free) punpckldq mm2, mm5 ; G2/R2 movq mm4, mm7 punpckhdq mm1, mm5 ; G3/R3 (mm5 done) punpckldq mm7, const_0 ; 0/B2 (change this line for alpha code) punpckhdq mm4, const_0 ; 0/B3 (change this line for alpha code) movq mm0, mm3 packssdw mm2, mm7 ; 0/B2/G2/R2 pand mm3, mask_highd ; 0/B1/G1/R1/0/0/0/0 packssdw mm1, mm4 ; 0/B3/G3/R3 psrlq mm3, 8 ; 0/0/B1/G1/R1/0/0/0 add edx, 12 por mm0, mm3 ; 0/0/?/?/R1/B0/G0/R0 packuswb mm2, mm1 ; 0/B3/G3/R3/0/B2/G2/R2 psrlq mm3, 32 ; 0/0/0/0/0/0/B1/G1 add eax, 4 movd [edx][-12], mm0 ; correct for add punpcklwd mm3, mm2 ; 0/B2/0/0/G2/R2/B1/G1 psrlq mm2, 24 ; 0/0/0/0/B3/G3/R3/0 add ecx, 4 movd [edx][-8], mm3 ; correct for previous add psrlq mm3, 48 ; 0/0/0/0/0/0/0/B2 por mm2, mm3 ; 0/0/0/0/B3/G3/R3/0 add ebx, 4 movd [edx][-4], mm2 ; correct for previous add dec edi jnz YUVtoRGB ; Do 12 more bytes if not zero //emms // "commented out since it is done at the end of the caller's loop" } // end of __asm } #endif