Bug 573948 - Part 1: Use libjpeg-turbo instead of libjpeg. r=jmuizelaar

2010-07-19 10:34:41 -07:00 · 2010-07-19 10:34:41 -07:00 · 81b4973141
--- a/config/autoconf.mk.in
+++ b/config/autoconf.mk.in
@ -168,6 +168,10 @@ VPX_AS_CONVERSION = @VPX_AS_CONVERSION@
 VPX_ASM_SUFFIX = @VPX_ASM_SUFFIX@
 VPX_X86_ASM = @VPX_X86_ASM@
 VPX_ARM_ASM = @VPX_ARM_ASM@
+LIBJPEG_TURBO_AS = @LIBJPEG_TURBO_AS@
+LIBJPEG_TURBO_ASFLAGS = @LIBJPEG_TURBO_ASFLAGS@
+LIBJPEG_TURBO_X86_ASM = @LIBJPEG_TURBO_X86_ASM@
+LIBJPEG_TURBO_X64_ASM = @LIBJPEG_TURBO_X64_ASM@
 NS_PRINTING = @NS_PRINTING@
 MOZ_PDF_PRINTING = @MOZ_PDF_PRINTING@
 MOZ_CRASHREPORTER = @MOZ_CRASHREPORTER@
--- a/configure.in
+++ b/configure.in
@ -4971,6 +4971,10 @@ VPX_AS_CONVERSION=
 VPX_ASM_SUFFIX=
 VPX_X86_ASM=
 VPX_ARM_ASM=
+LIBJPEG_TURBO_AS=
+LIBJPEG_TURBO_ASFLAGS=
+LIBJPEG_TURBO_X86_ASM=
+LIBJPEG_TURBO_X64_ASM=
 MOZ_PANGO=1
 MOZ_PERMISSIONS=1
 MOZ_PLACES=1
@ -6441,6 +6445,67 @@ if test -z "$MOZ_CRASHREPORTER_ENABLE_PERCENT"; then
 fi
 AC_DEFINE_UNQUOTED(MOZ_CRASHREPORTER_ENABLE_PERCENT, $MOZ_CRASHREPORTER_ENABLE_PERCENT)

+dnl ========================================================
+dnl = libjpeg-turbo configuration
+dnl ========================================================
+
+dnl Detect if we can use yasm to compile libjpeg-turbo's optimized assembly
+dnl files.
+AC_MSG_CHECKING([for YASM assembler])
+AC_CHECK_PROGS(LIBJPEG_TURBO_AS, yasm, "")
+
+dnl XXX jlebar -- need a yasm version check here.
+
+if test -n "LIBJPEG_TURBO_AS"; then
+
+  LIBJPEG_TURBO_AS="yasm"
+
+  dnl We have YASM; see if we support it on this platform.
+  case "$OS_ARCH:$OS_TEST" in
+  Linux:x86|Linux:i?86)
+    LIBJPEG_TURBO_ASFLAGS="-f elf32 -rnasm -pnasm -DPIC -DELF"
+    LIBJPEG_TURBO_X86_ASM=1
+  ;;
+  Linux:x86_64)
+    LIBJPEG_TURBO_ASFLAGS="-f elf64 -rnasm -pnasm -D__x86_64__ -DPIC -DELF"
+    LIBJPEG_TURBO_X64_ASM=1
+  ;;
+  SunOS:i?86)
+    LIBJPEG_TURBO_ASFLAGS="-f elf32 -rnasm -pnasm -DPIC -DELF"
+    LIBJPEG_TURBO_X86_ASM=1
+  ;;
+  SunOS:x86_64)
+    LIBJPEG_TURBO_ASFLAGS="-f elf64 -rnasm -pnasm -D__x86_64__ -DPIC -DELF"
+    LIBJPEG_TURBO_X64_ASM=1
+  ;;
+  Darwin:i?86)
+    LIBJPEG_TURBO_ASFLAGS="-f macho32 -rnasm -pnasm -DPIC -DMACHO"
+    LIBJPEG_TURBO_X86_ASM=1
+  ;;
+  Darwin:x86_64)
+    LIBJPEG_TURBO_ASFLAGS="-f macho64 -rnasm -pnasm -D__x86_64__ -DPIC -DMACHO"
+    LIBJPEG_TURBO_X64_ASM=1
+  ;;
+  WINNT:x86|WINNT:i?86)
+    LIBJPEG_TURBO_ASFLAGS="-f win32 -rnasm -pnasm -DPIC -DWIN32"
+    LIBJPEG_TURBO_X86_ASM=1
+  ;;
+  WINNT:x86_64)
+    LIBJPEG_TURBO_ASFLAGS="-f win64 -rnasm -pnasm -D__x86_64__ -DPIC -DWIN64"
+    LIBJPEG_TURBO_X64_ASM=1
+  ;;
+  esac
+
+fi # end have YASM
+
+if test -n "$LIBJPEG_TURBO_X86_ASM"; then
+  AC_DEFINE(LIBJPEG_TURBO_X86_ASM)
+elif test -n "$LIBJPEG_TURBO_X64_ASM"; then
+  AC_DEFINE(LIBJPEG_TURBO_X64_ASM)
+else
+  AC_MSG_WARN([No assembler or assembly support for libjpeg-turbo.  Using unoptimized C routines.])
+fi
+
 dnl ========================================================
 dnl = Enable compilation of specific extension modules
 dnl ========================================================
@ -9209,6 +9274,10 @@ AC_SUBST(VPX_AS_CONVERSION)
 AC_SUBST(VPX_ASM_SUFFIX)
 AC_SUBST(VPX_X86_ASM)
 AC_SUBST(VPX_ARM_ASM)
+AC_SUBST(LIBJPEG_TURBO_AS)
+AC_SUBST(LIBJPEG_TURBO_ASFLAGS)
+AC_SUBST(LIBJPEG_TURBO_X86_ASM)
+AC_SUBST(LIBJPEG_TURBO_X64_ASM)

 if test "$USING_HCC"; then
   CC='${topsrcdir}/build/hcc'
--- a/jpeg/MOZCHANGES
+++ b/jpeg/MOZCHANGES
@ -1,10 +1,69 @@
+To upgrade to a new revision of libjpeg-turbo, do the following:

-Changes made to pristine jpeg source by mozilla.org developers.
+* Check out libjpeg-turbo from SVN:

-2003/08/18 -- change default mapping for METHODDEF, LOCAL, GLOBAL, EXTERN to better match NSPR
+    $ svn co https://libjpeg-turbo.svn.sourceforge.net/svnroot/libjpeg-turbo/trunk libjpeg-turbo

-2003/03/14  -- mingw bustage fix. w32api uses different header guard define
-               for <basestd.h> than msvc.
+* In a clean clone of mozilla-central, run the following commands

-????/??/??  -- Lots of undocumented changes. :(
+    $ rm -rf jpeg
+    $ svn export --ignore-externals /path/to/libjpeg-turbo jpeg
+    $ cd jpeg

+* Now look through the new files and rm any which are npotb.  When I upgraded
+  to libjpeg-turbo 1.1.0, the only files I kept which didn't match
+
+    *.c  *.h *.asm *.inc
+
+  were README and README-turbo.
+
+  You can easily look for all non *.c, *.h, *.asm, and *.inc files by running
+
+    $ hg status -nu | grep -v '\(c\|h\|asm\|inc\)$'
+
+  Once you're comfortable that you're only deleting files you want to delete
+  (and you've hg add'ed the files you want to keep), you can nuke the remaining
+  files with
+
+    $ hg status -nu | grep -v '\(c\|h\|asm\|inc\)$' | xargs rm
+
+  A helpful command for finding the *.c files which aren't *currently* part of
+  the build is
+
+    diff <(ls *.c | sort) <(grep -o '\w*\.c' Makefile.in | sort)
+
+  of course, libjpeg-turbo might have added some new source files, so you'll
+  have to look though and figure out which of these files to keep.
+
+* Restore files modified in the Mozilla repository.
+
+    $ hg revert --no-backup Makefile.in jconfig.h jmorecfg.h simd/Makefile.in \
+      simd/jsimdcfg.inc jchuff.c jdhuff.c jdhuff.h MOZCHANGES
+
+* Update Makefile.in to build any new files.
+
+* Finally, tell hg that we've added or removed some files:
+
+    $ hg addremove
+
+
+== March 28, 2011 (initial commit, libjpeg-turbo v1.1.0 r469 2011-02-27) ==
+
+* Modified jmorecfg.h to define UINT8, UINT16, INT16, and INT32 in terms of
+  prtypes to fix a build error on Windows.
+
+* Defined INLINE as NS_ALWAYS_INLINE in jconfig.h.
+
+* Removed the following files which are licensed under the wxWindows license:
+
+    bmp.c, bmp.h, jpegut.c, jpgtest.cxx, rrtimer.h, rrutil.h, turbojpeg.h,
+    turbojpegl.c
+
+* Reverted the following files to what was previously in Mozilla's tree
+  (nominally libjpeg 6.2):
+
+    jchuff.c, jdhuff.c, jdhuff.h
+
+  since the versions of these files in libjpeg-turbo are also under the
+  wxWindows license.  (It would have been nicer to revert them to the new
+  libjpeg-8b code, but that doesn't easily integrate with libjpeg-turbo.)
--- a/jpeg/Makefile.in
+++ b/jpeg/Makefile.in
@ -15,11 +15,12 @@
 # The Original Code is mozilla.org code.
 #
 # The Initial Developer of the Original Code is
-# Netscape Communications Corporation.
-# Portions created by the Initial Developer are Copyright (C) 1998
+# Mozilla Corporation
+# Portions created by the Initial Developer are Copyright (C) 2010
 # the Initial Developer. All Rights Reserved.
 #
 # Contributor(s):
+#  Justin Lebar <justin.lebar@gmail.com>
 #
 # Alternatively, the contents of this file may be used under the terms of
 # either the GNU General Public License Version 2 or later (the "GPL"), or
@ -42,6 +43,7 @@ VPATH		= @srcdir@

 include $(DEPTH)/config/autoconf.mk

+DIRS		= simd
 MODULE		= jpeg
 LIBRARY_NAME	= mozjpeg

@ -58,64 +60,136 @@ endif
 GRE_MODULE	= 1

 CSRCS		= \
+		jcomapi.c \
 		jdapimin.c \
 		jdapistd.c \
-		jdatasrc.c \
 		jdatadst.c \
-		jdmaster.c \
-		jdinput.c \
-		jdmarker.c \
-		jdhuff.c \
-		jdphuff.c \
-		jdmainct.c \
+		jdatasrc.c \
 		jdcoefct.c \
-		jdpostct.c \
-		jddctmgr.c \
-		jidctfst.c \
-		jidctflt.c \
-		jidctint.c \
-		jdsample.c \
 		jdcolor.c \
-		jquant1.c \
-		jquant2.c \
+		jddctmgr.c \
+		jdhuff.c \
+		jdinput.c \
+		jdmainct.c \
+		jdmarker.c \
+		jdmaster.c \
 		jdmerge.c \
-		jcomapi.c \
-		jutils.c \
+		jdphuff.c \
+		jdpostct.c \
+		jdsample.c \
+		jdtrans.c \
 		jerror.c \
-		jmemmgr.c \
-		jmemnobs.c \
 		jfdctflt.c \
 		jfdctfst.c \
 		jfdctint.c \
-		$(NULL)
-
-EXPORTS		= \
-		jconfig.h \
-		jerror.h \
-		jinclude.h \
-		jmorecfg.h \
-		jpeglib.h \
-		jpegint.h \
-		jwinfig.h \
-		jos2fig.h \
+		jidctflt.c \
+		jidctfst.c \
+		jidctint.c \
+		jidctred.c \
+		jmemmgr.c \
+		jmemnobs.c \
+		jquant1.c \
+		jquant2.c \
+		jutils.c \
 		$(NULL)

 # These files enable support for writing JPEGs
 CSRCS		+= \
 		jcapimin.c \
-		jcparam.c \
 		jcapistd.c \
-		jcmarker.c \
-		jcinit.c \
-		jcmainct.c \
-		jchuff.c \
-		jcsample.c \
-		jcmaster.c \
 		jccoefct.c \
 		jccolor.c \
-		jcphuff.c \
 		jcdctmgr.c \
+		jchuff.c \
+		jcinit.c \
+		jcmainct.c \
+		jcmarker.c \
+		jcmaster.c \
+		jcparam.c \
+		jcphuff.c \
 		jcprepct.c \
+		jcsample.c \
+		$(NULL)
+
+AS=$(LIBJPEG_TURBO_AS)
+ASM_SUFFIX=asm
+ASFLAGS=$(LIBJPEG_TURBO_ASFLAGS) -I$(topsrcdir)/modules/libjpeg-turbo/simd/
+
+ifeq ($(AS),yasm)
+  # yasm doesn't like -c
+  AS_DASH_C_FLAG=
+endif
+
+# No SIMD support?
+ifeq (,$(LIBJPEG_TURBO_X86_ASM)$(LIBJPEG_TURBO_X64_ASM))
+  CSRCS += jsimd_none.c
+endif
+
+ifeq (1,$(LIBJPEG_TURBO_X64_ASM))
+  CSRCS   += simd/jsimd_x86_64.c
+  ASFILES += \
+	simd/jccolss2-64.asm \
+	simd/jcqnts2f-64.asm \
+	simd/jcqnts2i-64.asm \
+	simd/jcsamss2-64.asm \
+	simd/jdcolss2-64.asm \
+	simd/jdmerss2-64.asm \
+	simd/jdsamss2-64.asm \
+	simd/jfss2fst-64.asm \
+	simd/jfss2int-64.asm \
+	simd/jfsseflt-64.asm \
+	simd/jiss2flt-64.asm \
+	simd/jiss2fst-64.asm \
+	simd/jiss2int-64.asm \
+	simd/jiss2red-64.asm \
+	$(NULL)
+endif
+
+ifeq (1,$(LIBJPEG_TURBO_X86_ASM))
+  CSRCS   +=simd/jsimd_i386.c
+  ASFILES += \
+	simd/jccolmmx.asm \
+	simd/jccolss2.asm \
+	simd/jcqnt3dn.asm \
+	simd/jcqntmmx.asm \
+	simd/jcqnts2f.asm \
+	simd/jcqnts2i.asm \
+	simd/jcqntsse.asm \
+	simd/jcsammmx.asm \
+	simd/jcsamss2.asm \
+	simd/jdcolmmx.asm \
+	simd/jdcolss2.asm \
+	simd/jdmermmx.asm \
+	simd/jdmerss2.asm \
+	simd/jdsammmx.asm \
+	simd/jdsamss2.asm \
+	simd/jf3dnflt.asm \
+	simd/jfmmxfst.asm \
+	simd/jfmmxint.asm \
+	simd/jfss2fst.asm \
+	simd/jfss2int.asm \
+	simd/jfsseflt.asm \
+	simd/ji3dnflt.asm \
+	simd/jimmxfst.asm \
+	simd/jimmxint.asm \
+	simd/jimmxred.asm \
+	simd/jiss2flt.asm \
+	simd/jiss2fst.asm \
+	simd/jiss2int.asm \
+	simd/jiss2red.asm \
+	simd/jisseflt.asm \
+	simd/jsimdcpu.asm \
+	$(NULL)
+endif
+
+# jwinfig.h, jos2fig.h  ? XXX
+EXPORTS		= \
+		jconfig.h \
+		jerror.h \
+		jinclude.h \
+		jmorecfg.h \
+		jpegint.h \
+		jpeglib.h \
 		$(NULL)

 # need static lib for some of the libimg componentry to link properly
--- a/jpeg/README
+++ b/jpeg/README
@ -1,22 +1,20 @@
+libjpeg-turbo note:  This file is mostly taken from the libjpeg v8b README
+file, and it is included only for reference.  Some parts of it may not apply to
+libjpeg-turbo.  Please see README-turbo.txt for information specific to the
+turbo version.
+
+
 The Independent JPEG Group's JPEG software
 ==========================================

-README for release 6b of 27-Mar-1998
-====================================
+This distribution contains a release of the Independent JPEG Group's free JPEG
+software.  You are welcome to redistribute this software and to use it for any
+purpose, subject to the conditions under LEGAL ISSUES, below.

-This distribution contains the sixth public release of the Independent JPEG
-Group's free JPEG software.  You are welcome to redistribute this software and
-to use it for any purpose, subject to the conditions under LEGAL ISSUES, below.
-
-Serious users of this software (particularly those incorporating it into
-larger programs) should contact IJG at jpeg-info@uunet.uu.net to be added to
-our electronic mailing list.  Mailing list members are notified of updates
-and have a chance to participate in technical discussions, etc.
-
-This software is the work of Tom Lane, Philip Gladstone, Jim Boucher,
-Lee Crocker, Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi,
-Guido Vollbeding, Ge' Weijers, and other members of the Independent JPEG
-Group.
+This software is the work of Tom Lane, Guido Vollbeding, Philip Gladstone,
+Bill Allombert, Jim Boucher, Lee Crocker, Bob Friesenhahn, Ben Jackson,
+Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi, Ge' Weijers,
+and other members of the Independent JPEG Group.

 IJG is not affiliated with the official ISO JPEG standards committee.

@ -30,27 +28,26 @@ OVERVIEW            General description of JPEG and the IJG software.
 LEGAL ISSUES        Copyright, lack of warranty, terms of distribution.
 REFERENCES          Where to learn more about JPEG.
 ARCHIVE LOCATIONS   Where to find newer versions of this software.
-RELATED SOFTWARE    Other stuff you should get.
 FILE FORMAT WARS    Software *not* to get.
 TO DO               Plans for future IJG releases.

 Other documentation files in the distribution are:

 User documentation:
-  install.doc       How to configure and install the IJG software.
-  usage.doc         Usage instructions for cjpeg, djpeg, jpegtran,
+  install.txt       How to configure and install the IJG software.
+  usage.txt         Usage instructions for cjpeg, djpeg, jpegtran,
                    rdjpgcom, and wrjpgcom.
-  *.1               Unix-style man pages for programs (same info as usage.doc).
-  wizard.doc        Advanced usage instructions for JPEG wizards only.
+  *.1               Unix-style man pages for programs (same info as usage.txt).
+  wizard.txt        Advanced usage instructions for JPEG wizards only.
  change.log        Version-to-version change highlights.
 Programmer and internal documentation:
-  libjpeg.doc       How to use the JPEG library in your own programs.
+  libjpeg.txt       How to use the JPEG library in your own programs.
  example.c         Sample code for calling the JPEG library.
-  structure.doc     Overview of the JPEG library's internal structure.
-  filelist.doc      Road map of IJG files.
-  coderules.doc     Coding style rules --- please read if you contribute code.
+  structure.txt     Overview of the JPEG library's internal structure.
+  filelist.txt      Road map of IJG files.
+  coderules.txt     Coding style rules --- please read if you contribute code.

-Please read at least the files install.doc and usage.doc.  Useful information
+Please read at least the files install.txt and usage.txt.  Some information
 can also be found in the JPEG FAQ (Frequently Asked Questions) article.  See
 ARCHIVE LOCATIONS below to find out where to obtain the FAQ article.

@ -62,24 +59,27 @@ the order listed) before diving into the code.
 OVERVIEW
 ========

-This package contains C software to implement JPEG image compression and
-decompression.  JPEG (pronounced "jay-peg") is a standardized compression
-method for full-color and gray-scale images.  JPEG is intended for compressing
-"real-world" scenes; line drawings, cartoons and other non-realistic images
-are not its strong suit.  JPEG is lossy, meaning that the output image is not
-exactly identical to the input image.  Hence you must not use JPEG if you
-have to have identical output bits.  However, on typical photographic images,
-very good compression levels can be obtained with no visible change, and
-remarkably high compression levels are possible if you can tolerate a
-low-quality image.  For more details, see the references, or just experiment
-with various compression settings.
+This package contains C software to implement JPEG image encoding, decoding,
+and transcoding.  JPEG (pronounced "jay-peg") is a standardized compression
+method for full-color and gray-scale images.  JPEG's strong suit is compressing
+photographic images or other types of images which have smooth color and
+brightness transitions between neighboring pixels.  Images with sharp lines or
+other abrupt features may not compress well with JPEG, and a higher JPEG
+quality may have to be used to avoid visible compression artifacts with such
+images.
+
+JPEG is lossy, meaning that the output pixels are not necessarily identical to
+the input pixels.  However, on photographic content and other "smooth" images,
+very good compression ratios can be obtained with no visible compression
+artifacts, and extremely high compression ratios are possible if you are
+willing to sacrifice image quality (by reducing the "quality" setting in the
+compressor.)

 This software implements JPEG baseline, extended-sequential, and progressive
 compression processes.  Provision is made for supporting all variants of these
 processes, although some uncommon parameter settings aren't implemented yet.
-For legal reasons, we are not distributing code for the arithmetic-coding
-variants of JPEG; see LEGAL ISSUES.  We have made no provision for supporting
-the hierarchical or lossless processes defined in the standard.
+We have made no provision for supporting the hierarchical or lossless
+processes defined in the standard.

 We provide a set of library routines for reading and writing JPEG image files,
 plus two sample applications "cjpeg" and "djpeg", which use the library to
@ -91,10 +91,11 @@ considerable functionality beyond the bare JPEG coding/decoding capability;
 for example, the color quantization modules are not strictly part of JPEG
 decoding, but they are essential for output to colormapped file formats or
 colormapped displays.  These extra functions can be compiled out of the
-library if not required for a particular application.  We have also included
-"jpegtran", a utility for lossless transcoding between different JPEG
-processes, and "rdjpgcom" and "wrjpgcom", two simple applications for
-inserting and extracting textual comments in JFIF files.
+library if not required for a particular application.
+
+We have also included "jpegtran", a utility for lossless transcoding between
+different JPEG processes, and "rdjpgcom" and "wrjpgcom", two simple
+applications for inserting and extracting textual comments in JFIF files.

 The emphasis in designing this software has been on achieving portability and
 flexibility, while also making it fast enough to be useful.  In particular,
@ -127,7 +128,7 @@ with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.

-This software is copyright (C) 1991-1998, Thomas G. Lane.
+This software is copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.

 Permission is hereby granted to use, copy, modify, and distribute this
@ -170,17 +171,8 @@ the foregoing paragraphs do.
 The Unix configuration script "configure" was produced with GNU Autoconf.
 It is copyright by the Free Software Foundation but is freely distributable.
 The same holds for its supporting scripts (config.guess, config.sub,
-ltconfig, ltmain.sh).  Another support script, install-sh, is copyright
-by M.I.T. but is also freely distributable.
-
-It appears that the arithmetic coding option of the JPEG spec is covered by
-patents owned by IBM, AT&T, and Mitsubishi.  Hence arithmetic coding cannot
-legally be used without obtaining one or more licenses.  For this reason,
-support for arithmetic coding has been removed from the free JPEG software.
-(Since arithmetic coding provides only a marginal gain over the unpatented
-Huffman mode, it is unlikely that very many implementations will support it.)
-So far as we are aware, there are no patent restrictions on the remaining
-code.
+ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
+but is also freely distributable.

 The IJG distribution formerly included code to read and write GIF files.
 To avoid entanglement with the Unisys LZW patent, GIF reading support has
@ -198,7 +190,7 @@ We are required to state that
 REFERENCES
 ==========

-We highly recommend reading one or more of these references before trying to
+We recommend reading one or more of these references before trying to
 understand the innards of the JPEG software.

 The best short technical introduction to the JPEG compression algorithm is
@ -207,7 +199,7 @@ The best short technical introduction to the JPEG compression algorithm is
 (Adjacent articles in that issue discuss MPEG motion picture compression,
 applications of JPEG, and related topics.)  If you don't have the CACM issue
 handy, a PostScript file containing a revised version of Wallace's article is
-available at ftp://ftp.uu.net/graphics/jpeg/wallace.ps.gz.  The file (actually
+available at http://www.ijg.org/files/wallace.ps.gz.  The file (actually
 a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
 omits the sample images that appeared in CACM, but it includes corrections
 and some added material.  Note: the Wallace article is copyright ACM and IEEE,
@ -222,82 +214,53 @@ code but don't know much about data compression in general.  The book's JPEG
 sample code is far from industrial-strength, but when you are ready to look
 at a full implementation, you've got one here...

-The best full description of JPEG is the textbook "JPEG Still Image Data
-Compression Standard" by William B. Pennebaker and Joan L. Mitchell, published
-by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.  Price US$59.95, 638 pp.
-The book includes the complete text of the ISO JPEG standards (DIS 10918-1
-and draft DIS 10918-2).  This is by far the most complete exposition of JPEG
-in existence, and we highly recommend it.
+The best currently available description of JPEG is the textbook "JPEG Still
+Image Data Compression Standard" by William B. Pennebaker and Joan L.
+Mitchell, published by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.
+Price US$59.95, 638 pp.  The book includes the complete text of the ISO JPEG
+standards (DIS 10918-1 and draft DIS 10918-2).

-The JPEG standard itself is not available electronically; you must order a
-paper copy through ISO or ITU.  (Unless you feel a need to own a certified
-official copy, we recommend buying the Pennebaker and Mitchell book instead;
-it's much cheaper and includes a great deal of useful explanatory material.)
-In the USA, copies of the standard may be ordered from ANSI Sales at (212)
-642-4900, or from Global Engineering Documents at (800) 854-7179.  (ANSI
-doesn't take credit card orders, but Global does.)  It's not cheap: as of
-1992, ANSI was charging $95 for Part 1 and $47 for Part 2, plus 7%
-shipping/handling.  The standard is divided into two parts, Part 1 being the
-actual specification, while Part 2 covers compliance testing methods.  Part 1
-is titled "Digital Compression and Coding of Continuous-tone Still Images,
+The original JPEG standard is divided into two parts, Part 1 being the actual
+specification, while Part 2 covers compliance testing methods.  Part 1 is
+titled "Digital Compression and Coding of Continuous-tone Still Images,
 Part 1: Requirements and guidelines" and has document numbers ISO/IEC IS
 10918-1, ITU-T T.81.  Part 2 is titled "Digital Compression and Coding of
 Continuous-tone Still Images, Part 2: Compliance testing" and has document
 numbers ISO/IEC IS 10918-2, ITU-T T.83.

-Some extensions to the original JPEG standard are defined in JPEG Part 3,
-a newer ISO standard numbered ISO/IEC IS 10918-3 and ITU-T T.84.  IJG
-currently does not support any Part 3 extensions.
-
 The JPEG standard does not specify all details of an interchangeable file
 format.  For the omitted details we follow the "JFIF" conventions, revision
-1.02.  A copy of the JFIF spec is available from:
-	Literature Department
-	C-Cube Microsystems, Inc.
-	1778 McCarthy Blvd.
-	Milpitas, CA 95035
-	phone (408) 944-6300,  fax (408) 944-6314
-A PostScript version of this document is available by FTP at
-ftp://ftp.uu.net/graphics/jpeg/jfif.ps.gz.  There is also a plain text
-version at ftp://ftp.uu.net/graphics/jpeg/jfif.txt.gz, but it is missing
-the figures.
+1.02.  JFIF 1.02 has been adopted as an Ecma International Technical Report
+and thus received a formal publication status.  It is available as a free
+download in PDF format from
+http://www.ecma-international.org/publications/techreports/E-TR-098.htm.
+A PostScript version of the JFIF document is available at
+http://www.ijg.org/files/jfif.ps.gz.  There is also a plain text version at
+http://www.ijg.org/files/jfif.txt.gz, but it is missing the figures.

 The TIFF 6.0 file format specification can be obtained by FTP from
 ftp://ftp.sgi.com/graphics/tiff/TIFF6.ps.gz.  The JPEG incorporation scheme
 found in the TIFF 6.0 spec of 3-June-92 has a number of serious problems.
 IJG does not recommend use of the TIFF 6.0 design (TIFF Compression tag 6).
 Instead, we recommend the JPEG design proposed by TIFF Technical Note #2
-(Compression tag 7).  Copies of this Note can be obtained from ftp.sgi.com or
-from ftp://ftp.uu.net/graphics/jpeg/.  It is expected that the next revision
+(Compression tag 7).  Copies of this Note can be obtained from
+http://www.ijg.org/files/.  It is expected that the next revision
 of the TIFF spec will replace the 6.0 JPEG design with the Note's design.
 Although IJG's own code does not support TIFF/JPEG, the free libtiff library
-uses our library to implement TIFF/JPEG per the Note.  libtiff is available
-from ftp://ftp.sgi.com/graphics/tiff/.
+uses our library to implement TIFF/JPEG per the Note.


 ARCHIVE LOCATIONS
 =================

-The "official" archive site for this software is ftp.uu.net (Internet
-address 192.48.96.9).  The most recent released version can always be found
-there in directory graphics/jpeg.  This particular version will be archived
-as ftp://ftp.uu.net/graphics/jpeg/jpegsrc.v6b.tar.gz.  If you don't have
-direct Internet access, UUNET's archives are also available via UUCP; contact
-help@uunet.uu.net for information on retrieving files that way.
+The "official" archive site for this software is www.ijg.org.
+The most recent released version can always be found there in
+directory "files".  This particular version will be archived as
+http://www.ijg.org/files/jpegsrc.v8b.tar.gz, and in Windows-compatible
+"zip" archive format as http://www.ijg.org/files/jpegsr8b.zip.

-Numerous Internet sites maintain copies of the UUNET files.  However, only
-ftp.uu.net is guaranteed to have the latest official version.
-
-You can also obtain this software in DOS-compatible "zip" archive format from
-the SimTel archives (ftp://ftp.simtel.net/pub/simtelnet/msdos/graphics/), or
-on CompuServe in the Graphics Support forum (GO CIS:GRAPHSUP), library 12
-"JPEG Tools".  Again, these versions may sometimes lag behind the ftp.uu.net
-release.
-
-The JPEG FAQ (Frequently Asked Questions) article is a useful source of
-general information about JPEG.  It is updated constantly and therefore is
-not included in this distribution.  The FAQ is posted every two weeks to
-Usenet newsgroups comp.graphics.misc, news.answers, and other groups.
+The JPEG FAQ (Frequently Asked Questions) article is a source of some
+general information about JPEG.
 It is available on the World Wide Web at http://www.faqs.org/faqs/jpeg-faq/
 and other news.answers archive sites, including the official news.answers
 archive at rtfm.mit.edu: ftp://rtfm.mit.edu/pub/usenet/news.answers/jpeg-faq/.
@ -307,79 +270,20 @@ with body
 	send usenet/news.answers/jpeg-faq/part2


-RELATED SOFTWARE
-================
-
-Numerous viewing and image manipulation programs now support JPEG.  (Quite a
-few of them use this library to do so.)  The JPEG FAQ described above lists
-some of the more popular free and shareware viewers, and tells where to
-obtain them on Internet.
-
-If you are on a Unix machine, we highly recommend Jef Poskanzer's free
-PBMPLUS software, which provides many useful operations on PPM-format image
-files.  In particular, it can convert PPM images to and from a wide range of
-other formats, thus making cjpeg/djpeg considerably more useful.  The latest
-version is distributed by the NetPBM group, and is available from numerous
-sites, notably ftp://wuarchive.wustl.edu/graphics/graphics/packages/NetPBM/.
-Unfortunately PBMPLUS/NETPBM is not nearly as portable as the IJG software is;
-you are likely to have difficulty making it work on any non-Unix machine.
-
-A different free JPEG implementation, written by the PVRG group at Stanford,
-is available from ftp://havefun.stanford.edu/pub/jpeg/.  This program
-is designed for research and experimentation rather than production use;
-it is slower, harder to use, and less portable than the IJG code, but it
-is easier to read and modify.  Also, the PVRG code supports lossless JPEG,
-which we do not.  (On the other hand, it doesn't do progressive JPEG.)
-
-
 FILE FORMAT WARS
 ================

-Some JPEG programs produce files that are not compatible with our library.
-The root of the problem is that the ISO JPEG committee failed to specify a
-concrete file format.  Some vendors "filled in the blanks" on their own,
-creating proprietary formats that no one else could read.  (For example, none
-of the early commercial JPEG implementations for the Macintosh were able to
-exchange compressed files.)
-
-The file format we have adopted is called JFIF (see REFERENCES).  This format
-has been agreed to by a number of major commercial JPEG vendors, and it has
-become the de facto standard.  JFIF is a minimal or "low end" representation.
-We recommend the use of TIFF/JPEG (TIFF revision 6.0 as modified by TIFF
-Technical Note #2) for "high end" applications that need to record a lot of
-additional data about an image.  TIFF/JPEG is fairly new and not yet widely
-supported, unfortunately.
-
-The upcoming JPEG Part 3 standard defines a file format called SPIFF.
-SPIFF is interoperable with JFIF, in the sense that most JFIF decoders should
-be able to read the most common variant of SPIFF.  SPIFF has some technical
-advantages over JFIF, but its major claim to fame is simply that it is an
-official standard rather than an informal one.  At this point it is unclear
-whether SPIFF will supersede JFIF or whether JFIF will remain the de-facto
-standard.  IJG intends to support SPIFF once the standard is frozen, but we
-have not decided whether it should become our default output format or not.
-(In any case, our decoder will remain capable of reading JFIF indefinitely.)
-
-Various proprietary file formats incorporating JPEG compression also exist.
-We have little or no sympathy for the existence of these formats.  Indeed,
+The ISO JPEG standards committee actually promotes different formats like
+"JPEG 2000" or "JPEG XR" which are incompatible with original DCT-based
+JPEG.  IJG therefore does not support these formats (see REFERENCES).  Indeed,
 one of the original reasons for developing this free software was to help
-force convergence on common, open format standards for JPEG files.  Don't
-use a proprietary file format!
+force convergence on common, interoperable format standards for JPEG files.
+Don't use an incompatible file format!
+(In any case, our decoder will remain capable of reading existing JPEG
+image files indefinitely.)


 TO DO
 =====

-The major thrust for v7 will probably be improvement of visual quality.
-The current method for scaling the quantization tables is known not to be
-very good at low Q values.  We also intend to investigate block boundary
-smoothing, "poor man's variable quantization", and other means of improving
-quality-vs-file-size performance without sacrificing compatibility.
-
-In future versions, we are considering supporting some of the upcoming JPEG
-Part 3 extensions --- principally, variable quantization and the SPIFF file
-format.
-
-As always, speeding things up is of great interest.
-
-Please send bug reports, offers of help, etc. to jpeg-info@uunet.uu.net.
+Please send bug reports, offers of help, etc. to jpeg-info@uc.ag.
--- a/jpeg/README-turbo.txt
+++ b/jpeg/README-turbo.txt
@ -0,0 +1,304 @@
+*******************************************************************************
+**     Background
+*******************************************************************************
+
+libjpeg-turbo is a derivative of libjpeg which uses SIMD instructions (MMX,
+SSE2, etc.) to accelerate baseline JPEG compression and decompression on x86
+and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-4x as fast
+as the unmodified version of libjpeg, all else being equal.
+
+libjpeg-turbo was originally based on libjpeg/SIMD by Miyasaka Masaru, but
+the TigerVNC and VirtualGL projects made numerous enhancements to the codec in
+2009, including improved support for Mac OS X, 64-bit support, support for
+32-bit and big endian pixel formats (RGBX, XBGR, etc.), accelerated Huffman
+encoding/decoding, and various bug fixes.  The goal was to produce a fully open
+source codec that could replace the partially closed source TurboJPEG/IPP codec
+used by VirtualGL and TurboVNC.  libjpeg-turbo generally performs in the range
+of 80-120% of TurboJPEG/IPP.  It is faster in some areas but slower in others.
+
+In early 2010, libjpeg-turbo spun off into its own independent project, with
+the goal of making high-speed JPEG compression/decompression technology
+available to a broader range of users and developers.  The libjpeg-turbo shared
+libraries can be used as drop-in replacements for libjpeg on most systems.
+
+
+*******************************************************************************
+**     License
+*******************************************************************************
+
+The TurboJPEG/OSS wrapper, as well as some of the optimizations to the Huffman
+encoder (jchuff.c) and decoder (jdhuff.c), were borrowed from VirtualGL, and
+thus any distribution of libjpeg-turbo which includes those files must, as a
+whole, be subject to the terms of the wxWindows Library Licence, Version 3.1.
+A copy of this license can be found in this directory under LICENSE.txt.  The
+wxWindows Library License is based on the LGPL but includes provisions which
+allow the Library to be statically linked into proprietary libraries and
+applications without requiring the resulting binaries to be distributed under
+the terms of the LGPL.
+
+The rest of the source code, apart from TurboJPEG/OSS and the Huffman codec
+optimizations, falls under a less restrictive, BSD-style license (see README.)
+You can choose to distribute libjpeg-turbo, as a whole, under this BSD-style
+license by simply removing TurboJPEG/OSS and replacing the optimized jchuff.c
+and jdhuff.c with their unoptimized counterparts from the libjpeg v6b source.
+
+
+*******************************************************************************
+**     Using libjpeg-turbo
+*******************************************************************************
+
+=============================
+Replacing libjpeg at Run Time
+=============================
+
+If a Unix application is dynamically linked with libjpeg, then you can replace
+libjpeg with libjpeg-turbo at run time by manipulating LD_LIBRARY_PATH.
+For instance:
+
+  [Using libjpeg]
+  > time cjpeg <vgl_5674_0098.ppm >vgl_5674_0098.jpg
+  real  0m0.392s
+  user  0m0.074s
+  sys   0m0.020s
+
+  [Using libjpeg-turbo]
+  > export LD_LIBRARY_PATH=/opt/libjpeg-turbo/{lib}:$LD_LIBRARY_PATH
+  > time cjpeg <vgl_5674_0098.ppm >vgl_5674_0098.jpg
+  real  0m0.109s
+  user  0m0.029s
+  sys   0m0.010s
+
+NOTE: {lib} can be lib, lib32, lib64, or lib/64, depending on the O/S and
+architecture.
+
+System administrators can also replace the libjpeg sym links in /usr/{lib} with
+links to the libjpeg dynamic library located in /opt/libjpeg-turbo/{lib}.  This
+will effectively accelerate every dynamically linked libjpeg application on the
+system.
+
+The libjpeg-turbo SDK for Visual C++ installs the libjpeg-turbo DLL
+(jpeg62.dll, jpeg7.dll, or jpeg8.dll, depending on whether libjpeg v6b, v7, or
+v8 emulation is enabled) into c:\libjpeg-turbo[64]\bin, and the PATH
+environment variable can be modified such that this directory is searched
+before any others that might contain a libjpeg DLL.  However, if a libjpeg
+DLL exists in an application's install directory, then Windows will load this
+DLL first whenever the application is launched.  Thus, if an application ships
+with jpeg62.dll, jpeg7.dll, or jpeg8.dll, then back up the application's
+version of this DLL and copy c:\libjpeg-turbo[64]\bin\jpeg*.dll into the
+application's install directory to accelerate it.
+
+The version of the libjpeg-turbo DLL distributed in the libjpeg-turbo SDK for
+Visual C++ requires the Visual C++ 2008 C run time DLL (msvcr90.dll).
+msvcr90.dll ships with more recent versions of Windows, but users of older
+Windows releases can obtain it from the Visual C++ 2008 Redistributable
+Package, which is available as a free download from Microsoft's web site.
+
+NOTE:  Features of libjpeg which require passing a C run time structure, such
+as a file handle, from an application to libjpeg will probably not work with
+the version of the libjpeg-turbo DLL distributed in the libjpeg-turbo SDK for
+Visual C++, unless the application is also built to use the Visual C++ 2008 C
+run time DLL.  In particular, this affects jpeg_stdio_dest() and
+jpeg_stdio_src().
+
+Mac applications typically embed their own copies of the libjpeg dylib inside
+the (hidden) application bundle, so it is not possible to globally replace
+libjpeg on OS X systems.  If an application uses a shared library version of
+libjpeg, then it may be possible to replace the application's version of it.
+This would generally involve copying libjpeg.*.dylib from libjpeg-turbo into
+the appropriate place in the application bundle and using install_name_tool to
+repoint the dylib to the new directory.  This requires an advanced knowledge of
+OS X and would not survive an upgrade or a re-install of the application.
+Thus, it is not recommended for most users.
+
+=======================
+Replacing TurboJPEG/IPP
+=======================
+
+libjpeg-turbo is a drop-in replacement for the TurboJPEG/IPP SDK used by
+VirtualGL 2.1.x and TurboVNC 0.6 (and prior.)  libjpeg-turbo contains a wrapper
+library (TurboJPEG/OSS) that emulates the TurboJPEG API using libjpeg-turbo
+instead of the closed source Intel Performance Primitives.  You can replace the
+TurboJPEG/IPP package on Linux systems with the libjpeg-turbo package in order
+to make existing releases of VirtualGL 2.1.x and TurboVNC 0.x use the new codec
+at run time.  Note that the 64-bit libjpeg-turbo packages contain only 64-bit
+binaries, whereas the TurboJPEG/IPP 64-bit packages contained both 64-bit and
+32-bit binaries.  Thus, to replace a TurboJPEG/IPP 64-bit package, install
+both the 64-bit and 32-bit versions of libjpeg-turbo.
+
+You can also build the VirtualGL 2.1.x and TurboVNC 0.6 source code with
+the libjpeg-turbo SDK instead of TurboJPEG/IPP.  It should work identically.
+libjpeg-turbo also includes static library versions of TurboJPEG/OSS, which
+are used to build TurboVNC 1.0 and later.
+
+========================================
+Using libjpeg-turbo in Your Own Programs
+========================================
+
+For the most part, libjpeg-turbo should work identically to libjpeg, so in
+most cases, an application can be built against libjpeg and then run against
+libjpeg-turbo.  On Unix systems (including Cygwin), you can build against
+libjpeg-turbo instead of libjpeg by setting
+
+  CPATH=/opt/libjpeg-turbo/include
+  and
+  LIBRARY_PATH=/opt/libjpeg-turbo/{lib}
+
+({lib} = lib32 or lib64, depending on whether you are building a 32-bit or a
+64-bit application.)
+
+If using MinGW, then set
+
+  CPATH=/c/libjpeg-turbo-gcc[64]/include
+  and
+  LIBRARY_PATH=/c/libjpeg-turbo-gcc[64]/lib
+
+Building against libjpeg-turbo is useful, for instance, if you want to build an
+application that leverages the libjpeg-turbo colorspace extensions (see below.)
+On Linux and Solaris systems, you would still need to manipulate
+LD_LIBRARY_PATH or create appropriate sym links to use libjpeg-turbo at run
+time.  On such systems, you can pass -R /opt/libjpeg-turbo/{lib} to the linker
+to force the use of libjpeg-turbo at run time rather than libjpeg (also useful
+if you want to leverage the colorspace extensions), or you can link against the
+libjpeg-turbo static library.
+
+To force a Linux, Solaris, or MinGW application to link against the static
+version of libjpeg-turbo, you can use the following linker options:
+
+  -Wl,-Bstatic -ljpeg -Wl,-Bdynamic
+
+On OS X, simply add /opt/libjpeg-turbo/lib/libjpeg.a to the linker command
+line (this also works on Linux and Solaris.)
+
+To build Visual C++ applications using libjpeg-turbo, add
+c:\libjpeg-turbo[64]\include to the system or user INCLUDE environment
+variable and c:\libjpeg-turbo[64]\lib to the system or user LIB environment
+variable, and then link against either jpeg.lib (to use the DLL version of
+libjpeg-turbo) or jpeg-static.lib (to use the static version of libjpeg-turbo.)
+
+=====================
+Colorspace Extensions
+=====================
+
+libjpeg-turbo includes extensions which allow JPEG images to be compressed
+directly from (and decompressed directly to) buffers which use BGR, BGRX,
+RGBX, XBGR, and XRGB pixel ordering.  This is implemented with six new
+colorspace constants:
+
+  JCS_EXT_RGB   /* red/green/blue */
+  JCS_EXT_RGBX  /* red/green/blue/x */
+  JCS_EXT_BGR   /* blue/green/red */
+  JCS_EXT_BGRX  /* blue/green/red/x */
+  JCS_EXT_XBGR  /* x/blue/green/red */
+  JCS_EXT_XRGB  /* x/red/green/blue */
+
+Setting cinfo.in_color_space (compression) or cinfo.out_color_space
+(decompression) to one of these values will cause libjpeg-turbo to read the
+red, green, and blue values from (or write them to) the appropriate position in
+the pixel when YUV conversion is performed.
+
+Your application can check for the existence of these extensions at compile
+time with:
+
+  #ifdef JCS_EXTENSIONS
+
+At run time, attempting to use these extensions with a version of libjpeg
+that doesn't support them will result in a "Bogus input colorspace" error.
+
+=================================
+libjpeg v7 and v8 API/ABI support
+=================================
+
+libjpeg v7 and v8 added new features to the API/ABI, and, unfortunately, the
+compression and decompression structures were extended in a backward-
+incompatible manner to accommodate these features.  Thus, programs which are
+built to use libjpeg v7 or v8 did not work with libjpeg-turbo, since it is
+based on the libjpeg v6b code base.  Although libjpeg v7 and v8 are still not
+as widely used as v6b, enough programs (including a few Linux distros) have
+made the switch that it was desirable to provide support for the libjpeg v7/v8
+API/ABI in libjpeg-turbo.
+
+Some of the libjpeg v7 and v8 features -- DCT scaling, to name one -- involve
+deep modifications to the code which cannot be accommodated by libjpeg-turbo
+without either breaking compatibility with libjpeg v6b or producing an
+unsupportable mess.  In order to fully support libjpeg v8 with all of its
+features, we would have to essentially port the SIMD extensions to the libjpeg
+v8 code base and maintain two separate code trees.  We are hesitant to do this
+until/unless the newer libjpeg code bases garner more community support and
+involvement and until/unless we have some notion of whether future libjpeg
+releases will also be backward-incompatible.
+
+By passing an argument of --with-jpeg7 or --with-jpeg8 to configure, or an
+argument of -DWITH_JPEG7=1 or -DWITH_JPEG8=1 to cmake, you can build a version
+of libjpeg-turbo which emulates the libjpeg v7 or v8 API/ABI, so that programs
+which are built against libjpeg v7 or v8 can be run with libjpeg-turbo.  The
+following section describes which libjpeg v7+ features are supported and which
+aren't.
+
+libjpeg v7 and v8 Features:
+---------------------------
+
+Fully supported:
+
+-- cjpeg: Separate quality settings for luminance and chrominance
+   Note that the libpjeg v7+ API was extended to accommodate this feature only
+   for convenience purposes.  It has always been possible to implement this
+   feature with libjpeg v6b (see rdswitch.c for an example.)
+
+-- cjpeg: 32-bit BMP support
+
+-- jpegtran: lossless cropping
+
+-- jpegtran: -perfect option
+
+-- rdjpgcom: -raw option
+
+-- rdjpgcom: locale awareness
+
+
+Fully supported when using libjpeg v7/v8 emulation:
+
+-- libjpeg: In-memory source and destination managers
+
+
+Not supported:
+
+-- libjpeg: DCT scaling in compressor
+   cinfo.scale_num and cinfo.scale_denom are silently ignored.
+
+-- libjpeg: IDCT scaling extensions in decompressor
+   libjpeg-turbo still supports IDCT scaling with scaling factors of 1/2, 1/4,
+   and 1/8 (same as libjpeg v6b.)
+
+-- libjpeg: Fancy downsampling in compressor
+   cinfo.do_fancy_downsampling is silently ignored.
+
+-- jpegtran: Scaling
+   Seems to depend on the DCT scaling feature, which isn't supported.
+
+
+*******************************************************************************
+**     Performance pitfalls
+*******************************************************************************
+
+===============
+Restart Markers
+===============
+
+The optimized Huffman decoder in libjpeg-turbo does not handle restart markers
+in a way that makes libjpeg happy, so it is necessary to use the slow Huffman
+decoder when decompressing a JPEG image that has restart markers.  This can
+cause the decompression performance to drop by as much as 20%, but the
+performance will still be much much greater than that of libjpeg v6b.  Many
+consumer packages, such as PhotoShop, use restart markers when generating JPEG
+images, so images generated by those programs will experience this issue.
+
+===============================================
+Fast Integer Forward DCT at High Quality Levels
+===============================================
+
+The algorithm used by the SIMD-accelerated quantization function cannot produce
+correct results whenever the fast integer forward DCT is used along with a JPEG
+quality of 98-100.  Thus, libjpeg-turbo must use the non-SIMD quantization
+function in those cases.  This causes performance to drop by as much as 40%.
+It is therefore strongly advised that you use the slow integer forward DCT
+whenever encoding images with a JPEG quality of 98 or higher.
--- a/jpeg/cderror.h
+++ b/jpeg/cderror.h
@ -2,6 +2,7 @@
 * cderror.h
 *
 * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -45,6 +46,7 @@ JMESSAGE(JERR_BMP_BADHEADER, "Invalid BMP file: bad header length")
 JMESSAGE(JERR_BMP_BADPLANES, "Invalid BMP file: biPlanes not equal to 1")
 JMESSAGE(JERR_BMP_COLORSPACE, "BMP output must be grayscale or RGB")
 JMESSAGE(JERR_BMP_COMPRESSED, "Sorry, compressed BMPs not yet supported")
+JMESSAGE(JERR_BMP_EMPTY, "Empty BMP image")
 JMESSAGE(JERR_BMP_NOT, "Not a BMP file - does not start with BM")
 JMESSAGE(JTRC_BMP, "%ux%u 24-bit BMP image")
 JMESSAGE(JTRC_BMP_MAPPED, "%ux%u 8-bit colormapped BMP image")
--- a/jpeg/cdjpeg.c
+++ b/jpeg/cdjpeg.c
@ -1,181 +0,0 @@
-/*
- * cdjpeg.c
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains common support routines used by the IJG application
- * programs (cjpeg, djpeg, jpegtran).
- */
-
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include <ctype.h>		/* to declare isupper(), tolower() */
-#ifdef NEED_SIGNAL_CATCHER
-#include <signal.h>		/* to declare signal() */
-#endif
-#ifdef USE_SETMODE
-#include <fcntl.h>		/* to declare setmode()'s parameter macros */
-/* If you have setmode() but not <io.h>, just delete this line: */
-#include <io.h>			/* to declare setmode() */
-#endif
-
-
-/*
- * Signal catcher to ensure that temporary files are removed before aborting.
- * NB: for Amiga Manx C this is actually a global routine named _abort();
- * we put "#define signal_catcher _abort" in jconfig.h.  Talk about bogus...
- */
-
-#ifdef NEED_SIGNAL_CATCHER
-
-static j_common_ptr sig_cinfo;
-
-void				/* must be global for Manx C */
-signal_catcher (int signum)
-{
-  if (sig_cinfo != NULL) {
-    if (sig_cinfo->err != NULL) /* turn off trace output */
-      sig_cinfo->err->trace_level = 0;
-    jpeg_destroy(sig_cinfo);	/* clean up memory allocation & temp files */
-  }
-  exit(EXIT_FAILURE);
-}
-
-
-GLOBAL(void)
-enable_signal_catcher (j_common_ptr cinfo)
-{
-  sig_cinfo = cinfo;
-#ifdef SIGINT			/* not all systems have SIGINT */
-  signal(SIGINT, signal_catcher);
-#endif
-#ifdef SIGTERM			/* not all systems have SIGTERM */
-  signal(SIGTERM, signal_catcher);
-#endif
-}
-
-#endif
-
-
-/*
- * Optional progress monitor: display a percent-done figure on stderr.
- */
-
-#ifdef PROGRESS_REPORT
-
-METHODDEF(void)
-progress_monitor (j_common_ptr cinfo)
-{
-  cd_progress_ptr prog = (cd_progress_ptr) cinfo->progress;
-  int total_passes = prog->pub.total_passes + prog->total_extra_passes;
-  int percent_done = (int) (prog->pub.pass_counter*100L/prog->pub.pass_limit);
-
-  if (percent_done != prog->percent_done) {
-    prog->percent_done = percent_done;
-    if (total_passes > 1) {
-      fprintf(stderr, "\rPass %d/%d: %3d%% ",
-	      prog->pub.completed_passes + prog->completed_extra_passes + 1,
-	      total_passes, percent_done);
-    } else {
-      fprintf(stderr, "\r %3d%% ", percent_done);
-    }
-    fflush(stderr);
-  }
-}
-
-
-GLOBAL(void)
-start_progress_monitor (j_common_ptr cinfo, cd_progress_ptr progress)
-{
-  /* Enable progress display, unless trace output is on */
-  if (cinfo->err->trace_level == 0) {
-    progress->pub.progress_monitor = progress_monitor;
-    progress->completed_extra_passes = 0;
-    progress->total_extra_passes = 0;
-    progress->percent_done = -1;
-    cinfo->progress = &progress->pub;
-  }
-}
-
-
-GLOBAL(void)
-end_progress_monitor (j_common_ptr cinfo)
-{
-  /* Clear away progress display */
-  if (cinfo->err->trace_level == 0) {
-    fprintf(stderr, "\r                \r");
-    fflush(stderr);
-  }
-}
-
-#endif
-
-
-/*
- * Case-insensitive matching of possibly-abbreviated keyword switches.
- * keyword is the constant keyword (must be lower case already),
- * minchars is length of minimum legal abbreviation.
- */
-
-GLOBAL(boolean)
-keymatch (char * arg, const char * keyword, int minchars)
-{
-  register int ca, ck;
-  register int nmatched = 0;
-
-  while ((ca = *arg++) != '\0') {
-    if ((ck = *keyword++) == '\0')
-      return FALSE;		/* arg longer than keyword, no good */
-    if (isupper(ca))		/* force arg to lcase (assume ck is already) */
-      ca = tolower(ca);
-    if (ca != ck)
-      return FALSE;		/* no good */
-    nmatched++;			/* count matched characters */
-  }
-  /* reached end of argument; fail if it's too short for unique abbrev */
-  if (nmatched < minchars)
-    return FALSE;
-  return TRUE;			/* A-OK */
-}
-
-
-/*
- * Routines to establish binary I/O mode for stdin and stdout.
- * Non-Unix systems often require some hacking to get out of text mode.
- */
-
-GLOBAL(FILE *)
-read_stdin (void)
-{
-  FILE * input_file = stdin;
-
-#ifdef USE_SETMODE		/* need to hack file mode? */
-  setmode(fileno(stdin), O_BINARY);
-#endif
-#ifdef USE_FDOPEN		/* need to re-open in binary mode? */
-  if ((input_file = fdopen(fileno(stdin), READ_BINARY)) == NULL) {
-    fprintf(stderr, "Cannot reopen stdin\n");
-    exit(EXIT_FAILURE);
-  }
-#endif
-  return input_file;
-}
-
-
-GLOBAL(FILE *)
-write_stdout (void)
-{
-  FILE * output_file = stdout;
-
-#ifdef USE_SETMODE		/* need to hack file mode? */
-  setmode(fileno(stdout), O_BINARY);
-#endif
-#ifdef USE_FDOPEN		/* need to re-open in binary mode? */
-  if ((output_file = fdopen(fileno(stdout), WRITE_BINARY)) == NULL) {
-    fprintf(stderr, "Cannot reopen stdout\n");
-    exit(EXIT_FAILURE);
-  }
-#endif
-  return output_file;
-}
--- a/jpeg/cdjpeg.h
+++ b/jpeg/cdjpeg.h
@ -104,6 +104,7 @@ typedef struct cdjpeg_progress_mgr * cd_progress_ptr;
 #define jinit_write_targa	jIWrTarga
 #define read_quant_tables	RdQTables
 #define read_scan_script	RdScnScript
+#define set_quality_ratings     SetQRates
 #define set_quant_slots		SetQSlots
 #define set_sample_factors	SetSFacts
 #define read_color_map		RdCMap
@ -131,8 +132,10 @@ EXTERN(djpeg_dest_ptr) jinit_write_targa JPP((j_decompress_ptr cinfo));
 /* cjpeg support routines (in rdswitch.c) */

 EXTERN(boolean) read_quant_tables JPP((j_compress_ptr cinfo, char * filename,
-				    int scale_factor, boolean force_baseline));
+				       boolean force_baseline));
 EXTERN(boolean) read_scan_script JPP((j_compress_ptr cinfo, char * filename));
+EXTERN(boolean) set_quality_ratings JPP((j_compress_ptr cinfo, char *arg,
+					 boolean force_baseline));
 EXTERN(boolean) set_quant_slots JPP((j_compress_ptr cinfo, char *arg));
 EXTERN(boolean) set_sample_factors JPP((j_compress_ptr cinfo, char *arg));

--- a/jpeg/change.log
+++ b/jpeg/change.log
@ -1,217 +0,0 @@
-CHANGE LOG for Independent JPEG Group's JPEG software
-
-
-Version 6b  27-Mar-1998
-----------------------
-
-jpegtran has new features for lossless image transformations (rotation
-and flipping) as well as "lossless" reduction to grayscale.
-
-jpegtran now copies comments by default; it has a -copy switch to enable
-copying all APPn blocks as well, or to suppress comments.  (Formerly it
-always suppressed comments and APPn blocks.)  jpegtran now also preserves
-JFIF version and resolution information.
-
-New decompressor library feature: COM and APPn markers found in the input
-file can be saved in memory for later use by the application.  (Before,
-you had to code this up yourself with a custom marker processor.)
-
-There is an unused field "void * client_data" now in compress and decompress
-parameter structs; this may be useful in some applications.
-
-JFIF version number information is now saved by the decoder and accepted by
-the encoder.  jpegtran uses this to copy the source file's version number,
-to ensure "jpegtran -copy all" won't create bogus files that contain JFXX
-extensions but claim to be version 1.01.  Applications that generate their
-own JFXX extension markers also (finally) have a supported way to cause the
-encoder to emit JFIF version number 1.02.
-
-djpeg's trace mode reports JFIF 1.02 thumbnail images as such, rather
-than as unknown APP0 markers.
-
-In -verbose mode, djpeg and rdjpgcom will try to print the contents of
-APP12 markers as text.  Some digital cameras store useful text information
-in APP12 markers.
-
-Handling of truncated data streams is more robust: blocks beyond the one in
-which the error occurs will be output as uniform gray, or left unchanged
-if decoding a progressive JPEG.  The appearance no longer depends on the
-Huffman tables being used.
-
-Huffman tables are checked for validity much more carefully than before.
-
-To avoid the Unisys LZW patent, djpeg's GIF output capability has been
-changed to produce "uncompressed GIFs", and cjpeg's GIF input capability
-has been removed altogether.  We're not happy about it either, but there
-seems to be no good alternative.
-
-The configure script now supports building libjpeg as a shared library
-on many flavors of Unix (all the ones that GNU libtool knows how to
-build shared libraries for).  Use "./configure --enable-shared" to
-try this out.
-
-New jconfig file and makefiles for Microsoft Visual C++ and Developer Studio.
-Also, a jconfig file and a build script for Metrowerks CodeWarrior
-on Apple Macintosh.  makefile.dj has been updated for DJGPP v2, and there
-are miscellaneous other minor improvements in the makefiles.
-
-jmemmac.c now knows how to create temporary files following Mac System 7
-conventions.
-
-djpeg's -map switch is now able to read raw-format PPM files reliably.
-
-cjpeg -progressive -restart no longer generates any unnecessary DRI markers.
-
-Multiple calls to jpeg_simple_progression for a single JPEG object
-no longer leak memory.
-
-
-Version 6a  7-Feb-96
--------------------
-
-Library initialization sequence modified to detect version mismatches
-and struct field packing mismatches between library and calling application.
-This change requires applications to be recompiled, but does not require
-any application source code change.
-
-All routine declarations changed to the style "GLOBAL(type) name ...",
-that is, GLOBAL, LOCAL, METHODDEF, EXTERN are now macros taking the
-routine's return type as an argument.  This makes it possible to add
-Microsoft-style linkage keywords to all the routines by changing just
-these macros.  Note that any application code that was using these macros
-will have to be changed.
-
-DCT coefficient quantization tables are now stored in normal array order
-rather than zigzag order.  Application code that calls jpeg_add_quant_table,
-or otherwise manipulates quantization tables directly, will need to be
-changed.  If you need to make such code work with either older or newer
-versions of the library, a test like "#if JPEG_LIB_VERSION >= 61" is
-recommended.
-
-djpeg's trace capability now dumps DQT tables in natural order, not zigzag
-order.  This allows the trace output to be made into a "-qtables" file
-more easily.
-
-New system-dependent memory manager module for use on Apple Macintosh.
-
-Fix bug in cjpeg's -smooth option: last one or two scanlines would be
-duplicates of the prior line unless the image height mod 16 was 1 or 2.
-
-Repair minor problems in VMS, BCC, MC6 makefiles.
-
-New configure script based on latest GNU Autoconf.
-
-Correct the list of include files needed by MetroWerks C for ccommand().
-
-Numerous small documentation updates.
-
-
-Version 6  2-Aug-95
-------------------
-
-Progressive JPEG support: library can read and write full progressive JPEG
-files.  A "buffered image" mode supports incremental decoding for on-the-fly
-display of progressive images.  Simply recompiling an existing IJG-v5-based
-decoder with v6 should allow it to read progressive files, though of course
-without any special progressive display.
-
-New "jpegtran" application performs lossless transcoding between different
-JPEG formats; primarily, it can be used to convert baseline to progressive
-JPEG and vice versa.  In support of jpegtran, the library now allows lossless
-reading and writing of JPEG files as DCT coefficient arrays.  This ability
-may be of use in other applications.
-
-Notes for programmers:
-* We changed jpeg_start_decompress() to be able to suspend; this makes all
-decoding modes available to suspending-input applications.  However,
-existing applications that use suspending input will need to be changed
-to check the return value from jpeg_start_decompress().  You don't need to
-do anything if you don't use a suspending data source.
-* We changed the interface to the virtual array routines: access_virt_array
-routines now take a count of the number of rows to access this time.  The
-last parameter to request_virt_array routines is now interpreted as the
-maximum number of rows that may be accessed at once, but not necessarily
-the height of every access.
-
-
-Version 5b  15-Mar-95
---------------------
-
-Correct bugs with grayscale images having v_samp_factor > 1.
-
-jpeg_write_raw_data() now supports output suspension.
-
-Correct bugs in "configure" script for case of compiling in
-a directory other than the one containing the source files.
-
-Repair bug in jquant1.c: sometimes didn't use as many colors as it could.
-
-Borland C makefile and jconfig file work under either MS-DOS or OS/2.
-
-Miscellaneous improvements to documentation.
-
-
-Version 5a  7-Dec-94
--------------------
-
-Changed color conversion roundoff behavior so that grayscale values are
-represented exactly.  (This causes test image files to change.)
-
-Make ordered dither use 16x16 instead of 4x4 pattern for a small quality
-improvement.
-
-New configure script based on latest GNU Autoconf.
-Fix configure script to handle CFLAGS correctly.
-Rename *.auto files to *.cfg, so that configure script still works if
-file names have been truncated for DOS.
-
-Fix bug in rdbmp.c: didn't allow for extra data between header and image.
-
-Modify rdppm.c/wrppm.c to handle 2-byte raw PPM/PGM formats for 12-bit data.
-
-Fix several bugs in rdrle.c.
-
-NEED_SHORT_EXTERNAL_NAMES option was broken.
-
-Revise jerror.h/jerror.c for more flexibility in message table.
-
-Repair oversight in jmemname.c NO_MKTEMP case: file could be there
-but unreadable.
-
-
-Version 5  24-Sep-94
--------------------
-
-Version 5 represents a nearly complete redesign and rewrite of the IJG
-software.  Major user-visible changes include:
-  * Automatic configuration simplifies installation for most Unix systems.
-  * A range of speed vs. image quality tradeoffs are supported.
-    This includes resizing of an image during decompression: scaling down
-    by a factor of 1/2, 1/4, or 1/8 is handled very efficiently.
-  * New programs rdjpgcom and wrjpgcom allow insertion and extraction
-    of text comments in a JPEG file.
-
-The application programmer's interface to the library has changed completely.
-Notable improvements include:
-  * We have eliminated the use of callback routines for handling the
-    uncompressed image data.  The application now sees the library as a
-    set of routines that it calls to read or write image data on a
-    scanline-by-scanline basis.
-  * The application image data is represented in a conventional interleaved-
-    pixel format, rather than as a separate array for each color channel.
-    This can save a copying step in many programs.
-  * The handling of compressed data has been cleaned up: the application can
-    supply routines to source or sink the compressed data.  It is possible to
-    suspend processing on source/sink buffer overrun, although this is not
-    supported in all operating modes.
-  * All static state has been eliminated from the library, so that multiple
-    instances of compression or decompression can be active concurrently.
-  * JPEG abbreviated datastream formats are supported, ie, quantization and
-    Huffman tables can be stored separately from the image data.
-  * And not only that, but the documentation of the library has improved
-    considerably!
-
-
-The last widely used release before the version 5 rewrite was version 4A of
-18-Feb-93.  Change logs before that point have been discarded, since they
-are not of much interest after the rewrite.
--- a/jpeg/cjpeg.c
+++ b/jpeg/cjpeg.c
@ -1,606 +0,0 @@
-/*
- * cjpeg.c
- *
- * Copyright (C) 1991-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains a command-line user interface for the JPEG compressor.
- * It should work on any system with Unix- or MS-DOS-style command lines.
- *
- * Two different command line styles are permitted, depending on the
- * compile-time switch TWO_FILE_COMMANDLINE:
- *	cjpeg [options]  inputfile outputfile
- *	cjpeg [options]  [inputfile]
- * In the second style, output is always to standard output, which you'd
- * normally redirect to a file or pipe to some other program.  Input is
- * either from a named file or from standard input (typically redirected).
- * The second style is convenient on Unix but is unhelpful on systems that
- * don't support pipes.  Also, you MUST use the first style if your system
- * doesn't do binary I/O to stdin/stdout.
- * To simplify script writing, the "-outfile" switch is provided.  The syntax
- *	cjpeg [options]  -outfile outputfile  inputfile
- * works regardless of which command line style is used.
- */
-
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include "jversion.h"		/* for version message */
-
-#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
-#ifdef __MWERKS__
-#include <SIOUX.h>              /* Metrowerks needs this */
-#include <console.h>		/* ... and this */
-#endif
-#ifdef THINK_C
-#include <console.h>		/* Think declares it here */
-#endif
-#endif
-
-
-/* Create the add-on message string table. */
-
-#define JMESSAGE(code,string)	string ,
-
-static const char * const cdjpeg_message_table[] = {
-#include "cderror.h"
-  NULL
-};
-
-
-/*
- * This routine determines what format the input file is,
- * and selects the appropriate input-reading module.
- *
- * To determine which family of input formats the file belongs to,
- * we may look only at the first byte of the file, since C does not
- * guarantee that more than one character can be pushed back with ungetc.
- * Looking at additional bytes would require one of these approaches:
- *     1) assume we can fseek() the input file (fails for piped input);
- *     2) assume we can push back more than one character (works in
- *        some C implementations, but unportable);
- *     3) provide our own buffering (breaks input readers that want to use
- *        stdio directly, such as the RLE library);
- * or  4) don't put back the data, and modify the input_init methods to assume
- *        they start reading after the start of file (also breaks RLE library).
- * #1 is attractive for MS-DOS but is untenable on Unix.
- *
- * The most portable solution for file types that can't be identified by their
- * first byte is to make the user tell us what they are.  This is also the
- * only approach for "raw" file types that contain only arbitrary values.
- * We presently apply this method for Targa files.  Most of the time Targa
- * files start with 0x00, so we recognize that case.  Potentially, however,
- * a Targa file could start with any byte value (byte 0 is the length of the
- * seldom-used ID field), so we provide a switch to force Targa input mode.
- */
-
-static boolean is_targa;	/* records user -targa switch */
-
-
-LOCAL(cjpeg_source_ptr)
-select_file_type (j_compress_ptr cinfo, FILE * infile)
-{
-  int c;
-
-  if (is_targa) {
-#ifdef TARGA_SUPPORTED
-    return jinit_read_targa(cinfo);
-#else
-    ERREXIT(cinfo, JERR_TGA_NOTCOMP);
-#endif
-  }
-
-  if ((c = getc(infile)) == EOF)
-    ERREXIT(cinfo, JERR_INPUT_EMPTY);
-  if (ungetc(c, infile) == EOF)
-    ERREXIT(cinfo, JERR_UNGETC_FAILED);
-
-  switch (c) {
-#ifdef BMP_SUPPORTED
-  case 'B':
-    return jinit_read_bmp(cinfo);
-#endif
-#ifdef GIF_SUPPORTED
-  case 'G':
-    return jinit_read_gif(cinfo);
-#endif
-#ifdef PPM_SUPPORTED
-  case 'P':
-    return jinit_read_ppm(cinfo);
-#endif
-#ifdef RLE_SUPPORTED
-  case 'R':
-    return jinit_read_rle(cinfo);
-#endif
-#ifdef TARGA_SUPPORTED
-  case 0x00:
-    return jinit_read_targa(cinfo);
-#endif
-  default:
-    ERREXIT(cinfo, JERR_UNKNOWN_FORMAT);
-    break;
-  }
-
-  return NULL;			/* suppress compiler warnings */
-}
-
-
-/*
- * Argument-parsing code.
- * The switch parser is designed to be useful with DOS-style command line
- * syntax, ie, intermixed switches and file names, where only the switches
- * to the left of a given file name affect processing of that file.
- * The main program in this file doesn't actually use this capability...
- */
-
-
-static const char * progname;	/* program name for error messages */
-static char * outfilename;	/* for -outfile switch */
-
-
-LOCAL(void)
-usage (void)
-/* complain about bad command line */
-{
-  fprintf(stderr, "usage: %s [switches] ", progname);
-#ifdef TWO_FILE_COMMANDLINE
-  fprintf(stderr, "inputfile outputfile\n");
-#else
-  fprintf(stderr, "[inputfile]\n");
-#endif
-
-  fprintf(stderr, "Switches (names may be abbreviated):\n");
-  fprintf(stderr, "  -quality N     Compression quality (0..100; 5-95 is useful range)\n");
-  fprintf(stderr, "  -grayscale     Create monochrome JPEG file\n");
-#ifdef ENTROPY_OPT_SUPPORTED
-  fprintf(stderr, "  -optimize      Optimize Huffman table (smaller file, but slow compression)\n");
-#endif
-#ifdef C_PROGRESSIVE_SUPPORTED
-  fprintf(stderr, "  -progressive   Create progressive JPEG file\n");
-#endif
-#ifdef TARGA_SUPPORTED
-  fprintf(stderr, "  -targa         Input file is Targa format (usually not needed)\n");
-#endif
-  fprintf(stderr, "Switches for advanced users:\n");
-#ifdef DCT_ISLOW_SUPPORTED
-  fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
-#endif
-#ifdef DCT_IFAST_SUPPORTED
-  fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
-	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
-#endif
-#ifdef DCT_FLOAT_SUPPORTED
-  fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
-#endif
-  fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
-#ifdef INPUT_SMOOTHING_SUPPORTED
-  fprintf(stderr, "  -smooth N      Smooth dithered input (N=1..100 is strength)\n");
-#endif
-  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
-  fprintf(stderr, "  -outfile name  Specify name for output file\n");
-  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
-  fprintf(stderr, "Switches for wizards:\n");
-#ifdef C_ARITH_CODING_SUPPORTED
-  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
-#endif
-  fprintf(stderr, "  -baseline      Force baseline quantization tables\n");
-  fprintf(stderr, "  -qtables file  Use quantization tables given in file\n");
-  fprintf(stderr, "  -qslots N[,...]    Set component quantization tables\n");
-  fprintf(stderr, "  -sample HxV[,...]  Set component sampling factors\n");
-#ifdef C_MULTISCAN_FILES_SUPPORTED
-  fprintf(stderr, "  -scans file    Create multi-scan JPEG per script file\n");
-#endif
-  exit(EXIT_FAILURE);
-}
-
-
-LOCAL(int)
-parse_switches (j_compress_ptr cinfo, int argc, char **argv,
-		int last_file_arg_seen, boolean for_real)
-/* Parse optional switches.
- * Returns argv[] index of first file-name argument (== argc if none).
- * Any file names with indexes <= last_file_arg_seen are ignored;
- * they have presumably been processed in a previous iteration.
- * (Pass 0 for last_file_arg_seen on the first or only iteration.)
- * for_real is FALSE on the first (dummy) pass; we may skip any expensive
- * processing.
- */
-{
-  int argn;
-  char * arg;
-  int quality;			/* -quality parameter */
-  int q_scale_factor;		/* scaling percentage for -qtables */
-  boolean force_baseline;
-  boolean simple_progressive;
-  char * qtablefile = NULL;	/* saves -qtables filename if any */
-  char * qslotsarg = NULL;	/* saves -qslots parm if any */
-  char * samplearg = NULL;	/* saves -sample parm if any */
-  char * scansarg = NULL;	/* saves -scans parm if any */
-
-  /* Set up default JPEG parameters. */
-  /* Note that default -quality level need not, and does not,
-   * match the default scaling for an explicit -qtables argument.
-   */
-  quality = 75;			/* default -quality value */
-  q_scale_factor = 100;		/* default to no scaling for -qtables */
-  force_baseline = FALSE;	/* by default, allow 16-bit quantizers */
-  simple_progressive = FALSE;
-  is_targa = FALSE;
-  outfilename = NULL;
-  cinfo->err->trace_level = 0;
-
-  /* Scan command line options, adjust parameters */
-
-  for (argn = 1; argn < argc; argn++) {
-    arg = argv[argn];
-    if (*arg != '-') {
-      /* Not a switch, must be a file name argument */
-      if (argn <= last_file_arg_seen) {
-	outfilename = NULL;	/* -outfile applies to just one input file */
-	continue;		/* ignore this name if previously processed */
-      }
-      break;			/* else done parsing switches */
-    }
-    arg++;			/* advance past switch marker character */
-
-    if (keymatch(arg, "arithmetic", 1)) {
-      /* Use arithmetic coding. */
-#ifdef C_ARITH_CODING_SUPPORTED
-      cinfo->arith_code = TRUE;
-#else
-      fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
-	      progname);
-      exit(EXIT_FAILURE);
-#endif
-
-    } else if (keymatch(arg, "baseline", 1)) {
-      /* Force baseline-compatible output (8-bit quantizer values). */
-      force_baseline = TRUE;
-
-    } else if (keymatch(arg, "dct", 2)) {
-      /* Select DCT algorithm. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (keymatch(argv[argn], "int", 1)) {
-	cinfo->dct_method = JDCT_ISLOW;
-      } else if (keymatch(argv[argn], "fast", 2)) {
-	cinfo->dct_method = JDCT_IFAST;
-      } else if (keymatch(argv[argn], "float", 2)) {
-	cinfo->dct_method = JDCT_FLOAT;
-      } else
-	usage();
-
-    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
-      /* Enable debug printouts. */
-      /* On first -d, print version identification */
-      static boolean printed_version = FALSE;
-
-      if (! printed_version) {
-	fprintf(stderr, "Independent JPEG Group's CJPEG, version %s\n%s\n",
-		JVERSION, JCOPYRIGHT);
-	printed_version = TRUE;
-      }
-      cinfo->err->trace_level++;
-
-    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
-      /* Force a monochrome JPEG file to be generated. */
-      jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
-
-    } else if (keymatch(arg, "maxmemory", 3)) {
-      /* Maximum memory in Kb (or Mb with 'm'). */
-      long lval;
-      char ch = 'x';
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
-      if (ch == 'm' || ch == 'M')
-	lval *= 1000L;
-      cinfo->mem->max_memory_to_use = lval * 1000L;
-
-    } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
-      /* Enable entropy parm optimization. */
-#ifdef ENTROPY_OPT_SUPPORTED
-      cinfo->optimize_coding = TRUE;
-#else
-      fprintf(stderr, "%s: sorry, entropy optimization was not compiled\n",
-	      progname);
-      exit(EXIT_FAILURE);
-#endif
-
-    } else if (keymatch(arg, "outfile", 4)) {
-      /* Set output file name. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      outfilename = argv[argn];	/* save it away for later use */
-
-    } else if (keymatch(arg, "progressive", 1)) {
-      /* Select simple progressive mode. */
-#ifdef C_PROGRESSIVE_SUPPORTED
-      simple_progressive = TRUE;
-      /* We must postpone execution until num_components is known. */
-#else
-      fprintf(stderr, "%s: sorry, progressive output was not compiled\n",
-	      progname);
-      exit(EXIT_FAILURE);
-#endif
-
-    } else if (keymatch(arg, "quality", 1)) {
-      /* Quality factor (quantization table scaling factor). */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%d", &quality) != 1)
-	usage();
-      /* Change scale factor in case -qtables is present. */
-      q_scale_factor = jpeg_quality_scaling(quality);
-
-    } else if (keymatch(arg, "qslots", 2)) {
-      /* Quantization table slot numbers. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      qslotsarg = argv[argn];
-      /* Must delay setting qslots until after we have processed any
-       * colorspace-determining switches, since jpeg_set_colorspace sets
-       * default quant table numbers.
-       */
-
-    } else if (keymatch(arg, "qtables", 2)) {
-      /* Quantization tables fetched from file. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      qtablefile = argv[argn];
-      /* We postpone actually reading the file in case -quality comes later. */
-
-    } else if (keymatch(arg, "restart", 1)) {
-      /* Restart interval in MCU rows (or in MCUs with 'b'). */
-      long lval;
-      char ch = 'x';
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
-      if (lval < 0 || lval > 65535L)
-	usage();
-      if (ch == 'b' || ch == 'B') {
-	cinfo->restart_interval = (unsigned int) lval;
-	cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
-      } else {
-	cinfo->restart_in_rows = (int) lval;
-	/* restart_interval will be computed during startup */
-      }
-
-    } else if (keymatch(arg, "sample", 2)) {
-      /* Set sampling factors. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      samplearg = argv[argn];
-      /* Must delay setting sample factors until after we have processed any
-       * colorspace-determining switches, since jpeg_set_colorspace sets
-       * default sampling factors.
-       */
-
-    } else if (keymatch(arg, "scans", 2)) {
-      /* Set scan script. */
-#ifdef C_MULTISCAN_FILES_SUPPORTED
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      scansarg = argv[argn];
-      /* We must postpone reading the file in case -progressive appears. */
-#else
-      fprintf(stderr, "%s: sorry, multi-scan output was not compiled\n",
-	      progname);
-      exit(EXIT_FAILURE);
-#endif
-
-    } else if (keymatch(arg, "smooth", 2)) {
-      /* Set input smoothing factor. */
-      int val;
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%d", &val) != 1)
-	usage();
-      if (val < 0 || val > 100)
-	usage();
-      cinfo->smoothing_factor = val;
-
-    } else if (keymatch(arg, "targa", 1)) {
-      /* Input file is Targa format. */
-      is_targa = TRUE;
-
-    } else {
-      usage();			/* bogus switch */
-    }
-  }
-
-  /* Post-switch-scanning cleanup */
-
-  if (for_real) {
-
-    /* Set quantization tables for selected quality. */
-    /* Some or all may be overridden if -qtables is present. */
-    jpeg_set_quality(cinfo, quality, force_baseline);
-
-    if (qtablefile != NULL)	/* process -qtables if it was present */
-      if (! read_quant_tables(cinfo, qtablefile,
-			      q_scale_factor, force_baseline))
-	usage();
-
-    if (qslotsarg != NULL)	/* process -qslots if it was present */
-      if (! set_quant_slots(cinfo, qslotsarg))
-	usage();
-
-    if (samplearg != NULL)	/* process -sample if it was present */
-      if (! set_sample_factors(cinfo, samplearg))
-	usage();
-
-#ifdef C_PROGRESSIVE_SUPPORTED
-    if (simple_progressive)	/* process -progressive; -scans can override */
-      jpeg_simple_progression(cinfo);
-#endif
-
-#ifdef C_MULTISCAN_FILES_SUPPORTED
-    if (scansarg != NULL)	/* process -scans if it was present */
-      if (! read_scan_script(cinfo, scansarg))
-	usage();
-#endif
-  }
-
-  return argn;			/* return index of next arg (file name) */
-}
-
-
-/*
- * The main program.
- */
-
-int
-main (int argc, char **argv)
-{
-  struct jpeg_compress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
-  struct cdjpeg_progress_mgr progress;
-#endif
-  int file_index;
-  cjpeg_source_ptr src_mgr;
-  FILE * input_file;
-  FILE * output_file;
-  JDIMENSION num_scanlines;
-
-  /* On Mac, fetch a command line. */
-#ifdef USE_CCOMMAND
-  argc = ccommand(&argv);
-#endif
-
-  progname = argv[0];
-  if (progname == NULL || progname[0] == 0)
-    progname = "cjpeg";		/* in case C library doesn't provide it */
-
-  /* Initialize the JPEG compression object with default error handling. */
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_compress(&cinfo);
-  /* Add some application-specific error messages (from cderror.h) */
-  jerr.addon_message_table = cdjpeg_message_table;
-  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
-  jerr.last_addon_message = JMSG_LASTADDONCODE;
-
-  /* Now safe to enable signal catcher. */
-#ifdef NEED_SIGNAL_CATCHER
-  enable_signal_catcher((j_common_ptr) &cinfo);
-#endif
-
-  /* Initialize JPEG parameters.
-   * Much of this may be overridden later.
-   * In particular, we don't yet know the input file's color space,
-   * but we need to provide some value for jpeg_set_defaults() to work.
-   */
-
-  cinfo.in_color_space = JCS_RGB; /* arbitrary guess */
-  jpeg_set_defaults(&cinfo);
-
-  /* Scan command line to find file names.
-   * It is convenient to use just one switch-parsing routine, but the switch
-   * values read here are ignored; we will rescan the switches after opening
-   * the input file.
-   */
-
-  file_index = parse_switches(&cinfo, argc, argv, 0, FALSE);
-
-#ifdef TWO_FILE_COMMANDLINE
-  /* Must have either -outfile switch or explicit output file name */
-  if (outfilename == NULL) {
-    if (file_index != argc-2) {
-      fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
-      usage();
-    }
-    outfilename = argv[file_index+1];
-  } else {
-    if (file_index != argc-1) {
-      fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
-      usage();
-    }
-  }
-#else
-  /* Unix style: expect zero or one file name */
-  if (file_index < argc-1) {
-    fprintf(stderr, "%s: only one input file\n", progname);
-    usage();
-  }
-#endif /* TWO_FILE_COMMANDLINE */
-
-  /* Open the input file. */
-  if (file_index < argc) {
-    if ((input_file = fopen(argv[file_index], READ_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s\n", progname, argv[file_index]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    /* default input file is stdin */
-    input_file = read_stdin();
-  }
-
-  /* Open the output file. */
-  if (outfilename != NULL) {
-    if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s\n", progname, outfilename);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    /* default output file is stdout */
-    output_file = write_stdout();
-  }
-
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr) &cinfo, &progress);
-#endif
-
-  /* Figure out the input file format, and set up to read it. */
-  src_mgr = select_file_type(&cinfo, input_file);
-  src_mgr->input_file = input_file;
-
-  /* Read the input file header to obtain file size & colorspace. */
-  (*src_mgr->start_input) (&cinfo, src_mgr);
-
-  /* Now that we know input colorspace, fix colorspace-dependent defaults */
-  jpeg_default_colorspace(&cinfo);
-
-  /* Adjust default compression parameters by re-parsing the options */
-  file_index = parse_switches(&cinfo, argc, argv, 0, TRUE);
-
-  /* Specify data destination for compression */
-  jpeg_stdio_dest(&cinfo, output_file);
-
-  /* Start compressor */
-  jpeg_start_compress(&cinfo, TRUE);
-
-  /* Process data */
-  while (cinfo.next_scanline < cinfo.image_height) {
-    num_scanlines = (*src_mgr->get_pixel_rows) (&cinfo, src_mgr);
-    (void) jpeg_write_scanlines(&cinfo, src_mgr->buffer, num_scanlines);
-  }
-
-  /* Finish compression and release memory */
-  (*src_mgr->finish_input) (&cinfo, src_mgr);
-  jpeg_finish_compress(&cinfo);
-  jpeg_destroy_compress(&cinfo);
-
-  /* Close files, if we opened them */
-  if (input_file != stdin)
-    fclose(input_file);
-  if (output_file != stdout)
-    fclose(output_file);
-
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr) &cinfo);
-#endif
-
-  /* All done. */
-  exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
-  return 0;			/* suppress no-return-value warnings */
-}
--- a/jpeg/ckconfig.c
+++ b/jpeg/ckconfig.c
@ -1,402 +0,0 @@
-/*
- * ckconfig.c
- *
- * Copyright (C) 1991-1994, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- */
-
-/*
- * This program is intended to help you determine how to configure the JPEG
- * software for installation on a particular system.  The idea is to try to
- * compile and execute this program.  If your compiler fails to compile the
- * program, make changes as indicated in the comments below.  Once you can
- * compile the program, run it, and it will produce a "jconfig.h" file for
- * your system.
- *
- * As a general rule, each time you try to compile this program,
- * pay attention only to the *first* error message you get from the compiler.
- * Many C compilers will issue lots of spurious error messages once they
- * have gotten confused.  Go to the line indicated in the first error message,
- * and read the comments preceding that line to see what to change.
- *
- * Almost all of the edits you may need to make to this program consist of
- * changing a line that reads "#define SOME_SYMBOL" to "#undef SOME_SYMBOL",
- * or vice versa.  This is called defining or undefining that symbol.
- */
-
-
-/* First we must see if your system has the include files we need.
- * We start out with the assumption that your system has all the ANSI-standard
- * include files.  If you get any error trying to include one of these files,
- * undefine the corresponding HAVE_xxx symbol.
- */
-
-#define HAVE_STDDEF_H		/* replace 'define' by 'undef' if error here */
-#ifdef HAVE_STDDEF_H		/* next line will be skipped if you undef... */
-#include <stddef.h>
-#endif
-
-#define HAVE_STDLIB_H		/* same thing for stdlib.h */
-#ifdef HAVE_STDLIB_H
-#include <stdlib.h>
-#endif
-
-#include <stdio.h>		/* If you ain't got this, you ain't got C. */
-
-/* We have to see if your string functions are defined by
- * strings.h (old BSD convention) or string.h (everybody else).
- * We try the non-BSD convention first; define NEED_BSD_STRINGS
- * if the compiler says it can't find string.h.
- */
-
-#undef NEED_BSD_STRINGS
-
-#ifdef NEED_BSD_STRINGS
-#include <strings.h>
-#else
-#include <string.h>
-#endif
-
-/* On some systems (especially older Unix machines), type size_t is
- * defined only in the include file <sys/types.h>.  If you get a failure
- * on the size_t test below, try defining NEED_SYS_TYPES_H.
- */
-
-#undef NEED_SYS_TYPES_H		/* start by assuming we don't need it */
-#ifdef NEED_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
-
-/* Usually type size_t is defined in one of the include files we've included
- * above.  If not, you'll get an error on the "typedef size_t my_size_t;" line.
- * In that case, first try defining NEED_SYS_TYPES_H just above.
- * If that doesn't work, you'll have to search through your system library
- * to figure out which include file defines "size_t".  Look for a line that
- * says "typedef something-or-other size_t;".  Then, change the line below
- * that says "#include <someincludefile.h>" to instead include the file
- * you found size_t in, and define NEED_SPECIAL_INCLUDE.  If you can't find
- * type size_t anywhere, try replacing "#include <someincludefile.h>" with
- * "typedef unsigned int size_t;".
- */
-
-#undef NEED_SPECIAL_INCLUDE	/* assume we DON'T need it, for starters */
-
-#ifdef NEED_SPECIAL_INCLUDE
-#include <someincludefile.h>
-#endif
-
-typedef size_t my_size_t;	/* The payoff: do we have size_t now? */
-
-
-/* The next question is whether your compiler supports ANSI-style function
- * prototypes.  You need to know this in order to choose between using
- * makefile.ansi and using makefile.unix.
- * The #define line below is set to assume you have ANSI function prototypes.
- * If you get an error in this group of lines, undefine HAVE_PROTOTYPES.
- */
-
-#define HAVE_PROTOTYPES
-
-#ifdef HAVE_PROTOTYPES
-int testfunction (int arg1, int * arg2); /* check prototypes */
-
-struct methods_struct {		/* check method-pointer declarations */
-  int (*error_exit) (char *msgtext);
-  int (*trace_message) (char *msgtext);
-  int (*another_method) (void);
-};
-
-int testfunction (int arg1, int * arg2) /* check definitions */
-{
-  return arg2[arg1];
-}
-
-int test2function (void)	/* check void arg list */
-{
-  return 0;
-}
-#endif
-
-
-/* Now we want to find out if your compiler knows what "unsigned char" means.
- * If you get an error on the "unsigned char un_char;" line,
- * then undefine HAVE_UNSIGNED_CHAR.
- */
-
-#define HAVE_UNSIGNED_CHAR
-
-#ifdef HAVE_UNSIGNED_CHAR
-unsigned char un_char;
-#endif
-
-
-/* Now we want to find out if your compiler knows what "unsigned short" means.
- * If you get an error on the "unsigned short un_short;" line,
- * then undefine HAVE_UNSIGNED_SHORT.
- */
-
-#define HAVE_UNSIGNED_SHORT
-
-#ifdef HAVE_UNSIGNED_SHORT
-unsigned short un_short;
-#endif
-
-
-/* Now we want to find out if your compiler understands type "void".
- * If you get an error anywhere in here, undefine HAVE_VOID.
- */
-
-#define HAVE_VOID
-
-#ifdef HAVE_VOID
-/* Caution: a C++ compiler will insist on complete prototypes */
-typedef void * void_ptr;	/* check void * */
-#ifdef HAVE_PROTOTYPES		/* check ptr to function returning void */
-typedef void (*void_func) (int a, int b);
-#else
-typedef void (*void_func) ();
-#endif
-
-#ifdef HAVE_PROTOTYPES		/* check void function result */
-void test3function (void_ptr arg1, void_func arg2)
-#else
-void test3function (arg1, arg2)
-     void_ptr arg1;
-     void_func arg2;
-#endif
-{
-  char * locptr = (char *) arg1; /* check casting to and from void * */
-  arg1 = (void *) locptr;
-  (*arg2) (1, 2);		/* check call of fcn returning void */
-}
-#endif
-
-
-/* Now we want to find out if your compiler knows what "const" means.
- * If you get an error here, undefine HAVE_CONST.
- */
-
-#define HAVE_CONST
-
-#ifdef HAVE_CONST
-static const int carray[3] = {1, 2, 3};
-
-#ifdef HAVE_PROTOTYPES
-int test4function (const int arg1)
-#else
-int test4function (arg1)
-     const int arg1;
-#endif
-{
-  return carray[arg1];
-}
-#endif
-
-
-/* If you get an error or warning about this structure definition,
- * define INCOMPLETE_TYPES_BROKEN.
- */
-
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifndef INCOMPLETE_TYPES_BROKEN
-typedef struct undefined_structure * undef_struct_ptr;
-#endif
-
-
-/* If you get an error about duplicate names,
- * define NEED_SHORT_EXTERNAL_NAMES.
- */
-
-#undef NEED_SHORT_EXTERNAL_NAMES
-
-#ifndef NEED_SHORT_EXTERNAL_NAMES
-
-int possibly_duplicate_function ()
-{
-  return 0;
-}
-
-int possibly_dupli_function ()
-{
-  return 1;
-}
-
-#endif
-
-
-
-/************************************************************************
- *  OK, that's it.  You should not have to change anything beyond this
- *  point in order to compile and execute this program.  (You might get
- *  some warnings, but you can ignore them.)
- *  When you run the program, it will make a couple more tests that it
- *  can do automatically, and then it will create jconfig.h and print out
- *  any additional suggestions it has.
- ************************************************************************
- */
-
-
-#ifdef HAVE_PROTOTYPES
-int is_char_signed (int arg)
-#else
-int is_char_signed (arg)
-     int arg;
-#endif
-{
-  if (arg == 189) {		/* expected result for unsigned char */
-    return 0;			/* type char is unsigned */
-  }
-  else if (arg != -67) {	/* expected result for signed char */
-    printf("Hmm, it seems 'char' is not eight bits wide on your machine.\n");
-    printf("I fear the JPEG software will not work at all.\n\n");
-  }
-  return 1;			/* assume char is signed otherwise */
-}
-
-
-#ifdef HAVE_PROTOTYPES
-int is_shifting_signed (long arg)
-#else
-int is_shifting_signed (arg)
-     long arg;
-#endif
-/* See whether right-shift on a long is signed or not. */
-{
-  long res = arg >> 4;
-
-  if (res == -0x7F7E80CL) {	/* expected result for signed shift */
-    return 1;			/* right shift is signed */
-  }
-  /* see if unsigned-shift hack will fix it. */
-  /* we can't just test exact value since it depends on width of long... */
-  res |= (~0L) << (32-4);
-  if (res == -0x7F7E80CL) {	/* expected result now? */
-    return 0;			/* right shift is unsigned */
-  }
-  printf("Right shift isn't acting as I expect it to.\n");
-  printf("I fear the JPEG software will not work at all.\n\n");
-  return 0;			/* try it with unsigned anyway */
-}
-
-
-#ifdef HAVE_PROTOTYPES
-int main (int argc, char ** argv)
-#else
-int main (argc, argv)
-     int argc;
-     char ** argv;
-#endif
-{
-  char signed_char_check = (char) (-67);
-  FILE *outfile;
-
-  /* Attempt to write jconfig.h */
-  if ((outfile = fopen("jconfig.h", "w")) == NULL) {
-    printf("Failed to write jconfig.h\n");
-    return 1;
-  }
-
-  /* Write out all the info */
-  fprintf(outfile, "/* jconfig.h --- generated by ckconfig.c */\n");
-  fprintf(outfile, "/* see jconfig.doc for explanations */\n\n");
-#ifdef HAVE_PROTOTYPES
-  fprintf(outfile, "#define HAVE_PROTOTYPES\n");
-#else
-  fprintf(outfile, "#undef HAVE_PROTOTYPES\n");
-#endif
-#ifdef HAVE_UNSIGNED_CHAR
-  fprintf(outfile, "#define HAVE_UNSIGNED_CHAR\n");
-#else
-  fprintf(outfile, "#undef HAVE_UNSIGNED_CHAR\n");
-#endif
-#ifdef HAVE_UNSIGNED_SHORT
-  fprintf(outfile, "#define HAVE_UNSIGNED_SHORT\n");
-#else
-  fprintf(outfile, "#undef HAVE_UNSIGNED_SHORT\n");
-#endif
-#ifdef HAVE_VOID
-  fprintf(outfile, "/* #define void char */\n");
-#else
-  fprintf(outfile, "#define void char\n");
-#endif
-#ifdef HAVE_CONST
-  fprintf(outfile, "/* #define const */\n");
-#else
-  fprintf(outfile, "#define const\n");
-#endif
-  if (is_char_signed((int) signed_char_check))
-    fprintf(outfile, "#undef CHAR_IS_UNSIGNED\n");
-  else
-    fprintf(outfile, "#define CHAR_IS_UNSIGNED\n");
-#ifdef HAVE_STDDEF_H
-  fprintf(outfile, "#define HAVE_STDDEF_H\n");
-#else
-  fprintf(outfile, "#undef HAVE_STDDEF_H\n");
-#endif
-#ifdef HAVE_STDLIB_H
-  fprintf(outfile, "#define HAVE_STDLIB_H\n");
-#else
-  fprintf(outfile, "#undef HAVE_STDLIB_H\n");
-#endif
-#ifdef NEED_BSD_STRINGS
-  fprintf(outfile, "#define NEED_BSD_STRINGS\n");
-#else
-  fprintf(outfile, "#undef NEED_BSD_STRINGS\n");
-#endif
-#ifdef NEED_SYS_TYPES_H
-  fprintf(outfile, "#define NEED_SYS_TYPES_H\n");
-#else
-  fprintf(outfile, "#undef NEED_SYS_TYPES_H\n");
-#endif
-  fprintf(outfile, "#undef NEED_FAR_POINTERS\n");
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-  fprintf(outfile, "#define NEED_SHORT_EXTERNAL_NAMES\n");
-#else
-  fprintf(outfile, "#undef NEED_SHORT_EXTERNAL_NAMES\n");
-#endif
-#ifdef INCOMPLETE_TYPES_BROKEN
-  fprintf(outfile, "#define INCOMPLETE_TYPES_BROKEN\n");
-#else
-  fprintf(outfile, "#undef INCOMPLETE_TYPES_BROKEN\n");
-#endif
-  fprintf(outfile, "\n#ifdef JPEG_INTERNALS\n\n");
-  if (is_shifting_signed(-0x7F7E80B1L))
-    fprintf(outfile, "#undef RIGHT_SHIFT_IS_UNSIGNED\n");
-  else
-    fprintf(outfile, "#define RIGHT_SHIFT_IS_UNSIGNED\n");
-  fprintf(outfile, "\n#endif /* JPEG_INTERNALS */\n");
-  fprintf(outfile, "\n#ifdef JPEG_CJPEG_DJPEG\n\n");
-  fprintf(outfile, "#define BMP_SUPPORTED		/* BMP image file format */\n");
-  fprintf(outfile, "#define GIF_SUPPORTED		/* GIF image file format */\n");
-  fprintf(outfile, "#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */\n");
-  fprintf(outfile, "#undef RLE_SUPPORTED		/* Utah RLE image file format */\n");
-  fprintf(outfile, "#define TARGA_SUPPORTED		/* Targa image file format */\n\n");
-  fprintf(outfile, "#undef TWO_FILE_COMMANDLINE	/* You may need this on non-Unix systems */\n");
-  fprintf(outfile, "#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */\n");
-  fprintf(outfile, "#undef DONT_USE_B_MODE\n");
-  fprintf(outfile, "/* #define PROGRESS_REPORT */	/* optional */\n");
-  fprintf(outfile, "\n#endif /* JPEG_CJPEG_DJPEG */\n");
-
-  /* Close the jconfig.h file */
-  fclose(outfile);
-
-  /* User report */
-  printf("Configuration check for Independent JPEG Group's software done.\n");
-  printf("\nI have written the jconfig.h file for you.\n\n");
-#ifdef HAVE_PROTOTYPES
-  printf("You should use makefile.ansi as the starting point for your Makefile.\n");
-#else
-  printf("You should use makefile.unix as the starting point for your Makefile.\n");
-#endif
-
-#ifdef NEED_SPECIAL_INCLUDE
-  printf("\nYou'll need to change jconfig.h to include the system include file\n");
-  printf("that you found type size_t in, or add a direct definition of type\n");
-  printf("size_t if that's what you used.  Just add it to the end.\n");
-#endif
-
-  return 0;
-}
--- a/jpeg/coderules.doc
+++ b/jpeg/coderules.doc
@ -1,118 +0,0 @@
-IJG JPEG LIBRARY:  CODING RULES
-
-Copyright (C) 1991-1996, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-Since numerous people will be contributing code and bug fixes, it's important
-to establish a common coding style.  The goal of using similar coding styles
-is much more important than the details of just what that style is.
-
-In general we follow the recommendations of "Recommended C Style and Coding
-Standards" revision 6.1 (Cannon et al. as modified by Spencer, Keppel and
-Brader).  This document is available in the IJG FTP archive (see
-jpeg/doc/cstyle.ms.tbl.Z, or cstyle.txt.Z for those without nroff/tbl).
-
-Block comments should be laid out thusly:
-
-/*
- *  Block comments in this style.
- */
-
-We indent statements in K&R style, e.g.,
-	if (test) {
-	  then-part;
-	} else {
-	  else-part;
-	}
-with two spaces per indentation level.  (This indentation convention is
-handled automatically by GNU Emacs and many other text editors.)
-
-Multi-word names should be written in lower case with underscores, e.g.,
-multi_word_name (not multiWordName).  Preprocessor symbols and enum constants
-are similar but upper case (MULTI_WORD_NAME).  Names should be unique within
-the first fifteen characters.  (On some older systems, global names must be
-unique within six characters.  We accommodate this without cluttering the
-source code by using macros to substitute shorter names.)
-
-We use function prototypes everywhere; we rely on automatic source code
-transformation to feed prototype-less C compilers.  Transformation is done
-by the simple and portable tool 'ansi2knr.c' (courtesy of Ghostscript).
-ansi2knr is not very bright, so it imposes a format requirement on function
-declarations: the function name MUST BEGIN IN COLUMN 1.  Thus all functions
-should be written in the following style:
-
-LOCAL(int *)
-function_name (int a, char *b)
-{
-    code...
-}
-
-Note that each function definition must begin with GLOBAL(type), LOCAL(type),
-or METHODDEF(type).  These macros expand to "static type" or just "type" as
-appropriate.  They provide a readable indication of the routine's usage and
-can readily be changed for special needs.  (For instance, special linkage
-keywords can be inserted for use in Windows DLLs.)
-
-ansi2knr does not transform method declarations (function pointers in
-structs).  We handle these with a macro JMETHOD, defined as
-	#ifdef HAVE_PROTOTYPES
-	#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
-	#else
-	#define JMETHOD(type,methodname,arglist)  type (*methodname) ()
-	#endif
-which is used like this:
-	struct function_pointers {
-	  JMETHOD(void, init_entropy_encoder, (int somearg, jparms *jp));
-	  JMETHOD(void, term_entropy_encoder, (void));
-	};
-Note the set of parentheses surrounding the parameter list.
-
-A similar solution is used for forward and external function declarations
-(see the EXTERN and JPP macros).
-
-If the code is to work on non-ANSI compilers, we cannot rely on a prototype
-declaration to coerce actual parameters into the right types.  Therefore, use
-explicit casts on actual parameters whenever the actual parameter type is not
-identical to the formal parameter.  Beware of implicit conversions to "int".
-
-It seems there are some non-ANSI compilers in which the sizeof() operator
-is defined to return int, yet size_t is defined as long.  Needless to say,
-this is brain-damaged.  Always use the SIZEOF() macro in place of sizeof(),
-so that the result is guaranteed to be of type size_t.
-
-
-The JPEG library is intended to be used within larger programs.  Furthermore,
-we want it to be reentrant so that it can be used by applications that process
-multiple images concurrently.  The following rules support these requirements:
-
-1. Avoid direct use of file I/O, "malloc", error report printouts, etc;
-pass these through the common routines provided.
-
-2. Minimize global namespace pollution.  Functions should be declared static
-wherever possible.  (Note that our method-based calling conventions help this
-a lot: in many modules only the initialization function will ever need to be
-called directly, so only that function need be externally visible.)  All
-global function names should begin with "jpeg_", and should have an
-abbreviated name (unique in the first six characters) substituted by macro
-when NEED_SHORT_EXTERNAL_NAMES is set.
-
-3. Don't use global variables; anything that must be used in another module
-should be in the common data structures.
-
-4. Don't use static variables except for read-only constant tables.  Variables
-that should be private to a module can be placed into private structures (see
-the system architecture document, structure.doc).
-
-5. Source file names should begin with "j" for files that are part of the
-library proper; source files that are not part of the library, such as cjpeg.c
-and djpeg.c, do not begin with "j".  Keep source file names to eight
-characters (plus ".c" or ".h", etc) to make life easy for MS-DOSers.  Keep
-compression and decompression code in separate source files --- some
-applications may want only one half of the library.
-
-Note: these rules (particularly #4) are not followed religiously in the
-modules that are used in cjpeg/djpeg but are not part of the JPEG library
-proper.  Those modules are not really intended to be used in other
-applications.
--- a/jpeg/djpeg.c
+++ b/jpeg/djpeg.c
@ -1,616 +0,0 @@
-/*
- * djpeg.c
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains a command-line user interface for the JPEG decompressor.
- * It should work on any system with Unix- or MS-DOS-style command lines.
- *
- * Two different command line styles are permitted, depending on the
- * compile-time switch TWO_FILE_COMMANDLINE:
- *	djpeg [options]  inputfile outputfile
- *	djpeg [options]  [inputfile]
- * In the second style, output is always to standard output, which you'd
- * normally redirect to a file or pipe to some other program.  Input is
- * either from a named file or from standard input (typically redirected).
- * The second style is convenient on Unix but is unhelpful on systems that
- * don't support pipes.  Also, you MUST use the first style if your system
- * doesn't do binary I/O to stdin/stdout.
- * To simplify script writing, the "-outfile" switch is provided.  The syntax
- *	djpeg [options]  -outfile outputfile  inputfile
- * works regardless of which command line style is used.
- */
-
-#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
-#include "jversion.h"		/* for version message */
-
-#include <ctype.h>		/* to declare isprint() */
-
-#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
-#ifdef __MWERKS__
-#include <SIOUX.h>              /* Metrowerks needs this */
-#include <console.h>		/* ... and this */
-#endif
-#ifdef THINK_C
-#include <console.h>		/* Think declares it here */
-#endif
-#endif
-
-
-/* Create the add-on message string table. */
-
-#define JMESSAGE(code,string)	string ,
-
-static const char * const cdjpeg_message_table[] = {
-#include "cderror.h"
-  NULL
-};
-
-
-/*
- * This list defines the known output image formats
- * (not all of which need be supported by a given version).
- * You can change the default output format by defining DEFAULT_FMT;
- * indeed, you had better do so if you undefine PPM_SUPPORTED.
- */
-
-typedef enum {
-	FMT_BMP,		/* BMP format (Windows flavor) */
-	FMT_GIF,		/* GIF format */
-	FMT_OS2,		/* BMP format (OS/2 flavor) */
-	FMT_PPM,		/* PPM/PGM (PBMPLUS formats) */
-	FMT_RLE,		/* RLE format */
-	FMT_TARGA,		/* Targa format */
-	FMT_TIFF		/* TIFF format */
-} IMAGE_FORMATS;
-
-#ifndef DEFAULT_FMT		/* so can override from CFLAGS in Makefile */
-#define DEFAULT_FMT	FMT_PPM
-#endif
-
-static IMAGE_FORMATS requested_fmt;
-
-
-/*
- * Argument-parsing code.
- * The switch parser is designed to be useful with DOS-style command line
- * syntax, ie, intermixed switches and file names, where only the switches
- * to the left of a given file name affect processing of that file.
- * The main program in this file doesn't actually use this capability...
- */
-
-
-static const char * progname;	/* program name for error messages */
-static char * outfilename;	/* for -outfile switch */
-
-
-LOCAL(void)
-usage (void)
-/* complain about bad command line */
-{
-  fprintf(stderr, "usage: %s [switches] ", progname);
-#ifdef TWO_FILE_COMMANDLINE
-  fprintf(stderr, "inputfile outputfile\n");
-#else
-  fprintf(stderr, "[inputfile]\n");
-#endif
-
-  fprintf(stderr, "Switches (names may be abbreviated):\n");
-  fprintf(stderr, "  -colors N      Reduce image to no more than N colors\n");
-  fprintf(stderr, "  -fast          Fast, low-quality processing\n");
-  fprintf(stderr, "  -grayscale     Force grayscale output\n");
-#ifdef IDCT_SCALING_SUPPORTED
-  fprintf(stderr, "  -scale M/N     Scale output image by fraction M/N, eg, 1/8\n");
-#endif
-#ifdef BMP_SUPPORTED
-  fprintf(stderr, "  -bmp           Select BMP output format (Windows style)%s\n",
-	  (DEFAULT_FMT == FMT_BMP ? " (default)" : ""));
-#endif
-#ifdef GIF_SUPPORTED
-  fprintf(stderr, "  -gif           Select GIF output format%s\n",
-	  (DEFAULT_FMT == FMT_GIF ? " (default)" : ""));
-#endif
-#ifdef BMP_SUPPORTED
-  fprintf(stderr, "  -os2           Select BMP output format (OS/2 style)%s\n",
-	  (DEFAULT_FMT == FMT_OS2 ? " (default)" : ""));
-#endif
-#ifdef PPM_SUPPORTED
-  fprintf(stderr, "  -pnm           Select PBMPLUS (PPM/PGM) output format%s\n",
-	  (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
-#endif
-#ifdef RLE_SUPPORTED
-  fprintf(stderr, "  -rle           Select Utah RLE output format%s\n",
-	  (DEFAULT_FMT == FMT_RLE ? " (default)" : ""));
-#endif
-#ifdef TARGA_SUPPORTED
-  fprintf(stderr, "  -targa         Select Targa output format%s\n",
-	  (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
-#endif
-  fprintf(stderr, "Switches for advanced users:\n");
-#ifdef DCT_ISLOW_SUPPORTED
-  fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
-#endif
-#ifdef DCT_IFAST_SUPPORTED
-  fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
-	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
-#endif
-#ifdef DCT_FLOAT_SUPPORTED
-  fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
-	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
-#endif
-  fprintf(stderr, "  -dither fs     Use F-S dithering (default)\n");
-  fprintf(stderr, "  -dither none   Don't use dithering in quantization\n");
-  fprintf(stderr, "  -dither ordered  Use ordered dither (medium speed, quality)\n");
-#ifdef QUANT_2PASS_SUPPORTED
-  fprintf(stderr, "  -map FILE      Map to colors used in named image file\n");
-#endif
-  fprintf(stderr, "  -nosmooth      Don't use high-quality upsampling\n");
-#ifdef QUANT_1PASS_SUPPORTED
-  fprintf(stderr, "  -onepass       Use 1-pass quantization (fast, low quality)\n");
-#endif
-  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
-  fprintf(stderr, "  -outfile name  Specify name for output file\n");
-  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
-  exit(EXIT_FAILURE);
-}
-
-
-LOCAL(int)
-parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
-		int last_file_arg_seen, boolean for_real)
-/* Parse optional switches.
- * Returns argv[] index of first file-name argument (== argc if none).
- * Any file names with indexes <= last_file_arg_seen are ignored;
- * they have presumably been processed in a previous iteration.
- * (Pass 0 for last_file_arg_seen on the first or only iteration.)
- * for_real is FALSE on the first (dummy) pass; we may skip any expensive
- * processing.
- */
-{
-  int argn;
-  char * arg;
-
-  /* Set up default JPEG parameters. */
-  requested_fmt = DEFAULT_FMT;	/* set default output file format */
-  outfilename = NULL;
-  cinfo->err->trace_level = 0;
-
-  /* Scan command line options, adjust parameters */
-
-  for (argn = 1; argn < argc; argn++) {
-    arg = argv[argn];
-    if (*arg != '-') {
-      /* Not a switch, must be a file name argument */
-      if (argn <= last_file_arg_seen) {
-	outfilename = NULL;	/* -outfile applies to just one input file */
-	continue;		/* ignore this name if previously processed */
-      }
-      break;			/* else done parsing switches */
-    }
-    arg++;			/* advance past switch marker character */
-
-    if (keymatch(arg, "bmp", 1)) {
-      /* BMP output format. */
-      requested_fmt = FMT_BMP;
-
-    } else if (keymatch(arg, "colors", 1) || keymatch(arg, "colours", 1) ||
-	       keymatch(arg, "quantize", 1) || keymatch(arg, "quantise", 1)) {
-      /* Do color quantization. */
-      int val;
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%d", &val) != 1)
-	usage();
-      cinfo->desired_number_of_colors = val;
-      cinfo->quantize_colors = TRUE;
-
-    } else if (keymatch(arg, "dct", 2)) {
-      /* Select IDCT algorithm. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (keymatch(argv[argn], "int", 1)) {
-	cinfo->dct_method = JDCT_ISLOW;
-      } else if (keymatch(argv[argn], "fast", 2)) {
-	cinfo->dct_method = JDCT_IFAST;
-      } else if (keymatch(argv[argn], "float", 2)) {
-	cinfo->dct_method = JDCT_FLOAT;
-      } else
-	usage();
-
-    } else if (keymatch(arg, "dither", 2)) {
-      /* Select dithering algorithm. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (keymatch(argv[argn], "fs", 2)) {
-	cinfo->dither_mode = JDITHER_FS;
-      } else if (keymatch(argv[argn], "none", 2)) {
-	cinfo->dither_mode = JDITHER_NONE;
-      } else if (keymatch(argv[argn], "ordered", 2)) {
-	cinfo->dither_mode = JDITHER_ORDERED;
-      } else
-	usage();
-
-    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
-      /* Enable debug printouts. */
-      /* On first -d, print version identification */
-      static boolean printed_version = FALSE;
-
-      if (! printed_version) {
-	fprintf(stderr, "Independent JPEG Group's DJPEG, version %s\n%s\n",
-		JVERSION, JCOPYRIGHT);
-	printed_version = TRUE;
-      }
-      cinfo->err->trace_level++;
-
-    } else if (keymatch(arg, "fast", 1)) {
-      /* Select recommended processing options for quick-and-dirty output. */
-      cinfo->two_pass_quantize = FALSE;
-      cinfo->dither_mode = JDITHER_ORDERED;
-      if (! cinfo->quantize_colors) /* don't override an earlier -colors */
-	cinfo->desired_number_of_colors = 216;
-      cinfo->dct_method = JDCT_FASTEST;
-      cinfo->do_fancy_upsampling = FALSE;
-
-    } else if (keymatch(arg, "gif", 1)) {
-      /* GIF output format. */
-      requested_fmt = FMT_GIF;
-
-    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
-      /* Force monochrome output. */
-      cinfo->out_color_space = JCS_GRAYSCALE;
-
-    } else if (keymatch(arg, "map", 3)) {
-      /* Quantize to a color map taken from an input file. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (for_real) {		/* too expensive to do twice! */
-#ifdef QUANT_2PASS_SUPPORTED	/* otherwise can't quantize to supplied map */
-	FILE * mapfile;
-
-	if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) {
-	  fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
-	  exit(EXIT_FAILURE);
-	}
-	read_color_map(cinfo, mapfile);
-	fclose(mapfile);
-	cinfo->quantize_colors = TRUE;
-#else
-	ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-      }
-
-    } else if (keymatch(arg, "maxmemory", 3)) {
-      /* Maximum memory in Kb (or Mb with 'm'). */
-      long lval;
-      char ch = 'x';
-
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
-	usage();
-      if (ch == 'm' || ch == 'M')
-	lval *= 1000L;
-      cinfo->mem->max_memory_to_use = lval * 1000L;
-
-    } else if (keymatch(arg, "nosmooth", 3)) {
-      /* Suppress fancy upsampling */
-      cinfo->do_fancy_upsampling = FALSE;
-
-    } else if (keymatch(arg, "onepass", 3)) {
-      /* Use fast one-pass quantization. */
-      cinfo->two_pass_quantize = FALSE;
-
-    } else if (keymatch(arg, "os2", 3)) {
-      /* BMP output format (OS/2 flavor). */
-      requested_fmt = FMT_OS2;
-
-    } else if (keymatch(arg, "outfile", 4)) {
-      /* Set output file name. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      outfilename = argv[argn];	/* save it away for later use */
-
-    } else if (keymatch(arg, "pnm", 1) || keymatch(arg, "ppm", 1)) {
-      /* PPM/PGM output format. */
-      requested_fmt = FMT_PPM;
-
-    } else if (keymatch(arg, "rle", 1)) {
-      /* RLE output format. */
-      requested_fmt = FMT_RLE;
-
-    } else if (keymatch(arg, "scale", 1)) {
-      /* Scale the output image by a fraction M/N. */
-      if (++argn >= argc)	/* advance to next argument */
-	usage();
-      if (sscanf(argv[argn], "%d/%d",
-		 &cinfo->scale_num, &cinfo->scale_denom) != 2)
-	usage();
-
-    } else if (keymatch(arg, "targa", 1)) {
-      /* Targa output format. */
-      requested_fmt = FMT_TARGA;
-
-    } else {
-      usage();			/* bogus switch */
-    }
-  }
-
-  return argn;			/* return index of next arg (file name) */
-}
-
-
-/*
- * Marker processor for COM and interesting APPn markers.
- * This replaces the library's built-in processor, which just skips the marker.
- * We want to print out the marker as text, to the extent possible.
- * Note this code relies on a non-suspending data source.
- */
-
-LOCAL(unsigned int)
-jpeg_getc (j_decompress_ptr cinfo)
-/* Read next byte */
-{
-  struct jpeg_source_mgr * datasrc = cinfo->src;
-
-  if (datasrc->bytes_in_buffer == 0) {
-    if (! (*datasrc->fill_input_buffer) (cinfo))
-      ERREXIT(cinfo, JERR_CANT_SUSPEND);
-  }
-  datasrc->bytes_in_buffer--;
-  return GETJOCTET(*datasrc->next_input_byte++);
-}
-
-
-METHODDEF(boolean)
-print_text_marker (j_decompress_ptr cinfo)
-{
-  boolean traceit = (cinfo->err->trace_level >= 1);
-  INT32 length;
-  unsigned int ch;
-  unsigned int lastch = 0;
-
-  length = jpeg_getc(cinfo) << 8;
-  length += jpeg_getc(cinfo);
-  length -= 2;			/* discount the length word itself */
-
-  if (traceit) {
-    if (cinfo->unread_marker == JPEG_COM)
-      fprintf(stderr, "Comment, length %ld:\n", (long) length);
-    else			/* assume it is an APPn otherwise */
-      fprintf(stderr, "APP%d, length %ld:\n",
-	      cinfo->unread_marker - JPEG_APP0, (long) length);
-  }
-
-  while (--length >= 0) {
-    ch = jpeg_getc(cinfo);
-    if (traceit) {
-      /* Emit the character in a readable form.
-       * Nonprintables are converted to \nnn form,
-       * while \ is converted to \\.
-       * Newlines in CR, CR/LF, or LF form will be printed as one newline.
-       */
-      if (ch == '\r') {
-	fprintf(stderr, "\n");
-      } else if (ch == '\n') {
-	if (lastch != '\r')
-	  fprintf(stderr, "\n");
-      } else if (ch == '\\') {
-	fprintf(stderr, "\\\\");
-      } else if (isprint(ch)) {
-	putc(ch, stderr);
-      } else {
-	fprintf(stderr, "\\%03o", ch);
-      }
-      lastch = ch;
-    }
-  }
-
-  if (traceit)
-    fprintf(stderr, "\n");
-
-  return TRUE;
-}
-
-
-/*
- * The main program.
- */
-
-int
-main (int argc, char **argv)
-{
-  struct jpeg_decompress_struct cinfo;
-  struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
-  struct cdjpeg_progress_mgr progress;
-#endif
-  int file_index;
-  djpeg_dest_ptr dest_mgr = NULL;
-  FILE * input_file;
-  FILE * output_file;
-  JDIMENSION num_scanlines;
-
-  /* On Mac, fetch a command line. */
-#ifdef USE_CCOMMAND
-  argc = ccommand(&argv);
-#endif
-
-  progname = argv[0];
-  if (progname == NULL || progname[0] == 0)
-    progname = "djpeg";		/* in case C library doesn't provide it */
-
-  /* Initialize the JPEG decompression object with default error handling. */
-  cinfo.err = jpeg_std_error(&jerr);
-  jpeg_create_decompress(&cinfo);
-  /* Add some application-specific error messages (from cderror.h) */
-  jerr.addon_message_table = cdjpeg_message_table;
-  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
-  jerr.last_addon_message = JMSG_LASTADDONCODE;
-
-  /* Insert custom marker processor for COM and APP12.
-   * APP12 is used by some digital camera makers for textual info,
-   * so we provide the ability to display it as text.
-   * If you like, additional APPn marker types can be selected for display,
-   * but don't try to override APP0 or APP14 this way (see libjpeg.doc).
-   */
-  jpeg_set_marker_processor(&cinfo, JPEG_COM, print_text_marker);
-  jpeg_set_marker_processor(&cinfo, JPEG_APP0+12, print_text_marker);
-
-  /* Now safe to enable signal catcher. */
-#ifdef NEED_SIGNAL_CATCHER
-  enable_signal_catcher((j_common_ptr) &cinfo);
-#endif
-
-  /* Scan command line to find file names. */
-  /* It is convenient to use just one switch-parsing routine, but the switch
-   * values read here are ignored; we will rescan the switches after opening
-   * the input file.
-   * (Exception: tracing level set here controls verbosity for COM markers
-   * found during jpeg_read_header...)
-   */
-
-  file_index = parse_switches(&cinfo, argc, argv, 0, FALSE);
-
-#ifdef TWO_FILE_COMMANDLINE
-  /* Must have either -outfile switch or explicit output file name */
-  if (outfilename == NULL) {
-    if (file_index != argc-2) {
-      fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
-      usage();
-    }
-    outfilename = argv[file_index+1];
-  } else {
-    if (file_index != argc-1) {
-      fprintf(stderr, "%s: must name one input and one output file\n",
-	      progname);
-      usage();
-    }
-  }
-#else
-  /* Unix style: expect zero or one file name */
-  if (file_index < argc-1) {
-    fprintf(stderr, "%s: only one input file\n", progname);
-    usage();
-  }
-#endif /* TWO_FILE_COMMANDLINE */
-
-  /* Open the input file. */
-  if (file_index < argc) {
-    if ((input_file = fopen(argv[file_index], READ_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s\n", progname, argv[file_index]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    /* default input file is stdin */
-    input_file = read_stdin();
-  }
-
-  /* Open the output file. */
-  if (outfilename != NULL) {
-    if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s\n", progname, outfilename);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    /* default output file is stdout */
-    output_file = write_stdout();
-  }
-
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr) &cinfo, &progress);
-#endif
-
-  /* Specify data source for decompression */
-  jpeg_stdio_src(&cinfo, input_file);
-
-  /* Read file header, set default decompression parameters */
-  (void) jpeg_read_header(&cinfo, TRUE);
-
-  /* Adjust default decompression parameters by re-parsing the options */
-  file_index = parse_switches(&cinfo, argc, argv, 0, TRUE);
-
-  /* Initialize the output module now to let it override any crucial
-   * option settings (for instance, GIF wants to force color quantization).
-   */
-  switch (requested_fmt) {
-#ifdef BMP_SUPPORTED
-  case FMT_BMP:
-    dest_mgr = jinit_write_bmp(&cinfo, FALSE);
-    break;
-  case FMT_OS2:
-    dest_mgr = jinit_write_bmp(&cinfo, TRUE);
-    break;
-#endif
-#ifdef GIF_SUPPORTED
-  case FMT_GIF:
-    dest_mgr = jinit_write_gif(&cinfo);
-    break;
-#endif
-#ifdef PPM_SUPPORTED
-  case FMT_PPM:
-    dest_mgr = jinit_write_ppm(&cinfo);
-    break;
-#endif
-#ifdef RLE_SUPPORTED
-  case FMT_RLE:
-    dest_mgr = jinit_write_rle(&cinfo);
-    break;
-#endif
-#ifdef TARGA_SUPPORTED
-  case FMT_TARGA:
-    dest_mgr = jinit_write_targa(&cinfo);
-    break;
-#endif
-  default:
-    ERREXIT(&cinfo, JERR_UNSUPPORTED_FORMAT);
-    break;
-  }
-  dest_mgr->output_file = output_file;
-
-  /* Start decompressor */
-  (void) jpeg_start_decompress(&cinfo);
-
-  /* Write output file header */
-  (*dest_mgr->start_output) (&cinfo, dest_mgr);
-
-  /* Process data */
-  while (cinfo.output_scanline < cinfo.output_height) {
-    num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
-					dest_mgr->buffer_height);
-    (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
-  }
-
-#ifdef PROGRESS_REPORT
-  /* Hack: count final pass as done in case finish_output does an extra pass.
-   * The library won't have updated completed_passes.
-   */
-  progress.pub.completed_passes = progress.pub.total_passes;
-#endif
-
-  /* Finish decompression and release memory.
-   * I must do it in this order because output module has allocated memory
-   * of lifespan JPOOL_IMAGE; it needs to finish before releasing memory.
-   */
-  (*dest_mgr->finish_output) (&cinfo, dest_mgr);
-  (void) jpeg_finish_decompress(&cinfo);
-  jpeg_destroy_decompress(&cinfo);
-
-  /* Close files, if we opened them */
-  if (input_file != stdin)
-    fclose(input_file);
-  if (output_file != stdout)
-    fclose(output_file);
-
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr) &cinfo);
-#endif
-
-  /* All done. */
-  exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
-  return 0;			/* suppress no-return-value warnings */
-}
--- a/jpeg/example.c
+++ b/jpeg/example.c
@ -1,433 +0,0 @@
-/*
- * example.c
- *
- * This file illustrates how to use the IJG code as a subroutine library
- * to read or write JPEG image files.  You should look at this code in
- * conjunction with the documentation file libjpeg.doc.
- *
- * This code will not do anything useful as-is, but it may be helpful as a
- * skeleton for constructing routines that call the JPEG library.  
- *
- * We present these routines in the same coding style used in the JPEG code
- * (ANSI function definitions, etc); but you are of course free to code your
- * routines in a different style if you prefer.
- */
-
-#include <stdio.h>
-
-/*
- * Include file for users of JPEG library.
- * You will need to have included system headers that define at least
- * the typedefs FILE and size_t before you can include jpeglib.h.
- * (stdio.h is sufficient on ANSI-conforming systems.)
- * You may also wish to include "jerror.h".
- */
-
-#include "jpeglib.h"
-
-/*
- * <setjmp.h> is used for the optional error recovery mechanism shown in
- * the second part of the example.
- */
-
-#include <setjmp.h>
-
-
-
-/******************** JPEG COMPRESSION SAMPLE INTERFACE *******************/
-
-/* This half of the example shows how to feed data into the JPEG compressor.
- * We present a minimal version that does not worry about refinements such
- * as error recovery (the JPEG code will just exit() if it gets an error).
- */
-
-
-/*
- * IMAGE DATA FORMATS:
- *
- * The standard input image format is a rectangular array of pixels, with
- * each pixel having the same number of "component" values (color channels).
- * Each pixel row is an array of JSAMPLEs (which typically are unsigned chars).
- * If you are working with color data, then the color values for each pixel
- * must be adjacent in the row; for example, R,G,B,R,G,B,R,G,B,... for 24-bit
- * RGB color.
- *
- * For this example, we'll assume that this data structure matches the way
- * our application has stored the image in memory, so we can just pass a
- * pointer to our image buffer.  In particular, let's say that the image is
- * RGB color and is described by:
- */
-
-extern JSAMPLE * image_buffer;	/* Points to large array of R,G,B-order data */
-extern int image_height;	/* Number of rows in image */
-extern int image_width;		/* Number of columns in image */
-
-
-/*
- * Sample routine for JPEG compression.  We assume that the target file name
- * and a compression quality factor are passed in.
- */
-
-GLOBAL(void)
-write_JPEG_file (char * filename, int quality)
-{
-  /* This struct contains the JPEG compression parameters and pointers to
-   * working space (which is allocated as needed by the JPEG library).
-   * It is possible to have several such structures, representing multiple
-   * compression/decompression processes, in existence at once.  We refer
-   * to any one struct (and its associated working data) as a "JPEG object".
-   */
-  struct jpeg_compress_struct cinfo;
-  /* This struct represents a JPEG error handler.  It is declared separately
-   * because applications often want to supply a specialized error handler
-   * (see the second half of this file for an example).  But here we just
-   * take the easy way out and use the standard error handler, which will
-   * print a message on stderr and call exit() if compression fails.
-   * Note that this struct must live as long as the main JPEG parameter
-   * struct, to avoid dangling-pointer problems.
-   */
-  struct jpeg_error_mgr jerr;
-  /* More stuff */
-  FILE * outfile;		/* target file */
-  JSAMPROW row_pointer[1];	/* pointer to JSAMPLE row[s] */
-  int row_stride;		/* physical row width in image buffer */
-
-  /* Step 1: allocate and initialize JPEG compression object */
-
-  /* We have to set up the error handler first, in case the initialization
-   * step fails.  (Unlikely, but it could happen if you are out of memory.)
-   * This routine fills in the contents of struct jerr, and returns jerr's
-   * address which we place into the link field in cinfo.
-   */
-  cinfo.err = jpeg_std_error(&jerr);
-  /* Now we can initialize the JPEG compression object. */
-  jpeg_create_compress(&cinfo);
-
-  /* Step 2: specify data destination (eg, a file) */
-  /* Note: steps 2 and 3 can be done in either order. */
-
-  /* Here we use the library-supplied code to send compressed data to a
-   * stdio stream.  You can also write your own code to do something else.
-   * VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
-   * requires it in order to write binary files.
-   */
-  if ((outfile = fopen(filename, "wb")) == NULL) {
-    fprintf(stderr, "can't open %s\n", filename);
-    exit(1);
-  }
-  jpeg_stdio_dest(&cinfo, outfile);
-
-  /* Step 3: set parameters for compression */
-
-  /* First we supply a description of the input image.
-   * Four fields of the cinfo struct must be filled in:
-   */
-  cinfo.image_width = image_width; 	/* image width and height, in pixels */
-  cinfo.image_height = image_height;
-  cinfo.input_components = 3;		/* # of color components per pixel */
-  cinfo.in_color_space = JCS_RGB; 	/* colorspace of input image */
-  /* Now use the library's routine to set default compression parameters.
-   * (You must set at least cinfo.in_color_space before calling this,
-   * since the defaults depend on the source color space.)
-   */
-  jpeg_set_defaults(&cinfo);
-  /* Now you can set any non-default parameters you wish to.
-   * Here we just illustrate the use of quality (quantization table) scaling:
-   */
-  jpeg_set_quality(&cinfo, quality, TRUE /* limit to baseline-JPEG values */);
-
-  /* Step 4: Start compressor */
-
-  /* TRUE ensures that we will write a complete interchange-JPEG file.
-   * Pass TRUE unless you are very sure of what you're doing.
-   */
-  jpeg_start_compress(&cinfo, TRUE);
-
-  /* Step 5: while (scan lines remain to be written) */
-  /*           jpeg_write_scanlines(...); */
-
-  /* Here we use the library's state variable cinfo.next_scanline as the
-   * loop counter, so that we don't have to keep track ourselves.
-   * To keep things simple, we pass one scanline per call; you can pass
-   * more if you wish, though.
-   */
-  row_stride = image_width * 3;	/* JSAMPLEs per row in image_buffer */
-
-  while (cinfo.next_scanline < cinfo.image_height) {
-    /* jpeg_write_scanlines expects an array of pointers to scanlines.
-     * Here the array is only one element long, but you could pass
-     * more than one scanline at a time if that's more convenient.
-     */
-    row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride];
-    (void) jpeg_write_scanlines(&cinfo, row_pointer, 1);
-  }
-
-  /* Step 6: Finish compression */
-
-  jpeg_finish_compress(&cinfo);
-  /* After finish_compress, we can close the output file. */
-  fclose(outfile);
-
-  /* Step 7: release JPEG compression object */
-
-  /* This is an important step since it will release a good deal of memory. */
-  jpeg_destroy_compress(&cinfo);
-
-  /* And we're done! */
-}
-
-
-/*
- * SOME FINE POINTS:
- *
- * In the above loop, we ignored the return value of jpeg_write_scanlines,
- * which is the number of scanlines actually written.  We could get away
- * with this because we were only relying on the value of cinfo.next_scanline,
- * which will be incremented correctly.  If you maintain additional loop
- * variables then you should be careful to increment them properly.
- * Actually, for output to a stdio stream you needn't worry, because
- * then jpeg_write_scanlines will write all the lines passed (or else exit
- * with a fatal error).  Partial writes can only occur if you use a data
- * destination module that can demand suspension of the compressor.
- * (If you don't know what that's for, you don't need it.)
- *
- * If the compressor requires full-image buffers (for entropy-coding
- * optimization or a multi-scan JPEG file), it will create temporary
- * files for anything that doesn't fit within the maximum-memory setting.
- * (Note that temp files are NOT needed if you use the default parameters.)
- * On some systems you may need to set up a signal handler to ensure that
- * temporary files are deleted if the program is interrupted.  See libjpeg.doc.
- *
- * Scanlines MUST be supplied in top-to-bottom order if you want your JPEG
- * files to be compatible with everyone else's.  If you cannot readily read
- * your data in that order, you'll need an intermediate array to hold the
- * image.  See rdtarga.c or rdbmp.c for examples of handling bottom-to-top
- * source data using the JPEG code's internal virtual-array mechanisms.
- */
-
-
-
-/******************** JPEG DECOMPRESSION SAMPLE INTERFACE *******************/
-
-/* This half of the example shows how to read data from the JPEG decompressor.
- * It's a bit more refined than the above, in that we show:
- *   (a) how to modify the JPEG library's standard error-reporting behavior;
- *   (b) how to allocate workspace using the library's memory manager.
- *
- * Just to make this example a little different from the first one, we'll
- * assume that we do not intend to put the whole image into an in-memory
- * buffer, but to send it line-by-line someplace else.  We need a one-
- * scanline-high JSAMPLE array as a work buffer, and we will let the JPEG
- * memory manager allocate it for us.  This approach is actually quite useful
- * because we don't need to remember to deallocate the buffer separately: it
- * will go away automatically when the JPEG object is cleaned up.
- */
-
-
-/*
- * ERROR HANDLING:
- *
- * The JPEG library's standard error handler (jerror.c) is divided into
- * several "methods" which you can override individually.  This lets you
- * adjust the behavior without duplicating a lot of code, which you might
- * have to update with each future release.
- *
- * Our example here shows how to override the "error_exit" method so that
- * control is returned to the library's caller when a fatal error occurs,
- * rather than calling exit() as the standard error_exit method does.
- *
- * We use C's setjmp/longjmp facility to return control.  This means that the
- * routine which calls the JPEG library must first execute a setjmp() call to
- * establish the return point.  We want the replacement error_exit to do a
- * longjmp().  But we need to make the setjmp buffer accessible to the
- * error_exit routine.  To do this, we make a private extension of the
- * standard JPEG error handler object.  (If we were using C++, we'd say we
- * were making a subclass of the regular error handler.)
- *
- * Here's the extended error handler struct:
- */
-
-struct my_error_mgr {
-  struct jpeg_error_mgr pub;	/* "public" fields */
-
-  jmp_buf setjmp_buffer;	/* for return to caller */
-};
-
-typedef struct my_error_mgr * my_error_ptr;
-
-/*
- * Here's the routine that will replace the standard error_exit method:
- */
-
-METHODDEF(void)
-my_error_exit (j_common_ptr cinfo)
-{
-  /* cinfo->err really points to a my_error_mgr struct, so coerce pointer */
-  my_error_ptr myerr = (my_error_ptr) cinfo->err;
-
-  /* Always display the message. */
-  /* We could postpone this until after returning, if we chose. */
-  (*cinfo->err->output_message) (cinfo);
-
-  /* Return control to the setjmp point */
-  longjmp(myerr->setjmp_buffer, 1);
-}
-
-
-/*
- * Sample routine for JPEG decompression.  We assume that the source file name
- * is passed in.  We want to return 1 on success, 0 on error.
- */
-
-
-GLOBAL(int)
-read_JPEG_file (char * filename)
-{
-  /* This struct contains the JPEG decompression parameters and pointers to
-   * working space (which is allocated as needed by the JPEG library).
-   */
-  struct jpeg_decompress_struct cinfo;
-  /* We use our private extension JPEG error handler.
-   * Note that this struct must live as long as the main JPEG parameter
-   * struct, to avoid dangling-pointer problems.
-   */
-  struct my_error_mgr jerr;
-  /* More stuff */
-  FILE * infile;		/* source file */
-  JSAMPARRAY buffer;		/* Output row buffer */
-  int row_stride;		/* physical row width in output buffer */
-
-  /* In this example we want to open the input file before doing anything else,
-   * so that the setjmp() error recovery below can assume the file is open.
-   * VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
-   * requires it in order to read binary files.
-   */
-
-  if ((infile = fopen(filename, "rb")) == NULL) {
-    fprintf(stderr, "can't open %s\n", filename);
-    return 0;
-  }
-
-  /* Step 1: allocate and initialize JPEG decompression object */
-
-  /* We set up the normal JPEG error routines, then override error_exit. */
-  cinfo.err = jpeg_std_error(&jerr.pub);
-  jerr.pub.error_exit = my_error_exit;
-  /* Establish the setjmp return context for my_error_exit to use. */
-  if (setjmp(jerr.setjmp_buffer)) {
-    /* If we get here, the JPEG code has signaled an error.
-     * We need to clean up the JPEG object, close the input file, and return.
-     */
-    jpeg_destroy_decompress(&cinfo);
-    fclose(infile);
-    return 0;
-  }
-  /* Now we can initialize the JPEG decompression object. */
-  jpeg_create_decompress(&cinfo);
-
-  /* Step 2: specify data source (eg, a file) */
-
-  jpeg_stdio_src(&cinfo, infile);
-
-  /* Step 3: read file parameters with jpeg_read_header() */
-
-  (void) jpeg_read_header(&cinfo, TRUE);
-  /* We can ignore the return value from jpeg_read_header since
-   *   (a) suspension is not possible with the stdio data source, and
-   *   (b) we passed TRUE to reject a tables-only JPEG file as an error.
-   * See libjpeg.doc for more info.
-   */
-
-  /* Step 4: set parameters for decompression */
-
-  /* In this example, we don't need to change any of the defaults set by
-   * jpeg_read_header(), so we do nothing here.
-   */
-
-  /* Step 5: Start decompressor */
-
-  (void) jpeg_start_decompress(&cinfo);
-  /* We can ignore the return value since suspension is not possible
-   * with the stdio data source.
-   */
-
-  /* We may need to do some setup of our own at this point before reading
-   * the data.  After jpeg_start_decompress() we have the correct scaled
-   * output image dimensions available, as well as the output colormap
-   * if we asked for color quantization.
-   * In this example, we need to make an output work buffer of the right size.
-   */ 
-  /* JSAMPLEs per row in output buffer */
-  row_stride = cinfo.output_width * cinfo.output_components;
-  /* Make a one-row-high sample array that will go away when done with image */
-  buffer = (*cinfo.mem->alloc_sarray)
-		((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
-
-  /* Step 6: while (scan lines remain to be read) */
-  /*           jpeg_read_scanlines(...); */
-
-  /* Here we use the library's state variable cinfo.output_scanline as the
-   * loop counter, so that we don't have to keep track ourselves.
-   */
-  while (cinfo.output_scanline < cinfo.output_height) {
-    /* jpeg_read_scanlines expects an array of pointers to scanlines.
-     * Here the array is only one element long, but you could ask for
-     * more than one scanline at a time if that's more convenient.
-     */
-    (void) jpeg_read_scanlines(&cinfo, buffer, 1);
-    /* Assume put_scanline_someplace wants a pointer and sample count. */
-    put_scanline_someplace(buffer[0], row_stride);
-  }
-
-  /* Step 7: Finish decompression */
-
-  (void) jpeg_finish_decompress(&cinfo);
-  /* We can ignore the return value since suspension is not possible
-   * with the stdio data source.
-   */
-
-  /* Step 8: Release JPEG decompression object */
-
-  /* This is an important step since it will release a good deal of memory. */
-  jpeg_destroy_decompress(&cinfo);
-
-  /* After finish_decompress, we can close the input file.
-   * Here we postpone it until after no more JPEG errors are possible,
-   * so as to simplify the setjmp error logic above.  (Actually, I don't
-   * think that jpeg_destroy can do an error exit, but why assume anything...)
-   */
-  fclose(infile);
-
-  /* At this point you may want to check to see whether any corrupt-data
-   * warnings occurred (test whether jerr.pub.num_warnings is nonzero).
-   */
-
-  /* And we're done! */
-  return 1;
-}
-
-
-/*
- * SOME FINE POINTS:
- *
- * In the above code, we ignored the return value of jpeg_read_scanlines,
- * which is the number of scanlines actually read.  We could get away with
- * this because we asked for only one line at a time and we weren't using
- * a suspending data source.  See libjpeg.doc for more info.
- *
- * We cheated a bit by calling alloc_sarray() after jpeg_start_decompress();
- * we should have done it beforehand to ensure that the space would be
- * counted against the JPEG max_memory setting.  In some systems the above
- * code would risk an out-of-memory error.  However, in general we don't
- * know the output image dimensions before jpeg_start_decompress(), unless we
- * call jpeg_calc_output_dimensions().  See libjpeg.doc for more about this.
- *
- * Scanlines are returned in the same order as they appear in the JPEG file,
- * which is standardly top-to-bottom.  If you must emit data bottom-to-top,
- * you can use one of the virtual arrays provided by the JPEG memory manager
- * to invert the data.  See wrbmp.c for an example.
- *
- * As with compression, some operating modes may require temporary files.
- * On some systems you may need to set up a signal handler to ensure that
- * temporary files are deleted if the program is interrupted.  See libjpeg.doc.
- */
--- a/jpeg/filelist.doc
+++ b/jpeg/filelist.doc
@ -1,210 +0,0 @@
-IJG JPEG LIBRARY:  FILE LIST
-
-Copyright (C) 1994-1998, Thomas G. Lane.
-This file is part of the Independent JPEG Group's software.
-For conditions of distribution and use, see the accompanying README file.
-
-
-Here is a road map to the files in the IJG JPEG distribution.  The
-distribution includes the JPEG library proper, plus two application
-programs ("cjpeg" and "djpeg") which use the library to convert JPEG
-files to and from some other popular image formats.  A third application
-"jpegtran" uses the library to do lossless conversion between different
-variants of JPEG.  There are also two stand-alone applications,
-"rdjpgcom" and "wrjpgcom".
-
-
-THE JPEG LIBRARY
-================
-
-Include files:
-
-jpeglib.h	JPEG library's exported data and function declarations.
-jconfig.h	Configuration declarations.  Note: this file is not present
-		in the distribution; it is generated during installation.
-jmorecfg.h	Additional configuration declarations; need not be changed
-		for a standard installation.
-jerror.h	Declares JPEG library's error and trace message codes.
-jinclude.h	Central include file used by all IJG .c files to reference
-		system include files.
-jpegint.h	JPEG library's internal data structures.
-jchuff.h	Private declarations for Huffman encoder modules.
-jdhuff.h	Private declarations for Huffman decoder modules.
-jdct.h		Private declarations for forward & reverse DCT subsystems.
-jmemsys.h	Private declarations for memory management subsystem.
-jversion.h	Version information.
-
-Applications using the library should include jpeglib.h (which in turn
-includes jconfig.h and jmorecfg.h).  Optionally, jerror.h may be included
-if the application needs to reference individual JPEG error codes.  The
-other include files are intended for internal use and would not normally
-be included by an application program.  (cjpeg/djpeg/etc do use jinclude.h,
-since its function is to improve portability of the whole IJG distribution.
-Most other applications will directly include the system include files they
-want, and hence won't need jinclude.h.)
-
-
-C source code files:
-
-These files contain most of the functions intended to be called directly by
-an application program:
-
-jcapimin.c	Application program interface: core routines for compression.
-jcapistd.c	Application program interface: standard compression.
-jdapimin.c	Application program interface: core routines for decompression.
-jdapistd.c	Application program interface: standard decompression.
-jcomapi.c	Application program interface routines common to compression
-		and decompression.
-jcparam.c	Compression parameter setting helper routines.
-jctrans.c	API and library routines for transcoding compression.
-jdtrans.c	API and library routines for transcoding decompression.
-
-Compression side of the library:
-
-jcinit.c	Initialization: determines which other modules to use.
-jcmaster.c	Master control: setup and inter-pass sequencing logic.
-jcmainct.c	Main buffer controller (preprocessor => JPEG compressor).
-jcprepct.c	Preprocessor buffer controller.
-jccoefct.c	Buffer controller for DCT coefficient buffer.
-jccolor.c	Color space conversion.
-jcsample.c	Downsampling.
-jcdctmgr.c	DCT manager (DCT implementation selection & control).
-jfdctint.c	Forward DCT using slow-but-accurate integer method.
-jfdctfst.c	Forward DCT using faster, less accurate integer method.
-jfdctflt.c	Forward DCT using floating-point arithmetic.
-jchuff.c	Huffman entropy coding for sequential JPEG.
-jcphuff.c	Huffman entropy coding for progressive JPEG.
-jcmarker.c	JPEG marker writing.
-jdatadst.c	Data destination manager for stdio output.
-
-Decompression side of the library:
-
-jdmaster.c	Master control: determines which other modules to use.
-jdinput.c	Input controller: controls input processing modules.
-jdmainct.c	Main buffer controller (JPEG decompressor => postprocessor).
-jdcoefct.c	Buffer controller for DCT coefficient buffer.
-jdpostct.c	Postprocessor buffer controller.
-jdmarker.c	JPEG marker reading.
-jdhuff.c	Huffman entropy decoding for sequential JPEG.
-jdphuff.c	Huffman entropy decoding for progressive JPEG.
-jddctmgr.c	IDCT manager (IDCT implementation selection & control).
-jidctint.c	Inverse DCT using slow-but-accurate integer method.
-jidctfst.c	Inverse DCT using faster, less accurate integer method.
-jidctflt.c	Inverse DCT using floating-point arithmetic.
-jidctred.c	Inverse DCTs with reduced-size outputs.
-jdsample.c	Upsampling.
-jdcolor.c	Color space conversion.
-jdmerge.c	Merged upsampling/color conversion (faster, lower quality).
-jquant1.c	One-pass color quantization using a fixed-spacing colormap.
-jquant2.c	Two-pass color quantization using a custom-generated colormap.
-		Also handles one-pass quantization to an externally given map.
-jdatasrc.c	Data source manager for stdio input.
-
-Support files for both compression and decompression:
-
-jerror.c	Standard error handling routines (application replaceable).
-jmemmgr.c	System-independent (more or less) memory management code.
-jutils.c	Miscellaneous utility routines.
-
-jmemmgr.c relies on a system-dependent memory management module.  The IJG
-distribution includes the following implementations of the system-dependent
-module:
-
-jmemnobs.c	"No backing store": assumes adequate virtual memory exists.
-jmemansi.c	Makes temporary files with ANSI-standard routine tmpfile().
-jmemname.c	Makes temporary files with program-generated file names.
-jmemdos.c	Custom implementation for MS-DOS (16-bit environment only):
-		can use extended and expanded memory as well as temp files.
-jmemmac.c	Custom implementation for Apple Macintosh.
-
-Exactly one of the system-dependent modules should be configured into an
-installed JPEG library (see install.doc for hints about which one to use).
-On unusual systems you may find it worthwhile to make a special
-system-dependent memory manager.
-
-
-Non-C source code files:
-
-jmemdosa.asm	80x86 assembly code support for jmemdos.c; used only in
-		MS-DOS-specific configurations of the JPEG library.
-
-
-CJPEG/DJPEG/JPEGTRAN
-====================
-
-Include files:
-
-cdjpeg.h	Declarations shared by cjpeg/djpeg/jpegtran modules.
-cderror.h	Additional error and trace message codes for cjpeg et al.
-transupp.h	Declarations for jpegtran support routines in transupp.c.
-
-C source code files:
-
-cjpeg.c		Main program for cjpeg.
-djpeg.c		Main program for djpeg.
-jpegtran.c	Main program for jpegtran.
-cdjpeg.c	Utility routines used by all three programs.
-rdcolmap.c	Code to read a colormap file for djpeg's "-map" switch.
-rdswitch.c	Code to process some of cjpeg's more complex switches.
-		Also used by jpegtran.
-transupp.c	Support code for jpegtran: lossless image manipulations.
-
-Image file reader modules for cjpeg:
-
-rdbmp.c		BMP file input.
-rdgif.c		GIF file input (now just a stub).
-rdppm.c		PPM/PGM file input.
-rdrle.c		Utah RLE file input.
-rdtarga.c	Targa file input.
-
-Image file writer modules for djpeg:
-
-wrbmp.c		BMP file output.
-wrgif.c		GIF file output (a mere shadow of its former self).
-wrppm.c		PPM/PGM file output.
-wrrle.c		Utah RLE file output.
-wrtarga.c	Targa file output.
-
-
-RDJPGCOM/WRJPGCOM
-=================
-
-C source code files:
-
-rdjpgcom.c	Stand-alone rdjpgcom application.
-wrjpgcom.c	Stand-alone wrjpgcom application.
-
-These programs do not depend on the IJG library.  They do use
-jconfig.h and jinclude.h, only to improve portability.
-
-
-ADDITIONAL FILES
-================
-
-Documentation (see README for a guide to the documentation files):
-
-README		Master documentation file.
-*.doc		Other documentation files.
-*.1		Documentation in Unix man page format.
-change.log	Version-to-version change highlights.
-example.c	Sample code for calling JPEG library.
-
-Configuration/installation files and programs (see install.doc for more info):
-
-configure	Unix shell script to perform automatic configuration.
-ltconfig	Support scripts for configure (from GNU libtool).
-ltmain.sh
-config.guess
-config.sub
-install-sh	Install shell script for those Unix systems lacking one.
-ckconfig.c	Program to generate jconfig.h on non-Unix systems.
-jconfig.doc	Template for making jconfig.h by hand.
-makefile.*	Sample makefiles for particular systems.
-jconfig.*	Sample jconfig.h for particular systems.
-ansi2knr.c	De-ANSIfier for pre-ANSI C compilers (courtesy of
-		L. Peter Deutsch and Aladdin Enterprises).
-
-Test files (see install.doc for test procedure):
-
-test*.*		Source and comparison files for confidence test.
-		These are binary image files, NOT text files.
--- a/jpeg/install.doc
+++ b/jpeg/install.doc
--- a/jpeg/jaricom.c
+++ b/jpeg/jaricom.c
@ -0,0 +1,152 @@
+/*
+ * jaricom.c
+ *
+ * Developed 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains probability estimation tables for common use in
+ * arithmetic entropy encoding and decoding routines.
+ *
+ * This data represents Table D.2 in the JPEG spec (ISO/IEC IS 10918-1
+ * and CCITT Recommendation ITU-T T.81) and Table 24 in the JBIG spec
+ * (ISO/IEC IS 11544 and CCITT Recommendation ITU-T T.82).
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+/* The following #define specifies the packing of the four components
+ * into the compact INT32 representation.
+ * Note that this formula must match the actual arithmetic encoder
+ * and decoder implementation.  The implementation has to be changed
+ * if this formula is changed.
+ * The current organization is leaned on Markus Kuhn's JBIG
+ * implementation (jbig_tab.c).
+ */
+
+#define V(i,a,b,c,d) (((INT32)a << 16) | ((INT32)c << 8) | ((INT32)d << 7) | b)
+
+const INT32 jpeg_aritab[113+1] = {
+/*
+ * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
+ */
+  V(   0, 0x5a1d,   1,   1, 1 ),
+  V(   1, 0x2586,  14,   2, 0 ),
+  V(   2, 0x1114,  16,   3, 0 ),
+  V(   3, 0x080b,  18,   4, 0 ),
+  V(   4, 0x03d8,  20,   5, 0 ),
+  V(   5, 0x01da,  23,   6, 0 ),
+  V(   6, 0x00e5,  25,   7, 0 ),
+  V(   7, 0x006f,  28,   8, 0 ),
+  V(   8, 0x0036,  30,   9, 0 ),
+  V(   9, 0x001a,  33,  10, 0 ),
+  V(  10, 0x000d,  35,  11, 0 ),
+  V(  11, 0x0006,   9,  12, 0 ),
+  V(  12, 0x0003,  10,  13, 0 ),
+  V(  13, 0x0001,  12,  13, 0 ),
+  V(  14, 0x5a7f,  15,  15, 1 ),
+  V(  15, 0x3f25,  36,  16, 0 ),
+  V(  16, 0x2cf2,  38,  17, 0 ),
+  V(  17, 0x207c,  39,  18, 0 ),
+  V(  18, 0x17b9,  40,  19, 0 ),
+  V(  19, 0x1182,  42,  20, 0 ),
+  V(  20, 0x0cef,  43,  21, 0 ),
+  V(  21, 0x09a1,  45,  22, 0 ),
+  V(  22, 0x072f,  46,  23, 0 ),
+  V(  23, 0x055c,  48,  24, 0 ),
+  V(  24, 0x0406,  49,  25, 0 ),
+  V(  25, 0x0303,  51,  26, 0 ),
+  V(  26, 0x0240,  52,  27, 0 ),
+  V(  27, 0x01b1,  54,  28, 0 ),
+  V(  28, 0x0144,  56,  29, 0 ),
+  V(  29, 0x00f5,  57,  30, 0 ),
+  V(  30, 0x00b7,  59,  31, 0 ),
+  V(  31, 0x008a,  60,  32, 0 ),
+  V(  32, 0x0068,  62,  33, 0 ),
+  V(  33, 0x004e,  63,  34, 0 ),
+  V(  34, 0x003b,  32,  35, 0 ),
+  V(  35, 0x002c,  33,   9, 0 ),
+  V(  36, 0x5ae1,  37,  37, 1 ),
+  V(  37, 0x484c,  64,  38, 0 ),
+  V(  38, 0x3a0d,  65,  39, 0 ),
+  V(  39, 0x2ef1,  67,  40, 0 ),
+  V(  40, 0x261f,  68,  41, 0 ),
+  V(  41, 0x1f33,  69,  42, 0 ),
+  V(  42, 0x19a8,  70,  43, 0 ),
+  V(  43, 0x1518,  72,  44, 0 ),
+  V(  44, 0x1177,  73,  45, 0 ),
+  V(  45, 0x0e74,  74,  46, 0 ),
+  V(  46, 0x0bfb,  75,  47, 0 ),
+  V(  47, 0x09f8,  77,  48, 0 ),
+  V(  48, 0x0861,  78,  49, 0 ),
+  V(  49, 0x0706,  79,  50, 0 ),
+  V(  50, 0x05cd,  48,  51, 0 ),
+  V(  51, 0x04de,  50,  52, 0 ),
+  V(  52, 0x040f,  50,  53, 0 ),
+  V(  53, 0x0363,  51,  54, 0 ),
+  V(  54, 0x02d4,  52,  55, 0 ),
+  V(  55, 0x025c,  53,  56, 0 ),
+  V(  56, 0x01f8,  54,  57, 0 ),
+  V(  57, 0x01a4,  55,  58, 0 ),
+  V(  58, 0x0160,  56,  59, 0 ),
+  V(  59, 0x0125,  57,  60, 0 ),
+  V(  60, 0x00f6,  58,  61, 0 ),
+  V(  61, 0x00cb,  59,  62, 0 ),
+  V(  62, 0x00ab,  61,  63, 0 ),
+  V(  63, 0x008f,  61,  32, 0 ),
+  V(  64, 0x5b12,  65,  65, 1 ),
+  V(  65, 0x4d04,  80,  66, 0 ),
+  V(  66, 0x412c,  81,  67, 0 ),
+  V(  67, 0x37d8,  82,  68, 0 ),
+  V(  68, 0x2fe8,  83,  69, 0 ),
+  V(  69, 0x293c,  84,  70, 0 ),
+  V(  70, 0x2379,  86,  71, 0 ),
+  V(  71, 0x1edf,  87,  72, 0 ),
+  V(  72, 0x1aa9,  87,  73, 0 ),
+  V(  73, 0x174e,  72,  74, 0 ),
+  V(  74, 0x1424,  72,  75, 0 ),
+  V(  75, 0x119c,  74,  76, 0 ),
+  V(  76, 0x0f6b,  74,  77, 0 ),
+  V(  77, 0x0d51,  75,  78, 0 ),
+  V(  78, 0x0bb6,  77,  79, 0 ),
+  V(  79, 0x0a40,  77,  48, 0 ),
+  V(  80, 0x5832,  80,  81, 1 ),
+  V(  81, 0x4d1c,  88,  82, 0 ),
+  V(  82, 0x438e,  89,  83, 0 ),
+  V(  83, 0x3bdd,  90,  84, 0 ),
+  V(  84, 0x34ee,  91,  85, 0 ),
+  V(  85, 0x2eae,  92,  86, 0 ),
+  V(  86, 0x299a,  93,  87, 0 ),
+  V(  87, 0x2516,  86,  71, 0 ),
+  V(  88, 0x5570,  88,  89, 1 ),
+  V(  89, 0x4ca9,  95,  90, 0 ),
+  V(  90, 0x44d9,  96,  91, 0 ),
+  V(  91, 0x3e22,  97,  92, 0 ),
+  V(  92, 0x3824,  99,  93, 0 ),
+  V(  93, 0x32b4,  99,  94, 0 ),
+  V(  94, 0x2e17,  93,  86, 0 ),
+  V(  95, 0x56a8,  95,  96, 1 ),
+  V(  96, 0x4f46, 101,  97, 0 ),
+  V(  97, 0x47e5, 102,  98, 0 ),
+  V(  98, 0x41cf, 103,  99, 0 ),
+  V(  99, 0x3c3d, 104, 100, 0 ),
+  V( 100, 0x375e,  99,  93, 0 ),
+  V( 101, 0x5231, 105, 102, 0 ),
+  V( 102, 0x4c0f, 106, 103, 0 ),
+  V( 103, 0x4639, 107, 104, 0 ),
+  V( 104, 0x415e, 103,  99, 0 ),
+  V( 105, 0x5627, 105, 106, 1 ),
+  V( 106, 0x50e7, 108, 107, 0 ),
+  V( 107, 0x4b85, 109, 103, 0 ),
+  V( 108, 0x5597, 110, 109, 0 ),
+  V( 109, 0x504f, 111, 107, 0 ),
+  V( 110, 0x5a10, 110, 111, 1 ),
+  V( 111, 0x5522, 112, 109, 0 ),
+  V( 112, 0x59eb, 112, 111, 1 ),
+/*
+ * This last entry is used for fixed probability estimate of 0.5
+ * as recommended in Section 10.3 Table 5 of ITU-T Rec. T.851.
+ */
+  V( 113, 0x5a1d, 113, 113, 0 )
--- a/jpeg/jcapimin.c
+++ b/jpeg/jcapimin.c
@ -2,6 +2,7 @@
 * jcapimin.c
 *
 * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -63,14 +64,25 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)

  cinfo->comp_info = NULL;

-  for (i = 0; i < NUM_QUANT_TBLS; i++)
+  for (i = 0; i < NUM_QUANT_TBLS; i++) {
    cinfo->quant_tbl_ptrs[i] = NULL;
+#if JPEG_LIB_VERSION >= 70
+    cinfo->q_scale_factor[i] = 100;
+#endif
+  }

  for (i = 0; i < NUM_HUFF_TBLS; i++) {
    cinfo->dc_huff_tbl_ptrs[i] = NULL;
    cinfo->ac_huff_tbl_ptrs[i] = NULL;
  }

+#if JPEG_LIB_VERSION >= 80
+  /* Must do it here for emit_dqt in case jpeg_write_tables is used */
+  cinfo->block_size = DCTSIZE;
+  cinfo->natural_order = jpeg_natural_order;
+  cinfo->lim_Se = DCTSIZE2-1;
+#endif
+
  cinfo->script_space = NULL;

  cinfo->input_gamma = 1.0;	/* in case application forgets */
--- a/jpeg/jcarith.c
+++ b/jpeg/jcarith.c
@ -0,0 +1,925 @@
+/*
+ * jcarith.c
+ *
+ * Developed 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy encoder object for arithmetic encoding. */
+
+typedef struct {
+  struct jpeg_entropy_encoder pub; /* public fields */
+
+  INT32 c; /* C register, base of coding interval, layout as in sec. D.1.3 */
+  INT32 a;               /* A register, normalized size of coding interval */
+  INT32 sc;        /* counter for stacked 0xFF values which might overflow */
+  INT32 zc;          /* counter for pending 0x00 output values which might *
+                          * be discarded at the end ("Pacman" termination) */
+  int ct;  /* bit shift counter, determines when next byte will be written */
+  int buffer;                /* buffer for most recent output byte != 0xFF */
+
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+  int next_restart_num;		/* next restart number to write (0-7) */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_encoder;
+
+typedef arith_entropy_encoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+/* NOTE: Uncomment the following #define if you want to use the
+ * given formula for calculating the AC conditioning parameter Kx
+ * for spectral selection progressive coding in section G.1.3.2
+ * of the spec (Kx = Kmin + SRL (8 + Se - Kmin) 4).
+ * Although the spec and P&M authors claim that this "has proven
+ * to give good results for 8 bit precision samples", I'm not
+ * convinced yet that this is really beneficial.
+ * Early tests gave only very marginal compression enhancements
+ * (a few - around 5 or so - bytes even for very large files),
+ * which would turn out rather negative if we'd suppress the
+ * DAC (Define Arithmetic Conditioning) marker segments for
+ * the default parameters in the future.
+ * Note that currently the marker writing module emits 12-byte
+ * DAC segments for a full-component scan in a color image.
+ * This is not worth worrying about IMHO. However, since the
+ * spec defines the default values to be used if the tables
+ * are omitted (unlike Huffman tables, which are required
+ * anyway), one might optimize this behaviour in the future,
+ * and then it would be disadvantageous to use custom tables if
+ * they don't provide sufficient gain to exceed the DAC size.
+ *
+ * On the other hand, I'd consider it as a reasonable result
+ * that the conditioning has no significant influence on the
+ * compression performance. This means that the basic
+ * statistical model is already rather stable.
+ *
+ * Thus, at the moment, we use the default conditioning values
+ * anyway, and do not use the custom formula.
+ *
+#define CALCULATE_SPECTRAL_CONDITIONING
+ */
+
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
+ * We assume that int right shift is unsigned if INT32 right shift is,
+ * which should be safe.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS	int ishift_temp;
+#define IRIGHT_SHIFT(x,shft)  \
+	((ishift_temp = (x)) < 0 ? \
+	 (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+	 (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
+#endif
+
+
+LOCAL(void)
+emit_byte (int val, j_compress_ptr cinfo)
+/* Write next output byte; we do not support suspension in this module. */
+{
+  struct jpeg_destination_mgr * dest = cinfo->dest;
+
+  *dest->next_output_byte++ = (JOCTET) val;
+  if (--dest->free_in_buffer == 0)
+    if (! (*dest->empty_output_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+}
+
+
+/*
+ * Finish up at the end of an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+finish_pass (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  INT32 temp;
+
+  /* Section D.1.8: Termination of encoding */
+
+  /* Find the e->c in the coding interval with the largest
+   * number of trailing zero bits */
+  if ((temp = (e->a - 1 + e->c) & 0xFFFF0000L) < e->c)
+    e->c = temp + 0x8000L;
+  else
+    e->c = temp;
+  /* Send remaining bytes to output */
+  e->c <<= e->ct;
+  if (e->c & 0xF8000000L) {
+    /* One final overflow has to be handled */
+    if (e->buffer >= 0) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      emit_byte(e->buffer + 1, cinfo);
+      if (e->buffer + 1 == 0xFF)
+	emit_byte(0x00, cinfo);
+    }
+    e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+    e->sc = 0;
+  } else {
+    if (e->buffer == 0)
+      ++e->zc;
+    else if (e->buffer >= 0) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      emit_byte(e->buffer, cinfo);
+    }
+    if (e->sc) {
+      if (e->zc)
+	do emit_byte(0x00, cinfo);
+	while (--e->zc);
+      do {
+	emit_byte(0xFF, cinfo);
+	emit_byte(0x00, cinfo);
+      } while (--e->sc);
+    }
+  }
+  /* Output final bytes only if they are not 0x00 */
+  if (e->c & 0x7FFF800L) {
+    if (e->zc)  /* output final pending zero bytes */
+      do emit_byte(0x00, cinfo);
+      while (--e->zc);
+    emit_byte((e->c >> 19) & 0xFF, cinfo);
+    if (((e->c >> 19) & 0xFF) == 0xFF)
+      emit_byte(0x00, cinfo);
+    if (e->c & 0x7F800L) {
+      emit_byte((e->c >> 11) & 0xFF, cinfo);
+      if (((e->c >> 11) & 0xFF) == 0xFF)
+	emit_byte(0x00, cinfo);
+    }
+  }
+}
+
+
+/*
+ * The core arithmetic encoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Parameter 'val' to be encoded may be 0 or 1 (binary decision).
+ *
+ * Note: I've added full "Pacman" termination support to the
+ * byte output routines, which is equivalent to the optional
+ * Discard_final_zeros procedure (Figure D.15) in the spec.
+ * Thus, we always produce the shortest possible output
+ * stream compliant to the spec (no trailing zero bytes,
+ * except for FF stuffing).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(void)
+arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) 
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv;
+
+  /* Fetch values from our compact representation of Table D.2:
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
+  e->a -= qe;
+  if (val != (sv >> 7)) {
+    /* Encode the less probable symbol */
+    if (e->a >= qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency, otherwise code the LPS
+       * as usual: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+  } else {
+    /* Encode the more probable symbol */
+    if (e->a >= 0x8000L)
+      return;  /* A >= 0x8000 -> ready, no renormalization required */
+    if (e->a < qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+  }
+
+  /* Renormalization & data output per section D.1.6 */
+  do {
+    e->a <<= 1;
+    e->c <<= 1;
+    if (--e->ct == 0) {
+      /* Another byte is ready for output */
+      temp = e->c >> 19;
+      if (temp > 0xFF) {
+	/* Handle overflow over all stacked 0xFF bytes */
+	if (e->buffer >= 0) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  emit_byte(e->buffer + 1, cinfo);
+	  if (e->buffer + 1 == 0xFF)
+	    emit_byte(0x00, cinfo);
+	}
+	e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+	e->sc = 0;
+	/* Note: The 3 spacer bits in the C register guarantee
+	 * that the new buffer byte can't be 0xFF here
+	 * (see page 160 in the P&M JPEG book). */
+	e->buffer = temp & 0xFF;  /* new output byte, might overflow later */
+      } else if (temp == 0xFF) {
+	++e->sc;  /* stack 0xFF byte (which might overflow later) */
+      } else {
+	/* Output all stacked 0xFF bytes, they will not overflow any more */
+	if (e->buffer == 0)
+	  ++e->zc;
+	else if (e->buffer >= 0) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  emit_byte(e->buffer, cinfo);
+	}
+	if (e->sc) {
+	  if (e->zc)
+	    do emit_byte(0x00, cinfo);
+	    while (--e->zc);
+	  do {
+	    emit_byte(0xFF, cinfo);
+	    emit_byte(0x00, cinfo);
+	  } while (--e->sc);
+	}
+	e->buffer = temp & 0xFF;  /* new output byte (can still overflow) */
+      }
+      e->c &= 0x7FFFFL;
+      e->ct += 8;
+    }
+  } while (e->a < 0x8000L);
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(void)
+emit_restart (j_compress_ptr cinfo, int restart_num)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  finish_pass(cinfo);
+
+  emit_byte(0xFF, cinfo);
+  emit_byte(JPEG_RST0 + restart_num, cinfo);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->progressive_mode == 0 || cinfo->Se) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl;
+  int v, v2, m;
+  ISHIFT_TEMPS
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Compute the DC value after the required point transform by Al.
+     * This is simply an arithmetic right shift.
+     */
+    m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al);
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = m - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = m;
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+	st += 2;			/* Table F.4: SP = S0 + 2 */
+	entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+	v = -v;
+	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+	st += 3;			/* Table F.4: SN = S0 + 3 */
+	entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke;
+  int v, v2, m;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  for (ke = cinfo->Se; ke > 0; ke--)
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[jpeg_natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+
+  /* Figure F.5: Encode_AC_Coefficients */
+  for (k = cinfo->Ss; k <= ke; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 0);		/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[jpeg_natural_order[k]]) >= 0) {
+	if (v >>= cinfo->Al) {
+	  arith_encode(cinfo, st + 1, 1);
+	  arith_encode(cinfo, entropy->fixed_bin, 0);
+	  break;
+	}
+      } else {
+	v = -v;
+	if (v >>= cinfo->Al) {
+	  arith_encode(cinfo, st + 1, 1);
+	  arith_encode(cinfo, entropy->fixed_bin, 1);
+	  break;
+	}
+      }
+      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+    }
+    st += 2;
+    /* Figure F.8: Encoding the magnitude category of v */
+    m = 0;
+    if (v -= 1) {
+      arith_encode(cinfo, st, 1);
+      m = 1;
+      v2 = v;
+      if (v2 >>= 1) {
+	arith_encode(cinfo, st, 1);
+	m <<= 1;
+	st = entropy->ac_stats[tbl] +
+	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+    }
+    arith_encode(cinfo, st, 0);
+    /* Figure F.9: Encoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      arith_encode(cinfo, st, (m & v) ? 1 : 0);
+  }
+  /* Encode EOB decision only if k <= cinfo->Se */
+  if (k <= cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int Al, blkn;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  Al = cinfo->Al;
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* We simply emit the Al'th bit of the DC coefficient value. */
+    arith_encode(cinfo, st, (MCU_data[blkn][0][0] >> Al) & 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke, kex;
+  int v;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Section G.1.3.3: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  for (ke = cinfo->Se; ke > 0; ke--)
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[jpeg_natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  for (kex = ke; kex > 0; kex--)
+    if ((v = (*block)[jpeg_natural_order[kex]]) >= 0) {
+      if (v >>= cinfo->Ah) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Ah) break;
+    }
+
+  /* Figure G.10: Encode_AC_Coefficients_SA */
+  for (k = cinfo->Ss; k <= ke; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (k > kex)
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[jpeg_natural_order[k]]) >= 0) {
+	if (v >>= cinfo->Al) {
+	  if (v >> 1)			/* previously nonzero coef */
+	    arith_encode(cinfo, st + 2, (v & 1));
+	  else {			/* newly nonzero coef */
+	    arith_encode(cinfo, st + 1, 1);
+	    arith_encode(cinfo, entropy->fixed_bin, 0);
+	  }
+	  break;
+	}
+      } else {
+	v = -v;
+	if (v >>= cinfo->Al) {
+	  if (v >> 1)			/* previously nonzero coef */
+	    arith_encode(cinfo, st + 2, (v & 1));
+	  else {			/* newly nonzero coef */
+	    arith_encode(cinfo, st + 1, 1);
+	    arith_encode(cinfo, entropy->fixed_bin, 1);
+	  }
+	  break;
+	}
+      }
+      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+    }
+  }
+  /* Encode EOB decision only if k <= cinfo->Se */
+  if (k <= cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Encode and output one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, k, ke;
+  int v, v2, m;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = (*block)[0] - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = (*block)[0];
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+	st += 2;			/* Table F.4: SP = S0 + 2 */
+	entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+	v = -v;
+	arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+	st += 3;			/* Table F.4: SN = S0 + 3 */
+	entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+	while (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st += 1;
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+
+    /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+    tbl = compptr->ac_tbl_no;
+
+    /* Establish EOB (end-of-block) index */
+    for (ke = DCTSIZE2 - 1; ke > 0; ke--)
+      if ((*block)[jpeg_natural_order[ke]]) break;
+
+    /* Figure F.5: Encode_AC_Coefficients */
+    for (k = 1; k <= ke; k++) {
+      st = entropy->ac_stats[tbl] + 3 * (k - 1);
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+      while ((v = (*block)[jpeg_natural_order[k]]) == 0) {
+	arith_encode(cinfo, st + 1, 0); st += 3; k++;
+      }
+      arith_encode(cinfo, st + 1, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+	arith_encode(cinfo, entropy->fixed_bin, 0);
+      } else {
+	v = -v;
+	arith_encode(cinfo, entropy->fixed_bin, 1);
+      }
+      st += 2;
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+	arith_encode(cinfo, st, 1);
+	m = 1;
+	v2 = v;
+	if (v2 >>= 1) {
+	  arith_encode(cinfo, st, 1);
+	  m <<= 1;
+	  st = entropy->ac_stats[tbl] +
+	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	  while (v2 >>= 1) {
+	    arith_encode(cinfo, st, 1);
+	    m <<= 1;
+	    st += 1;
+	  }
+	}
+      }
+      arith_encode(cinfo, st, 0);
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+    /* Encode EOB decision only if k <= DCTSIZE2 - 1 */
+    if (k <= DCTSIZE2 - 1) {
+      st = entropy->ac_stats[tbl] + 3 * (k - 1);
+      arith_encode(cinfo, st, 1);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_compress_ptr cinfo, boolean gather_statistics)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (gather_statistics)
+    /* Make sure to avoid that in the master control logic!
+     * We are fully adaptive here and need no extra
+     * statistics gathering pass!
+     */
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+
+  /* We assume jcmaster.c already validated the progressive scan parameters. */
+
+  /* Select execution routines */
+  if (cinfo->progressive_mode) {
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_first;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.encode_mcu = encode_mcu_DC_refine;
+      else
+	entropy->pub.encode_mcu = encode_mcu_AC_refine;
+    }
+  } else
+    entropy->pub.encode_mcu = encode_mcu;
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->progressive_mode == 0 || cinfo->Se) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+#ifdef CALCULATE_SPECTRAL_CONDITIONING
+      if (cinfo->progressive_mode)
+	/* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
+	cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+#endif
+    }
+  }
+
+  /* Initialize arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+
+  /* Initialize restart stuff */
+  entropy->restarts_to_go = cinfo->restart_interval;
+  entropy->next_restart_num = 0;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy encoding.
+ */
+
+GLOBAL(void)
+jinit_arith_encoder (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				SIZEOF(arith_entropy_encoder));
+  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  entropy->pub.start_pass = start_pass;
+  entropy->pub.finish_pass = finish_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+}
--- a/jpeg/jccolor.c
+++ b/jpeg/jccolor.c
@ -2,6 +2,8 @@
 * jccolor.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -11,6 +13,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"


 /* Private subobject */
@ -78,6 +81,74 @@ typedef my_color_converter * my_cconvert_ptr;
 #define TABLE_SIZE	(8*(MAXJSAMPLE+1))


+#if BITS_IN_JSAMPLE == 8
+
+static const unsigned char red_lut[256] = {
+  0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 3 , 3 , 4 , 4 , 4 , 4 ,
+  5 , 5 , 5 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 9 , 9 , 9 ,
+  10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14,
+  14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19,
+  19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 24,
+  24, 24, 25, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 28,
+  29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33,
+  33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 38, 38,
+  38, 39, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 42, 43,
+  43, 43, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48,
+  48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52,
+  53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 56, 56, 56, 57, 57, 57,
+  57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 62, 62,
+  62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 67,
+  67, 67, 68, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 71,
+  72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 76, 76, 76
+};
+
+static const unsigned char green_lut[256] = {
+  0  , 1  , 1  , 2  , 2  , 3  , 4  , 4  , 5  , 5  , 6  , 6  ,
+  7  , 8  , 8  , 9  , 9  , 10 , 11 , 11 , 12 , 12 , 13 , 14 ,
+  14 , 15 , 15 , 16 , 16 , 17 , 18 , 18 , 19 , 19 , 20 , 21 ,
+  21 , 22 , 22 , 23 , 23 , 24 , 25 , 25 , 26 , 26 , 27 , 28 ,
+  28 , 29 , 29 , 30 , 31 , 31 , 32 , 32 , 33 , 33 , 34 , 35 ,
+  35 , 36 , 36 , 37 , 38 , 38 , 39 , 39 , 40 , 41 , 41 , 42 ,
+  42 , 43 , 43 , 44 , 45 , 45 , 46 , 46 , 47 , 48 , 48 , 49 ,
+  49 , 50 , 50 , 51 , 52 , 52 , 53 , 53 , 54 , 55 , 55 , 56 ,
+  56 , 57 , 58 , 58 , 59 , 59 , 60 , 60 , 61 , 62 , 62 , 63 ,
+  63 , 64 , 65 , 65 , 66 , 66 , 67 , 68 , 68 , 69 , 69 , 70 ,
+  70 , 71 , 72 , 72 , 73 , 73 , 74 , 75 , 75 , 76 , 76 , 77 ,
+  77 , 78 , 79 , 79 , 80 , 80 , 81 , 82 , 82 , 83 , 83 , 84 ,
+  85 , 85 , 86 , 86 , 87 , 87 , 88 , 89 , 89 , 90 , 90 , 91 ,
+  92 , 92 , 93 , 93 , 94 , 95 , 95 , 96 , 96 , 97 , 97 , 98 ,
+  99 , 99 , 100, 100, 101, 102, 102, 103, 103, 104, 104, 105,
+  106, 106, 107, 107, 108, 109, 109, 110, 110, 111, 112, 112,
+  113, 113, 114, 114, 115, 116, 116, 117, 117, 118, 119, 119,
+  120, 120, 121, 122, 122, 123, 123, 124, 124, 125, 126, 126,
+  127, 127, 128, 129, 129, 130, 130, 131, 131, 132, 133, 133,
+  134, 134, 135, 136, 136, 137, 137, 138, 139, 139, 140, 140,
+  141, 141, 142, 143, 143, 144, 144, 145, 146, 146, 147, 147,
+  148, 149, 149, 150
+};
+
+static const unsigned char blue_lut[256] = {
+  0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 ,
+  2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 ,
+  4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 ,
+  5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 ,
+  7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 ,
+  9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+  11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+  18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+  20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+  22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24,
+  24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+  26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
+  27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29
+};
+
+#endif
+
+
 /*
 * Initialize for RGB->YCC colorspace conversion.
 */
@ -146,10 +217,10 @@ rgb_ycc_convert (j_compress_ptr cinfo,
    outptr2 = output_buf[2][output_row];
    output_row++;
    for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
-      inptr += RGB_PIXELSIZE;
+      r = GETJSAMPLE(inptr[rgb_red[cinfo->in_color_space]]);
+      g = GETJSAMPLE(inptr[rgb_green[cinfo->in_color_space]]);
+      b = GETJSAMPLE(inptr[rgb_blue[cinfo->in_color_space]]);
+      inptr += rgb_pixelsize[cinfo->in_color_space];
      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
       * must be too; we do not need an explicit range-limiting operation.
       * Hence the value being shifted is never negative, and we don't
@ -187,27 +258,35 @@ rgb_gray_convert (j_compress_ptr cinfo,
 		  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
 		  JDIMENSION output_row, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  register int r, g, b;
+  #if BITS_IN_JSAMPLE != 8
  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  #endif
  register JSAMPROW inptr;
  register JSAMPROW outptr;
-  register JDIMENSION col;
+  JSAMPLE *maxoutptr;
  JDIMENSION num_cols = cinfo->image_width;
+  int rindex = rgb_red[cinfo->in_color_space];
+  int gindex = rgb_green[cinfo->in_color_space];
+  int bindex = rgb_blue[cinfo->in_color_space];
+  int rgbstride = rgb_pixelsize[cinfo->in_color_space];

  while (--num_rows >= 0) {
    inptr = *input_buf++;
    outptr = output_buf[0][output_row];
+    maxoutptr = &outptr[num_cols];
    output_row++;
-    for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
-      inptr += RGB_PIXELSIZE;
+    for (; outptr < maxoutptr; outptr++, inptr += rgbstride) {
      /* Y */
-      outptr[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+      #if BITS_IN_JSAMPLE == 8
+      *outptr = red_lut[inptr[rindex]] + green_lut[inptr[gindex]]
+	    + blue_lut[inptr[bindex]];
+      #else
+      *outptr = (JSAMPLE)
+	    ((ctab[GETJSAMPLE(inptr[rindex])+R_Y_OFF]
+	     + ctab[GETJSAMPLE(inptr[gindex])+G_Y_OFF]
+	     + ctab[GETJSAMPLE(inptr[bindex])+B_Y_OFF])
+	     >> SCALEBITS);
+      #endif
    }
  }
 }
@ -368,11 +447,15 @@ jinit_color_converter (j_compress_ptr cinfo)
    break;

  case JCS_RGB:
-#if RGB_PIXELSIZE != 3
-    if (cinfo->input_components != RGB_PIXELSIZE)
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    if (cinfo->input_components != rgb_pixelsize[cinfo->in_color_space])
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;
-#endif /* else share code with YCbCr */

  case JCS_YCbCr:
    if (cinfo->input_components != 3)
@ -398,7 +481,13 @@ jinit_color_converter (j_compress_ptr cinfo)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    if (cinfo->in_color_space == JCS_GRAYSCALE)
      cconvert->pub.color_convert = grayscale_convert;
-    else if (cinfo->in_color_space == JCS_RGB) {
+    else if (cinfo->in_color_space == JCS_RGB ||
+             cinfo->in_color_space == JCS_EXT_RGB ||
+             cinfo->in_color_space == JCS_EXT_RGBX ||
+             cinfo->in_color_space == JCS_EXT_BGR ||
+             cinfo->in_color_space == JCS_EXT_BGRX ||
+             cinfo->in_color_space == JCS_EXT_XBGR ||
+             cinfo->in_color_space == JCS_EXT_XRGB) {
      cconvert->pub.start_pass = rgb_ycc_start;
      cconvert->pub.color_convert = rgb_gray_convert;
    } else if (cinfo->in_color_space == JCS_YCbCr)
@ -408,9 +497,16 @@ jinit_color_converter (j_compress_ptr cinfo)
    break;

  case JCS_RGB:
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
    if (cinfo->num_components != 3)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_RGB && RGB_PIXELSIZE == 3)
+    if (cinfo->in_color_space == cinfo->jpeg_color_space &&
+      rgb_pixelsize[cinfo->in_color_space] == 3)
      cconvert->pub.color_convert = null_convert;
    else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
@ -419,9 +515,19 @@ jinit_color_converter (j_compress_ptr cinfo)
  case JCS_YCbCr:
    if (cinfo->num_components != 3)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_RGB) {
-      cconvert->pub.start_pass = rgb_ycc_start;
-      cconvert->pub.color_convert = rgb_ycc_convert;
+    if (cinfo->in_color_space == JCS_RGB ||
+        cinfo->in_color_space == JCS_EXT_RGB ||
+        cinfo->in_color_space == JCS_EXT_RGBX ||
+        cinfo->in_color_space == JCS_EXT_BGR ||
+        cinfo->in_color_space == JCS_EXT_BGRX ||
+        cinfo->in_color_space == JCS_EXT_XBGR ||
+        cinfo->in_color_space == JCS_EXT_XRGB) {
+      if (jsimd_can_rgb_ycc())
+        cconvert->pub.color_convert = jsimd_rgb_ycc_convert;
+      else {
+        cconvert->pub.start_pass = rgb_ycc_start;
+        cconvert->pub.color_convert = rgb_ycc_convert;
+      }
    } else if (cinfo->in_color_space == JCS_YCbCr)
      cconvert->pub.color_convert = null_convert;
    else
--- a/jpeg/jcdctmgr.c
+++ b/jpeg/jcdctmgr.c
@ -2,6 +2,9 @@
 * jcdctmgr.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011 D. R. Commander
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -15,15 +18,37 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
+#include "jsimddct.h"


 /* Private subobject for this module */

+typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
+typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+
+typedef JMETHOD(void, convsamp_method_ptr,
+                (JSAMPARRAY sample_data, JDIMENSION start_col,
+                 DCTELEM * workspace));
+typedef JMETHOD(void, float_convsamp_method_ptr,
+                (JSAMPARRAY sample_data, JDIMENSION start_col,
+                 FAST_FLOAT *workspace));
+
+typedef JMETHOD(void, quantize_method_ptr,
+                (JCOEFPTR coef_block, DCTELEM * divisors,
+                 DCTELEM * workspace));
+typedef JMETHOD(void, float_quantize_method_ptr,
+                (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                 FAST_FLOAT * workspace));
+
+METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
+
 typedef struct {
  struct jpeg_forward_dct pub;	/* public fields */

  /* Pointer to the DCT routine actually in use */
-  forward_DCT_method_ptr do_dct;
+  forward_DCT_method_ptr dct;
+  convsamp_method_ptr convsamp;
+  quantize_method_ptr quantize;

  /* The actual post-DCT divisors --- not identical to the quant table
   * entries, because of scaling (especially for an unnormalized DCT).
@ -31,16 +56,147 @@ typedef struct {
   */
  DCTELEM * divisors[NUM_QUANT_TBLS];

+  /* work area for FDCT subroutine */
+  DCTELEM * workspace;
+
 #ifdef DCT_FLOAT_SUPPORTED
  /* Same as above for the floating-point case. */
-  float_DCT_method_ptr do_float_dct;
+  float_DCT_method_ptr float_dct;
+  float_convsamp_method_ptr float_convsamp;
+  float_quantize_method_ptr float_quantize;
  FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
+  FAST_FLOAT * float_workspace;
 #endif
 } my_fdct_controller;

 typedef my_fdct_controller * my_fdct_ptr;


+/*
+ * Find the highest bit in an integer through binary search.
+ */
+LOCAL(int)
+flss (UINT16 val)
+{
+  int bit;
+
+  bit = 16;
+
+  if (!val)
+    return 0;
+
+  if (!(val & 0xff00)) {
+    bit -= 8;
+    val <<= 8;
+  }
+  if (!(val & 0xf000)) {
+    bit -= 4;
+    val <<= 4;
+  }
+  if (!(val & 0xc000)) {
+    bit -= 2;
+    val <<= 2;
+  }
+  if (!(val & 0x8000)) {
+    bit -= 1;
+    val <<= 1;
+  }
+
+  return bit;
+}
+
+/*
+ * Compute values to do a division using reciprocal.
+ *
+ * This implementation is based on an algorithm described in
+ *   "How to optimize for the Pentium family of microprocessors"
+ *   (http://www.agner.org/assem/).
+ * More information about the basic algorithm can be found in
+ * the paper "Integer Division Using Reciprocals" by Robert Alverson.
+ *
+ * The basic idea is to replace x/d by x * d^-1. In order to store
+ * d^-1 with enough precision we shift it left a few places. It turns
+ * out that this algoright gives just enough precision, and also fits
+ * into DCTELEM:
+ *
+ *   b = (the number of significant bits in divisor) - 1
+ *   r = (word size) + b
+ *   f = 2^r / divisor
+ *
+ * f will not be an integer for most cases, so we need to compensate
+ * for the rounding error introduced:
+ *
+ *   no fractional part:
+ *
+ *       result = input >> r
+ *
+ *   fractional part of f < 0.5:
+ *
+ *       round f down to nearest integer
+ *       result = ((input + 1) * f) >> r
+ *
+ *   fractional part of f > 0.5:
+ *
+ *       round f up to nearest integer
+ *       result = (input * f) >> r
+ *
+ * This is the original algorithm that gives truncated results. But we
+ * want properly rounded results, so we replace "input" with
+ * "input + divisor/2".
+ *
+ * In order to allow SIMD implementations we also tweak the values to
+ * allow the same calculation to be made at all times:
+ * 
+ *   dctbl[0] = f rounded to nearest integer
+ *   dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
+ *   dctbl[2] = 1 << ((word size) * 2 - r)
+ *   dctbl[3] = r - (word size)
+ *
+ * dctbl[2] is for stupid instruction sets where the shift operation
+ * isn't member wise (e.g. MMX).
+ *
+ * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
+ * is that most SIMD implementations have a "multiply and store top
+ * half" operation.
+ *
+ * Lastly, we store each of the values in their own table instead
+ * of in a consecutive manner, yet again in order to allow SIMD
+ * routines.
+ */
+LOCAL(int)
+compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
+{
+  UDCTELEM2 fq, fr;
+  UDCTELEM c;
+  int b, r;
+
+  b = flss(divisor) - 1;
+  r  = sizeof(DCTELEM) * 8 + b;
+
+  fq = ((UDCTELEM2)1 << r) / divisor;
+  fr = ((UDCTELEM2)1 << r) % divisor;
+
+  c = divisor / 2; /* for rounding */
+
+  if (fr == 0) { /* divisor is power of two */
+    /* fq will be one bit too large to fit in DCTELEM, so adjust */
+    fq >>= 1;
+    r--;
+  } else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */
+    c++;
+  } else { /* fractional part is > 0.5 */
+    fq++;
+  }
+
+  dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
+  dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
+  dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+
+  if(r <= 16) return 0;
+  else return 1;
+}
+
 /*
 * Initialize for a processing pass.
 * Verify that all referenced Q-tables are present, and set up
@ -78,11 +234,13 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
      if (fdct->divisors[qtblno] == NULL) {
 	fdct->divisors[qtblno] = (DCTELEM *)
 	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				      DCTSIZE2 * SIZEOF(DCTELEM));
+				      (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
      }
      dtbl = fdct->divisors[qtblno];
      for (i = 0; i < DCTSIZE2; i++) {
-	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+	if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i])
+	  && fdct->quantize == jsimd_quantize)
+	  fdct->quantize = quantize;
      }
      break;
 #endif
@ -112,14 +270,16 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 	if (fdct->divisors[qtblno] == NULL) {
 	  fdct->divisors[qtblno] = (DCTELEM *)
 	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-					DCTSIZE2 * SIZEOF(DCTELEM));
+					(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
 	}
 	dtbl = fdct->divisors[qtblno];
 	for (i = 0; i < DCTSIZE2; i++) {
-	  dtbl[i] = (DCTELEM)
+	  if(!compute_reciprocal(
 	    DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
 				  (INT32) aanscales[i]),
-		    CONST_BITS-3);
+		    CONST_BITS-3), &dtbl[i])
+	    && fdct->quantize == jsimd_quantize)
+	    fdct->quantize = quantize;
 	}
      }
      break;
@ -168,6 +328,77 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 }


+/*
+ * Load data into workspace, applying unsigned->signed conversion.
+ */
+
+METHODDEF(void)
+convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
+{
+  register DCTELEM *workspaceptr;
+  register JSAMPROW elemptr;
+  register int elemr;
+
+  workspaceptr = workspace;
+  for (elemr = 0; elemr < DCTSIZE; elemr++) {
+    elemptr = sample_data[elemr] + start_col;
+
+#if DCTSIZE == 8		/* unroll the inner loop */
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+#else
+    {
+      register int elemc;
+      for (elemc = DCTSIZE; elemc > 0; elemc--)
+        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    }
+#endif
+  }
+}
+
+
+/*
+ * Quantize/descale the coefficients, and store into coef_blocks[].
+ */
+
+METHODDEF(void)
+quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
+{
+  int i;
+  DCTELEM temp;
+  UDCTELEM recip, corr, shift;
+  UDCTELEM2 product;
+  JCOEFPTR output_ptr = coef_block;
+
+  for (i = 0; i < DCTSIZE2; i++) {
+    temp = workspace[i];
+    recip = divisors[i + DCTSIZE2 * 0];
+    corr =  divisors[i + DCTSIZE2 * 1];
+    shift = divisors[i + DCTSIZE2 * 3];
+
+    if (temp < 0) {
+      temp = -temp;
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM)*8;
+      temp = product;
+      temp = -temp;
+    } else {
+      product = (UDCTELEM2)(temp + corr) * recip;
+      product >>= shift + sizeof(DCTELEM)*8;
+      temp = product;
+    }
+
+    output_ptr[i] = (JCOEF) temp;
+  }
+}
+
+
 /*
 * Perform forward DCT on one or more blocks of a component.
 *
@ -185,87 +416,87 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
  /* This routine is heavily used, so it's worth coding it tightly. */
  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  forward_DCT_method_ptr do_dct = fdct->do_dct;
  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
+  DCTELEM * workspace;
  JDIMENSION bi;

+  /* Make sure the compiler doesn't look up these every pass */
+  forward_DCT_method_ptr do_dct = fdct->dct;
+  convsamp_method_ptr do_convsamp = fdct->convsamp;
+  quantize_method_ptr do_quantize = fdct->quantize;
+  workspace = fdct->workspace;
+
  sample_data += start_row;	/* fold in the vertical offset once */

  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
    /* Load data into workspace, applying unsigned->signed conversion */
-    { register DCTELEM *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	  }
-	}
-#endif
-      }
-    }
+    (*do_convsamp) (sample_data, start_col, workspace);

    /* Perform the DCT */
    (*do_dct) (workspace);

    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register DCTELEM temp, qval;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	qval = divisors[i];
-	temp = workspace[i];
-	/* Divide the coefficient value by qval, ensuring proper rounding.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 *
-	 * In most files, at least half of the output values will be zero
-	 * (at default quantization settings, more like three-quarters...)
-	 * so we should ensure that this case is fast.  On many machines,
-	 * a comparison is enough cheaper than a divide to make a special test
-	 * a win.  Since both inputs will be nonnegative, we need only test
-	 * for a < b to discover whether a/b is 0.
-	 * If your machine's division is fast enough, define FAST_DIVIDE.
-	 */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)	a /= b
-#else
-#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
-#endif
-	if (temp < 0) {
-	  temp = -temp;
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	  temp = -temp;
-	} else {
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	}
-	output_ptr[i] = (JCOEF) temp;
-      }
-    }
+    (*do_quantize) (coef_blocks[bi], divisors, workspace);
  }
 }


 #ifdef DCT_FLOAT_SUPPORTED

+
+METHODDEF(void)
+convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
+{
+  register FAST_FLOAT *workspaceptr;
+  register JSAMPROW elemptr;
+  register int elemr;
+
+  workspaceptr = workspace;
+  for (elemr = 0; elemr < DCTSIZE; elemr++) {
+    elemptr = sample_data[elemr] + start_col;
+#if DCTSIZE == 8		/* unroll the inner loop */
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+#else
+    {
+      register int elemc;
+      for (elemc = DCTSIZE; elemc > 0; elemc--)
+        *workspaceptr++ = (FAST_FLOAT)
+                          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    }
+#endif
+  }
+}
+
+
+METHODDEF(void)
+quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
+{
+  register FAST_FLOAT temp;
+  register int i;
+  register JCOEFPTR output_ptr = coef_block;
+
+  for (i = 0; i < DCTSIZE2; i++) {
+    /* Apply the quantization and scaling factor */
+    temp = workspace[i] * divisors[i];
+
+    /* Round to nearest integer.
+     * Since C does not specify the direction of rounding for negative
+     * quotients, we have to force the dividend positive for portability.
+     * The maximum coefficient size is +-16K (for 12-bit data), so this
+     * code should work for either 16-bit or 32-bit ints.
+     */
+    output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+  }
+}
+
+
 METHODDEF(void)
 forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
@ -275,62 +506,28 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
  /* This routine is heavily used, so it's worth coding it tightly. */
  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  float_DCT_method_ptr do_dct = fdct->do_float_dct;
  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+  FAST_FLOAT * workspace;
  JDIMENSION bi;

+
+  /* Make sure the compiler doesn't look up these every pass */
+  float_DCT_method_ptr do_dct = fdct->float_dct;
+  float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
+  float_quantize_method_ptr do_quantize = fdct->float_quantize;
+  workspace = fdct->float_workspace;
+
  sample_data += start_row;	/* fold in the vertical offset once */

  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
    /* Load data into workspace, applying unsigned->signed conversion */
-    { register FAST_FLOAT *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = (FAST_FLOAT)
-	      (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	  }
-	}
-#endif
-      }
-    }
+    (*do_convsamp) (sample_data, start_col, workspace);

    /* Perform the DCT */
    (*do_dct) (workspace);

    /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register FAST_FLOAT temp;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	/* Apply the quantization and scaling factor */
-	temp = workspace[i] * divisors[i];
-	/* Round to nearest integer.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 * The maximum coefficient size is +-16K (for 12-bit data), so this
-	 * code should work for either 16-bit or 32-bit ints.
-	 */
-	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
-      }
-    }
+    (*do_quantize) (coef_blocks[bi], divisors, workspace);
  }
 }

@ -353,23 +550,33 @@ jinit_forward_dct (j_compress_ptr cinfo)
  cinfo->fdct = (struct jpeg_forward_dct *) fdct;
  fdct->pub.start_pass = start_pass_fdctmgr;

+  /* First determine the DCT... */
  switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
  case JDCT_ISLOW:
    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_islow;
+    if (jsimd_can_fdct_islow())
+      fdct->dct = jsimd_fdct_islow;
+    else
+      fdct->dct = jpeg_fdct_islow;
    break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
  case JDCT_IFAST:
    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_ifast;
+    if (jsimd_can_fdct_ifast())
+      fdct->dct = jsimd_fdct_ifast;
+    else
+      fdct->dct = jpeg_fdct_ifast;
    break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
  case JDCT_FLOAT:
    fdct->pub.forward_DCT = forward_DCT_float;
-    fdct->do_float_dct = jpeg_fdct_float;
+    if (jsimd_can_fdct_float())
+      fdct->float_dct = jsimd_fdct_float;
+    else
+      fdct->float_dct = jpeg_fdct_float;
    break;
 #endif
  default:
@ -377,6 +584,54 @@ jinit_forward_dct (j_compress_ptr cinfo)
    break;
  }

+  /* ...then the supporting stages. */
+  switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+  case JDCT_ISLOW:
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  case JDCT_IFAST:
+#endif
+#if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
+    if (jsimd_can_convsamp())
+      fdct->convsamp = jsimd_convsamp;
+    else
+      fdct->convsamp = convsamp;
+    if (jsimd_can_quantize())
+      fdct->quantize = jsimd_quantize;
+    else
+      fdct->quantize = quantize;
+    break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  case JDCT_FLOAT:
+    if (jsimd_can_convsamp_float())
+      fdct->float_convsamp = jsimd_convsamp_float;
+    else
+      fdct->float_convsamp = convsamp_float;
+    if (jsimd_can_quantize_float())
+      fdct->float_quantize = jsimd_quantize_float;
+    else
+      fdct->float_quantize = quantize_float;
+    break;
+#endif
+  default:
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+    break;
+  }
+
+  /* Allocate workspace memory */
+#ifdef DCT_FLOAT_SUPPORTED
+  if (cinfo->dct_method == JDCT_FLOAT)
+    fdct->float_workspace = (FAST_FLOAT *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  SIZEOF(FAST_FLOAT) * DCTSIZE2);
+  else
+#endif
+    fdct->workspace = (DCTELEM *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  SIZEOF(DCTELEM) * DCTSIZE2);
+
  /* Mark divisor tables unallocated */
  for (i = 0; i < NUM_QUANT_TBLS; i++) {
    fdct->divisors[i] = NULL;
--- a/jpeg/jchuff.c
+++ b/jpeg/jchuff.c
@ -19,7 +19,6 @@
 #include "jpeglib.h"
 #include "jchuff.h"		/* Declarations shared with jcphuff.c */

-
 /* Expanded entropy encoder object for Huffman encoding.
 *
 * The savable_state subrecord contains fields that change within an MCU,
--- a/jpeg/jcinit.c
+++ b/jpeg/jcinit.c
@ -42,7 +42,11 @@ jinit_compress_master (j_compress_ptr cinfo)
  jinit_forward_dct(cinfo);
  /* Entropy encoding: either Huffman or arithmetic coding. */
  if (cinfo->arith_code) {
+#ifdef C_ARITH_CODING_SUPPORTED
+    jinit_arith_encoder(cinfo);
+#else
    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
  } else {
    if (cinfo->progressive_mode) {
 #ifdef C_PROGRESSIVE_SUPPORTED
--- a/jpeg/jcmarker.c
+++ b/jpeg/jcmarker.c
@ -2,6 +2,7 @@
 * jcmarker.c
 *
 * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -11,6 +12,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"


 typedef enum {			/* JPEG marker codes */
@ -75,7 +77,9 @@ typedef enum {			/* JPEG marker codes */
  M_JPG13 = 0xfd,
  M_COM   = 0xfe,
  
-  M_TEM   = 0x01
+  M_TEM   = 0x01,
+  
+  M_ERROR = 0x100
 } JPEG_MARKER;


@ -103,7 +107,7 @@ typedef my_marker_writer * my_marker_ptr;
 */

 LOCAL(void)
-emit_byte (j_compress_ptr cinfo, int16 val)
+emit_byte (j_compress_ptr cinfo, int val)
 /* Emit a byte */
 {
  struct jpeg_destination_mgr * dest = cinfo->dest;
@ -121,12 +125,12 @@ emit_marker (j_compress_ptr cinfo, JPEG_MARKER mark)
 /* Emit a marker code */
 {
  emit_byte(cinfo, 0xFF);
-  emit_byte(cinfo, (int16) mark);
+  emit_byte(cinfo, (int) mark);
 }


 LOCAL(void)
-emit_2bytes (j_compress_ptr cinfo, int16 value)
+emit_2bytes (j_compress_ptr cinfo, int value)
 /* Emit a 2-byte integer; these are always MSB first in JPEG files */
 {
  emit_byte(cinfo, (value >> 8) & 0xFF);
@ -138,14 +142,14 @@ emit_2bytes (j_compress_ptr cinfo, int16 value)
 * Routines to write specific marker types.
 */

-LOCAL(int16)
-emit_dqt (j_compress_ptr cinfo, int16 index)
+LOCAL(int)
+emit_dqt (j_compress_ptr cinfo, int index)
 /* Emit a DQT marker */
 /* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */
 {
  JQUANT_TBL * qtbl = cinfo->quant_tbl_ptrs[index];
-  int16 prec;
-  int16 i;
+  int prec;
+  int i;

  if (qtbl == NULL)
    ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, index);
@ -167,8 +171,8 @@ emit_dqt (j_compress_ptr cinfo, int16 index)
      /* The table entries must be emitted in zigzag order. */
      unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
      if (prec)
-	emit_byte(cinfo, (int16) (qval >> 8));
-      emit_byte(cinfo, (int16) (qval & 0xFF));
+	emit_byte(cinfo, (int) (qval >> 8));
+      emit_byte(cinfo, (int) (qval & 0xFF));
    }

    qtbl->sent_table = TRUE;
@ -179,11 +183,11 @@ emit_dqt (j_compress_ptr cinfo, int16 index)


 LOCAL(void)
-emit_dht (j_compress_ptr cinfo, int16 index, boolean is_ac)
+emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
 /* Emit a DHT marker */
 {
  JHUFF_TBL * htbl;
-  int16 length, i;
+  int length, i;
  
  if (is_ac) {
    htbl = cinfo->ac_huff_tbl_ptrs[index];
@ -225,7 +229,7 @@ emit_dac (j_compress_ptr cinfo)
 #ifdef C_ARITH_CODING_SUPPORTED
  char dc_in_use[NUM_ARITH_TBLS];
  char ac_in_use[NUM_ARITH_TBLS];
-  int16 length, i;
+  int length, i;
  jpeg_component_info *compptr;
  
  for (i = 0; i < NUM_ARITH_TBLS; i++)
@ -267,7 +271,7 @@ emit_dri (j_compress_ptr cinfo)
  
  emit_2bytes(cinfo, 4);	/* fixed length */

-  emit_2bytes(cinfo, (int16) cinfo->restart_interval);
+  emit_2bytes(cinfo, (int) cinfo->restart_interval);
 }


@ -275,7 +279,7 @@ LOCAL(void)
 emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
 /* Emit a SOF marker */
 {
-  int16 ci;
+  int ci;
  jpeg_component_info *compptr;
  
  emit_marker(cinfo, code);
@ -283,13 +287,13 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
  emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */

  /* Make sure image isn't bigger than SOF field can handle */
-  if ((long) cinfo->image_height > 65535L ||
-      (long) cinfo->image_width > 65535L)
+  if ((long) cinfo->_jpeg_height > 65535L ||
+      (long) cinfo->_jpeg_width > 65535L)
    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);

  emit_byte(cinfo, cinfo->data_precision);
-  emit_2bytes(cinfo, (int16) cinfo->image_height);
-  emit_2bytes(cinfo, (int16) cinfo->image_width);
+  emit_2bytes(cinfo, (int) cinfo->_jpeg_height);
+  emit_2bytes(cinfo, (int) cinfo->_jpeg_width);

  emit_byte(cinfo, cinfo->num_components);

@ -306,7 +310,7 @@ LOCAL(void)
 emit_sos (j_compress_ptr cinfo)
 /* Emit a SOS marker */
 {
-  int16 i, td, ta;
+  int i, td, ta;
  jpeg_component_info *compptr;
  
  emit_marker(cinfo, M_SOS);
@ -371,8 +375,8 @@ emit_jfif_app0 (j_compress_ptr cinfo)
  emit_byte(cinfo, cinfo->JFIF_major_version); /* Version fields */
  emit_byte(cinfo, cinfo->JFIF_minor_version);
  emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */
-  emit_2bytes(cinfo, (int16) cinfo->X_density);
-  emit_2bytes(cinfo, (int16) cinfo->Y_density);
+  emit_2bytes(cinfo, (int) cinfo->X_density);
+  emit_2bytes(cinfo, (int) cinfo->Y_density);
  emit_byte(cinfo, 0);		/* No thumbnail image */
  emit_byte(cinfo, 0);
 }
@ -441,14 +445,14 @@ write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen)

  emit_marker(cinfo, (JPEG_MARKER) marker);

-  emit_2bytes(cinfo, (int16) (datalen + 2));	/* total length */
+  emit_2bytes(cinfo, (int) (datalen + 2));	/* total length */
 }

 METHODDEF(void)
 write_marker_byte (j_compress_ptr cinfo, int val)
 /* Emit one byte of marker parameters following write_marker_header */
 {
-  emit_byte(cinfo, (int16) val);
+  emit_byte(cinfo, val);
 }


@ -491,7 +495,7 @@ write_file_header (j_compress_ptr cinfo)
 METHODDEF(void)
 write_frame_header (j_compress_ptr cinfo)
 {
-  int16 ci, prec;
+  int ci, prec;
  boolean is_baseline;
  jpeg_component_info *compptr;
  
@ -549,7 +553,7 @@ METHODDEF(void)
 write_scan_header (j_compress_ptr cinfo)
 {
  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
-  int16 i;
+  int i;
  jpeg_component_info *compptr;

  if (cinfo->arith_code) {
@ -613,7 +617,7 @@ write_file_trailer (j_compress_ptr cinfo)
 METHODDEF(void)
 write_tables_only (j_compress_ptr cinfo)
 {
-  int16 i;
+  int i;

  emit_marker(cinfo, M_SOI);

--- a/jpeg/jcmaster.c
+++ b/jpeg/jcmaster.c
@ -2,6 +2,8 @@
 * jcmaster.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
+ * Copyright (C) 2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -14,6 +16,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"


 /* Private state */
@ -42,8 +45,28 @@ typedef my_comp_master * my_master_ptr;
 * Support routines that do various essential calculations.
 */

+#if JPEG_LIB_VERSION >= 70
+/*
+ * Compute JPEG image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+/* Do computations that are needed before master selection phase */
+{
+  /* Hardwire it to "no scaling" */
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+}
+#endif
+
+
 LOCAL(void)
-initial_setup (j_compress_ptr cinfo)
+initial_setup (j_compress_ptr cinfo, boolean transcode_only)
 /* Do computations that are needed before master selection phase */
 {
  int ci;
@ -51,14 +74,19 @@ initial_setup (j_compress_ptr cinfo)
  long samplesperrow;
  JDIMENSION jd_samplesperrow;

+#if JPEG_LIB_VERSION >= 70
+  if (!transcode_only)
+    jpeg_calc_jpeg_dimensions(cinfo);
+#endif
+
  /* Sanity check on image dimensions */
-  if (cinfo->image_height <= 0 || cinfo->image_width <= 0
+  if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0
      || cinfo->num_components <= 0 || cinfo->input_components <= 0)
    ERREXIT(cinfo, JERR_EMPTY_IMAGE);

  /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
+  if ((long) cinfo->_jpeg_height > (long) JPEG_MAX_DIMENSION ||
+      (long) cinfo->_jpeg_width > (long) JPEG_MAX_DIMENSION)
    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);

  /* Width of an input scanline must be representable as JDIMENSION. */
@ -96,20 +124,24 @@ initial_setup (j_compress_ptr cinfo)
    /* Fill in the correct component_index value; don't rely on application */
    compptr->component_index = ci;
    /* For compression, we never do DCT scaling. */
+#if JPEG_LIB_VERSION >= 70
+    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE;
+#else
    compptr->DCT_scaled_size = DCTSIZE;
+#endif
    /* Size in DCT blocks */
    compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
+      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
 		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
    compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
+      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
 		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
    /* Size in samples */
    compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
+      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
 		    (long) cinfo->max_h_samp_factor);
    compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
+      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
 		    (long) cinfo->max_v_samp_factor);
    /* Mark component needed (this flag isn't actually used for compression) */
    compptr->component_needed = TRUE;
@ -119,7 +151,7 @@ initial_setup (j_compress_ptr cinfo)
   * main controller will call coefficient controller).
   */
  cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->image_height,
+    jdiv_round_up((long) cinfo->_jpeg_height,
 		  (long) (cinfo->max_v_samp_factor*DCTSIZE));
 }

@ -347,10 +379,10 @@ per_scan_setup (j_compress_ptr cinfo)
    
    /* Overall image size in MCUs */
    cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width,
+      jdiv_round_up((long) cinfo->_jpeg_width,
 		    (long) (cinfo->max_h_samp_factor*DCTSIZE));
    cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
+      jdiv_round_up((long) cinfo->_jpeg_height,
 		    (long) (cinfo->max_v_samp_factor*DCTSIZE));
    
    cinfo->blocks_in_MCU = 0;
@ -554,7 +586,7 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
  master->pub.is_last_pass = FALSE;

  /* Validate parameters, determine derived values */
-  initial_setup(cinfo);
+  initial_setup(cinfo, transcode_only);

  if (cinfo->scan_info != NULL) {
 #ifdef C_MULTISCAN_FILES_SUPPORTED
--- a/jpeg/jconfig-mac-cw.h
+++ b/jpeg/jconfig-mac-cw.h
@ -1,43 +0,0 @@
-/* jconfig.h --- generated by ckconfig.c */
-/* see jconfig.doc for explanations */
-
-#define ALIGN_TYPE long /* memory alignment */
-#define NO_GETENV /* we do have the function, but it's dead */
-#ifdef __cplusplus
-#define INLINE inline /* we have them in C++ */
-#endif
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE	/* You may need this on non-Unix systems */
-#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
-#undef DONT_USE_B_MODE
-/* #define PROGRESS_REPORT */	/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
--- a/jpeg/jconfig.doc
+++ b/jpeg/jconfig.doc
@ -1,155 +0,0 @@
-/*
- * jconfig.doc
- *
- * Copyright (C) 1991-1994, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file documents the configuration options that are required to
- * customize the JPEG software for a particular system.
- *
- * The actual configuration options for a particular installation are stored
- * in jconfig.h.  On many machines, jconfig.h can be generated automatically
- * or copied from one of the "canned" jconfig files that we supply.  But if
- * you need to generate a jconfig.h file by hand, this file tells you how.
- *
- * DO NOT EDIT THIS FILE --- IT WON'T ACCOMPLISH ANYTHING.
- * EDIT A COPY NAMED JCONFIG.H.
- */
-
-
-/*
- * These symbols indicate the properties of your machine or compiler.
- * #define the symbol if yes, #undef it if no.
- */
-
-/* Does your compiler support function prototypes?
- * (If not, you also need to use ansi2knr, see install.doc)
- */
-#define HAVE_PROTOTYPES
-
-/* Does your compiler support the declaration "unsigned char" ?
- * How about "unsigned short" ?
- */
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-
-/* Define "void" as "char" if your compiler doesn't know about type void.
- * NOTE: be sure to define void such that "void *" represents the most general
- * pointer type, e.g., that returned by malloc().
- */
-/* #define void char */
-
-/* Define "const" as empty if your compiler doesn't know the "const" keyword.
- */
-/* #define const */
-
-/* Define this if an ordinary "char" type is unsigned.
- * If you're not sure, leaving it undefined will work at some cost in speed.
- * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
- */
-#undef CHAR_IS_UNSIGNED
-
-/* Define this if your system has an ANSI-conforming <stddef.h> file.
- */
-#define HAVE_STDDEF_H
-
-/* Define this if your system has an ANSI-conforming <stdlib.h> file.
- */
-#define HAVE_STDLIB_H
-
-/* Define this if your system does not have an ANSI/SysV <string.h>,
- * but does have a BSD-style <strings.h>.
- */
-#undef NEED_BSD_STRINGS
-
-/* Define this if your system does not provide typedef size_t in any of the
- * ANSI-standard places (stddef.h, stdlib.h, or stdio.h), but places it in
- * <sys/types.h> instead.
- */
-#undef NEED_SYS_TYPES_H
-
-/* For 80x86 machines, you need to define NEED_FAR_POINTERS,
- * unless you are using a large-data memory model or 80386 flat-memory mode.
- * On less brain-damaged CPUs this symbol must not be defined.
- * (Defining this symbol causes large data structures to be referenced through
- * "far" pointers and to be allocated with a special version of malloc.)
- */
-#undef NEED_FAR_POINTERS
-
-/* Define this if your linker needs global names to be unique in less
- * than the first 15 characters.
- */
-#undef NEED_SHORT_EXTERNAL_NAMES
-
-/* Although a real ANSI C compiler can deal perfectly well with pointers to
- * unspecified structures (see "incomplete types" in the spec), a few pre-ANSI
- * and pseudo-ANSI compilers get confused.  To keep one of these bozos happy,
- * define INCOMPLETE_TYPES_BROKEN.  This is not recommended unless you
- * actually get "missing structure definition" warnings or errors while
- * compiling the JPEG code.
- */
-#undef INCOMPLETE_TYPES_BROKEN
-
-
-/*
- * The following options affect code selection within the JPEG library,
- * but they don't need to be visible to applications using the library.
- * To minimize application namespace pollution, the symbols won't be
- * defined unless JPEG_INTERNALS has been defined.
- */
-
-#ifdef JPEG_INTERNALS
-
-/* Define this if your compiler implements ">>" on signed values as a logical
- * (unsigned) shift; leave it undefined if ">>" is a signed (arithmetic) shift,
- * which is the normal and rational definition.
- */
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-
-#endif /* JPEG_INTERNALS */
-
-
-/*
- * The remaining options do not affect the JPEG library proper,
- * but only the sample applications cjpeg/djpeg (see cjpeg.c, djpeg.c).
- * Other applications can ignore these.
- */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-/* These defines indicate which image (non-JPEG) file formats are allowed. */
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-/* Define this if you want to name both input and output files on the command
- * line, rather than using stdout and optionally stdin.  You MUST do this if
- * your system can't cope with binary I/O to stdin/stdout.  See comments at
- * head of cjpeg.c or djpeg.c.
- */
-#undef TWO_FILE_COMMANDLINE
-
-/* Define this if your system needs explicit cleanup of temporary files.
- * This is crucial under MS-DOS, where the temporary "files" may be areas
- * of extended memory; on most other systems it's not as important.
- */
-#undef NEED_SIGNAL_CATCHER
-
-/* By default, we open image files with fopen(...,"rb") or fopen(...,"wb").
- * This is necessary on systems that distinguish text files from binary files,
- * and is harmless on most systems that don't.  If you have one of the rare
- * systems that complains about the "b" spec, define this symbol.
- */
-#undef DONT_USE_B_MODE
-
-/* Define this if you want percent-done progress reports from cjpeg/djpeg.
- */
-#undef PROGRESS_REPORT
-
-
-#endif /* JPEG_CJPEG_DJPEG */
--- a/jpeg/jconfig.h
+++ b/jpeg/jconfig.h
@ -1,107 +1,59 @@
-/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* ***** BEGIN LICENSE BLOCK *****
- * Version: MPL 1.1/GPL 2.0/LGPL 2.1
- *
- * The contents of this file are subject to the Mozilla Public License Version
- * 1.1 (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * The Original Code is mozilla.org code.
- *
- * The Initial Developer of the Original Code is
- * Netscape Communications Corporation.
- * Portions created by the Initial Developer are Copyright (C) 1998
- * the Initial Developer. All Rights Reserved.
- *
- * Contributor(s):
- *
- * Alternatively, the contents of this file may be used under the terms of
- * either the GNU General Public License Version 2 or later (the "GPL"), or
- * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- * in which case the provisions of the GPL or the LGPL are applicable instead
- * of those above. If you wish to allow use of your version of this file only
- * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the MPL, indicate your
- * decision by deleting the provisions above and replace them with the notice
- * and other provisions required by the GPL or the LGPL. If you do not delete
- * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the MPL, the GPL or the LGPL.
- *
- * ***** END LICENSE BLOCK ***** */
+/* jconfig.h.  Generated from jconfig.h.in by configure, then manually edited
+   for Mozilla. */

-/*
- * jconfig.h to configure the IJG JPEG library for the Mozilla/Netscape
- * environment.  Note that there are also Mozilla mods in jmorecfg.h.
- */
+/* Export libjpeg v6.2's ABI. */
+#define JPEG_LIB_VERSION 62

-/* We assume an ANSI C or C++ compilation environment */
-#define HAVE_PROTOTYPES 
-#define HAVE_UNSIGNED_CHAR 
-#define HAVE_UNSIGNED_SHORT 
-/* #define void char */
-/* #define const */
-#ifndef HAVE_STDDEF_H 
-#define HAVE_STDDEF_H 
-#endif /* HAVE_STDDEF_H */
-#ifndef HAVE_STDLIB_H
-#define HAVE_STDLIB_H 
-#endif /* HAVE_STDLIB_H */
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS
-#undef NEED_SHORT_EXTERNAL_NAMES
-/* Define this if you get warnings about undefined structures. */
-#undef INCOMPLETE_TYPES_BROKEN
+/* Define if your compiler supports prototypes */
+#define HAVE_PROTOTYPES 1

-/* With this setting, the IJG code will work regardless of whether
- * type "char" is signed or unsigned.
- */
-#undef CHAR_IS_UNSIGNED
+/* Define to 1 if you have the <stddef.h> header file. */
+#define HAVE_STDDEF_H 1

+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1

-/* defines that need not be visible to callers of the IJG library */
+/* Define to 1 if the system has the type `unsigned char'. */
+#define HAVE_UNSIGNED_CHAR 1

-#ifdef JPEG_INTERNALS
+/* Define to 1 if the system has the type `unsigned short'. */
+#define HAVE_UNSIGNED_SHORT 1

-/* If right shift of "long" quantities is unsigned on your machine,
- * you'll have to define this.  Fortunately few people should need it.
- */
-#undef RIGHT_SHIFT_IS_UNSIGNED
+/* Define if you want use complete types */
+/* #define INCOMPLETE_TYPES_BROKEN 1 */

-#endif /* JPEG_INTERNALS */
+/* Define if you have BSD-like bzero and bcopy */
+/* #undef NEED_BSD_STRINGS */

+/* Define if you need short function names */
+/* #undef NEED_SHORT_EXTERNAL_NAMES */

-/* these defines are not interesting for building just the IJG library,
- * but we leave 'em here anyway.
- */
-#ifdef JPEG_CJPEG_DJPEG
+/* Define if you have sys/types.h */
+#define NEED_SYS_TYPES_H 1

-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
+/* Define if shift is unsigned */
+/* #undef RIGHT_SHIFT_IS_UNSIGNED */

-#undef TWO_FILE_COMMANDLINE
-#undef NEED_SIGNAL_CATCHER
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT
+/* Use accelerated SIMD routines. */
+#define WITH_SIMD 1

-#endif /* JPEG_CJPEG_DJPEG */
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+/* # undef __CHAR_UNSIGNED__ */
+#endif

-/* SSE* alignment support - only use on platforms that support declspec and __attribute__ */
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */

-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define ALIGN16_const_vector_short(name) __declspec(align(16)) const short name[8]
-#define ALIGN16_const_vector_uchar(name) __declspec(align(16)) const unsigned char name[16]
-#else
-#define ALIGN16_const_vector_short(name) const short name[8] __attribute__ ((aligned (16)))
-#define ALIGN16_const_vector_uchar(name) const unsigned char name[16] __attribute__ ((aligned (16)))
-#endif /* ! XP_WIN32 && _M_IX86 && !__GNUC */
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif

+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* MOZILLA CHANGE: libjpeg-turbo doesn't define INLINE in its config file, so
+ * we define it here. */
+#define INLINE NS_ALWAYS_INLINE
--- a/jpeg/jconfig.h.in
+++ b/jpeg/jconfig.h.in
@ -0,0 +1,60 @@
+/* Version ID for the JPEG library.
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+ */
+#define JPEG_LIB_VERSION  62	/* Version 6b */
+
+/* Support arithmetic encoding */
+#undef C_ARITH_CODING_SUPPORTED
+
+/* Support arithmetic decoding */
+#undef D_ARITH_CODING_SUPPORTED
+
+/* Define if your compiler supports prototypes */
+#undef HAVE_PROTOTYPES
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#undef HAVE_STDDEF_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if the system has the type `unsigned char'. */
+#undef HAVE_UNSIGNED_CHAR
+
+/* Define to 1 if the system has the type `unsigned short'. */
+#undef HAVE_UNSIGNED_SHORT
+
+/* Define if you want use complete types */
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define if you have BSD-like bzero and bcopy */
+#undef NEED_BSD_STRINGS
+
+/* Define if you need short function names */
+#undef NEED_SHORT_EXTERNAL_NAMES
+
+/* Define if you have sys/types.h */
+#undef NEED_SYS_TYPES_H
+
+/* Define if shift is unsigned */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+/* Use accelerated SIMD routines. */
+#undef WITH_SIMD
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+# undef __CHAR_UNSIGNED__
+#endif
+
+/* Define to empty if `const' does not conform to ANSI C. */
+#undef const
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#undef inline
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
--- a/jpeg/jconfig.wat
+++ b/jpeg/jconfig.wat
@ -1,38 +0,0 @@
-/* jconfig.wat --- jconfig.h for Watcom C/C++ on MS-DOS or OS/2. */
-/* see jconfig.doc for explanations */
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#define CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS	/* Watcom uses flat 32-bit addressing */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#undef TWO_FILE_COMMANDLINE	/* optional */
-#define USE_SETMODE		/* Needed to make one-file style work in Watcom */
-#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
--- a/jpeg/jcparam.c
+++ b/jpeg/jcparam.c
@ -2,6 +2,8 @@
 * jcparam.c
 *
 * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2008 by Guido Vollbeding.
+ * Copyright (C) 2009-2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -60,6 +62,49 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 }


+/* These are the sample quantization tables given in JPEG spec section K.1.
+ * The spec says that the values given produce "good" quality, and
+ * when divided by 2, "very good" quality.
+ */
+static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
+  16,  11,  10,  16,  24,  40,  51,  61,
+  12,  12,  14,  19,  26,  58,  60,  55,
+  14,  13,  16,  24,  40,  57,  69,  56,
+  14,  17,  22,  29,  51,  87,  80,  62,
+  18,  22,  37,  56,  68, 109, 103,  77,
+  24,  35,  55,  64,  81, 104, 113,  92,
+  49,  64,  78,  87, 103, 121, 120, 101,
+  72,  92,  95,  98, 112, 100, 103,  99
+};
+static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
+  17,  18,  24,  47,  99,  99,  99,  99,
+  18,  21,  26,  66,  99,  99,  99,  99,
+  24,  26,  56,  99,  99,  99,  99,  99,
+  47,  66,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99
+};
+
+
+#if JPEG_LIB_VERSION >= 70
+GLOBAL(void)
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and straight percentage-scaling quality scales.
+ * This entry point allows different scalings for luminance and chrominance.
+ */
+{
+  /* Set up two quantization tables using the specified scaling */
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+		       cinfo->q_scale_factor[0], force_baseline);
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+		       cinfo->q_scale_factor[1], force_baseline);
+}
+#endif
+
+
 GLOBAL(void)
 jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
 			 boolean force_baseline)
@ -69,35 +114,10 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
 * applications that insist on a linear percentage scaling.
 */
 {
-  /* These are the sample quantization tables given in JPEG spec section K.1.
-   * The spec says that the values given produce "good" quality, and
-   * when divided by 2, "very good" quality.
-   */
-  static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
-    16,  11,  10,  16,  24,  40,  51,  61,
-    12,  12,  14,  19,  26,  58,  60,  55,
-    14,  13,  16,  24,  40,  57,  69,  56,
-    14,  17,  22,  29,  51,  87,  80,  62,
-    18,  22,  37,  56,  68, 109, 103,  77,
-    24,  35,  55,  64,  81, 104, 113,  92,
-    49,  64,  78,  87, 103, 121, 120, 101,
-    72,  92,  95,  98, 112, 100, 103,  99
-  };
-  static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
-    17,  18,  24,  47,  99,  99,  99,  99,
-    18,  21,  26,  66,  99,  99,  99,  99,
-    24,  26,  56,  99,  99,  99,  99,  99,
-    47,  66,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99,
-    99,  99,  99,  99,  99,  99,  99,  99
-  };
-
  /* Set up two quantization tables using the specified scaling */
-  jpeg_add_quant_table(cinfo, 0, (const unsigned int *)std_luminance_quant_tbl,
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
 		       scale_factor, force_baseline);
-  jpeg_add_quant_table(cinfo, 1, (const unsigned int *)std_chrominance_quant_tbl,
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
 		       scale_factor, force_baseline);
 }

@ -284,6 +304,10 @@ jpeg_set_defaults (j_compress_ptr cinfo)

  /* Initialize everything not dependent on the color space */

+#if JPEG_LIB_VERSION >= 70
+  cinfo->scale_num = 1;		/* 1:1 scaling */
+  cinfo->scale_denom = 1;
+#endif
  cinfo->data_precision = BITS_IN_JSAMPLE;
  /* Set up two quantization tables using default quality of 75 */
  jpeg_set_quality(cinfo, 75, TRUE);
@ -320,6 +344,11 @@ jpeg_set_defaults (j_compress_ptr cinfo)
  /* By default, use the simpler non-cosited sampling alignment */
  cinfo->CCIR601_sampling = FALSE;

+#if JPEG_LIB_VERSION >= 70
+  /* By default, apply fancy downsampling */
+  cinfo->do_fancy_downsampling = TRUE;
+#endif
+
  /* No input smoothing */
  cinfo->smoothing_factor = 0;

@ -363,6 +392,12 @@ jpeg_default_colorspace (j_compress_ptr cinfo)
    jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
    break;
  case JCS_RGB:
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
    jpeg_set_colorspace(cinfo, JCS_YCbCr);
    break;
  case JCS_YCbCr:
--- a/jpeg/jcphuff.c
+++ b/jpeg/jcphuff.c
@ -223,7 +223,6 @@ dump_buffer (phuff_entropy_ptr entropy)
 * between calls, so 24 bits are sufficient.
 */

-INLINE
 LOCAL(void)
 emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
 /* Emit some bits, unless we are in gather mode */
@ -276,7 +275,6 @@ flush_bits (phuff_entropy_ptr entropy)
 * Emit (or just count) a Huffman symbol.
 */

-INLINE
 LOCAL(void)
 emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
 {
--- a/jpeg/jcsample.c
+++ b/jpeg/jcsample.c
@ -2,6 +2,7 @@
 * jcsample.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -48,6 +49,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"


 /* Pointer to routine to downsample a single component */
@ -494,7 +496,10 @@ jinit_downsampler (j_compress_ptr cinfo)
    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor == cinfo->max_v_samp_factor) {
      smoothok = FALSE;
-      downsample->methods[ci] = h2v1_downsample;
+      if (jsimd_can_h2v1_downsample())
+        downsample->methods[ci] = jsimd_h2v1_downsample;
+      else
+        downsample->methods[ci] = h2v1_downsample;
    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
@ -503,7 +508,10 @@ jinit_downsampler (j_compress_ptr cinfo)
 	downsample->pub.need_context_rows = TRUE;
      } else
 #endif
-	downsample->methods[ci] = h2v2_downsample;
+	if (jsimd_can_h2v2_downsample())
+	  downsample->methods[ci] = jsimd_h2v2_downsample;
+	else
+	  downsample->methods[ci] = h2v2_downsample;
    } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
 	       (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
      smoothok = FALSE;
--- a/jpeg/jdapimin.c
+++ b/jpeg/jdapimin.c
@ -20,62 +20,6 @@
 #include "jinclude.h"
 #include "jpeglib.h"

-#ifdef HAVE_MMX_INTEL_MNEMONICS
-#if _MSC_VER >= 1400
-#include "intrin.h"
-#else
-/* no __cpuid intrinsic, use a manually rewritten replacement */
-void __stdcall __cpuid( int CPUInfo[4], int InfoType )
-{
-  int my_eax = 0, my_ebx = 0, my_ecx = 0, my_edx = 0;
-  __asm {
-    /* check eflags bit 21 to see if cpuid is supported */
-    pushfd             /* save eflags to stack */
-    pop eax            /* and put it in eax */
-    mov ecx, eax       /* save a copy in ecx to compare against */
-    xor eax, 0x200000  /* toggle ID bit (bit 21) in eflags */
-    push eax           /* save modified eflags to stack */
-    popfd              /* set eflags register with modified value */
-    pushfd             /* read eflags back out */
-    pop eax
-    xor eax, ecx       /* check for modified eflags */
-    jz NOT_SUPPORTED   /* cpuid not supported */
-
-    /* check to see if the requested cpuid type is supported */
-    xor eax, eax       /* set eax to zero */
-    cpuid
-    cmp eax, InfoType
-    jl NOT_SUPPORTED   /* the requested cpuid type is not supported */
-
-    /* actually make the cpuid call */
-    mov eax, InfoType
-    cpuid
-    mov my_eax, eax
-    mov my_ebx, ebx
-    mov my_ecx, ecx
-    mov my_edx, edx
-NOT_SUPPORTED:
-  }
-  CPUInfo[0] = my_eax;
-  CPUInfo[1] = my_ebx;
-  CPUInfo[2] = my_ecx;
-  CPUInfo[3] = my_edx;
-}
-#endif /* _MSC_VER >= 1400 */
-
-int MMXAvailable;
-static int mmxsupport();
-#endif
-
-#ifdef HAVE_SSE2_INTRINSICS
-int SSE2Available = 0;
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-static int sse2support();
-#else
-static int sse2supportGCC();
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
-#endif /* HAVE_SSE2_INTRINSICS */
-

 /*
 * Initialization of a JPEG decompression object.
@ -87,38 +31,6 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
 {
  int i;

-#ifdef HAVE_MMX_INTEL_MNEMONICS
-  static int cpuidDetected = 0;
-
-  if(!cpuidDetected)
-  {
-	MMXAvailable = mmxsupport();
-
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-	/* only do the sse2 support check if mmx is supported (so
-	   we know the processor supports cpuid) */
-	if (MMXAvailable)
-	    SSE2Available = sse2support();
-#endif
-
-	cpuidDetected = 1;
-  }
-#else
-#ifdef HAVE_SSE2_INTRINSICS
-  static int cpuidDetected = 0;
-
-  if(!cpuidDetected) {
-    SSE2Available = sse2supportGCC();
-    cpuidDetected = 1;
-  }
-
-#endif /* HAVE_SSE2_INTRINSICS */
-#endif /* HAVE_MMX_INTEL_MNEMONICS */
-
-  /* For debugging purposes, zero the whole master structure.
-   * But error manager pointer is already there, so save and restore it.
-   */
-
  /* Guard against version mismatches between library and caller. */
  cinfo->mem = NULL;		/* so jpeg_destroy knows mem mgr not called */
  if (version != JPEG_LIB_VERSION)
@ -193,6 +105,7 @@ jpeg_abort_decompress (j_decompress_ptr cinfo)
  jpeg_abort((j_common_ptr) cinfo); /* use common routine */
 }

+
 /*
 * Set default decompression parameters.
 */
@ -480,51 +393,3 @@ jpeg_finish_decompress (j_decompress_ptr cinfo)
  jpeg_abort((j_common_ptr) cinfo);
  return TRUE;
 }
-
-
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-static int mmxsupport()
-{
-  int CPUInfo[4];
-
-  __cpuid(CPUInfo, 1);
-  if (CPUInfo[3] & (0x1 << 23))
-    return 1;
-  else
-    return 0;
-}
-#endif
-
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-static int sse2support()
-{
-  int CPUInfo[4];
-
-  __cpuid(CPUInfo, 1);
-  if (CPUInfo[3] & (0x1 << 26))
-    return 1;
-  else
-    return 2;
-}
-#else
-#ifdef HAVE_SSE2_INTRINSICS
-static int sse2supportGCC()
-{
-
-  /* Mac Intel started with Core Duo chips which have SSE2 Support */
-
-#if defined(__GNUC__) && defined(__i386__)
-#if defined(XP_MACOSX)
-  return 1;
-#endif /* XP_MACOSX */
-#endif /* GNUC && i386 */
-
-  /* Add checking for SSE2 support for other platforms here */
-
-  /* We don't have SSE2 intrinsics support */
-
-  return 2;
-}
-#endif /* HAVE_SSE2_INTRINSICS */
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
-
--- a/jpeg/jdapistd.c
+++ b/jpeg/jdapistd.c
@ -2,6 +2,7 @@
 * jdapistd.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -17,6 +18,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"


 /* Forward declarations */
@ -202,7 +204,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
  }

  /* Verify that at least one iMCU row can be returned. */
-  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size;
+  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size;
  if (max_lines < lines_per_iMCU_row)
    ERREXIT(cinfo, JERR_BUFFER_SIZE);

--- a/jpeg/jdarith.c
+++ b/jpeg/jdarith.c
@ -0,0 +1,761 @@
+/*
+ * jdarith.c
+ *
+ * Developed 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy decoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy decoder object for arithmetic decoding. */
+
+typedef struct {
+  struct jpeg_entropy_decoder pub; /* public fields */
+
+  INT32 c;       /* C register, base of coding interval + input bit buffer */
+  INT32 a;               /* A register, normalized size of coding interval */
+  int ct;     /* bit shift counter, # of bits left in bit buffer part of C */
+                                                         /* init: ct = -16 */
+                                                         /* run: ct = 0..7 */
+                                                         /* error: ct = -1 */
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_decoder;
+
+typedef arith_entropy_decoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+
+LOCAL(int)
+get_byte (j_decompress_ptr cinfo)
+/* Read next input byte; we do not support suspension in this module. */
+{
+  struct jpeg_source_mgr * src = cinfo->src;
+
+  if (src->bytes_in_buffer == 0)
+    if (! (*src->fill_input_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+  src->bytes_in_buffer--;
+  return GETJOCTET(*src->next_input_byte++);
+}
+
+
+/*
+ * The core arithmetic decoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Return value is 0 or 1 (binary decision).
+ *
+ * Note: I've changed the handling of the code base & bit
+ * buffer register C compared to other implementations
+ * based on the standards layout & procedures.
+ * While it also contains both the actual base of the
+ * coding interval (16 bits) and the next-bits buffer,
+ * the cut-point between these two parts is floating
+ * (instead of fixed) with the bit shift counter CT.
+ * Thus, we also need only one (variable instead of
+ * fixed size) shift for the LPS/MPS decision, and
+ * we can get away with any renormalization update
+ * of C (except for new data insertion, of course).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(int)
+arith_decode (j_decompress_ptr cinfo, unsigned char *st)
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv, data;
+
+  /* Renormalization & data input per section D.2.6 */
+  while (e->a < 0x8000L) {
+    if (--e->ct < 0) {
+      /* Need to fetch next data byte */
+      if (cinfo->unread_marker)
+	data = 0;		/* stuff zero data */
+      else {
+	data = get_byte(cinfo);	/* read next input byte */
+	if (data == 0xFF) {	/* zero stuff or marker code */
+	  do data = get_byte(cinfo);
+	  while (data == 0xFF);	/* swallow extra 0xFF bytes */
+	  if (data == 0)
+	    data = 0xFF;	/* discard stuffed zero byte */
+	  else {
+	    /* Note: Different from the Huffman decoder, hitting
+	     * a marker while processing the compressed data
+	     * segment is legal in arithmetic coding.
+	     * The convention is to supply zero data
+	     * then until decoding is complete.
+	     */
+	    cinfo->unread_marker = data;
+	    data = 0;
+	  }
+	}
+      }
+      e->c = (e->c << 8) | data; /* insert data into C register */
+      if ((e->ct += 8) < 0)	 /* update bit shift counter */
+	/* Need more initial bytes */
+	if (++e->ct == 0)
+	  /* Got 2 initial bytes -> re-init A and exit loop */
+	  e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */
+    }
+    e->a <<= 1;
+  }
+
+  /* Fetch values from our compact representation of Table D.2:
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
+  temp = e->a - qe;
+  e->a = temp;
+  temp <<= e->ct;
+  if (e->c >= temp) {
+    e->c -= temp;
+    /* Conditional LPS (less probable symbol) exchange */
+    if (e->a < qe) {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    } else {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    }
+  } else if (e->a < 0x8000L) {
+    /* Conditional MPS (more probable symbol) exchange */
+    if (e->a < qe) {
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    } else {
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    }
+  }
+
+  return sv >> 7;
+}
+
+
+/*
+ * Check for a restart marker & resynchronize decoder.
+ */
+
+LOCAL(void)
+process_restart (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  /* Advance past the RSTn marker */
+  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+    ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if (! cinfo->progressive_mode || cinfo->Ss) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Reset restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Arithmetic MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * arithmetic-compressed coefficients.
+ * The coefficients are reordered from zigzag order into natural array order,
+ * but are not dequantized.
+ *
+ * The i'th block of the MCU is stored into the block pointed to by
+ * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign;
+  int v, m;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
+    (*block)[0] = (JCOEF) (entropy->last_dc_val[ci] << cinfo->Al);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, sign, k;
+  int v, m;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+  /* Figure F.20: Decode_AC_coefficients */
+  for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (arith_decode(cinfo, st)) break;		/* EOB flag */
+    while (arith_decode(cinfo, st + 1) == 0) {
+      st += 3; k++;
+      if (k > cinfo->Se) {
+	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	entropy->ct = -1;			/* spectral overflow */
+	return TRUE;
+      }
+    }
+    /* Figure F.21: Decoding nonzero value v */
+    /* Figure F.22: Decoding the sign of v */
+    sign = arith_decode(cinfo, entropy->fixed_bin);
+    st += 2;
+    /* Figure F.23: Decoding the magnitude category of v */
+    if ((m = arith_decode(cinfo, st)) != 0) {
+      if (arith_decode(cinfo, st)) {
+	m <<= 1;
+	st = entropy->ac_stats[tbl] +
+	     (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+    }
+    v = m;
+    /* Figure F.24: Decoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      if (arith_decode(cinfo, st)) v |= m;
+    v += 1; if (sign) v = -v;
+    /* Scale and output coefficient in natural (dezigzagged) order */
+    (*block)[jpeg_natural_order[k]] = (JCOEF) (v << cinfo->Al);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int p1, blkn;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* Encoded data is simply the next bit of the two's-complement DC value */
+    if (arith_decode(cinfo, st))
+      MCU_data[blkn][0][0] |= p1;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  JCOEFPTR thiscoef;
+  unsigned char *st;
+  int tbl, k, kex;
+  int p1, m1;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+  m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  for (kex = cinfo->Se; kex > 0; kex--)
+    if ((*block)[jpeg_natural_order[kex]]) break;
+
+  for (k = cinfo->Ss; k <= cinfo->Se; k++) {
+    st = entropy->ac_stats[tbl] + 3 * (k - 1);
+    if (k > kex)
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+    for (;;) {
+      thiscoef = *block + jpeg_natural_order[k];
+      if (*thiscoef) {				/* previously nonzero coef */
+	if (arith_decode(cinfo, st + 2)) {
+	  if (*thiscoef < 0)
+	    *thiscoef += m1;
+	  else
+	    *thiscoef += p1;
+	}
+	break;
+      }
+      if (arith_decode(cinfo, st + 1)) {	/* newly nonzero coef */
+	if (arith_decode(cinfo, entropy->fixed_bin))
+	  *thiscoef = m1;
+	else
+	  *thiscoef = p1;
+	break;
+      }
+      st += 3; k++;
+      if (k > cinfo->Se) {
+	WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	entropy->ct = -1;			/* spectral overflow */
+	return TRUE;
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Decode one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign, k;
+  int v, m;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+	while (arith_decode(cinfo, st)) {
+	  if ((m <<= 1) == 0x8000) {
+	    WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	    entropy->ct = -1;			/* magnitude overflow */
+	    return TRUE;
+	  }
+	  st += 1;
+	}
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+	entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+	entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+	entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+
+    /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+    tbl = compptr->ac_tbl_no;
+
+    /* Figure F.20: Decode_AC_coefficients */
+    for (k = 1; k <= DCTSIZE2 - 1; k++) {
+      st = entropy->ac_stats[tbl] + 3 * (k - 1);
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+      while (arith_decode(cinfo, st + 1) == 0) {
+	st += 3; k++;
+	if (k > DCTSIZE2 - 1) {
+	  WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	  entropy->ct = -1;			/* spectral overflow */
+	  return TRUE;
+	}
+      }
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, entropy->fixed_bin);
+      st += 2;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+	if (arith_decode(cinfo, st)) {
+	  m <<= 1;
+	  st = entropy->ac_stats[tbl] +
+	       (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+	  while (arith_decode(cinfo, st)) {
+	    if ((m <<= 1) == 0x8000) {
+	      WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+	      entropy->ct = -1;			/* magnitude overflow */
+	      return TRUE;
+	    }
+	    st += 1;
+	  }
+	}
+      }
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+	if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      (*block)[jpeg_natural_order[k]] = (JCOEF) v;
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (cinfo->progressive_mode) {
+    /* Validate progressive scan parameters */
+    if (cinfo->Ss == 0) {
+      if (cinfo->Se != 0)
+	goto bad;
+    } else {
+      /* need not check Ss/Se < 0 since they came from unsigned bytes */
+      if (cinfo->Se < cinfo->Ss || cinfo->Se > DCTSIZE2 - 1)
+	goto bad;
+      /* AC scans may have only one component */
+      if (cinfo->comps_in_scan != 1)
+	goto bad;
+    }
+    if (cinfo->Ah != 0) {
+      /* Successive approximation refinement scan: must have Al = Ah-1. */
+      if (cinfo->Ah-1 != cinfo->Al)
+	goto bad;
+    }
+    if (cinfo->Al > 13) {	/* need not check for < 0 */
+      bad:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+    }
+    /* Update progression status, and verify that scan order is legal.
+     * Note that inter-scan inconsistencies are treated as warnings
+     * not fatal errors ... not clear if this is right way to behave.
+     */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+	WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+	int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+	if (cinfo->Ah != expected)
+	  WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+	coef_bit_ptr[coefi] = cinfo->Al;
+      }
+    }
+    /* Select MCU decoding routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_first;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+	entropy->pub.decode_mcu = decode_mcu_DC_refine;
+      else
+	entropy->pub.decode_mcu = decode_mcu_AC_refine;
+    }
+  } else {
+    /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+     * This ought to be an error condition, but we make it a warning.
+     */
+    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
+	(cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
+      WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+    /* Select MCU decoding routine */
+    entropy->pub.decode_mcu = decode_mcu;
+  }
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+	entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if (! cinfo->progressive_mode || cinfo->Ss) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+	ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+	entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+	  ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+    }
+  }
+
+  /* Initialize arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Initialize restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy decoding.
+ */
+
+GLOBAL(void)
+jinit_arith_decoder (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				SIZEOF(arith_entropy_decoder));
+  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  entropy->pub.start_pass = start_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+
+  if (cinfo->progressive_mode) {
+    /* Create progression status table */
+    int *coef_bit_ptr, ci;
+    cinfo->coef_bits = (int (*)[DCTSIZE2])
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  cinfo->num_components*DCTSIZE2*SIZEOF(int));
+    coef_bit_ptr = & cinfo->coef_bits[0][0];
+    for (ci = 0; ci < cinfo->num_components; ci++) 
+      for (i = 0; i < DCTSIZE2; i++)
+	*coef_bit_ptr++ = -1;
+  }
+}
--- a/jpeg/jdatadst.c
+++ b/jpeg/jdatadst.c
@ -2,13 +2,14 @@
 * jdatadst.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains compression data destination routines for the case of
- * emitting JPEG data to a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * destination manager.
+ * emitting JPEG data to memory or to a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different destination manager.
 * IMPORTANT: we assume that fwrite() will correctly transcribe an array of
 * JOCTETs into 8-bit-wide elements on external storage.  If char is wider
 * than 8 bits on your machine, you may need to do some tweaking.
@ -19,6 +20,11 @@
 #include "jpeglib.h"
 #include "jerror.h"

+#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
+extern void * malloc JPP((size_t size));
+extern void free JPP((void *ptr));
+#endif
+

 /* Expanded data destination object for stdio output */

@ -34,6 +40,23 @@ typedef my_destination_mgr * my_dest_ptr;
 #define OUTPUT_BUF_SIZE  4096	/* choose an efficiently fwrite'able size */


+#if JPEG_LIB_VERSION >= 80
+/* Expanded data destination object for memory output */
+
+typedef struct {
+  struct jpeg_destination_mgr pub; /* public fields */
+
+  unsigned char ** outbuffer;	/* target buffer */
+  unsigned long * outsize;
+  unsigned char * newbuffer;	/* newly allocated buffer */
+  JOCTET * buffer;		/* start of buffer */
+  size_t bufsize;
+} my_mem_destination_mgr;
+
+typedef my_mem_destination_mgr * my_mem_dest_ptr;
+#endif
+
+
 /*
 * Initialize destination --- called by jpeg_start_compress
 * before any data is actually written.
@ -53,6 +76,14 @@ init_destination (j_compress_ptr cinfo)
  dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
 }

+#if JPEG_LIB_VERSION >= 80
+METHODDEF(void)
+init_mem_destination (j_compress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+#endif
+

 /*
 * Empty the output buffer --- called whenever buffer fills up.
@ -92,6 +123,38 @@ empty_output_buffer (j_compress_ptr cinfo)
  return TRUE;
 }

+#if JPEG_LIB_VERSION >= 80
+METHODDEF(boolean)
+empty_mem_output_buffer (j_compress_ptr cinfo)
+{
+  size_t nextsize;
+  JOCTET * nextbuffer;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  /* Try to allocate new buffer with double size */
+  nextsize = dest->bufsize * 2;
+  nextbuffer = malloc(nextsize);
+
+  if (nextbuffer == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+
+  MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
+
+  if (dest->newbuffer != NULL)
+    free(dest->newbuffer);
+
+  dest->newbuffer = nextbuffer;
+
+  dest->pub.next_output_byte = nextbuffer + dest->bufsize;
+  dest->pub.free_in_buffer = dest->bufsize;
+
+  dest->buffer = nextbuffer;
+  dest->bufsize = nextsize;
+
+  return TRUE;
+}
+#endif
+

 /*
 * Terminate destination --- called by jpeg_finish_compress
@ -119,6 +182,17 @@ term_destination (j_compress_ptr cinfo)
    ERREXIT(cinfo, JERR_FILE_WRITE);
 }

+#if JPEG_LIB_VERSION >= 80
+METHODDEF(void)
+term_mem_destination (j_compress_ptr cinfo)
+{
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  *dest->outbuffer = dest->buffer;
+  *dest->outsize = dest->bufsize - dest->pub.free_in_buffer;
+}
+#endif
+

 /*
 * Prepare for output to a stdio stream.
@ -150,65 +224,54 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile)
  dest->outfile = outfile;
 }

+
+#if JPEG_LIB_VERSION >= 80
 /*
- * term_destination_file_close --- called by jpeg_finish_compress
- * after all data has been written.  Usually needs to flush buffer.
- * also will need to close file
- * NB: *not* called by jpeg_abort or jpeg_destroy; surrounding
- * application must deal with any cleanup that should happen even
- * for error exit.
- */
-
-METHODDEF(void)
-term_destination_file_close(j_compress_ptr cinfo)
-{
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
-  size_t datacount = OUTPUT_BUF_SIZE - dest->pub.free_in_buffer;
-
-  /* Write any data remaining in the buffer */
-  if (datacount > 0) {
-    if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount)
-      ERREXIT(cinfo, JERR_FILE_WRITE);
-  }
-  fflush(dest->outfile);
-  
-  /* Make sure we wrote the output file OK */
-  if (ferror(dest->outfile))
-    ERREXIT(cinfo, JERR_FILE_WRITE);
-  else
-      fclose(dest->outfile);
-}
-
-
-
-
-
-/*
- * Prepare for output to a file from a char *
- * The caller is responsible
- * for closing it after finishing compression.
+ * Prepare for output to a memory buffer.
+ * The caller may supply an own initial buffer with appropriate size.
+ * Otherwise, or when the actual data output exceeds the given size,
+ * the library adapts the buffer size as necessary.
+ * The standard library functions malloc/free are used for allocating
+ * larger memory, so the buffer is available to the application after
+ * finishing compression, and then the application is responsible for
+ * freeing the requested memory.
 */

 GLOBAL(void)
-jpeg_file_dest (j_compress_ptr cinfo, char * outfile)
+jpeg_mem_dest (j_compress_ptr cinfo,
+	       unsigned char ** outbuffer, unsigned long * outsize)
 {
-  my_dest_ptr dest;
+  my_mem_dest_ptr dest;
+
+  if (outbuffer == NULL || outsize == NULL)	/* sanity check */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);

  /* The destination object is made permanent so that multiple JPEG images
-   * can be written to the same file without re-executing jpeg_stdio_dest.
-   * This makes it dangerous to use this manager and a different destination
-   * manager serially with the same JPEG object, because their private object
-   * sizes may be different.  Caveat programmer.
+   * can be written to the same buffer without re-executing jpeg_mem_dest.
   */
  if (cinfo->dest == NULL) {	/* first time for this JPEG object? */
    cinfo->dest = (struct jpeg_destination_mgr *)
      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				  SIZEOF(my_destination_mgr));
+				  SIZEOF(my_mem_destination_mgr));
  }

-  dest = (my_dest_ptr) cinfo->dest;
-  dest->pub.init_destination = init_destination;
-  dest->pub.empty_output_buffer = empty_output_buffer;
-  dest->pub.term_destination = term_destination_file_close;
-  dest->outfile = fopen(outfile,"wb");
+  dest = (my_mem_dest_ptr) cinfo->dest;
+  dest->pub.init_destination = init_mem_destination;
+  dest->pub.empty_output_buffer = empty_mem_output_buffer;
+  dest->pub.term_destination = term_mem_destination;
+  dest->outbuffer = outbuffer;
+  dest->outsize = outsize;
+  dest->newbuffer = NULL;
+
+  if (*outbuffer == NULL || *outsize == 0) {
+    /* Allocate initial buffer */
+    dest->newbuffer = *outbuffer = malloc(OUTPUT_BUF_SIZE);
+    if (dest->newbuffer == NULL)
+      ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+    *outsize = OUTPUT_BUF_SIZE;
+  }
+
+  dest->pub.next_output_byte = dest->buffer = *outbuffer;
+  dest->pub.free_in_buffer = dest->bufsize = *outsize;
 }
+#endif
--- a/jpeg/jdatasrc.c
+++ b/jpeg/jdatasrc.c
@ -2,13 +2,14 @@
 * jdatasrc.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2010 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains decompression data source routines for the case of
- * reading JPEG data from a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * source manager.
+ * reading JPEG data from memory or from a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different source manager.
 * IMPORTANT: we assume that fread() will correctly transcribe an array of
 * JOCTETs from 8-bit-wide elements on external storage.  If char is wider
 * than 8 bits on your machine, you may need to do some tweaking.
@ -52,6 +53,14 @@ init_source (j_decompress_ptr cinfo)
  src->start_of_file = TRUE;
 }

+#if JPEG_LIB_VERSION >= 80
+METHODDEF(void)
+init_mem_source (j_decompress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+#endif
+

 /*
 * Fill the input buffer --- called whenever buffer is emptied.
@ -111,6 +120,28 @@ fill_input_buffer (j_decompress_ptr cinfo)
  return TRUE;
 }

+#if JPEG_LIB_VERSION >= 80
+METHODDEF(boolean)
+fill_mem_input_buffer (j_decompress_ptr cinfo)
+{
+  static JOCTET mybuffer[4];
+
+  /* The whole JPEG data is expected to reside in the supplied memory
+   * buffer, so any request for more data beyond the given buffer size
+   * is treated as an error.
+   */
+  WARNMS(cinfo, JWRN_JPEG_EOF);
+  /* Insert a fake EOI marker */
+  mybuffer[0] = (JOCTET) 0xFF;
+  mybuffer[1] = (JOCTET) JPEG_EOI;
+
+  cinfo->src->next_input_byte = mybuffer;
+  cinfo->src->bytes_in_buffer = 2;
+
+  return TRUE;
+}
+#endif
+

 /*
 * Skip data --- used to skip over a potentially large amount of
@ -127,22 +158,22 @@ fill_input_buffer (j_decompress_ptr cinfo)
 METHODDEF(void)
 skip_input_data (j_decompress_ptr cinfo, long num_bytes)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  struct jpeg_source_mgr * src = cinfo->src;

  /* Just a dumb implementation for now.  Could use fseek() except
   * it doesn't work on pipes.  Not clear that being smart is worth
   * any trouble anyway --- large skips are infrequent.
   */
  if (num_bytes > 0) {
-    while (num_bytes > (long) src->pub.bytes_in_buffer) {
-      num_bytes -= (long) src->pub.bytes_in_buffer;
-      (void) fill_input_buffer(cinfo);
+    while (num_bytes > (long) src->bytes_in_buffer) {
+      num_bytes -= (long) src->bytes_in_buffer;
+      (void) (*src->fill_input_buffer) (cinfo);
      /* note we assume that fill_input_buffer will never return FALSE,
       * so suspension need not be handled.
       */
    }
-    src->pub.next_input_byte += (size_t) num_bytes;
-    src->pub.bytes_in_buffer -= (size_t) num_bytes;
+    src->next_input_byte += (size_t) num_bytes;
+    src->bytes_in_buffer -= (size_t) num_bytes;
  }
 }

@ -210,3 +241,40 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
  src->pub.bytes_in_buffer = 0; /* forces fill_input_buffer on first read */
  src->pub.next_input_byte = NULL; /* until buffer loaded */
 }
+
+
+#if JPEG_LIB_VERSION >= 80
+/*
+ * Prepare for input from a supplied memory buffer.
+ * The buffer must contain the whole JPEG data.
+ */
+
+GLOBAL(void)
+jpeg_mem_src (j_decompress_ptr cinfo,
+	      unsigned char * inbuffer, unsigned long insize)
+{
+  struct jpeg_source_mgr * src;
+
+  if (inbuffer == NULL || insize == 0)	/* Treat empty input as fatal error */
+    ERREXIT(cinfo, JERR_INPUT_EMPTY);
+
+  /* The source object is made permanent so that a series of JPEG images
+   * can be read from the same buffer by calling jpeg_mem_src only before
+   * the first one.
+   */
+  if (cinfo->src == NULL) {	/* first time for this JPEG object? */
+    cinfo->src = (struct jpeg_source_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+				  SIZEOF(struct jpeg_source_mgr));
+  }
+
+  src = cinfo->src;
+  src->init_source = init_mem_source;
+  src->fill_input_buffer = fill_mem_input_buffer;
+  src->skip_input_data = skip_input_data;
+  src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
+  src->term_source = term_source;
+  src->bytes_in_buffer = (size_t) insize;
+  src->next_input_byte = (JOCTET *) inbuffer;
+}
+#endif
--- a/jpeg/jdcoefct.c
+++ b/jpeg/jdcoefct.c
@ -2,6 +2,7 @@
 * jdcoefct.c
 *
 * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Copyright (C) 2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -17,6 +18,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"

 /* Block smoothing is only applicable for progressive JPEG, so: */
 #ifndef D_PROGRESSIVE_SUPPORTED
@ -47,6 +49,9 @@ typedef struct {
   */
  JBLOCKROW MCU_buffer[D_MAX_BLOCKS_IN_MCU];

+  /* Temporary workspace for one MCU */
+  JCOEF * workspace;
+
 #ifdef D_MULTISCAN_FILES_SUPPORTED
  /* In multi-pass modes, we need a virtual block array for each component. */
  jvirt_barray_ptr whole_image[MAX_COMPONENTS];
@ -187,7 +192,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						    : compptr->last_col_width;
 	output_ptr = output_buf[compptr->component_index] +
-	  yoffset * compptr->DCT_scaled_size;
+	  yoffset * compptr->_DCT_scaled_size;
 	start_col = MCU_col_num * compptr->MCU_sample_width;
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (cinfo->input_iMCU_row < last_iMCU_row ||
@ -197,11 +202,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	      (*inverse_DCT) (cinfo, compptr,
 			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
 			      output_ptr, output_col);
-	      output_col += compptr->DCT_scaled_size;
+	      output_col += compptr->_DCT_scaled_size;
 	    }
 	  }
 	  blkn += compptr->MCU_width;
-	  output_ptr += compptr->DCT_scaled_size;
+	  output_ptr += compptr->_DCT_scaled_size;
 	}
      }
    }
@ -362,9 +367,9 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
 			output_ptr, output_col);
 	buffer_ptr++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->_DCT_scaled_size;
      }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->_DCT_scaled_size;
    }
  }

@ -471,13 +476,16 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  jpeg_component_info *compptr;
  inverse_DCT_method_ptr inverse_DCT;
  boolean first_row, last_row;
-  JBLOCK workspace;
+  JCOEF * workspace;
  int *coef_bits;
  JQUANT_TBL *quanttbl;
  INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
  int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
  int Al, pred;

+  /* Keep a local variable to avoid looking it up more than once */
+  workspace = coef->workspace;
+
  /* Force some input to be done if we are getting ahead of the input. */
  while (cinfo->input_scan_number <= cinfo->output_scan_number &&
 	 ! cinfo->inputctl->eoi_reached) {
@ -654,9 +662,9 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	DC4 = DC5; DC5 = DC6;
 	DC7 = DC8; DC8 = DC9;
 	buffer_ptr++, prev_block_row++, next_block_row++;
-	output_col += compptr->DCT_scaled_size;
+	output_col += compptr->_DCT_scaled_size;
      }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->_DCT_scaled_size;
    }
  }

@ -733,4 +741,9 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
    coef->pub.decompress_data = decompress_onepass;
    coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
  }
+
+  /* Allocate the workspace buffer */
+  coef->workspace = (JCOEF *)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                SIZEOF(JCOEF) * DCTSIZE2);
 }
--- a/jpeg/jdcolor.c
+++ b/jpeg/jdcolor.c
@ -2,6 +2,8 @@
 * jdcolor.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -11,7 +13,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jconfig.h"
+#include "jsimd.h"


 /* Private subobject */
@ -19,15 +21,11 @@
 typedef struct {
  struct jpeg_color_deconverter pub; /* public fields */

-  /* These fields are not needed anymore as these are now static tables */
-
-#if 0
  /* Private state for YCC->RGB conversion */
  int * Cr_r_tab;		/* => table for Cr to R conversion */
  int * Cb_b_tab;		/* => table for Cb to B conversion */
  INT32 * Cr_g_tab;		/* => table for Cr to G conversion */
  INT32 * Cb_g_tab;		/* => table for Cb to G conversion */
-#endif
 } my_color_deconverter;

 typedef my_color_deconverter * my_cconvert_ptr;
@ -66,191 +64,6 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
 #define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))

-/* Use static tables for color processing. */
-
-const int Cr_r_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
-  0xffffff4dUL, 0xffffff4eUL, 0xffffff4fUL, 0xffffff51UL, 0xffffff52UL, 0xffffff54UL, 
-  0xffffff55UL, 0xffffff56UL, 0xffffff58UL, 0xffffff59UL, 0xffffff5bUL, 0xffffff5cUL, 
-  0xffffff5dUL, 0xffffff5fUL, 0xffffff60UL, 0xffffff62UL, 0xffffff63UL, 0xffffff64UL, 
-  0xffffff66UL, 0xffffff67UL, 0xffffff69UL, 0xffffff6aUL, 0xffffff6bUL, 0xffffff6dUL, 
-  0xffffff6eUL, 0xffffff70UL, 0xffffff71UL, 0xffffff72UL, 0xffffff74UL, 0xffffff75UL, 
-  0xffffff77UL, 0xffffff78UL, 0xffffff79UL, 0xffffff7bUL, 0xffffff7cUL, 0xffffff7eUL, 
-  0xffffff7fUL, 0xffffff80UL, 0xffffff82UL, 0xffffff83UL, 0xffffff85UL, 0xffffff86UL, 
-  0xffffff87UL, 0xffffff89UL, 0xffffff8aUL, 0xffffff8cUL, 0xffffff8dUL, 0xffffff8eUL, 
-  0xffffff90UL, 0xffffff91UL, 0xffffff93UL, 0xffffff94UL, 0xffffff95UL, 0xffffff97UL, 
-  0xffffff98UL, 0xffffff9aUL, 0xffffff9bUL, 0xffffff9cUL, 0xffffff9eUL, 0xffffff9fUL, 
-  0xffffffa1UL, 0xffffffa2UL, 0xffffffa3UL, 0xffffffa5UL, 0xffffffa6UL, 0xffffffa8UL, 
-  0xffffffa9UL, 0xffffffaaUL, 0xffffffacUL, 0xffffffadUL, 0xffffffafUL, 0xffffffb0UL, 
-  0xffffffb1UL, 0xffffffb3UL, 0xffffffb4UL, 0xffffffb6UL, 0xffffffb7UL, 0xffffffb8UL, 
-  0xffffffbaUL, 0xffffffbbUL, 0xffffffbdUL, 0xffffffbeUL, 0xffffffc0UL, 0xffffffc1UL, 
-  0xffffffc2UL, 0xffffffc4UL, 0xffffffc5UL, 0xffffffc7UL, 0xffffffc8UL, 0xffffffc9UL, 
-  0xffffffcbUL, 0xffffffccUL, 0xffffffceUL, 0xffffffcfUL, 0xffffffd0UL, 0xffffffd2UL, 
-  0xffffffd3UL, 0xffffffd5UL, 0xffffffd6UL, 0xffffffd7UL, 0xffffffd9UL, 0xffffffdaUL, 
-  0xffffffdcUL, 0xffffffddUL, 0xffffffdeUL, 0xffffffe0UL, 0xffffffe1UL, 0xffffffe3UL, 
-  0xffffffe4UL, 0xffffffe5UL, 0xffffffe7UL, 0xffffffe8UL, 0xffffffeaUL, 0xffffffebUL, 
-  0xffffffecUL, 0xffffffeeUL, 0xffffffefUL, 0xfffffff1UL, 0xfffffff2UL, 0xfffffff3UL, 
-  0xfffffff5UL, 0xfffffff6UL, 0xfffffff8UL, 0xfffffff9UL, 0xfffffffaUL, 0xfffffffcUL, 
-  0xfffffffdUL, 0xffffffffUL,       0x00UL,       0x01UL,       0x03UL,       0x04UL, 
-        0x06UL,       0x07UL,       0x08UL,       0x0aUL,       0x0bUL,       0x0dUL, 
-        0x0eUL,       0x0fUL,       0x11UL,       0x12UL,       0x14UL,       0x15UL, 
-        0x16UL,       0x18UL,       0x19UL,       0x1bUL,       0x1cUL,       0x1dUL, 
-        0x1fUL,       0x20UL,       0x22UL,       0x23UL,       0x24UL,       0x26UL, 
-        0x27UL,       0x29UL,       0x2aUL,       0x2bUL,       0x2dUL,       0x2eUL, 
-        0x30UL,       0x31UL,       0x32UL,       0x34UL,       0x35UL,       0x37UL, 
-        0x38UL,       0x39UL,       0x3bUL,       0x3cUL,       0x3eUL,       0x3fUL, 
-        0x40UL,       0x42UL,       0x43UL,       0x45UL,       0x46UL,       0x48UL, 
-        0x49UL,       0x4aUL,       0x4cUL,       0x4dUL,       0x4fUL,       0x50UL, 
-        0x51UL,       0x53UL,       0x54UL,       0x56UL,       0x57UL,       0x58UL, 
-        0x5aUL,       0x5bUL,       0x5dUL,       0x5eUL,       0x5fUL,       0x61UL, 
-        0x62UL,       0x64UL,       0x65UL,       0x66UL,       0x68UL,       0x69UL, 
-        0x6bUL,       0x6cUL,       0x6dUL,       0x6fUL,       0x70UL,       0x72UL, 
-        0x73UL,       0x74UL,       0x76UL,       0x77UL,       0x79UL,       0x7aUL, 
-        0x7bUL,       0x7dUL,       0x7eUL,       0x80UL,       0x81UL,       0x82UL, 
-        0x84UL,       0x85UL,       0x87UL,       0x88UL,       0x89UL,       0x8bUL, 
-        0x8cUL,       0x8eUL,       0x8fUL,       0x90UL,       0x92UL,       0x93UL, 
-        0x95UL,       0x96UL,       0x97UL,       0x99UL,       0x9aUL,       0x9cUL, 
-        0x9dUL,       0x9eUL,       0xa0UL,       0xa1UL,       0xa3UL,       0xa4UL, 
-        0xa5UL,       0xa7UL,       0xa8UL,       0xaaUL,       0xabUL,       0xacUL, 
-        0xaeUL,       0xafUL,       0xb1UL,       0xb2UL
-  };
-
-const int Cb_b_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
-  0xffffff1dUL, 0xffffff1fUL, 0xffffff21UL, 0xffffff22UL, 0xffffff24UL, 0xffffff26UL, 
-  0xffffff28UL, 0xffffff2aUL, 0xffffff2bUL, 0xffffff2dUL, 0xffffff2fUL, 0xffffff31UL, 
-  0xffffff32UL, 0xffffff34UL, 0xffffff36UL, 0xffffff38UL, 0xffffff3aUL, 0xffffff3bUL, 
-  0xffffff3dUL, 0xffffff3fUL, 0xffffff41UL, 0xffffff42UL, 0xffffff44UL, 0xffffff46UL, 
-  0xffffff48UL, 0xffffff49UL, 0xffffff4bUL, 0xffffff4dUL, 0xffffff4fUL, 0xffffff51UL, 
-  0xffffff52UL, 0xffffff54UL, 0xffffff56UL, 0xffffff58UL, 0xffffff59UL, 0xffffff5bUL, 
-  0xffffff5dUL, 0xffffff5fUL, 0xffffff61UL, 0xffffff62UL, 0xffffff64UL, 0xffffff66UL, 
-  0xffffff68UL, 0xffffff69UL, 0xffffff6bUL, 0xffffff6dUL, 0xffffff6fUL, 0xffffff70UL, 
-  0xffffff72UL, 0xffffff74UL, 0xffffff76UL, 0xffffff78UL, 0xffffff79UL, 0xffffff7bUL, 
-  0xffffff7dUL, 0xffffff7fUL, 0xffffff80UL, 0xffffff82UL, 0xffffff84UL, 0xffffff86UL, 
-  0xffffff88UL, 0xffffff89UL, 0xffffff8bUL, 0xffffff8dUL, 0xffffff8fUL, 0xffffff90UL, 
-  0xffffff92UL, 0xffffff94UL, 0xffffff96UL, 0xffffff97UL, 0xffffff99UL, 0xffffff9bUL, 
-  0xffffff9dUL, 0xffffff9fUL, 0xffffffa0UL, 0xffffffa2UL, 0xffffffa4UL, 0xffffffa6UL, 
-  0xffffffa7UL, 0xffffffa9UL, 0xffffffabUL, 0xffffffadUL, 0xffffffaeUL, 0xffffffb0UL, 
-  0xffffffb2UL, 0xffffffb4UL, 0xffffffb6UL, 0xffffffb7UL, 0xffffffb9UL, 0xffffffbbUL, 
-  0xffffffbdUL, 0xffffffbeUL, 0xffffffc0UL, 0xffffffc2UL, 0xffffffc4UL, 0xffffffc6UL, 
-  0xffffffc7UL, 0xffffffc9UL, 0xffffffcbUL, 0xffffffcdUL, 0xffffffceUL, 0xffffffd0UL, 
-  0xffffffd2UL, 0xffffffd4UL, 0xffffffd5UL, 0xffffffd7UL, 0xffffffd9UL, 0xffffffdbUL, 
-  0xffffffddUL, 0xffffffdeUL, 0xffffffe0UL, 0xffffffe2UL, 0xffffffe4UL, 0xffffffe5UL, 
-  0xffffffe7UL, 0xffffffe9UL, 0xffffffebUL, 0xffffffedUL, 0xffffffeeUL, 0xfffffff0UL, 
-  0xfffffff2UL, 0xfffffff4UL, 0xfffffff5UL, 0xfffffff7UL, 0xfffffff9UL, 0xfffffffbUL, 
-  0xfffffffcUL, 0xfffffffeUL,       0x00UL,       0x02UL,       0x04UL,       0x05UL, 
-        0x07UL,       0x09UL,       0x0bUL,       0x0cUL,       0x0eUL,       0x10UL, 
-        0x12UL,       0x13UL,       0x15UL,       0x17UL,       0x19UL,       0x1bUL, 
-        0x1cUL,       0x1eUL,       0x20UL,       0x22UL,       0x23UL,       0x25UL, 
-        0x27UL,       0x29UL,       0x2bUL,       0x2cUL,       0x2eUL,       0x30UL, 
-        0x32UL,       0x33UL,       0x35UL,       0x37UL,       0x39UL,       0x3aUL, 
-        0x3cUL,       0x3eUL,       0x40UL,       0x42UL,       0x43UL,       0x45UL, 
-        0x47UL,       0x49UL,       0x4aUL,       0x4cUL,       0x4eUL,       0x50UL, 
-        0x52UL,       0x53UL,       0x55UL,       0x57UL,       0x59UL,       0x5aUL, 
-        0x5cUL,       0x5eUL,       0x60UL,       0x61UL,       0x63UL,       0x65UL, 
-        0x67UL,       0x69UL,       0x6aUL,       0x6cUL,       0x6eUL,       0x70UL, 
-        0x71UL,       0x73UL,       0x75UL,       0x77UL,       0x78UL,       0x7aUL, 
-        0x7cUL,       0x7eUL,       0x80UL,       0x81UL,       0x83UL,       0x85UL, 
-        0x87UL,       0x88UL,       0x8aUL,       0x8cUL,       0x8eUL,       0x90UL, 
-        0x91UL,       0x93UL,       0x95UL,       0x97UL,       0x98UL,       0x9aUL, 
-        0x9cUL,       0x9eUL,       0x9fUL,       0xa1UL,       0xa3UL,       0xa5UL, 
-        0xa7UL,       0xa8UL,       0xaaUL,       0xacUL,       0xaeUL,       0xafUL, 
-        0xb1UL,       0xb3UL,       0xb5UL,       0xb7UL,       0xb8UL,       0xbaUL, 
-        0xbcUL,       0xbeUL,       0xbfUL,       0xc1UL,       0xc3UL,       0xc5UL, 
-        0xc6UL,       0xc8UL,       0xcaUL,       0xccUL,       0xceUL,       0xcfUL, 
-        0xd1UL,       0xd3UL,       0xd5UL,       0xd6UL,       0xd8UL,       0xdaUL, 
-        0xdcUL,       0xdeUL,       0xdfUL,       0xe1UL
-  };
-
-const int Cr_g_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
-    0x5b6900UL,   0x5ab22eUL,   0x59fb5cUL,   0x59448aUL,   0x588db8UL,   0x57d6e6UL, 
-    0x572014UL,   0x566942UL,   0x55b270UL,   0x54fb9eUL,   0x5444ccUL,   0x538dfaUL, 
-    0x52d728UL,   0x522056UL,   0x516984UL,   0x50b2b2UL,   0x4ffbe0UL,   0x4f450eUL, 
-    0x4e8e3cUL,   0x4dd76aUL,   0x4d2098UL,   0x4c69c6UL,   0x4bb2f4UL,   0x4afc22UL, 
-    0x4a4550UL,   0x498e7eUL,   0x48d7acUL,   0x4820daUL,   0x476a08UL,   0x46b336UL, 
-    0x45fc64UL,   0x454592UL,   0x448ec0UL,   0x43d7eeUL,   0x43211cUL,   0x426a4aUL, 
-    0x41b378UL,   0x40fca6UL,   0x4045d4UL,   0x3f8f02UL,   0x3ed830UL,   0x3e215eUL, 
-    0x3d6a8cUL,   0x3cb3baUL,   0x3bfce8UL,   0x3b4616UL,   0x3a8f44UL,   0x39d872UL, 
-    0x3921a0UL,   0x386aceUL,   0x37b3fcUL,   0x36fd2aUL,   0x364658UL,   0x358f86UL, 
-    0x34d8b4UL,   0x3421e2UL,   0x336b10UL,   0x32b43eUL,   0x31fd6cUL,   0x31469aUL, 
-    0x308fc8UL,   0x2fd8f6UL,   0x2f2224UL,   0x2e6b52UL,   0x2db480UL,   0x2cfdaeUL, 
-    0x2c46dcUL,   0x2b900aUL,   0x2ad938UL,   0x2a2266UL,   0x296b94UL,   0x28b4c2UL, 
-    0x27fdf0UL,   0x27471eUL,   0x26904cUL,   0x25d97aUL,   0x2522a8UL,   0x246bd6UL, 
-    0x23b504UL,   0x22fe32UL,   0x224760UL,   0x21908eUL,   0x20d9bcUL,   0x2022eaUL, 
-    0x1f6c18UL,   0x1eb546UL,   0x1dfe74UL,   0x1d47a2UL,   0x1c90d0UL,   0x1bd9feUL, 
-    0x1b232cUL,   0x1a6c5aUL,   0x19b588UL,   0x18feb6UL,   0x1847e4UL,   0x179112UL, 
-    0x16da40UL,   0x16236eUL,   0x156c9cUL,   0x14b5caUL,   0x13fef8UL,   0x134826UL, 
-    0x129154UL,   0x11da82UL,   0x1123b0UL,   0x106cdeUL,    0xfb60cUL,    0xeff3aUL, 
-     0xe4868UL,    0xd9196UL,    0xcdac4UL,    0xc23f2UL,    0xb6d20UL,    0xab64eUL, 
-     0x9ff7cUL,    0x948aaUL,    0x891d8UL,    0x7db06UL,    0x72434UL,    0x66d62UL, 
-     0x5b690UL,    0x4ffbeUL,    0x448ecUL,    0x3921aUL,    0x2db48UL,    0x22476UL, 
-     0x16da4UL,     0xb6d2UL,        0x0UL, 0xffff492eUL, 0xfffe925cUL, 0xfffddb8aUL, 
-  0xfffd24b8UL, 0xfffc6de6UL, 0xfffbb714UL, 0xfffb0042UL, 0xfffa4970UL, 0xfff9929eUL, 
-  0xfff8dbccUL, 0xfff824faUL, 0xfff76e28UL, 0xfff6b756UL, 0xfff60084UL, 0xfff549b2UL, 
-  0xfff492e0UL, 0xfff3dc0eUL, 0xfff3253cUL, 0xfff26e6aUL, 0xfff1b798UL, 0xfff100c6UL, 
-  0xfff049f4UL, 0xffef9322UL, 0xffeedc50UL, 0xffee257eUL, 0xffed6eacUL, 0xffecb7daUL, 
-  0xffec0108UL, 0xffeb4a36UL, 0xffea9364UL, 0xffe9dc92UL, 0xffe925c0UL, 0xffe86eeeUL, 
-  0xffe7b81cUL, 0xffe7014aUL, 0xffe64a78UL, 0xffe593a6UL, 0xffe4dcd4UL, 0xffe42602UL, 
-  0xffe36f30UL, 0xffe2b85eUL, 0xffe2018cUL, 0xffe14abaUL, 0xffe093e8UL, 0xffdfdd16UL, 
-  0xffdf2644UL, 0xffde6f72UL, 0xffddb8a0UL, 0xffdd01ceUL, 0xffdc4afcUL, 0xffdb942aUL, 
-  0xffdadd58UL, 0xffda2686UL, 0xffd96fb4UL, 0xffd8b8e2UL, 0xffd80210UL, 0xffd74b3eUL, 
-  0xffd6946cUL, 0xffd5dd9aUL, 0xffd526c8UL, 0xffd46ff6UL, 0xffd3b924UL, 0xffd30252UL, 
-  0xffd24b80UL, 0xffd194aeUL, 0xffd0dddcUL, 0xffd0270aUL, 0xffcf7038UL, 0xffceb966UL, 
-  0xffce0294UL, 0xffcd4bc2UL, 0xffcc94f0UL, 0xffcbde1eUL, 0xffcb274cUL, 0xffca707aUL, 
-  0xffc9b9a8UL, 0xffc902d6UL, 0xffc84c04UL, 0xffc79532UL, 0xffc6de60UL, 0xffc6278eUL, 
-  0xffc570bcUL, 0xffc4b9eaUL, 0xffc40318UL, 0xffc34c46UL, 0xffc29574UL, 0xffc1dea2UL, 
-  0xffc127d0UL, 0xffc070feUL, 0xffbfba2cUL, 0xffbf035aUL, 0xffbe4c88UL, 0xffbd95b6UL, 
-  0xffbcdee4UL, 0xffbc2812UL, 0xffbb7140UL, 0xffbaba6eUL, 0xffba039cUL, 0xffb94ccaUL, 
-  0xffb895f8UL, 0xffb7df26UL, 0xffb72854UL, 0xffb67182UL, 0xffb5bab0UL, 0xffb503deUL, 
-  0xffb44d0cUL, 0xffb3963aUL, 0xffb2df68UL, 0xffb22896UL, 0xffb171c4UL, 0xffb0baf2UL, 
-  0xffb00420UL, 0xffaf4d4eUL, 0xffae967cUL, 0xffaddfaaUL, 0xffad28d8UL, 0xffac7206UL, 
-  0xffabbb34UL, 0xffab0462UL, 0xffaa4d90UL, 0xffa996beUL, 0xffa8dfecUL, 0xffa8291aUL, 
-  0xffa77248UL, 0xffa6bb76UL, 0xffa604a4UL, 0xffa54dd2UL
- };
-
-const int Cb_g_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
-    0x2c8d00UL,   0x2c34e6UL,   0x2bdcccUL,   0x2b84b2UL,   0x2b2c98UL,   0x2ad47eUL, 
-    0x2a7c64UL,   0x2a244aUL,   0x29cc30UL,   0x297416UL,   0x291bfcUL,   0x28c3e2UL, 
-    0x286bc8UL,   0x2813aeUL,   0x27bb94UL,   0x27637aUL,   0x270b60UL,   0x26b346UL, 
-    0x265b2cUL,   0x260312UL,   0x25aaf8UL,   0x2552deUL,   0x24fac4UL,   0x24a2aaUL, 
-    0x244a90UL,   0x23f276UL,   0x239a5cUL,   0x234242UL,   0x22ea28UL,   0x22920eUL, 
-    0x2239f4UL,   0x21e1daUL,   0x2189c0UL,   0x2131a6UL,   0x20d98cUL,   0x208172UL, 
-    0x202958UL,   0x1fd13eUL,   0x1f7924UL,   0x1f210aUL,   0x1ec8f0UL,   0x1e70d6UL, 
-    0x1e18bcUL,   0x1dc0a2UL,   0x1d6888UL,   0x1d106eUL,   0x1cb854UL,   0x1c603aUL, 
-    0x1c0820UL,   0x1bb006UL,   0x1b57ecUL,   0x1affd2UL,   0x1aa7b8UL,   0x1a4f9eUL, 
-    0x19f784UL,   0x199f6aUL,   0x194750UL,   0x18ef36UL,   0x18971cUL,   0x183f02UL, 
-    0x17e6e8UL,   0x178eceUL,   0x1736b4UL,   0x16de9aUL,   0x168680UL,   0x162e66UL, 
-    0x15d64cUL,   0x157e32UL,   0x152618UL,   0x14cdfeUL,   0x1475e4UL,   0x141dcaUL, 
-    0x13c5b0UL,   0x136d96UL,   0x13157cUL,   0x12bd62UL,   0x126548UL,   0x120d2eUL, 
-    0x11b514UL,   0x115cfaUL,   0x1104e0UL,   0x10acc6UL,   0x1054acUL,    0xffc92UL, 
-     0xfa478UL,    0xf4c5eUL,    0xef444UL,    0xe9c2aUL,    0xe4410UL,    0xdebf6UL, 
-     0xd93dcUL,    0xd3bc2UL,    0xce3a8UL,    0xc8b8eUL,    0xc3374UL,    0xbdb5aUL, 
-     0xb8340UL,    0xb2b26UL,    0xad30cUL,    0xa7af2UL,    0xa22d8UL,    0x9cabeUL, 
-     0x972a4UL,    0x91a8aUL,    0x8c270UL,    0x86a56UL,    0x8123cUL,    0x7ba22UL, 
-     0x76208UL,    0x709eeUL,    0x6b1d4UL,    0x659baUL,    0x601a0UL,    0x5a986UL, 
-     0x5516cUL,    0x4f952UL,    0x4a138UL,    0x4491eUL,    0x3f104UL,    0x398eaUL, 
-     0x340d0UL,    0x2e8b6UL,    0x2909cUL,    0x23882UL,    0x1e068UL,    0x1884eUL, 
-     0x13034UL,     0xd81aUL,     0x8000UL,     0x27e6UL, 0xffffcfccUL, 0xffff77b2UL,
-  0xffff1f98UL, 0xfffec77eUL, 0xfffe6f64UL, 0xfffe174aUL, 0xfffdbf30UL, 0xfffd6716UL,
-  0xfffd0efcUL, 0xfffcb6e2UL, 0xfffc5ec8UL, 0xfffc06aeUL, 0xfffbae94UL, 0xfffb567aUL,
-  0xfffafe60UL, 0xfffaa646UL, 0xfffa4e2cUL, 0xfff9f612UL, 0xfff99df8UL, 0xfff945deUL,
-  0xfff8edc4UL, 0xfff895aaUL, 0xfff83d90UL, 0xfff7e576UL, 0xfff78d5cUL, 0xfff73542UL,
-  0xfff6dd28UL, 0xfff6850eUL, 0xfff62cf4UL, 0xfff5d4daUL, 0xfff57cc0UL, 0xfff524a6UL,
-  0xfff4cc8cUL, 0xfff47472UL, 0xfff41c58UL, 0xfff3c43eUL, 0xfff36c24UL, 0xfff3140aUL,
-  0xfff2bbf0UL, 0xfff263d6UL, 0xfff20bbcUL, 0xfff1b3a2UL, 0xfff15b88UL, 0xfff1036eUL,
-  0xfff0ab54UL, 0xfff0533aUL, 0xffeffb20UL, 0xffefa306UL, 0xffef4aecUL, 0xffeef2d2UL,
-  0xffee9ab8UL, 0xffee429eUL, 0xffedea84UL, 0xffed926aUL, 0xffed3a50UL, 0xffece236UL,
-  0xffec8a1cUL, 0xffec3202UL, 0xffebd9e8UL, 0xffeb81ceUL, 0xffeb29b4UL, 0xffead19aUL,
-  0xffea7980UL, 0xffea2166UL, 0xffe9c94cUL, 0xffe97132UL, 0xffe91918UL, 0xffe8c0feUL,
-  0xffe868e4UL, 0xffe810caUL, 0xffe7b8b0UL, 0xffe76096UL, 0xffe7087cUL, 0xffe6b062UL,
-  0xffe65848UL, 0xffe6002eUL, 0xffe5a814UL, 0xffe54ffaUL, 0xffe4f7e0UL, 0xffe49fc6UL,
-  0xffe447acUL, 0xffe3ef92UL, 0xffe39778UL, 0xffe33f5eUL, 0xffe2e744UL, 0xffe28f2aUL,
-  0xffe23710UL, 0xffe1def6UL, 0xffe186dcUL, 0xffe12ec2UL, 0xffe0d6a8UL, 0xffe07e8eUL,
-  0xffe02674UL, 0xffdfce5aUL, 0xffdf7640UL, 0xffdf1e26UL, 0xffdec60cUL, 0xffde6df2UL,
-  0xffde15d8UL, 0xffddbdbeUL, 0xffdd65a4UL, 0xffdd0d8aUL, 0xffdcb570UL, 0xffdc5d56UL,
-  0xffdc053cUL, 0xffdbad22UL, 0xffdb5508UL, 0xffdafceeUL, 0xffdaa4d4UL, 0xffda4cbaUL,
-  0xffd9f4a0UL, 0xffd99c86UL, 0xffd9446cUL, 0xffd8ec52UL, 0xffd89438UL, 0xffd83c1eUL,
-  0xffd7e404UL, 0xffd78beaUL, 0xffd733d0UL, 0xffd6dbb6UL, 0xffd6839cUL, 0xffd62b82UL,
-  0xffd5d368UL, 0xffd57b4eUL, 0xffd52334UL, 0xffd4cb1aUL
- };

 /*
 * Initialize tables for YCC->RGB colorspace conversion.
@ -259,10 +72,6 @@ const int Cb_g_tab[(MAXJSAMPLE+1) * SIZEOF(int)] ={
 LOCAL(void)
 build_ycc_rgb_table (j_decompress_ptr cinfo)
 {
-
-  /* The code below was used to generate the static tables above */
-
-#if 0
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  int i;
  INT32 x;
@ -296,7 +105,6 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
    /* We also add in ONE_HALF so that need not do it in inner loop */
    cconvert->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
  }
-#endif /* 0 */
 }


@ -318,12 +126,16 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
 {
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int y, cb, cr;
-  JSAMPLE * range_limit_y;
-  JSAMPROW outptr;
-  JSAMPROW inptr0, inptr1, inptr2;
-  JDIMENSION col;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;
-  JSAMPLE * range_limit = cinfo->sample_range_limit;
+  /* copy these pointers into registers if possible */
+  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register int * Crrtab = cconvert->Cr_r_tab;
+  register int * Cbbtab = cconvert->Cb_b_tab;
+  register INT32 * Crgtab = cconvert->Cr_g_tab;
+  register INT32 * Cbgtab = cconvert->Cb_g_tab;
  SHIFT_TEMPS

  while (--num_rows >= 0) {
@ -336,14 +148,13 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
      y  = GETJSAMPLE(inptr0[col]);
      cb = GETJSAMPLE(inptr1[col]);
      cr = GETJSAMPLE(inptr2[col]);
-      range_limit_y = range_limit + y;
      /* Range-limiting is essential due to noise introduced by DCT losses. */
-      outptr[RGB_RED] =   range_limit_y[Cr_r_tab[cr]];
-      outptr[RGB_GREEN] = range_limit_y[
-			      ((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr],
+      outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + Crrtab[cr]];
+      outptr[rgb_green[cinfo->out_color_space]] = range_limit[y +
+			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
 						 SCALEBITS))];
-      outptr[RGB_BLUE] =  range_limit_y[Cb_b_tab[cb]];
-      outptr += RGB_PIXELSIZE;
+      outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + Cbbtab[cb]];
+      outptr += rgb_pixelsize[cinfo->out_color_space];
    }
  }
 }
@ -411,16 +222,20 @@ gray_rgb_convert (j_decompress_ptr cinfo,
 		  JSAMPARRAY output_buf, int num_rows)
 {
  register JSAMPROW inptr, outptr;
-  register JDIMENSION col;
+  JSAMPLE *maxinptr;
  JDIMENSION num_cols = cinfo->output_width;
+  int rindex = rgb_red[cinfo->out_color_space];
+  int gindex = rgb_green[cinfo->out_color_space];
+  int bindex = rgb_blue[cinfo->out_color_space];
+  int rgbstride = rgb_pixelsize[cinfo->out_color_space];

  while (--num_rows >= 0) {
    inptr = input_buf[0][input_row++];
+    maxinptr = &inptr[num_cols];
    outptr = *output_buf++;
-    for (col = 0; col < num_cols; col++) {
+    for (; inptr < maxinptr; inptr++, outptr += rgbstride) {
      /* We can dispense with GETJSAMPLE() here */
-      outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
-      outptr += RGB_PIXELSIZE;
+      outptr[rindex] = outptr[gindex] = outptr[bindex] = *inptr;
    }
  }
 }
@ -430,6 +245,7 @@ gray_rgb_convert (j_decompress_ptr cinfo,
 * Adobe-style YCCK->CMYK conversion.
 * We convert YCbCr to R=1-C, G=1-M, and B=1-Y using the same
 * conversion as above, while passing K (black) unchanged.
+ * We assume build_ycc_rgb_table has been called.
 */

 METHODDEF(void)
@ -445,6 +261,10 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
  JDIMENSION num_cols = cinfo->output_width;
  /* copy these pointers into registers if possible */
  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register int * Crrtab = cconvert->Cr_r_tab;
+  register int * Cbbtab = cconvert->Cb_b_tab;
+  register INT32 * Crgtab = cconvert->Cr_g_tab;
+  register INT32 * Cbgtab = cconvert->Cb_g_tab;
  SHIFT_TEMPS

  while (--num_rows >= 0) {
@ -459,11 +279,11 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
      cb = GETJSAMPLE(inptr1[col]);
      cr = GETJSAMPLE(inptr2[col]);
      /* Range-limiting is essential due to noise introduced by DCT losses. */
-      outptr[0] = range_limit[MAXJSAMPLE - (y + Cr_r_tab[cr])];   /* red */
-      outptr[1] = range_limit[MAXJSAMPLE - (y +                   /* green */
-				  ((int) RIGHT_SHIFT(Cb_g_tab[cb] + Cr_g_tab[cr],
-                         SCALEBITS)))];
-      outptr[2] = range_limit[MAXJSAMPLE - (y + Cb_b_tab[cb])];   /* blue */
+      outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];	/* red */
+      outptr[1] = range_limit[MAXJSAMPLE - (y +			/* green */
+			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+						 SCALEBITS)))];
+      outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];	/* blue */
      /* K passes through unchanged */
      outptr[3] = inptr3[col];	/* don't need GETJSAMPLE here */
      outptr += 4;
@ -543,13 +363,24 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
    break;

  case JCS_RGB:
-    cinfo->out_color_components = RGB_PIXELSIZE;
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
    if (cinfo->jpeg_color_space == JCS_YCbCr) {
-      cconvert->pub.color_convert = ycc_rgb_convert;
-      build_ycc_rgb_table(cinfo);
+      if (jsimd_can_ycc_rgb())
+        cconvert->pub.color_convert = jsimd_ycc_rgb_convert;
+      else {
+        cconvert->pub.color_convert = ycc_rgb_convert;
+        build_ycc_rgb_table(cinfo);
+      }
    } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
      cconvert->pub.color_convert = gray_rgb_convert;
-    } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) {
+    } else if (cinfo->jpeg_color_space == cinfo->out_color_space &&
+      rgb_pixelsize[cinfo->out_color_space] == 3) {
      cconvert->pub.color_convert = null_convert;
    } else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
--- a/jpeg/jdct.h
+++ b/jpeg/jdct.h
@ -23,18 +23,26 @@
 * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
 * convention improves accuracy in integer implementations and saves some
 * work in floating-point ones.
- * Quantization of the output coefficients is done by jcdctmgr.c.
+ * Quantization of the output coefficients is done by jcdctmgr.c. This
+ * step requires an unsigned type and also one with twice the bits.
 */

 #if BITS_IN_JSAMPLE == 8
+#ifndef WITH_SIMD
 typedef int DCTELEM;		/* 16 or 32 bits is fine */
+typedef unsigned int UDCTELEM;
+typedef unsigned long long UDCTELEM2;
+#else
+typedef short DCTELEM;  /* prefer 16 bit with SIMD for parellelism */
+typedef unsigned short UDCTELEM;
+typedef unsigned int UDCTELEM2;
+#endif
 #else
 typedef INT32 DCTELEM;		/* must have 32 bits */
+typedef UINT32 UDCTELEM;
+typedef unsigned long long UDCTELEM2;
 #endif

-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
-

 /*
 * An inverse DCT routine is given a pointer to the input JBLOCK and a pointer
--- a/jpeg/jddctmgr.c
+++ b/jpeg/jddctmgr.c
@ -2,6 +2,8 @@
 * jddctmgr.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -19,9 +21,9 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
-#ifdef HAVE_SSE2_INTRINSICS
-extern int SSE2Available;
-#endif
+#include "jsimddct.h"
+#include "jpegcomp.h"
+

 /*
 * The decompressor input side (jdinput.c) saves away the appropriate
@ -80,14 +82,6 @@ typedef union {
 #endif
 #endif

-GLOBAL(void)
-jpeg_idct_islow_sse2 (
-	j_decompress_ptr cinfo, 
-	jpeg_component_info * compptr,
-	JCOEFPTR coef_block,
-	JSAMPARRAY output_buf, 
-	JDIMENSION output_col);
-

 /*
 * Prepare for an output pass.
@ -108,18 +102,24 @@ start_pass (j_decompress_ptr cinfo)
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
    /* Select the proper IDCT routine for this component's scaling */
-    switch (compptr->DCT_scaled_size) {
+    switch (compptr->_DCT_scaled_size) {
 #ifdef IDCT_SCALING_SUPPORTED
    case 1:
      method_ptr = jpeg_idct_1x1;
      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
      break;
    case 2:
-      method_ptr = jpeg_idct_2x2;
+      if (jsimd_can_idct_2x2())
+        method_ptr = jsimd_idct_2x2;
+      else
+        method_ptr = jpeg_idct_2x2;
      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
      break;
    case 4:
-      method_ptr = jpeg_idct_4x4;
+      if (jsimd_can_idct_4x4())
+        method_ptr = jsimd_idct_4x4;
+      else
+        method_ptr = jpeg_idct_4x4;
      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
      break;
 #endif
@ -127,47 +127,28 @@ start_pass (j_decompress_ptr cinfo)
      switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
      case JDCT_ISLOW:
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-		if(SSE2Available == 1)
-		{
-			method_ptr = jpeg_idct_islow_sse2;
-			method = JDCT_ISLOW;
-		}
-		else
-		{
-			method_ptr = jpeg_idct_islow;
-			method = JDCT_ISLOW;
-		}
-#else
-		method_ptr = jpeg_idct_islow;
-		method = JDCT_ISLOW;
-		  
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
+	if (jsimd_can_idct_islow())
+	  method_ptr = jsimd_idct_islow;
+	else
+	  method_ptr = jpeg_idct_islow;
+	method = JDCT_ISLOW;
 	break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
      case JDCT_IFAST:
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-		if (SSE2Available==1) 
-		{
-			method_ptr = jpeg_idct_islow_sse2;
-			method = JDCT_ISLOW;
-		}
-		else
-		{
-			method_ptr = jpeg_idct_ifast;
-			method = JDCT_IFAST;
-		}
-#else
-		method_ptr = jpeg_idct_ifast;
-		method = JDCT_IFAST;
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
+	if (jsimd_can_idct_ifast())
+	  method_ptr = jsimd_idct_ifast;
+	else
+	  method_ptr = jpeg_idct_ifast;
+	method = JDCT_IFAST;
 	break;
-
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
      case JDCT_FLOAT:
-	method_ptr = jpeg_idct_float;
+	if (jsimd_can_idct_float())
+	  method_ptr = jsimd_idct_float;
+	else
+	  method_ptr = jpeg_idct_float;
 	method = JDCT_FLOAT;
 	break;
 #endif
@ -177,7 +158,7 @@ start_pass (j_decompress_ptr cinfo)
      }
      break;
    default:
-      ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->DCT_scaled_size);
+      ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->_DCT_scaled_size);
      break;
    }
    idct->pub.inverse_DCT[ci] = method_ptr;
--- a/jpeg/jdinput.c
+++ b/jpeg/jdinput.c
@ -2,6 +2,8 @@
 * jdinput.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
+ * Copyright (C) 2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -14,6 +16,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"


 /* Private state */
@ -35,6 +38,79 @@ METHODDEF(int) consume_markers JPP((j_decompress_ptr cinfo));
 * Routines to calculate various quantities related to the size of the image.
 */

+
+#if JPEG_LIB_VERSION >= 80
+/*
+ * Compute output image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_core_output_dimensions (j_decompress_ptr cinfo)
+/* Do computations that are needed before master selection phase.
+ * This function is used for transcoding and full decompression.
+ */
+{
+#ifdef IDCT_SCALING_SUPPORTED
+  int ci;
+  jpeg_component_info *compptr;
+
+  /* Compute actual output image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom) {
+    /* Provide 1/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 2) {
+    /* Provide 2/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 2L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 2L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 4) {
+    /* Provide 4/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 4L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 4L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 8) {
+    /* Provide 8/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 8;
+    cinfo->min_DCT_v_scaled_size = 8;
+  }
+  /* Recompute dimensions of components */
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size;
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size;
+  }
+
+#else /* !IDCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->output_width = cinfo->image_width;
+  cinfo->output_height = cinfo->image_height;
+  /* jdinput.c has already initialized DCT_scaled_size,
+   * and has computed unscaled downsampled_width and downsampled_height.
+   */
+
+#endif /* IDCT_SCALING_SUPPORTED */
+}
+#endif
+
+
 LOCAL(void)
 initial_setup (j_decompress_ptr cinfo)
 /* Called once, when first SOS marker is reached */
@ -70,16 +146,30 @@ initial_setup (j_decompress_ptr cinfo)
 				   compptr->v_samp_factor);
  }

+#if JPEG_LIB_VERSION >=80
+    cinfo->block_size = DCTSIZE;
+    cinfo->natural_order = jpeg_natural_order;
+    cinfo->lim_Se = DCTSIZE2-1;
+#endif
+
  /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
   * In the full decompressor, this will be overridden by jdmaster.c;
   * but in the transcoder, jdmaster.c is not used, so we must do it here.
   */
+#if JPEG_LIB_VERSION >= 70
+  cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = DCTSIZE;
+#else
  cinfo->min_DCT_scaled_size = DCTSIZE;
+#endif

  /* Compute dimensions of components */
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
+#if JPEG_LIB_VERSION >= 70
+    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE;
+#else
    compptr->DCT_scaled_size = DCTSIZE;
+#endif
    /* Size in DCT blocks */
    compptr->width_in_blocks = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
@ -138,7 +228,7 @@ per_scan_setup (j_decompress_ptr cinfo)
    compptr->MCU_width = 1;
    compptr->MCU_height = 1;
    compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = compptr->DCT_scaled_size;
+    compptr->MCU_sample_width = compptr->_DCT_scaled_size;
    compptr->last_col_width = 1;
    /* For noninterleaved scans, it is convenient to define last_row_height
     * as the number of block rows present in the last iMCU row.
@ -174,7 +264,7 @@ per_scan_setup (j_decompress_ptr cinfo)
      compptr->MCU_width = compptr->h_samp_factor;
      compptr->MCU_height = compptr->v_samp_factor;
      compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_scaled_size;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->_DCT_scaled_size;
      /* Figure number of non-dummy blocks in last MCU column & row */
      tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
      if (tmp == 0) tmp = compptr->MCU_width;
--- a/jpeg/jdmainct.c
+++ b/jpeg/jdmainct.c
@ -2,6 +2,7 @@
 * jdmainct.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright (C) 2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -16,6 +17,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"


 /*
@ -161,7 +163,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 {
  my_main_ptr main = (my_main_ptr) cinfo->main;
  int ci, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->_min_DCT_scaled_size;
  jpeg_component_info *compptr;
  JSAMPARRAY xbuf;

@ -175,8 +177,8 @@ alloc_funny_pointers (j_decompress_ptr cinfo)

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
    /* Get space for pointer lists --- M+4 row groups in each list.
     * We alloc both pointer lists with one call to save a few cycles.
     */
@ -202,14 +204,14 @@ make_funny_pointers (j_decompress_ptr cinfo)
 {
  my_main_ptr main = (my_main_ptr) cinfo->main;
  int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->_min_DCT_scaled_size;
  jpeg_component_info *compptr;
  JSAMPARRAY buf, xbuf0, xbuf1;

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
    xbuf0 = main->xbuffer[0][ci];
    xbuf1 = main->xbuffer[1][ci];
    /* First copy the workspace pointers as-is */
@ -242,14 +244,14 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
 {
  my_main_ptr main = (my_main_ptr) cinfo->main;
  int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->_min_DCT_scaled_size;
  jpeg_component_info *compptr;
  JSAMPARRAY xbuf0, xbuf1;

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
    xbuf0 = main->xbuffer[0][ci];
    xbuf1 = main->xbuffer[1][ci];
    for (i = 0; i < rgroup; i++) {
@ -277,8 +279,8 @@ set_bottom_pointers (j_decompress_ptr cinfo)
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
    /* Count sample rows in one iMCU row and in one row group */
-    iMCUheight = compptr->v_samp_factor * compptr->DCT_scaled_size;
-    rgroup = iMCUheight / cinfo->min_DCT_scaled_size;
+    iMCUheight = compptr->v_samp_factor * compptr->_DCT_scaled_size;
+    rgroup = iMCUheight / cinfo->_min_DCT_scaled_size;
    /* Count nondummy sample rows remaining for this component */
    rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
    if (rows_left == 0) rows_left = iMCUheight;
@ -357,7 +359,7 @@ process_data_simple_main (j_decompress_ptr cinfo,
  }

  /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_scaled_size;
+  rowgroups_avail = (JDIMENSION) cinfo->_min_DCT_scaled_size;
  /* Note: at the bottom of the image, we may pass extra garbage row groups
   * to the postprocessor.  The postprocessor has to check for bottom
   * of image anyway (at row resolution), so no point in us doing it too.
@ -417,7 +419,7 @@ process_data_context_main (j_decompress_ptr cinfo,
  case CTX_PREPARE_FOR_IMCU:
    /* Prepare to process first M-1 row groups of this iMCU row */
    main->rowgroup_ctr = 0;
-    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size - 1);
+    main->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size - 1);
    /* Check for bottom of image: if so, tweak pointers to "duplicate"
     * the last sample row, and adjust rowgroups_avail to ignore padding rows.
     */
@ -440,8 +442,8 @@ process_data_context_main (j_decompress_ptr cinfo,
    main->buffer_full = FALSE;
    /* Still need to process last row group of this iMCU row, */
    /* which is saved at index M+1 of the other xbuffer */
-    main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_scaled_size + 1);
-    main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size + 2);
+    main->rowgroup_ctr = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 1);
+    main->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 2);
    main->context_state = CTX_POSTPONED_ROW;
  }
 }
@ -492,21 +494,21 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
   * ngroups is the number of row groups we need.
   */
  if (cinfo->upsample->need_context_rows) {
-    if (cinfo->min_DCT_scaled_size < 2) /* unsupported, see comments above */
+    if (cinfo->_min_DCT_scaled_size < 2) /* unsupported, see comments above */
      ERREXIT(cinfo, JERR_NOTIMPL);
    alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */
-    ngroups = cinfo->min_DCT_scaled_size + 2;
+    ngroups = cinfo->_min_DCT_scaled_size + 2;
  } else {
-    ngroups = cinfo->min_DCT_scaled_size;
+    ngroups = cinfo->_min_DCT_scaled_size;
  }

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+      cinfo->_min_DCT_scaled_size; /* height of a row group of component */
    main->buffer[ci] = (*cinfo->mem->alloc_sarray)
 			((j_common_ptr) cinfo, JPOOL_IMAGE,
-			 compptr->width_in_blocks * compptr->DCT_scaled_size,
+			 compptr->width_in_blocks * compptr->_DCT_scaled_size,
 			 (JDIMENSION) (rgroup * ngroups));
  }
 }
--- a/jpeg/jdmarker.c
+++ b/jpeg/jdmarker.c
@ -79,7 +79,9 @@ typedef enum {			/* JPEG marker codes */
  M_JPG13 = 0xfd,
  M_COM   = 0xfe,
  
-  M_TEM   = 0x01
+  M_TEM   = 0x01,
+  
+  M_ERROR = 0x100
 } JPEG_MARKER;


--- a/jpeg/jdmaster.c
+++ b/jpeg/jdmaster.c
@ -2,6 +2,7 @@
 * jdmaster.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2009-2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -14,105 +15,8 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"

-/* Use static array */
-
-const JSAMPLE static_range_table[ (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * SIZEOF(JSAMPLE) ]={
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 
- 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 
- 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 
- 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 
- 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 
- 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 
- 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 
- 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 
- 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 
- 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 
- 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 
- 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 
- 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 
- 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 
- 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 
- 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 
- 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 
- 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 
- 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 
- 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 
- 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 
- 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 
- 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 
- 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 
- 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 
- 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f
-};

 /* Private state */

@ -147,8 +51,14 @@ use_merged_upsample (j_decompress_ptr cinfo)
    return FALSE;
  /* jdmerge.c only supports YCC=>RGB color conversion */
  if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
-      cinfo->out_color_space != JCS_RGB ||
-      cinfo->out_color_components != RGB_PIXELSIZE)
+      (cinfo->out_color_space != JCS_RGB &&
+      cinfo->out_color_space != JCS_EXT_RGB &&
+      cinfo->out_color_space != JCS_EXT_RGBX &&
+      cinfo->out_color_space != JCS_EXT_BGR &&
+      cinfo->out_color_space != JCS_EXT_BGRX &&
+      cinfo->out_color_space != JCS_EXT_XBGR &&
+      cinfo->out_color_space != JCS_EXT_XRGB) ||
+      cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space])
    return FALSE;
  /* and it only handles 2h1v or 2h2v sampling ratios */
  if (cinfo->comp_info[0].h_samp_factor != 2 ||
@ -159,9 +69,9 @@ use_merged_upsample (j_decompress_ptr cinfo)
      cinfo->comp_info[2].v_samp_factor != 1)
    return FALSE;
  /* furthermore, it doesn't work if we've scaled the IDCTs differently */
-  if (cinfo->comp_info[0].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[1].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[2].DCT_scaled_size != cinfo->min_DCT_scaled_size)
+  if (cinfo->comp_info[0]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
+      cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
+      cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
    return FALSE;
  /* ??? also need to test for upsample-time rescaling, when & if supported */
  return TRUE;			/* by golly, it'll work... */
@ -200,26 +110,42 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
      jdiv_round_up((long) cinfo->image_width, 8L);
    cinfo->output_height = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_height, 8L);
+#if JPEG_LIB_VERSION >= 70
+    cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = 1;
+#else
    cinfo->min_DCT_scaled_size = 1;
+#endif
  } else if (cinfo->scale_num * 4 <= cinfo->scale_denom) {
    /* Provide 1/4 scaling */
    cinfo->output_width = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_width, 4L);
    cinfo->output_height = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_height, 4L);
+#if JPEG_LIB_VERSION >= 70
+    cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = 2;
+#else
    cinfo->min_DCT_scaled_size = 2;
+#endif
  } else if (cinfo->scale_num * 2 <= cinfo->scale_denom) {
    /* Provide 1/2 scaling */
    cinfo->output_width = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_width, 2L);
    cinfo->output_height = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_height, 2L);
+#if JPEG_LIB_VERSION >= 70
+    cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = 4;
+#else
    cinfo->min_DCT_scaled_size = 4;
+#endif
  } else {
    /* Provide 1/1 scaling */
    cinfo->output_width = cinfo->image_width;
    cinfo->output_height = cinfo->image_height;
+#if JPEG_LIB_VERSION >= 70
+    cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = DCTSIZE;
+#else
    cinfo->min_DCT_scaled_size = DCTSIZE;
+#endif
  }
  /* In selecting the actual DCT scaling for each component, we try to
   * scale up the chroma components via IDCT scaling rather than upsampling.
@ -228,15 +154,19 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
   */
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    int ssize = cinfo->min_DCT_scaled_size;
+    int ssize = cinfo->_min_DCT_scaled_size;
    while (ssize < DCTSIZE &&
 	   (compptr->h_samp_factor * ssize * 2 <=
-	    cinfo->max_h_samp_factor * cinfo->min_DCT_scaled_size) &&
+	    cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) &&
 	   (compptr->v_samp_factor * ssize * 2 <=
-	    cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size)) {
+	    cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size)) {
      ssize = ssize * 2;
    }
+#if JPEG_LIB_VERSION >= 70
+    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = ssize;
+#else
    compptr->DCT_scaled_size = ssize;
+#endif
  }

  /* Recompute downsampled dimensions of components;
@ -247,11 +177,11 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
    /* Size in samples, after IDCT scaling */
    compptr->downsampled_width = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_width *
-		    (long) (compptr->h_samp_factor * compptr->DCT_scaled_size),
+		    (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size),
 		    (long) (cinfo->max_h_samp_factor * DCTSIZE));
    compptr->downsampled_height = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_height *
-		    (long) (compptr->v_samp_factor * compptr->DCT_scaled_size),
+		    (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size),
 		    (long) (cinfo->max_v_samp_factor * DCTSIZE));
  }

@ -273,10 +203,14 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
    cinfo->out_color_components = 1;
    break;
  case JCS_RGB:
-#if RGB_PIXELSIZE != 3
-    cinfo->out_color_components = RGB_PIXELSIZE;
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
    break;
-#endif /* else share code with YCbCr */
  case JCS_YCbCr:
    cinfo->out_color_components = 3;
    break;
@ -346,14 +280,6 @@ LOCAL(void)
 prepare_range_limit_table (j_decompress_ptr cinfo)
 /* Allocate and fill in the sample_range_limit table */
 {
-  /* Use a static table and allow negative subscripts of simple table */
-
-  cinfo->sample_range_limit = (JSAMPLE *) static_range_table + (MAXJSAMPLE+1);
-
-  /* This code is used to create the values for the static table used above */
-
-#if 0
-
  JSAMPLE * table;
  int i;

@ -376,8 +302,6 @@ prepare_range_limit_table (j_decompress_ptr cinfo)
 	  (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * SIZEOF(JSAMPLE));
  MEMCOPY(table + (4 * (MAXJSAMPLE+1) - CENTERJSAMPLE),
 	  cinfo->sample_range_limit, CENTERJSAMPLE * SIZEOF(JSAMPLE));
-
-#endif /* 0 */
 }


@ -481,7 +405,11 @@ master_selection (j_decompress_ptr cinfo)
  jinit_inverse_dct(cinfo);
  /* Entropy decoding: either Huffman or arithmetic coding. */
  if (cinfo->arith_code) {
+#ifdef D_ARITH_CODING_SUPPORTED
+    jinit_arith_decoder(cinfo);
+#else
    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
  } else {
    if (cinfo->progressive_mode) {
 #ifdef D_PROGRESSIVE_SUPPORTED
--- a/jpeg/jdmerge.c
+++ b/jpeg/jdmerge.c
@ -2,6 +2,8 @@
 * jdmerge.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -35,22 +37,10 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"

 #ifdef UPSAMPLE_MERGING_SUPPORTED

-#ifdef HAVE_MMX_INTEL_MNEMONICS
-  __int64 const1 = 0x59BA0000D24B59BA;       // Cr_r Cr_b Cr_g Cr_r
-  __int64 const2 = 0x00007168E9FA0000;		 // Cb-r Cb_b Cb_g Cb_r
-  __int64 const5 = 0x0000D24B59BA0000;		 // Cr_b Cr_g Cr_r Cr_b
-  __int64 const6 = 0x7168E9FA00007168;		 // Cb_b Cb_g Cb_r Cb_b
-
-  // constants for factors (One_Half/fix(x)) << 2
-
-  __int64 const05 = 0x0001000000000001;	// Cr_r Cr_b Cr_g Cr_r
-  __int64 const15 = 0x00000001FFFA0000;	// Cb-r Cb_b Cb_g Cb_r
-  __int64 const45 = 0x0000000000010000;	// Cr_b Cr_g Cr_r Cr_b
-  __int64 const55 = 0x0001FFFA00000001;	// Cb_b Cb_g Cb_r Cb_b
-#endif

 /* Private subobject */

@ -240,9 +230,7 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
 		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
 		      JSAMPARRAY output_buf)
 {
- 
-
- my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
  register int y, cred, cgreen, cblue;
  int cb, cr;
  register JSAMPROW outptr;
@ -270,15 +258,15 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
    cblue = Cbbtab[cb];
    /* Fetch 2 Y values and emit 2 pixels */
    y  = GETJSAMPLE(*inptr0++);
-    outptr[RGB_RED] =   range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
-    outptr += RGB_PIXELSIZE;
+    outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
+    outptr += rgb_pixelsize[cinfo->out_color_space];
    y  = GETJSAMPLE(*inptr0++);
-    outptr[RGB_RED] =   range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
-    outptr += RGB_PIXELSIZE;
+    outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
+    outptr += rgb_pixelsize[cinfo->out_color_space];
  }
  /* If image width is odd, do the last output column separately */
  if (cinfo->output_width & 1) {
@ -288,9 +276,9 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
    cblue = Cbbtab[cb];
    y  = GETJSAMPLE(*inptr0);
-    outptr[RGB_RED] =   range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE] =  range_limit[y + cblue];
+    outptr[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
  }
 }

@ -299,614 +287,6 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
 * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
 */

-#ifdef HAVE_MMX_INTEL_MNEMONICS
-__inline METHODDEF(void)
-h2v2_merged_upsample_orig (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf);
-__inline METHODDEF(void)
-h2v2_merged_upsample_mmx (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf);
-#endif
- 
-METHODDEF(void)
-h2v2_merged_upsample (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf);
-
-#ifdef HAVE_MMX_INTEL_MNEMONICS
-METHODDEF(void)
-h2v2_merged_upsample (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf)
-{
-if (MMXAvailable && (cinfo->image_width >= 8))
-	h2v2_merged_upsample_mmx (cinfo, input_buf, in_row_group_ctr, output_buf);
-else
-	h2v2_merged_upsample_orig (cinfo, input_buf, in_row_group_ctr, output_buf);
-
-}
-
-__inline METHODDEF(void)
-h2v2_merged_upsample_orig (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf)
-{
-
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-  register int y, cred, cgreen, cblue;
-  int cb, cr;
-  register JSAMPROW outptr0, outptr1;
-  JSAMPROW inptr00, inptr01, inptr1, inptr2;
-  JDIMENSION col;
-  /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  SHIFT_TEMPS
-
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
-  inptr1 = input_buf[1][in_row_group_ctr];
-  inptr2 = input_buf[2][in_row_group_ctr];
-  outptr0 = output_buf[0];
-  outptr1 = output_buf[1];
-  /* Loop for each group of output pixels */
-  for (col = cinfo->output_width >> 1; col > 0; col--) {
-    /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-  }
-  /* If image width is odd, do the last output column separately */
-  if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    y  = GETJSAMPLE(*inptr01);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-  }
-}
-
-/*
- * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
- */
-__inline METHODDEF(void)
-h2v2_merged_upsample_mmx (j_decompress_ptr cinfo,
-		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-		      JSAMPARRAY output_buf)
-{
-	// added for MMX
-  __int64 const128 = 0x0080008000800080;
-  __int64 empty = 0x0000000000000000;
-  __int64 davemask = 0x0000FFFFFFFF0000;
-  ////////////////////////////////
-
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-  register int y, cred, cgreen, cblue;
-  int cb, cr;
-  register JSAMPROW outptr0, outptr1;
-  JSAMPROW inptr00, inptr01, inptr1, inptr2;
-  JDIMENSION col;
-  /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  INT32 * Crgtab = upsample->Cr_g_tab;
-  INT32 * Cbgtab = upsample->Cb_g_tab;
-  SHIFT_TEMPS
-  
-
-  // Added for MMX	  
-  register int width = cinfo->image_width;
-  int cols = cinfo->output_width;
-  int cols_asm = (cols >> 3);
-  int diff = cols - (cols_asm<<3);
-  int cols_asm_copy = cols_asm;
-
- ///////////////////////////////////////
-
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
-  inptr1 = input_buf[1][in_row_group_ctr];
-  inptr2 = input_buf[2][in_row_group_ctr];
-  outptr0 = output_buf[0];
-  outptr1 = output_buf[1];
-  /* Loop for each group of output pixels */
-
-	   
-  _asm
-  {
-	  mov esi, inptr00
-
-	  mov eax, inptr01
-	  
-	  mov ebx, inptr2
-
-	  mov ecx, inptr1
-
-	  mov edi, outptr0
-
-	  mov edx, outptr1
-
-do_next16:
-	  
-	  movd mm0, [ebx]			; Cr7 Cr6.....Cr1 Cr0
-
-	  pxor mm6, mm6
-
-	  punpcklbw mm0, mm0		; Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0
-
-	  movq mm7, const128
-
-	  punpcklwd mm0, mm0		; Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0
-
-	  movq mm4, mm0
-
-	  punpcklbw mm0, mm6		; Cr0 Cr0 Cr0 Cr0
-
-	  psubsw mm0, mm7			; Cr0 - 128:Cr0-128:Cr0-128:Cr0 -128
-	  
-	  movd mm1, [ecx]			; Cb7 Cb6...... Cb1 Cb0
-	  	   
-	  psllw mm0, 2				; left shift by 2 bits
-
-	  punpcklbw mm1, mm1		; Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0
-	  
-	  paddsw mm0, const05		; add (one_half/fix(x)) << 2
-
-	  punpcklwd mm1, mm1		; Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0
-
-	  movq mm5, mm1
-
-	  pmulhw mm0, const1		; multiply by (fix(x) >> 1) 
-
-	  punpcklbw mm1, mm6		; Cb0 Cb0 Cb0 Cb0
-
-	  punpckhbw mm4, mm6		; Cr1 Cr1 Cr1 Cr1
-
-	  psubsw mm1, mm7			; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
-
-	  punpckhbw mm5, mm6		; Cb1 Cb1 Cb1 Cb1
-
-	  psllw mm1, 2				; left shift by 2 bits
- 
-	  paddsw mm1, const15		; add (one_half/fix(x)) << 2
-
-	  psubsw mm4, mm7			; Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128
-						
-	  psubsw mm5, mm7			; Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128
-
-	  pmulhw mm1, const2		; multiply by (fix(x) >> 1) 
-
-	  psllw mm4, 2				; left shift by 2 bits
-
-	  psllw mm5, 2				; left shift by 2 bits
-
-	  paddsw mm4, const45		; add (one_half/fix(x)) << 2
-
-	  movd mm7, [esi]			;  Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0
-
-	  pmulhw mm4, const5		; multiply by (fix(x) >> 1) 
-
-	  movq mm6, mm7
-
-	  punpcklbw mm7, mm7		; Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0
-
-	  paddsw mm5, const55		; add (one_half/fix(x)) << 2
-
-	  paddsw  mm0, mm1			; cred0 cbl0 cgr0 cred0
-
-	  movq mm1, mm7
-
-	  pmulhw mm5, const6		; multiply by (fix(x) >> 1) 
-
-	  movq	mm2, mm0			; cred0 cbl0 cgr0 cred0
-
-	  punpcklwd mm7, mm6		; Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0
-
-	  pand mm2, davemask		; 0 cbl0 cgr0 0
-
-	  psrlq mm1, 16				; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
-
-	  psrlq	mm2, 16				; 0 0 cbl0 cgr0
-
-	  punpcklbw mm7, empty		; Y1 Y0 Y0 Y0
-
-	  paddsw mm4, mm5			; cbl1 cgr1 cred1 cbl1
-
-	  movq	mm3, mm4			; cbl1 cgr1 cred1 cbl1
-
-	  pand	mm3, davemask		; 0 cgr1 cred1 0
-
-	  paddsw mm7, mm0			; r1 b0 g0 r0
-
-	  psllq	mm3, 16				; cgr1 cred1 0 0
-
-	  movq mm6, mm1				; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
-	
-	  por	mm2, mm3			; cgr1 cred1 cbl0 cgr0
-
-	  punpcklbw mm6, empty		; Y4 Y4 Y1 Y1
-
-	  movd mm3, [eax]			; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
-	  
-	  paddsw mm6, mm2			; g4 r4 b1 g1
-
-	  packuswb mm7, mm6			; g4 r4 b1 g1 r1 b0 g0 r0
-
-	  movq mm6, mm3				; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
-
-	  punpcklbw mm3, mm3		; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
-
-	  movq [edi], mm7			; move to memory g4 r4 b1 g1 r1 b0 g0 r0
-
-	  movq mm5, mm3				; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
-
-	  punpcklwd mm3, mm6		; X X X X Y3 Y2 Y2 Y2
-
-	  punpcklbw mm3, empty		; Y3 Y2 Y2 Y2
-
-	  psrlq mm5, 16				; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
-
-	  paddsw mm3, mm0			; r3 b2 g2 r2
-
-	  movq mm6, mm5				; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
-
-	  movq mm0, mm1				; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
-
-	  punpckldq mm6, mm6		; X X X X Y6 Y6 Y3 Y3
-
-	  punpcklbw mm6, empty		; Y6 Y6 Y3 Y3
-
-	  psrlq mm1, 24				; 0 0 0 0 0 Y5 Y5 Y4
-	  
-	  paddsw mm6, mm2			; g6 r6 b3 g3
-
-	  packuswb mm3, mm6			; g6 r6 b3 g3 r3 b2 g2 r2
-
-	  movq mm2, mm5				; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
-
-	  psrlq mm0, 32				; 0 0 0 0 0 0 Y5 Y5
-
-	  movq [edx], mm3			; move to memory g6 r6 b3 g3 r3 b2 g2 r2
-	  
-	  punpcklwd mm1, mm0		; X X X X Y5 Y5 Y5 Y4
-
-	  psrlq mm5, 24				; 0 0 0 0 0 Y7 Y7 Y6 
-
-	  movd mm0, [ebx]			; Cr9 Cr8.....Cr3 Cr2
-
-	  psrlq mm2, 32	   			; 0 0 0 0 0 0 Y7 Y7	 
-	  
-	  psrlq	mm0, 16		
-
-	  punpcklbw mm1, empty		; Y5 Y5 Y5 Y4
-
-	  punpcklwd mm5, mm2		; X X X X Y7 Y7 Y7 Y6
-
-	  paddsw mm1, mm4			; b5 g5 r5 b4
-	 
-	  punpcklbw mm5, empty		; Y7 Y7 Y7 Y6	    
-
-	  pxor mm6, mm6				; clear mm6 registr
-	  
-	  punpcklbw mm0, mm0		; X X X X Cr3 Cr3 Cr2 Cr2
-  
-	  paddsw mm5, mm4			; b7 g7 r7 b6
-	  
-	  punpcklwd mm0, mm0		; Cr3 Cr3 Cr3 Cr3 Cr2 Cr2 Cr2 Cr2
-
-	  movq mm4, mm0
-
-	  movd mm3, [ecx]			; Cb9 Cb8...... Cb3 Cb2
-	  
-	  punpcklbw mm0, mm6		; Cr2 Cr2 Cr2 Cr2
-
-	  psrlq	mm3, 16
-
-	  psubsw mm0, const128		; Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128
-
-	  punpcklbw mm3, mm3		; X X X X Cb3 Cb3 Cb2 Cb2
-
-	  psllw mm0, 2				; left shift by 2 bits
-
-	  paddsw mm0, const05		; add (one_half/fix(x)) << 2
-
-	  punpcklwd mm3, mm3		; Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2
-
-	  movq mm7, mm3
-	  
-	  pmulhw mm0, const1		; multiply by (fix(x) >> 1) 	  	  
-
-	  punpcklbw mm3, mm6		; Cb2 Cb2 Cb2 Cb2
-
-	  psubsw mm3, const128		; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
-
-	  punpckhbw mm4, mm6		; Cr3 Cr3 Cr3 Cr3
-	  
-	  psllw mm3, 2				; left shift by 2 bits
-
-	  paddsw mm3, const15		; add (one_half/fix(x)) << 2
-
-	  punpckhbw mm7, mm6		; Cb3 Cb3 Cb3 Cb3
-
-	  pmulhw mm3, const2		; multiply by (fix(x) >> 1) 
-	  
-	  psubsw mm7, const128		; Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128
-
-	  paddsw  mm0, mm3			; cred2 cbl2 cgr2 cred2
-	    
-	  psllw mm7, 2				; left shift by 2 bits
-
-	  psubsw mm4, const128		; Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128
-	  
-	  movd mm3, [esi+4]			;  Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
-	  
-	  psllw mm4, 2				; left shift by 2 bits
-
-	  paddsw mm7, const55		; add (one_half/fix(x)) << 2
-	  	  
-	  movq mm6, mm3				;  Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
-
-	  movq	mm2, mm0
-	  	  
-	  pand mm2, davemask
-
-	  punpcklbw mm3, mm3		; Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8
-
-	  psrlq	mm2, 16
-	    	  
-	  paddsw mm4, const45		; add (one_half/fix(x)) << 2
-
-	  punpcklwd mm3, mm6		; X X X X Y9 Y8 Y8 Y8
-	  
-	  pmulhw mm4, const5		; multiply by (fix(x) >> 1) 
-
-	  pmulhw mm7, const6		; multiply by (fix(x) >> 1) 
-
-	  punpcklbw mm3, empty		; Y9 Y8 Y8 Y8
-	  
-	  paddsw mm4, mm7			; cbl3 cgr3 cred3 cbl3
-
-	  paddsw mm3, mm0			; r9 b8 g8 r8
-
-	  movq	mm7, mm4
-
-	  packuswb mm1, mm3			; r9 b8 g8 r8 b5 g5 r5 b4
-
-	  movd mm3, [eax+4]			; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
- 	  
-	  pand	mm7, davemask
-
-	  psrlq mm6, 8				; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
-
-	  psllq	mm7, 16
-						   
-	  movq [edi+8], mm1			; move to memory r9 b8 g8 r8 b5 g5 r5 b4
-
-	  por	mm2, mm7
-
-	  movq mm7, mm3				; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
-
-	  punpcklbw mm3, mm3		; X X X X Y11 Y11 Y10 Y10
-
-	  pxor mm1, mm1
-
-	  punpcklwd mm3, mm7		; X X X X Y11 Y10 Y10 Y10
-
-	  punpcklbw mm3, mm1		; Y11 Y10 Y10 Y10
-
-	  psrlq mm7, 8				; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
-	  
-	  paddsw mm3, mm0			; r11 b10 g10 r10
-
-	  movq mm0, mm7				; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
-
-	  packuswb mm5, mm3			; r11 b10 g10 r10 b7 g7 r7 b6
-
-	  punpcklbw mm7, mm7		; X X X X Y14 Y14 Y11 Y11
-
-	  movq [edx+8], mm5			; move to memory r11 b10 g10 r10 b7 g7 r7 b6
-
-	  movq mm3, mm6				; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
-
-	  punpcklbw mm6, mm6		; X X X X Y12 Y12 Y9 Y9
-
-	  punpcklbw mm7, mm1		; Y14 Y14 Y11 Y11
-
-	  punpcklbw mm6, mm1		; Y12 Y12 Y9 Y9
-
-	  paddsw mm7, mm2			; g14 r14 b11 g11
-
-	  paddsw mm6, mm2			; g12 r12 b9 g9
-
-	  psrlq mm3, 8				; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
-
-	  movq mm1, mm3				; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
-
-	  punpcklbw mm3, mm3		; X X X X Y13 Y13 Y12 Y12
-
-	  add esi, 8
-
-	  psrlq mm3, 16				; X X X X X X Y13 Y13 modified on 09/24
-
-	  punpcklwd mm1, mm3		; X X X X Y13 Y13 Y13 Y12
-
-	  add eax, 8
-
-	  psrlq mm0, 8				; 0 0 Y23 Y22 Y19 Y18 Y15 Y14	
-
-	  punpcklbw mm1, empty		; Y13 Y13 Y13 Y12
-
-	  movq mm5, mm0				; 0 0 Y23 Y22 Y19 Y18 Y15 Y14	
-
-	  punpcklbw mm0, mm0		; X X X X Y15 Y15 Y14 Y14
-
-	  paddsw mm1, mm4			; b13 g13 r13 b12
-
-	  psrlq mm0, 16				; X X X X X X Y15 Y15
-
-	  add edi, 24
-	  
-	  punpcklwd mm5, mm0		; X X X X Y15 Y15 Y15 Y14
-
-	  packuswb mm6, mm1			; b13 g13 r13 b12 g12 r12 b9 g9
-
-	  add edx, 24
-	  
-	  punpcklbw mm5, empty		; Y15 Y15 Y15 Y14
-
-	  add ebx, 4
-	  	  
-	  paddsw mm5, mm4			; b15 g15 r15 b14
-
-	  movq [edi-8], mm6		; move to memory b13 g13 r13 b12 g12 r12 b9 g9
-
-	  packuswb mm7, mm5			; b15 g15 r15 b14 g14 r14 b11 g11
-
-	  add ecx, 4
-  
-	  movq [edx-8], mm7		; move to memory b15 g15 r15 b14 g14 r14 b11 g11
-
-	  dec cols_asm
-	  
-	  jnz do_next16
-
-	  EMMS
-	  	  
-	  }
-
-	  
-  inptr1 += (cols_asm_copy<<2);
-
-  inptr2 += (cols_asm_copy<<2);
-
-  inptr00 += (cols_asm_copy<<3);
-
-  inptr01 += (cols_asm_copy<<3);
-
-  outptr0 += cols_asm_copy*24;
-
-  outptr1 += cols_asm_copy*24;
-  		  
-  //for (col = cinfo->output_width >> 1; col > 0; col--) {
-      /* Do the chroma part of the calculation */
-    /*cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];*/
-    /* Fetch 4 Y values and emit 4 pixels */
-    /*y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-  }	  */
-
-
-  for (col = diff >> 1; col > 0; col--) {
-      /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-    outptr1 += RGB_PIXELSIZE;
-  }	  
-
-					  
-  /* If image width is odd, do the last output column separately */
-  //if (cinfo->output_width & 1) {
-  if (diff & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
-    cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
-    y  = GETJSAMPLE(*inptr01);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
-  }    
-}
-#else
-
-
 METHODDEF(void)
 h2v2_merged_upsample (j_decompress_ptr cinfo,
 		      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
@ -942,24 +322,24 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
    cblue = Cbbtab[cb];
    /* Fetch 4 Y values and emit 4 pixels */
    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr0[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
    outptr0 += RGB_PIXELSIZE;
    y  = GETJSAMPLE(*inptr00++);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr0[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
    outptr0 += RGB_PIXELSIZE;
    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr1[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
    outptr1 += RGB_PIXELSIZE;
    y  = GETJSAMPLE(*inptr01++);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr1[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
    outptr1 += RGB_PIXELSIZE;
  }
  /* If image width is odd, do the last output column separately */
@ -970,16 +350,15 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
    cblue = Cbbtab[cb];
    y  = GETJSAMPLE(*inptr00);
-    outptr0[RGB_RED] =   range_limit[y + cred];
-    outptr0[RGB_GREEN] = range_limit[y + cgreen];
-    outptr0[RGB_BLUE] =  range_limit[y + cblue];
+    outptr0[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr0[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr0[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
    y  = GETJSAMPLE(*inptr01);
-    outptr1[RGB_RED] =   range_limit[y + cred];
-    outptr1[RGB_GREEN] = range_limit[y + cgreen];
-    outptr1[RGB_BLUE] =  range_limit[y + cblue];
+    outptr1[rgb_red[cinfo->out_color_space]] =   range_limit[y + cred];
+    outptr1[rgb_green[cinfo->out_color_space]] = range_limit[y + cgreen];
+    outptr1[rgb_blue[cinfo->out_color_space]] =  range_limit[y + cblue];
  }
 }
-#endif


 /*
@ -1006,14 +385,20 @@ jinit_merged_upsampler (j_decompress_ptr cinfo)

  if (cinfo->max_v_samp_factor == 2) {
    upsample->pub.upsample = merged_2v_upsample;
-    upsample->upmethod = h2v2_merged_upsample;
+    if (jsimd_can_h2v2_merged_upsample())
+      upsample->upmethod = jsimd_h2v2_merged_upsample;
+    else
+      upsample->upmethod = h2v2_merged_upsample;
    /* Allocate a spare row buffer */
    upsample->spare_row = (JSAMPROW)
      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 		(size_t) (upsample->out_row_width * SIZEOF(JSAMPLE)));
  } else {
    upsample->pub.upsample = merged_1v_upsample;
-    upsample->upmethod = h2v1_merged_upsample;
+    if (jsimd_can_h2v1_merged_upsample())
+      upsample->upmethod = jsimd_h2v1_merged_upsample;
+    else
+      upsample->upmethod = h2v1_merged_upsample;
    /* No spare row needed */
    upsample->spare_row = NULL;
  }
--- a/jpeg/jdsample.c
+++ b/jpeg/jdsample.c
@ -2,6 +2,8 @@
 * jdsample.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -21,6 +23,8 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsimd.h"
+#include "jpegcomp.h"


 /* Pointer to routine to upsample a single component */
@ -418,7 +422,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
  /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1,
   * so don't ask for it.
   */
-  do_fancy = cinfo->do_fancy_upsampling && cinfo->min_DCT_scaled_size > 1;
+  do_fancy = cinfo->do_fancy_upsampling && cinfo->_min_DCT_scaled_size > 1;

  /* Verify we can handle the sampling factors, select per-component methods,
   * and create storage as needed.
@ -428,10 +432,10 @@ jinit_upsampler (j_decompress_ptr cinfo)
    /* Compute size of an "input group" after IDCT scaling.  This many samples
     * are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
     */
-    h_in_group = (compptr->h_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
-    v_in_group = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-		 cinfo->min_DCT_scaled_size;
+    h_in_group = (compptr->h_samp_factor * compptr->_DCT_scaled_size) /
+		 cinfo->_min_DCT_scaled_size;
+    v_in_group = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
+		 cinfo->_min_DCT_scaled_size;
    h_out_group = cinfo->max_h_samp_factor;
    v_out_group = cinfo->max_v_samp_factor;
    upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
@ -447,18 +451,32 @@ jinit_upsampler (j_decompress_ptr cinfo)
    } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group == v_out_group) {
      /* Special cases for 2h1v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2)
-	upsample->methods[ci] = h2v1_fancy_upsample;
-      else
-	upsample->methods[ci] = h2v1_upsample;
+      if (do_fancy && compptr->downsampled_width > 2) {
+	if (jsimd_can_h2v1_fancy_upsample())
+	  upsample->methods[ci] = jsimd_h2v1_fancy_upsample;
+	else
+	  upsample->methods[ci] = h2v1_fancy_upsample;
+      } else {
+	if (jsimd_can_h2v1_upsample())
+	  upsample->methods[ci] = jsimd_h2v1_upsample;
+	else
+	  upsample->methods[ci] = h2v1_upsample;
+      }
    } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group * 2 == v_out_group) {
      /* Special cases for 2h2v upsampling */
      if (do_fancy && compptr->downsampled_width > 2) {
-	upsample->methods[ci] = h2v2_fancy_upsample;
+	if (jsimd_can_h2v2_fancy_upsample())
+	  upsample->methods[ci] = jsimd_h2v2_fancy_upsample;
+	else
+	  upsample->methods[ci] = h2v2_fancy_upsample;
 	upsample->pub.need_context_rows = TRUE;
-      } else
-	upsample->methods[ci] = h2v2_upsample;
+      } else {
+	if (jsimd_can_h2v2_upsample())
+	  upsample->methods[ci] = jsimd_h2v2_upsample;
+	else
+	  upsample->methods[ci] = h2v2_upsample;
+      }
    } else if ((h_out_group % h_in_group) == 0 &&
 	       (v_out_group % v_in_group) == 0) {
      /* Generic integral-factors upsampling method */
--- a/jpeg/jdtrans.c
+++ b/jpeg/jdtrans.c
@ -0,0 +1,147 @@
+/*
+ * jdtrans.c
+ *
+ * Copyright (C) 1995-1997, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains library routines for transcoding decompression,
+ * that is, reading raw DCT coefficient arrays from an input JPEG file.
+ * The routines in jdapimin.c will also be needed by a transcoder.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Forward declarations */
+LOCAL(void) transdecode_master_selection JPP((j_decompress_ptr cinfo));
+
+
+/*
+ * Read the coefficient arrays from a JPEG file.
+ * jpeg_read_header must be completed before calling this.
+ *
+ * The entire image is read into a set of virtual coefficient-block arrays,
+ * one per component.  The return value is a pointer to the array of
+ * virtual-array descriptors.  These can be manipulated directly via the
+ * JPEG memory manager, or handed off to jpeg_write_coefficients().
+ * To release the memory occupied by the virtual arrays, call
+ * jpeg_finish_decompress() when done with the data.
+ *
+ * An alternative usage is to simply obtain access to the coefficient arrays
+ * during a buffered-image-mode decompression operation.  This is allowed
+ * after any jpeg_finish_output() call.  The arrays can be accessed until
+ * jpeg_finish_decompress() is called.  (Note that any call to the library
+ * may reposition the arrays, so don't rely on access_virt_barray() results
+ * to stay valid across library calls.)
+ *
+ * Returns NULL if suspended.  This case need be checked only if
+ * a suspending data source is used.
+ */
+
+GLOBAL(jvirt_barray_ptr *)
+jpeg_read_coefficients (j_decompress_ptr cinfo)
+{
+  if (cinfo->global_state == DSTATE_READY) {
+    /* First call: initialize active modules */
+    transdecode_master_selection(cinfo);
+    cinfo->global_state = DSTATE_RDCOEFS;
+  }
+  if (cinfo->global_state == DSTATE_RDCOEFS) {
+    /* Absorb whole file into the coef buffer */
+    for (;;) {
+      int retcode;
+      /* Call progress monitor hook if present */
+      if (cinfo->progress != NULL)
+	(*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+      /* Absorb some more input */
+      retcode = (*cinfo->inputctl->consume_input) (cinfo);
+      if (retcode == JPEG_SUSPENDED)
+	return NULL;
+      if (retcode == JPEG_REACHED_EOI)
+	break;
+      /* Advance progress counter if appropriate */
+      if (cinfo->progress != NULL &&
+	  (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
+	if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
+	  /* startup underestimated number of scans; ratchet up one scan */
+	  cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+	}
+      }
+    }
+    /* Set state so that jpeg_finish_decompress does the right thing */
+    cinfo->global_state = DSTATE_STOPPING;
+  }
+  /* At this point we should be in state DSTATE_STOPPING if being used
+   * standalone, or in state DSTATE_BUFIMAGE if being invoked to get access
+   * to the coefficients during a full buffered-image-mode decompression.
+   */
+  if ((cinfo->global_state == DSTATE_STOPPING ||
+       cinfo->global_state == DSTATE_BUFIMAGE) && cinfo->buffered_image) {
+    return cinfo->coef->coef_arrays;
+  }
+  /* Oops, improper usage */
+  ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+  return NULL;			/* keep compiler happy */
+}
+
+
+/*
+ * Master selection of decompression modules for transcoding.
+ * This substitutes for jdmaster.c's initialization of the full decompressor.
+ */
+
+LOCAL(void)
+transdecode_master_selection (j_decompress_ptr cinfo)
+{
+  /* This is effectively a buffered-image operation. */
+  cinfo->buffered_image = TRUE;
+
+  /* Entropy decoding: either Huffman or arithmetic coding. */
+  if (cinfo->arith_code) {
+#ifdef D_ARITH_CODING_SUPPORTED
+    jinit_arith_decoder(cinfo);
+#else
+    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
+  } else {
+    if (cinfo->progressive_mode) {
+#ifdef D_PROGRESSIVE_SUPPORTED
+      jinit_phuff_decoder(cinfo);
+#else
+      ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+    } else
+      jinit_huff_decoder(cinfo);
+  }
+
+  /* Always get a full-image coefficient buffer. */
+  jinit_d_coef_controller(cinfo, TRUE);
+
+  /* We can now tell the memory manager to allocate virtual arrays. */
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+
+  /* Initialize input side of decompressor to consume first scan. */
+  (*cinfo->inputctl->start_input_pass) (cinfo);
+
+  /* Initialize progress monitoring. */
+  if (cinfo->progress != NULL) {
+    int nscans;
+    /* Estimate number of scans to set pass_limit. */
+    if (cinfo->progressive_mode) {
+      /* Arbitrarily estimate 2 interleaved DC scans + 3 AC scans/component. */
+      nscans = 2 + 3 * cinfo->num_components;
+    } else if (cinfo->inputctl->has_multiple_scans) {
+      /* For a nonprogressive multiscan file, estimate 1 scan per component. */
+      nscans = cinfo->num_components;
+    } else {
+      nscans = 1;
+    }
+    cinfo->progress->pass_counter = 0L;
+    cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+    cinfo->progress->completed_passes = 0;
+    cinfo->progress->total_passes = 1;
+  }
+}
--- a/jpeg/jerror.c
+++ b/jpeg/jerror.c
@ -18,13 +18,6 @@
 * These routines are used by both the compression and decompression code.
 */

-/*
- * This file has been modified for the Mozilla/Netscape environment.
- * Modifications are distributed under the mozilla.org tri-license and are
- * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
- * Reserved. See http://www.mozilla.org/MPL/
- */
-
 /* this is not a core library module, so it doesn't define JPEG_INTERNALS */
 #include "jinclude.h"
 #include "jpeglib.h"
@ -82,15 +75,7 @@ error_exit (j_common_ptr cinfo)
  /* Let the memory manager delete any temp files before we die */
  jpeg_destroy(cinfo);

-/* Mozilla mod: in some Windows environments, the exit() function doesn't
- * even exist, so don't compile a reference to it.  Heaven help you if
- * you fail to provide a replacement error_exit function, because the
- * IJG library will NOT handle control returning from error_exit!
- */
-
-#ifndef XP_WIN
  exit(EXIT_FAILURE);
-#endif
 }


@ -116,6 +101,15 @@ output_message (j_common_ptr cinfo)

  /* Create the message */
  (*cinfo->err->format_message) (cinfo, buffer);
+
+#ifdef USE_WINDOWS_MESSAGEBOX
+  /* Display it in a message dialog box */
+  MessageBox(GetActiveWindow(), buffer, "JPEG Library Error",
+	     MB_OK | MB_ICONERROR);
+#else
+  /* Send it to stderr, adding a newline */
+  fprintf(stderr, "%s\n", buffer);
+#endif
 }


--- a/jpeg/jerror.h
+++ b/jpeg/jerror.h
@ -2,6 +2,7 @@
 * jerror.h
 *
 * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -40,11 +41,12 @@ JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */

 /* For maintenance convenience, list is alphabetical by message code name */
 JMESSAGE(JERR_ARITH_NOTIMPL,
-	 "Sorry, there are legal restrictions on arithmetic coding")
+	 "Sorry, arithmetic coding is not implemented")
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
 JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
 JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
 JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
 JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
 JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
@ -93,6 +95,7 @@ JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
 JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
 JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
 JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
 JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
@ -170,6 +173,7 @@ JMESSAGE(JTRC_UNKNOWN_IDS,
 JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
 JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
 JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 JMESSAGE(JWRN_BOGUS_PROGRESSION,
 	 "Inconsistent progression sequence for component %d coefficient %d")
 JMESSAGE(JWRN_EXTRANEOUS_DATA,
--- a/jpeg/jidctfst.c
+++ b/jpeg/jidctfst.c
--- a/jpeg/jidctint.c
+++ b/jpeg/jidctint.c
@ -386,578 +386,4 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  }
 }

-
-#ifdef HAVE_SSE2_INTEL_MNEMONICS
-
-/*
-* Intel SSE2 optimized Inverse Discrete Cosine Transform
-*
-*
-* Copyright (c) 2001-2002 Intel Corporation
-* All Rights Reserved
-*
-*
-*  Authors:
-*      Danilov G.
-*
-*
-*-----------------------------------------------------------------------------
-*
-* References:
-*    K.R. Rao and P. Yip
-*       Discrete Cosine Transform.
-*       Algorithms, Advantages, Applications.
-*       Academic Press, Inc, London, 1990.
-*    JPEG Group's software.
-*       This implementation is based on Appendix A.2 of the book (R&Y) ...
-*
-*-----------------------------------------------------------------------------
-*/
-
-typedef unsigned char   Ipp8u;
-typedef unsigned short  Ipp16u;
-typedef unsigned int    Ipp32u;
-
-typedef signed char    Ipp8s;
-typedef signed short   Ipp16s;
-typedef signed int     Ipp32s;
-
-#define BITS_INV_ACC  4			
-#define SHIFT_INV_ROW  16 - BITS_INV_ACC
-#define SHIFT_INV_COL 1 + BITS_INV_ACC
-
-#define RND_INV_ROW  1024 * (6 - BITS_INV_ACC)	/* 1 << (SHIFT_INV_ROW-1)		*/
-#define RND_INV_COL = 16 * (BITS_INV_ACC - 3)   /* 1 << (SHIFT_INV_COL-1)		*/
-#define RND_INV_CORR = RND_INV_COL - 1          /* correction -1.0 and round	*/
-
-#define c_inv_corr_0 -1024 * (6 - BITS_INV_ACC) + 65536		/* -0.5 + (16.0 or 32.0)	*/
-#define c_inv_corr_1 1877 * (6 - BITS_INV_ACC)				/* 0.9167	*/	
-#define c_inv_corr_2 1236 * (6 - BITS_INV_ACC)				/* 0.6035	*/					
-#define c_inv_corr_3 680  * (6 - BITS_INV_ACC)				/* 0.3322	*/
-#define c_inv_corr_4 0    * (6 - BITS_INV_ACC)				/* 0.0		*/	
-#define c_inv_corr_5 -569  * (6 - BITS_INV_ACC)				/* -0.278	*/
-#define c_inv_corr_6 -512  * (6 - BITS_INV_ACC)				/* -0.25	*/	
-#define c_inv_corr_7 -651  * (6 - BITS_INV_ACC)				/* -0.3176	*/	
-
-#define RND_INV_ROW_0 RND_INV_ROW + c_inv_corr_0
-#define RND_INV_ROW_1 RND_INV_ROW + c_inv_corr_1
-#define RND_INV_ROW_2 RND_INV_ROW + c_inv_corr_2
-#define RND_INV_ROW_3 RND_INV_ROW + c_inv_corr_3
-#define RND_INV_ROW_4 RND_INV_ROW + c_inv_corr_4
-#define RND_INV_ROW_5 RND_INV_ROW + c_inv_corr_5
-#define RND_INV_ROW_6 RND_INV_ROW + c_inv_corr_6
-#define RND_INV_ROW_7 RND_INV_ROW + c_inv_corr_7
-
-/* Table for rows 0,4 - constants are multiplied on cos_4_16 */
-
-__declspec(align(16)) short tab_i_04[] = { 
-	16384, 21407, 16384, 8867,		
-	-16384, 21407, 16384, -8867,	
-	16384,  -8867,  16384, -21407,  
-    16384,   8867, -16384, -21407,  
-    22725,  19266,  19266,  -4520,  
-    4520,  19266,  19266, -22725,   
-    12873, -22725,   4520, -12873,  
-    12873,   4520, -22725, -12873}; 
-
-/* Table for rows 1,7 - constants are multiplied on cos_1_16 */
-
-__declspec(align(16)) short tab_i_17[] = {
-	22725,  29692,  22725,  12299,   
-    -22725,  29692,  22725, -12299,  
-    22725, -12299,  22725, -29692,   
-    22725,  12299, -22725, -29692,   
-    31521,  26722,  26722,  -6270,   
-    6270,  26722,  26722, -31521,    
-    17855, -31521,   6270, -17855,   
-    17855,   6270, -31521, -17855};  
-
-/* Table for rows 2,6 - constants are multiplied on cos_2_16 */
-
-__declspec(align(16)) short tab_i_26[] = {
-	21407,  27969,  21407,  11585,	
-    -21407,  27969,  21407, -11585,	
-    21407, -11585,  21407, -27969,	
-    21407,  11585, -21407, -27969,	
-    29692,  25172,  25172,  -5906,	
-    5906,  25172,  25172, -29692,	
-    16819, -29692,   5906, -16819,	
-    16819,   5906, -29692, -16819};	
-
-/* Table for rows 3,5 - constants are multiplied on cos_3_16 */
-
-__declspec(align(16)) short tab_i_35[] = {
-	19266,  25172,  19266,  10426,	
-    -19266,  25172,  19266, -10426,	
-    19266, -10426,  19266, -25172,	
-    19266,  10426, -19266, -25172,	
-    26722,  22654,  22654,  -5315,	
-    5315,  22654,  22654, -26722,	
-    15137, -26722,   5315, -15137,	
-    15137,   5315, -26722, -15137};	
-	
-__declspec(align(16)) long round_i_0[] = {RND_INV_ROW_0,RND_INV_ROW_0,
-	RND_INV_ROW_0,RND_INV_ROW_0};
-__declspec(align(16)) long round_i_1[] = {RND_INV_ROW_1,RND_INV_ROW_1,
-	RND_INV_ROW_1,RND_INV_ROW_1};
-__declspec(align(16)) long round_i_2[] = {RND_INV_ROW_2,RND_INV_ROW_2,
-	RND_INV_ROW_2,RND_INV_ROW_2};
-__declspec(align(16)) long round_i_3[] = {RND_INV_ROW_3,RND_INV_ROW_3,
-	RND_INV_ROW_3,RND_INV_ROW_3};
-__declspec(align(16)) long round_i_4[] = {RND_INV_ROW_4,RND_INV_ROW_4,
-	RND_INV_ROW_4,RND_INV_ROW_4};
-__declspec(align(16)) long round_i_5[] = {RND_INV_ROW_5,RND_INV_ROW_5,
-	RND_INV_ROW_5,RND_INV_ROW_5};
-__declspec(align(16)) long round_i_6[] = {RND_INV_ROW_6,RND_INV_ROW_6,
-	RND_INV_ROW_6,RND_INV_ROW_6};
-__declspec(align(16)) long round_i_7[] = {RND_INV_ROW_7,RND_INV_ROW_7,
-	RND_INV_ROW_7,RND_INV_ROW_7};
-
-__declspec(align(16)) short tg_1_16[] = {
-	13036,  13036,  13036,  13036,	/* tg * (2<<16) + 0.5 */
-	13036,  13036,  13036,  13036};
-__declspec(align(16)) short tg_2_16[] = {
-	27146,  27146,  27146,  27146,	/* tg * (2<<16) + 0.5 */
-	27146,  27146,  27146,  27146};
-__declspec(align(16)) short tg_3_16[] = {
-	-21746, -21746, -21746, -21746,	/* tg * (2<<16) + 0.5 */
-	-21746, -21746, -21746, -21746};
-__declspec(align(16)) short cos_4_16[] = {
-	-19195, -19195, -19195, -19195,	/* cos * (2<<16) + 0.5 */
-	-19195, -19195, -19195, -19195};
-
-/*
-* In this implementation the outputs of the iDCT-1D are multiplied
-*    for rows 0,4 - on cos_4_16,
-*    for rows 1,7 - on cos_1_16,
-*    for rows 2,6 - on cos_2_16,
-*    for rows 3,5 - on cos_3_16
-* and are shifted to the left for rise of accuracy
-*
-* For used constants
-*    FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
-*
-*-----------------------------------------------------------------------------
-*
-* On the first stage the calculation is executed at once for two rows.
-* The permutation for each output row is done on second stage
-*    t7 t6 t5 t4 t3 t2 t1 t0 -> t4 t5 t6 t7 t3 t2 t1 t0
-*
-*-----------------------------------------------------------------------------
-*/
-	
-#define DCT_8_INV_ROW_2R(TABLE, ROUND1, ROUND2) __asm {	\
-	__asm pshuflw  xmm1, xmm0, 10001000b				\
-    __asm pshuflw  xmm0, xmm0, 11011101b    			\
-    __asm pshufhw  xmm1, xmm1, 10001000b    			\
-	__asm pshufhw  xmm0, xmm0, 11011101b				\
-	__asm movdqa   xmm2, XMMWORD PTR [TABLE]			\
-	__asm pmaddwd  xmm2, xmm1							\
-	__asm movdqa   xmm3, XMMWORD PTR [TABLE + 32]		\
-	__asm pmaddwd  xmm3, xmm0               			\
-	__asm pmaddwd  xmm1, XMMWORD PTR [TABLE + 16]		\
-	__asm pmaddwd  xmm0, XMMWORD PTR [TABLE + 48]		\
-	__asm pshuflw  xmm5, xmm4, 10001000b				\
-	__asm pshuflw  xmm4, xmm4, 11011101b    			\
-	__asm pshufhw  xmm5, xmm5, 10001000b    			\
-	__asm pshufhw  xmm4, xmm4, 11011101b    			\
-	__asm movdqa   xmm6, XMMWORD PTR [TABLE]			\
-	__asm pmaddwd  xmm6, xmm5               			\
-	__asm movdqa   xmm7, XMMWORD PTR [TABLE + 32]		\
-	__asm pmaddwd  xmm7, xmm4               			\
-	__asm pmaddwd  xmm5, XMMWORD PTR [TABLE + 16]		\
-	__asm pmaddwd  xmm4, XMMWORD PTR [TABLE + 48]		\
-	__asm pshufd   xmm1, xmm1, 01001110b    			\
-	__asm pshufd   xmm0, xmm0, 01001110b    			\
-	__asm paddd    xmm2, XMMWORD PTR [ROUND1]			\
-	__asm paddd    xmm3, xmm0							\
-	__asm paddd    xmm1, xmm2							\
-	__asm pshufd   xmm5, xmm5, 01001110b    			\
-	__asm pshufd   xmm4, xmm4, 01001110b    			\
-	__asm movdqa   xmm2, xmm1             				\
-	__asm psubd    xmm2, xmm3             				\
-	__asm psrad    xmm2, SHIFT_INV_ROW    				\
-	__asm paddd    xmm1, xmm3							\
-	__asm psrad    xmm1, SHIFT_INV_ROW      			\
-	__asm packssdw xmm1, xmm2							\
-	__asm paddd    xmm6, XMMWORD PTR [ROUND2]			\
-	__asm paddd    xmm7, xmm4							\
-	__asm paddd    xmm5, xmm6							\
-	__asm movdqa   xmm6, xmm5	            			\
-	__asm psubd    xmm6, xmm7               			\
-	__asm psrad    xmm6, SHIFT_INV_ROW      			\
-	__asm paddd    xmm5, xmm7							\
-	__asm psrad    xmm5, SHIFT_INV_ROW      			\
-	__asm packssdw xmm5, xmm6							\
-	}
-
-/*
-*
-* The second stage - inverse DCTs of columns
-*
-* The inputs are multiplied
-*    for rows 0,4 - on cos_4_16,
-*    for rows 1,7 - on cos_1_16,
-*    for rows 2,6 - on cos_2_16,
-*    for rows 3,5 - on cos_3_16
-* and are shifted to the left for rise of accuracy
-*/
-
-#define DCT_8_INV_COL_8R(INP, OUTP) __asm {		\
-	__asm movdqa   xmm0, [INP + 5*16]			\
-    __asm movdqa   xmm1, XMMWORD PTR tg_3_16	\
-    __asm movdqa   xmm2, xmm0            		\
-    __asm movdqa   xmm3, [INP + 3*16]   		\
-    __asm pmulhw   xmm0, xmm1           		\
-    __asm movdqa   xmm4, [INP + 7*16]   		\
-    __asm pmulhw   xmm1, xmm3           		\
-    __asm movdqa   xmm5, XMMWORD PTR tg_1_16   	\
-    __asm movdqa   xmm6, xmm4            		\
-    __asm pmulhw   xmm4, xmm5           		\
-    __asm paddsw   xmm0, xmm2           		\
-    __asm pmulhw   xmm5, [INP + 1*16]   		\
-    __asm paddsw   xmm1, xmm3           		\
-    __asm movdqa   xmm7, [INP + 6*16]    		\
-    __asm paddsw   xmm0, xmm3					\
-    __asm movdqa   xmm3, XMMWORD PTR tg_2_16	\
-    __asm psubsw   xmm2, xmm1					\
-    __asm pmulhw   xmm7, xmm3            		\
-    __asm movdqa   xmm1, xmm0            		\
-    __asm pmulhw   xmm3, [INP + 2*16]   		\
-    __asm psubsw   xmm5, xmm6					\
-    __asm paddsw   xmm4, [INP + 1*16]    		\
-    __asm paddsw   xmm0, xmm4            		\
-    __asm psubsw   xmm4, xmm1					\
-    __asm pshufhw  xmm0, xmm0, 00011011b		\
-    __asm paddsw   xmm7, [INP + 2*16]    		\
-    __asm movdqa   xmm6, xmm5					\
-    __asm psubsw   xmm3, [INP + 6*16]    		\
-    __asm psubsw   xmm5, xmm2            		\
-    __asm paddsw   xmm6, xmm2					\
-	__asm movdqa   [OUTP + 7*16], xmm0    		\
-    __asm movdqa   xmm1, xmm4            		\
-    __asm movdqa   xmm2, XMMWORD PTR cos_4_16  	\
-    __asm paddsw   xmm4, xmm5            		\
-    __asm movdqa   xmm0, XMMWORD PTR cos_4_16  	\
-    __asm pmulhw   xmm2, xmm4					\
-    __asm pshufhw  xmm6, xmm6, 00011011b		\
-    __asm movdqa   [OUTP + 3*16], xmm6    		\
-    __asm psubsw   xmm1, xmm5            		\
-    __asm movdqa   xmm6, [INP + 0*16]   		\
-    __asm pmulhw   xmm0, xmm1					\
-    __asm movdqa   xmm5, [INP + 4*16]    		\
-    __asm paddsw   xmm4, xmm2					\
-    __asm paddsw   xmm5, xmm6       			\
-    __asm psubsw   xmm6, [INP + 4*16]   		\
-    __asm paddsw   xmm0, xmm1					\
-    __asm pshufhw  xmm4, xmm4, 00011011b		\
-    __asm movdqa   xmm2, xmm5            		\
-    __asm paddsw   xmm5, xmm7            		\
-    __asm movdqa   xmm1, xmm6					\
-    __asm psubsw   xmm2, xmm7					\
-    __asm movdqa   xmm7, [OUTP + 7*16]    		\
-    __asm paddsw   xmm6, xmm3            		\
-    __asm pshufhw  xmm5, xmm5, 00011011b		\
-	__asm paddsw   xmm7, xmm5					\
-    __asm psubsw   xmm1, xmm3					\
-    __asm pshufhw  xmm6, xmm6, 00011011b		\
-	__asm movdqa   xmm3, xmm6					\
-    __asm paddsw   xmm6, xmm4            		\
-    __asm pshufhw  xmm2, xmm2, 00011011b		\
-    __asm psraw    xmm7, SHIFT_INV_COL   		\
-    __asm movdqa   [OUTP + 0*16], xmm7    		\
-    __asm movdqa   xmm7, xmm1            		\
-    __asm paddsw   xmm1, xmm0					\
-    __asm psraw    xmm6, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 1*16], xmm6    		\
-    __asm pshufhw  xmm1, xmm1, 00011011b		\
-	__asm movdqa   xmm6, [OUTP + 3*16]			\
-    __asm psubsw   xmm7, xmm0            		\
-    __asm psraw    xmm1, SHIFT_INV_COL   		\
-    __asm movdqa   [OUTP + 2*16], xmm1    		\
-    __asm psubsw   xmm5, [OUTP + 7*16]			\
-    __asm paddsw   xmm6, xmm2            		\
-    __asm psubsw   xmm2, [OUTP + 3*16]			\
-    __asm psubsw   xmm3, xmm4            		\
-    __asm psraw    xmm7, SHIFT_INV_COL  		\
-    __asm pshufhw  xmm7, xmm7, 00011011b		\
-    __asm movdqa   [OUTP + 5*16], xmm7    		\
-    __asm psraw    xmm5, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 7*16], xmm5    		\
-    __asm psraw    xmm6, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 3*16], xmm6    		\
-    __asm psraw    xmm2, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 4*16], xmm2    		\
-    __asm psraw    xmm3, SHIFT_INV_COL			\
-    __asm movdqa   [OUTP + 6*16], xmm3    		\
-	}
-
-/*
-*
-*  Name:      dct_8x8_inv_16s
-*  Purpose:   Inverse Discrete Cosine Transform 8x8 with
-*             2D buffer of short int data
-*  Context:
-*      void dct_8x8_inv_16s ( short *src, short *dst )
-*  Parameters:
-*      src  - Pointer to the source buffer
-*      dst  - Pointer to the destination buffer
-*
-*/
-
-GLOBAL(void)
-dct_8x8_inv_16s ( short *src, short *dst ) {
-	
-	__asm {
-
-		mov     ecx,  src
-		mov     edx,  dst
-
-		movdqa  xmm0, [ecx+0*16]
-		movdqa  xmm4, [ecx+4*16]
-		DCT_8_INV_ROW_2R(tab_i_04, round_i_0, round_i_4)
-		movdqa     [edx+0*16], xmm1 
-		movdqa     [edx+4*16], xmm5 
-
-		movdqa  xmm0, [ecx+1*16]
-		movdqa  xmm4, [ecx+7*16]
-		DCT_8_INV_ROW_2R(tab_i_17, round_i_1, round_i_7)
-		movdqa     [edx+1*16], xmm1 
-		movdqa     [edx+7*16], xmm5 
-
-		movdqa  xmm0, [ecx+3*16]
-		movdqa  xmm4, [ecx+5*16]
-		DCT_8_INV_ROW_2R(tab_i_35, round_i_3, round_i_5);
-		movdqa     [edx+3*16], xmm1 
-		movdqa     [edx+5*16], xmm5 
-
-		movdqa  xmm0, [ecx+2*16]
-		movdqa  xmm4, [ecx+6*16]
-		DCT_8_INV_ROW_2R(tab_i_26, round_i_2, round_i_6);
-		movdqa     [edx+2*16], xmm1
-		movdqa     [edx+6*16], xmm5    
-
-		DCT_8_INV_COL_8R(edx+0, edx+0);
-	}
-}
-
-
-/*
-*  Name:
-*    ownpj_QuantInv_8x8_16s
-*
-*  Purpose:
-*    Dequantize 8x8 block of DCT coefficients
-*
-*  Context:
-*    void ownpj_QuantInv_8x8_16s
-*            Ipp16s*  pSrc,
-*            Ipp16s*  pDst,
-*      const Ipp16u*  pQTbl)*
-*
-*/
-
-GLOBAL(void)
-ownpj_QuantInv_8x8_16s(short * pSrc, short * pDst, const unsigned short * pQTbl)
-{
-	__asm {
-
-		push        ebx
-		push        ecx
-		push        edx
-		push        esi
-		push        edi
-
-		mov         esi, pSrc
-		mov         edi, pDst
-		mov         edx, pQTbl
-		mov         ecx, 4
-		mov         ebx, 32
-
-	again:
-
-		movq        mm0, QWORD PTR [esi+0]
-		movq        mm1, QWORD PTR [esi+8]
-		movq        mm2, QWORD PTR [esi+16]
-		movq        mm3, QWORD PTR [esi+24]
-
-		prefetcht0  [esi+ebx] ; fetch next cache line
-
-		pmullw      mm0, QWORD PTR [edx+0]
-		pmullw      mm1, QWORD PTR [edx+8]
-		pmullw      mm2, QWORD PTR [edx+16]
-		pmullw      mm3, QWORD PTR [edx+24]
-
-		movq        QWORD PTR [edi+0], mm0
-		movq        QWORD PTR [edi+8], mm1
-		movq        QWORD PTR [edi+16], mm2
-		movq        QWORD PTR [edi+24], mm3
-
-		add         esi, ebx
-		add         edi, ebx
-		add         edx, ebx
-		dec         ecx
-		jnz         again
-
-		emms
-
-		pop         edi
-		pop         esi
-		pop         edx
-		pop         ecx
-		pop         ebx
-	}
-}
-
-
-/*
-*  Name:
-*    ownpj_Add128_8x8_16s8u
-*
-*  Purpose:
-*    signed to unsigned conversion (level shift)
-*    for 8x8 block of DCT coefficients
-*
-*  Context:
-*    void ownpj_Add128_8x8_16s8u
-*      const Ipp16s* pSrc,
-*            Ipp8u*  pDst,
-*            int     DstStep);
-*
-*/
-
-__declspec(align(16)) long const_128[]= {0x00800080, 0x00800080, 0x00800080, 0x00800080};
-
-GLOBAL(void)
-ownpj_Add128_8x8_16s8u(const short * pSrc, unsigned char * pDst, int DstStep)
-{
-	__asm {
-		push        eax
-		push        ebx
-		push        ecx
-		push        edx
-		push        esi
-		push        edi
-
-		mov         esi, pSrc
-		mov         edi, pDst
-		mov         edx, DstStep
-		mov         ecx, 2
-		mov         ebx, edx
-		mov         eax, edx
-		sal         ebx, 1
-		add         eax, ebx
-		movdqa      xmm7, XMMWORD PTR const_128
-
-	again:
-
-		movdqa      xmm0, XMMWORD PTR [esi+0]  ; line 0
-		movdqa      xmm1, XMMWORD PTR [esi+16] ; line 1
-		movdqa      xmm2, XMMWORD PTR [esi+32] ; line 2
-		movdqa      xmm3, XMMWORD PTR [esi+48] ; line 3
-
-		paddw     xmm0, xmm7
-		paddw     xmm1, xmm7
-		paddw     xmm2, xmm7
-		paddw     xmm3, xmm7
-
-		packuswb  xmm0, xmm1
-		packuswb  xmm2, xmm3
-
-		movq      QWORD PTR [edi], xmm0      ;0*DstStep
-		movq      QWORD PTR [edi+ebx], xmm2  ;2*DstStep
-
-		psrldq      xmm0, 8
-		psrldq      xmm2, 8
-
-		movq      QWORD PTR [edi+edx], xmm0  ;1*DstStep
-		movq      QWORD PTR [edi+eax], xmm2  ;3*DstStep
-
-		add         edi, ebx
-		add         esi, 64
-		add         edi, ebx
-		dec         ecx
-		jnz         again
-
-		pop         edi
-		pop         esi
-		pop         edx
-		pop         ecx
-		pop         ebx
-		pop         eax
-	}
-}
-
-
-/* 
-*  Name:
-*    ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R
-*
-*  Purpose:
-*    Inverse DCT transform, de-quantization and level shift
-*
-*  Parameters:
-*    pSrc               - pointer to source
-*    pDst               - pointer to output array
-*    DstStep            - line offset for output data
-*    pEncoderQuantTable - pointer to Quantization table
-*
-*/
-
-GLOBAL(void)
-ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
-  short * pSrc,
-  unsigned char *  pDst,
-  int     DstStep,
-  const unsigned short * pQuantInvTable)
-{
-
-	__declspec(align(16)) Ipp8u buf[DCTSIZE2*sizeof(Ipp16s)];
-	Ipp16s * workbuf = (Ipp16s *)buf;	
-
-	ownpj_QuantInv_8x8_16s(pSrc,workbuf,pQuantInvTable);
-	dct_8x8_inv_16s(workbuf,workbuf);
-	ownpj_Add128_8x8_16s8u(workbuf,pDst,DstStep);
-  
-} 
-
-GLOBAL(void)
-jpeg_idct_islow_sse2 (
-	j_decompress_ptr cinfo, 
-	jpeg_component_info * compptr,
-	JCOEFPTR coef_block,
-	JSAMPARRAY output_buf, 
-	JDIMENSION output_col)
-{
-	int			ctr;
-	JCOEFPTR	inptr;
-	Ipp16u*		quantptr;
-	Ipp8u*		wsptr;
-	__declspec(align(16)) Ipp8u workspace[DCTSIZE2];  	
-	JSAMPROW	outptr;
-
-	inptr = coef_block;
-	quantptr = (Ipp16u*)compptr->dct_table;
-	wsptr = workspace;
-	
-	ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(inptr, workspace, 8, quantptr);
-
-	for(ctr = 0; ctr < DCTSIZE; ctr++)
-	{
-		outptr = output_buf[ctr] + output_col;
-
-		outptr[0] = wsptr[0];
-		outptr[1] = wsptr[1];
-		outptr[2] = wsptr[2];
-		outptr[3] = wsptr[3];
-		outptr[4] = wsptr[4];
-		outptr[5] = wsptr[5];
-		outptr[6] = wsptr[6];
-		outptr[7] = wsptr[7];
-
-		wsptr += DCTSIZE;
-	}
-}
-#endif /* HAVE_SSE2_INTEL_MNEMONICS */
-
 #endif /* DCT_ISLOW_SUPPORTED */
--- a/jpeg/jmemansi.c
+++ b/jpeg/jmemansi.c
@ -1,167 +0,0 @@
-/*
- * jmemansi.c
- *
- * Copyright (C) 1992-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides a simple generic implementation of the system-
- * dependent portion of the JPEG memory manager.  This implementation
- * assumes that you have the ANSI-standard library routine tmpfile().
- * Also, the problem of determining the amount of memory available
- * is shoved onto the user.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-#endif
-
-#ifndef SEEK_SET		/* pre-ANSI systems may not define this; */
-#define SEEK_SET  0		/* if not, assume 0 is correct */
-#endif
-
-
-/*
- * Memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: although we include FAR keywords in the routine declarations,
- * this file won't actually work in 80x86 small/medium model; at least,
- * you probably won't be able to process useful-size images in only 64KB.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
-#define DEFAULT_MAX_MEM		1000000L /* default: one megabyte */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		    void FAR * buffer_address,
-		    long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFREAD(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		     void FAR * buffer_address,
-		     long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFWRITE(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  fclose(info->temp_file);
-  /* Since this implementation uses tmpfile() to create the file,
-   * no explicit file deletion is needed.
-   */
-}
-
-
-/*
- * Initial opening of a backing-store object.
- *
- * This version uses tmpfile(), which constructs a suitable file name
- * behind the scenes.  We don't have to use info->temp_name[] at all;
- * indeed, we can't even find out the actual name of the temp file.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  if ((info->temp_file = tmpfile()) == NULL)
-    ERREXITS(cinfo, JERR_TFILE_CREATE, "");
-  info->read_backing_store = read_backing_store;
-  info->write_backing_store = write_backing_store;
-  info->close_backing_store = close_backing_store;
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* no work */
-}
--- a/jpeg/jmemdos.c
+++ b/jpeg/jmemdos.c
@ -1,638 +0,0 @@
-/*
- * jmemdos.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides an MS-DOS-compatible implementation of the system-
- * dependent portion of the JPEG memory manager.  Temporary data can be
- * stored in extended or expanded memory as well as in regular DOS files.
- *
- * If you use this file, you must be sure that NEED_FAR_POINTERS is defined
- * if you compile in a small-data memory model; it should NOT be defined if
- * you use a large-data memory model.  This file is not recommended if you
- * are using a flat-memory-space 386 environment such as DJGCC or Watcom C.
- * Also, this code will NOT work if struct fields are aligned on greater than
- * 2-byte boundaries.
- *
- * Based on code contributed by Ge' Weijers.
- */
-
-/*
- * If you have both extended and expanded memory, you may want to change the
- * order in which they are tried in jopen_backing_store.  On a 286 machine
- * expanded memory is usually faster, since extended memory access involves
- * an expensive protected-mode-and-back switch.  On 386 and better, extended
- * memory is usually faster.  As distributed, the code tries extended memory
- * first (what? not everyone has a 386? :-).
- *
- * You can disable use of extended/expanded memory entirely by altering these
- * definitions or overriding them from the Makefile (eg, -DEMS_SUPPORTED=0).
- */
-
-#ifndef XMS_SUPPORTED
-#define XMS_SUPPORTED  1
-#endif
-#ifndef EMS_SUPPORTED
-#define EMS_SUPPORTED  1
-#endif
-
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare these */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-extern char * getenv JPP((const char * name));
-#endif
-
-#ifdef NEED_FAR_POINTERS
-
-#ifdef __TURBOC__
-/* These definitions work for Borland C (Turbo C) */
-#include <alloc.h>		/* need farmalloc(), farfree() */
-#define far_malloc(x)	farmalloc(x)
-#define far_free(x)	farfree(x)
-#else
-/* These definitions work for Microsoft C and compatible compilers */
-#include <malloc.h>		/* need _fmalloc(), _ffree() */
-#define far_malloc(x)	_fmalloc(x)
-#define far_free(x)	_ffree(x)
-#endif
-
-#else /* not NEED_FAR_POINTERS */
-
-#define far_malloc(x)	malloc(x)
-#define far_free(x)	free(x)
-
-#endif /* NEED_FAR_POINTERS */
-
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
-#else
-#define READ_BINARY	"rb"
-#endif
-
-#ifndef USE_MSDOS_MEMMGR	/* make sure user got configuration right */
-  You forgot to define USE_MSDOS_MEMMGR in jconfig.h. /* deliberate syntax error */
-#endif
-
-#if MAX_ALLOC_CHUNK >= 65535L	/* make sure jconfig.h got this right */
-  MAX_ALLOC_CHUNK should be less than 64K. /* deliberate syntax error */
-#endif
-
-
-/*
- * Declarations for assembly-language support routines (see jmemdosa.asm).
- *
- * The functions are declared "far" as are all their pointer arguments;
- * this ensures the assembly source code will work regardless of the
- * compiler memory model.  We assume "short" is 16 bits, "long" is 32.
- */
-
-typedef void far * XMSDRIVER;	/* actually a pointer to code */
-typedef struct {		/* registers for calling XMS driver */
-	unsigned short ax, dx, bx;
-	void far * ds_si;
-      } XMScontext;
-typedef struct {		/* registers for calling EMS driver */
-	unsigned short ax, dx, bx;
-	void far * ds_si;
-      } EMScontext;
-
-extern short far jdos_open JPP((short far * handle, char far * filename));
-extern short far jdos_close JPP((short handle));
-extern short far jdos_seek JPP((short handle, long offset));
-extern short far jdos_read JPP((short handle, void far * buffer,
-				unsigned short count));
-extern short far jdos_write JPP((short handle, void far * buffer,
-				 unsigned short count));
-extern void far jxms_getdriver JPP((XMSDRIVER far *));
-extern void far jxms_calldriver JPP((XMSDRIVER, XMScontext far *));
-extern short far jems_available JPP((void));
-extern void far jems_calldriver JPP((EMScontext far *));
-
-
-/*
- * Selection of a file name for a temporary file.
- * This is highly system-dependent, and you may want to customize it.
- */
-
-static int next_file_num;	/* to distinguish among several temp files */
-
-LOCAL(void)
-select_file_name (char * fname)
-{
-  const char * env;
-  char * ptr;
-  FILE * tfile;
-
-  /* Keep generating file names till we find one that's not in use */
-  for (;;) {
-    /* Get temp directory name from environment TMP or TEMP variable;
-     * if none, use "."
-     */
-    if ((env = (const char *) getenv("TMP")) == NULL)
-      if ((env = (const char *) getenv("TEMP")) == NULL)
-	env = ".";
-    if (*env == '\0')		/* null string means "." */
-      env = ".";
-    ptr = fname;		/* copy name to fname */
-    while (*env != '\0')
-      *ptr++ = *env++;
-    if (ptr[-1] != '\\' && ptr[-1] != '/')
-      *ptr++ = '\\';		/* append backslash if not in env variable */
-    /* Append a suitable file name */
-    next_file_num++;		/* advance counter */
-    sprintf(ptr, "JPG%03d.TMP", next_file_num);
-    /* Probe to see if file name is already in use */
-    if ((tfile = fopen(fname, READ_BINARY)) == NULL)
-      break;
-    fclose(tfile);		/* oops, it's there; close tfile & try again */
-  }
-}
-
-
-/*
- * Near-memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * "Large" objects are allocated in far memory, if possible
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) far_malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  far_free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
-#define DEFAULT_MAX_MEM		300000L /* for total usage about 450K */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-/*
- * For MS-DOS we support three types of backing storage:
- *   1. Conventional DOS files.  We access these by direct DOS calls rather
- *      than via the stdio package.  This provides a bit better performance,
- *      but the real reason is that the buffers to be read or written are FAR.
- *      The stdio library for small-data memory models can't cope with that.
- *   2. Extended memory, accessed per the XMS V2.0 specification.
- *   3. Expanded memory, accessed per the LIM/EMS 4.0 specification.
- * You'll need copies of those specs to make sense of the related code.
- * The specs are available by Internet FTP from the SIMTEL archives 
- * (oak.oakland.edu and its various mirror sites).  See files
- * pub/msdos/microsoft/xms20.arc and pub/msdos/info/limems41.zip.
- */
-
-
-/*
- * Access methods for a DOS file.
- */
-
-
-METHODDEF(void)
-read_file_store (j_common_ptr cinfo, backing_store_ptr info,
-		 void FAR * buffer_address,
-		 long file_offset, long byte_count)
-{
-  if (jdos_seek(info->handle.file_handle, file_offset))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  /* Since MAX_ALLOC_CHUNK is less than 64K, byte_count will be too. */
-  if (byte_count > 65535L)	/* safety check */
-    ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
-  if (jdos_read(info->handle.file_handle, buffer_address,
-		(unsigned short) byte_count))
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_file_store (j_common_ptr cinfo, backing_store_ptr info,
-		  void FAR * buffer_address,
-		  long file_offset, long byte_count)
-{
-  if (jdos_seek(info->handle.file_handle, file_offset))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  /* Since MAX_ALLOC_CHUNK is less than 64K, byte_count will be too. */
-  if (byte_count > 65535L)	/* safety check */
-    ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
-  if (jdos_write(info->handle.file_handle, buffer_address,
-		 (unsigned short) byte_count))
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_file_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  jdos_close(info->handle.file_handle);	/* close the file */
-  remove(info->temp_name);	/* delete the file */
-/* If your system doesn't have remove(), try unlink() instead.
- * remove() is the ANSI-standard name for this function, but
- * unlink() was more common in pre-ANSI systems.
- */
-  TRACEMSS(cinfo, 1, JTRC_TFILE_CLOSE, info->temp_name);
-}
-
-
-LOCAL(boolean)
-open_file_store (j_common_ptr cinfo, backing_store_ptr info,
-		 long total_bytes_needed)
-{
-  short handle;
-
-  select_file_name(info->temp_name);
-  if (jdos_open((short far *) & handle, (char far *) info->temp_name)) {
-    /* might as well exit since jpeg_open_backing_store will fail anyway */
-    ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-    return FALSE;
-  }
-  info->handle.file_handle = handle;
-  info->read_backing_store = read_file_store;
-  info->write_backing_store = write_file_store;
-  info->close_backing_store = close_file_store;
-  TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
-  return TRUE;			/* succeeded */
-}
-
-
-/*
- * Access methods for extended memory.
- */
-
-#if XMS_SUPPORTED
-
-static XMSDRIVER xms_driver;	/* saved address of XMS driver */
-
-typedef union {			/* either long offset or real-mode pointer */
-	long offset;
-	void far * ptr;
-      } XMSPTR;
-
-typedef struct {		/* XMS move specification structure */
-	long length;
-	XMSH src_handle;
-	XMSPTR src;
-	XMSH dst_handle;
-	XMSPTR dst;
-      } XMSspec;
-
-#define ODD(X)	(((X) & 1L) != 0)
-
-
-METHODDEF(void)
-read_xms_store (j_common_ptr cinfo, backing_store_ptr info,
-		void FAR * buffer_address,
-		long file_offset, long byte_count)
-{
-  XMScontext ctx;
-  XMSspec spec;
-  char endbuffer[2];
-
-  /* The XMS driver can't cope with an odd length, so handle the last byte
-   * specially if byte_count is odd.  We don't expect this to be common.
-   */
-
-  spec.length = byte_count & (~ 1L);
-  spec.src_handle = info->handle.xms_handle;
-  spec.src.offset = file_offset;
-  spec.dst_handle = 0;
-  spec.dst.ptr = buffer_address;
-  
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x0b00;		/* EMB move */
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax != 1)
-    ERREXIT(cinfo, JERR_XMS_READ);
-
-  if (ODD(byte_count)) {
-    read_xms_store(cinfo, info, (void FAR *) endbuffer,
-		   file_offset + byte_count - 1L, 2L);
-    ((char FAR *) buffer_address)[byte_count - 1L] = endbuffer[0];
-  }
-}
-
-
-METHODDEF(void)
-write_xms_store (j_common_ptr cinfo, backing_store_ptr info,
-		 void FAR * buffer_address,
-		 long file_offset, long byte_count)
-{
-  XMScontext ctx;
-  XMSspec spec;
-  char endbuffer[2];
-
-  /* The XMS driver can't cope with an odd length, so handle the last byte
-   * specially if byte_count is odd.  We don't expect this to be common.
-   */
-
-  spec.length = byte_count & (~ 1L);
-  spec.src_handle = 0;
-  spec.src.ptr = buffer_address;
-  spec.dst_handle = info->handle.xms_handle;
-  spec.dst.offset = file_offset;
-
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x0b00;		/* EMB move */
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax != 1)
-    ERREXIT(cinfo, JERR_XMS_WRITE);
-
-  if (ODD(byte_count)) {
-    read_xms_store(cinfo, info, (void FAR *) endbuffer,
-		   file_offset + byte_count - 1L, 2L);
-    endbuffer[0] = ((char FAR *) buffer_address)[byte_count - 1L];
-    write_xms_store(cinfo, info, (void FAR *) endbuffer,
-		    file_offset + byte_count - 1L, 2L);
-  }
-}
-
-
-METHODDEF(void)
-close_xms_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  XMScontext ctx;
-
-  ctx.dx = info->handle.xms_handle;
-  ctx.ax = 0x0a00;
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  TRACEMS1(cinfo, 1, JTRC_XMS_CLOSE, info->handle.xms_handle);
-  /* we ignore any error return from the driver */
-}
-
-
-LOCAL(boolean)
-open_xms_store (j_common_ptr cinfo, backing_store_ptr info,
-		long total_bytes_needed)
-{
-  XMScontext ctx;
-
-  /* Get address of XMS driver */
-  jxms_getdriver((XMSDRIVER far *) & xms_driver);
-  if (xms_driver == NULL)
-    return FALSE;		/* no driver to be had */
-
-  /* Get version number, must be >= 2.00 */
-  ctx.ax = 0x0000;
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax < (unsigned short) 0x0200)
-    return FALSE;
-
-  /* Try to get space (expressed in kilobytes) */
-  ctx.dx = (unsigned short) ((total_bytes_needed + 1023L) >> 10);
-  ctx.ax = 0x0900;
-  jxms_calldriver(xms_driver, (XMScontext far *) & ctx);
-  if (ctx.ax != 1)
-    return FALSE;
-
-  /* Succeeded, save the handle and away we go */
-  info->handle.xms_handle = ctx.dx;
-  info->read_backing_store = read_xms_store;
-  info->write_backing_store = write_xms_store;
-  info->close_backing_store = close_xms_store;
-  TRACEMS1(cinfo, 1, JTRC_XMS_OPEN, ctx.dx);
-  return TRUE;			/* succeeded */
-}
-
-#endif /* XMS_SUPPORTED */
-
-
-/*
- * Access methods for expanded memory.
- */
-
-#if EMS_SUPPORTED
-
-/* The EMS move specification structure requires word and long fields aligned
- * at odd byte boundaries.  Some compilers will align struct fields at even
- * byte boundaries.  While it's usually possible to force byte alignment,
- * that causes an overall performance penalty and may pose problems in merging
- * JPEG into a larger application.  Instead we accept some rather dirty code
- * here.  Note this code would fail if the hardware did not allow odd-byte
- * word & long accesses, but all 80x86 CPUs do.
- */
-
-typedef void far * EMSPTR;
-
-typedef union {			/* EMS move specification structure */
-	long length;		/* It's easy to access first 4 bytes */
-	char bytes[18];		/* Misaligned fields in here! */
-      } EMSspec;
-
-/* Macros for accessing misaligned fields */
-#define FIELD_AT(spec,offset,type)  (*((type *) &(spec.bytes[offset])))
-#define SRC_TYPE(spec)		FIELD_AT(spec,4,char)
-#define SRC_HANDLE(spec)	FIELD_AT(spec,5,EMSH)
-#define SRC_OFFSET(spec)	FIELD_AT(spec,7,unsigned short)
-#define SRC_PAGE(spec)		FIELD_AT(spec,9,unsigned short)
-#define SRC_PTR(spec)		FIELD_AT(spec,7,EMSPTR)
-#define DST_TYPE(spec)		FIELD_AT(spec,11,char)
-#define DST_HANDLE(spec)	FIELD_AT(spec,12,EMSH)
-#define DST_OFFSET(spec)	FIELD_AT(spec,14,unsigned short)
-#define DST_PAGE(spec)		FIELD_AT(spec,16,unsigned short)
-#define DST_PTR(spec)		FIELD_AT(spec,14,EMSPTR)
-
-#define EMSPAGESIZE	16384L	/* gospel, see the EMS specs */
-
-#define HIBYTE(W)  (((W) >> 8) & 0xFF)
-#define LOBYTE(W)  ((W) & 0xFF)
-
-
-METHODDEF(void)
-read_ems_store (j_common_ptr cinfo, backing_store_ptr info,
-		void FAR * buffer_address,
-		long file_offset, long byte_count)
-{
-  EMScontext ctx;
-  EMSspec spec;
-
-  spec.length = byte_count;
-  SRC_TYPE(spec) = 1;
-  SRC_HANDLE(spec) = info->handle.ems_handle;
-  SRC_PAGE(spec)   = (unsigned short) (file_offset / EMSPAGESIZE);
-  SRC_OFFSET(spec) = (unsigned short) (file_offset % EMSPAGESIZE);
-  DST_TYPE(spec) = 0;
-  DST_HANDLE(spec) = 0;
-  DST_PTR(spec)    = buffer_address;
-  
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x5700;		/* move memory region */
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    ERREXIT(cinfo, JERR_EMS_READ);
-}
-
-
-METHODDEF(void)
-write_ems_store (j_common_ptr cinfo, backing_store_ptr info,
-		 void FAR * buffer_address,
-		 long file_offset, long byte_count)
-{
-  EMScontext ctx;
-  EMSspec spec;
-
-  spec.length = byte_count;
-  SRC_TYPE(spec) = 0;
-  SRC_HANDLE(spec) = 0;
-  SRC_PTR(spec)    = buffer_address;
-  DST_TYPE(spec) = 1;
-  DST_HANDLE(spec) = info->handle.ems_handle;
-  DST_PAGE(spec)   = (unsigned short) (file_offset / EMSPAGESIZE);
-  DST_OFFSET(spec) = (unsigned short) (file_offset % EMSPAGESIZE);
-  
-  ctx.ds_si = (void far *) & spec;
-  ctx.ax = 0x5700;		/* move memory region */
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    ERREXIT(cinfo, JERR_EMS_WRITE);
-}
-
-
-METHODDEF(void)
-close_ems_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  EMScontext ctx;
-
-  ctx.ax = 0x4500;
-  ctx.dx = info->handle.ems_handle;
-  jems_calldriver((EMScontext far *) & ctx);
-  TRACEMS1(cinfo, 1, JTRC_EMS_CLOSE, info->handle.ems_handle);
-  /* we ignore any error return from the driver */
-}
-
-
-LOCAL(boolean)
-open_ems_store (j_common_ptr cinfo, backing_store_ptr info,
-		long total_bytes_needed)
-{
-  EMScontext ctx;
-
-  /* Is EMS driver there? */
-  if (! jems_available())
-    return FALSE;
-
-  /* Get status, make sure EMS is OK */
-  ctx.ax = 0x4000;
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    return FALSE;
-
-  /* Get version, must be >= 4.0 */
-  ctx.ax = 0x4600;
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0 || LOBYTE(ctx.ax) < 0x40)
-    return FALSE;
-
-  /* Try to allocate requested space */
-  ctx.ax = 0x4300;
-  ctx.bx = (unsigned short) ((total_bytes_needed + EMSPAGESIZE-1L) / EMSPAGESIZE);
-  jems_calldriver((EMScontext far *) & ctx);
-  if (HIBYTE(ctx.ax) != 0)
-    return FALSE;
-
-  /* Succeeded, save the handle and away we go */
-  info->handle.ems_handle = ctx.dx;
-  info->read_backing_store = read_ems_store;
-  info->write_backing_store = write_ems_store;
-  info->close_backing_store = close_ems_store;
-  TRACEMS1(cinfo, 1, JTRC_EMS_OPEN, ctx.dx);
-  return TRUE;			/* succeeded */
-}
-
-#endif /* EMS_SUPPORTED */
-
-
-/*
- * Initial opening of a backing-store object.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  /* Try extended memory, then expanded memory, then regular file. */
-#if XMS_SUPPORTED
-  if (open_xms_store(cinfo, info, total_bytes_needed))
-    return;
-#endif
-#if EMS_SUPPORTED
-  if (open_ems_store(cinfo, info, total_bytes_needed))
-    return;
-#endif
-  if (open_file_store(cinfo, info, total_bytes_needed))
-    return;
-  ERREXITS(cinfo, JERR_TFILE_CREATE, "");
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  next_file_num = 0;		/* initialize temp file name generator */
-  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* Microsoft C, at least in v6.00A, will not successfully reclaim freed
-   * blocks of size > 32Kbytes unless we give it a kick in the rear, like so:
-   */
-#ifdef NEED_FHEAPMIN
-  _fheapmin();
-#endif
-}
--- a/jpeg/jmemdosa.asm
+++ b/jpeg/jmemdosa.asm
@ -1,379 +0,0 @@
-;
-; jmemdosa.asm
-;
-; Copyright (C) 1992, Thomas G. Lane.
-; This file is part of the Independent JPEG Group's software.
-; For conditions of distribution and use, see the accompanying README file.
-;
-; This file contains low-level interface routines to support the MS-DOS
-; backing store manager (jmemdos.c).  Routines are provided to access disk
-; files through direct DOS calls, and to access XMS and EMS drivers.
-;
-; This file should assemble with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).  If you haven't got
-; a compatible assembler, better fall back to jmemansi.c or jmemname.c.
-;
-; To minimize dependence on the C compiler's register usage conventions,
-; we save and restore all 8086 registers, even though most compilers only
-; require SI,DI,DS to be preserved.  Also, we use only 16-bit-wide return
-; values, which everybody returns in AX.
-;
-; Based on code contributed by Ge' Weijers.
-;
-
-JMEMDOSA_TXT	segment byte public 'CODE'
-
-		assume	cs:JMEMDOSA_TXT
-
-		public	_jdos_open
-		public	_jdos_close
-		public	_jdos_seek
-		public	_jdos_read
-		public	_jdos_write
-		public	_jxms_getdriver
-		public	_jxms_calldriver
-		public	_jems_available
-		public	_jems_calldriver
-
-;
-; short far jdos_open (short far * handle, char far * filename)
-;
-; Create and open a temporary file
-;
-_jdos_open	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	cx,0			; normal file attributes
-		lds	dx,dword ptr [bp+10]	; get filename pointer
-		mov	ah,3ch			; create file
-		int	21h
-		jc	open_err		; if failed, return error code
-		lds	bx,dword ptr [bp+6]	; get handle pointer
-		mov	word ptr [bx],ax	; save the handle
-		xor	ax,ax			; return zero for OK
-open_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_open	endp
-
-
-;
-; short far jdos_close (short handle)
-;
-; Close the file handle
-;
-_jdos_close	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		mov	ah,3eh			; close file
-		int	21h
-		jc	close_err		; if failed, return error code
-		xor	ax,ax			; return zero for OK
-close_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_close	endp
-
-
-;
-; short far jdos_seek (short handle, long offset)
-;
-; Set file position
-;
-_jdos_seek	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		mov	dx,word ptr [bp+8]	; LS offset
-		mov	cx,word ptr [bp+10]	; MS offset
-		mov	ax,4200h		; absolute seek
-		int	21h
-		jc	seek_err		; if failed, return error code
-		xor	ax,ax			; return zero for OK
-seek_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_seek	endp
-
-
-;
-; short far jdos_read (short handle, void far * buffer, unsigned short count)
-;
-; Read from file
-;
-_jdos_read	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		lds	dx,dword ptr [bp+8]	; buffer address
-		mov	cx,word ptr [bp+12]	; number of bytes
-		mov	ah,3fh			; read file
-		int	21h
-		jc	read_err		; if failed, return error code
-		cmp	ax,word ptr [bp+12]	; make sure all bytes were read
-		je	read_ok
-		mov	ax,1			; else return 1 for not OK
-		jmp	short read_err
-read_ok:	xor	ax,ax			; return zero for OK
-read_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_read	endp
-
-
-;
-; short far jdos_write (short handle, void far * buffer, unsigned short count)
-;
-; Write to file
-;
-_jdos_write	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	bx,word ptr [bp+6]	; file handle
-		lds	dx,dword ptr [bp+8]	; buffer address
-		mov	cx,word ptr [bp+12]	; number of bytes
-		mov	ah,40h			; write file
-		int	21h
-		jc	write_err		; if failed, return error code
-		cmp	ax,word ptr [bp+12]	; make sure all bytes written
-		je	write_ok
-		mov	ax,1			; else return 1 for not OK
-		jmp	short write_err
-write_ok:	xor	ax,ax			; return zero for OK
-write_err:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jdos_write	endp
-
-
-;
-; void far jxms_getdriver (XMSDRIVER far *)
-;
-; Get the address of the XMS driver, or NULL if not available
-;
-_jxms_getdriver	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov 	ax,4300h		; call multiplex interrupt with
-		int	2fh			; a magic cookie, hex 4300
-		cmp 	al,80h			; AL should contain hex 80
-		je	xmsavail
-		xor 	dx,dx			; no XMS driver available
-		xor 	ax,ax			; return a nil pointer
-		jmp	short xmsavail_done
-xmsavail:	mov 	ax,4310h		; fetch driver address with
-		int	2fh			; another magic cookie
-		mov 	dx,es			; copy address to dx:ax
-		mov 	ax,bx
-xmsavail_done:	les 	bx,dword ptr [bp+6]	; get pointer to return value
-		mov	word ptr es:[bx],ax
-		mov	word ptr es:[bx+2],dx
-		pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop	bp
-		ret
-_jxms_getdriver	endp
-
-
-;
-; void far jxms_calldriver (XMSDRIVER, XMScontext far *)
-;
-; The XMScontext structure contains values for the AX,DX,BX,SI,DS registers.
-; These are loaded, the XMS call is performed, and the new values of the
-; AX,DX,BX registers are written back to the context structure.
-;
-_jxms_calldriver 	proc	far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		les 	bx,dword ptr [bp+10]	; get XMScontext pointer
-		mov 	ax,word ptr es:[bx]	; load registers
-		mov 	dx,word ptr es:[bx+2]
-		mov 	si,word ptr es:[bx+6]
-		mov 	ds,word ptr es:[bx+8]
-		mov 	bx,word ptr es:[bx+4]
-		call	dword ptr [bp+6]	; call the driver
-		mov	cx,bx			; save returned BX for a sec
-		les 	bx,dword ptr [bp+10]	; get XMScontext pointer
-		mov 	word ptr es:[bx],ax	; put back ax,dx,bx
-		mov 	word ptr es:[bx+2],dx
-		mov 	word ptr es:[bx+4],cx
-		pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jxms_calldriver 	endp
-
-
-;
-; short far jems_available (void)
-;
-; Have we got an EMS driver? (this comes straight from the EMS 4.0 specs)
-;
-_jems_available	proc	far
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		mov	ax,3567h		; get interrupt vector 67h
-		int	21h
-		push	cs
-		pop	ds
-		mov	di,000ah		; check offs 10 in returned seg
-		lea	si,ASCII_device_name	; against literal string
-		mov	cx,8
-		cld
-		repe cmpsb
-		jne	no_ems
-		mov	ax,1			; match, it's there
-		jmp	short avail_done
-no_ems:		xor	ax,ax			; it's not there
-avail_done:	pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		ret
-
-ASCII_device_name	db	"EMMXXXX0"
-
-_jems_available	endp
-
-
-;
-; void far jems_calldriver (EMScontext far *)
-;
-; The EMScontext structure contains values for the AX,DX,BX,SI,DS registers.
-; These are loaded, the EMS trap is performed, and the new values of the
-; AX,DX,BX registers are written back to the context structure.
-;
-_jems_calldriver	proc far
-		push	bp			; linkage
-		mov 	bp,sp
-		push	si			; save all registers for safety
-		push	di
-		push	bx
-		push	cx
-		push	dx
-		push	es
-		push	ds
-		les 	bx,dword ptr [bp+6]	; get EMScontext pointer
-		mov 	ax,word ptr es:[bx]	; load registers
-		mov 	dx,word ptr es:[bx+2]
-		mov 	si,word ptr es:[bx+6]
-		mov 	ds,word ptr es:[bx+8]
-		mov 	bx,word ptr es:[bx+4]
-		int	67h			; call the EMS driver
-		mov	cx,bx			; save returned BX for a sec
-		les 	bx,dword ptr [bp+6]	; get EMScontext pointer
-		mov 	word ptr es:[bx],ax	; put back ax,dx,bx
-		mov 	word ptr es:[bx+2],dx
-		mov 	word ptr es:[bx+4],cx
-		pop	ds			; restore registers and exit
-		pop	es
-		pop	dx
-		pop	cx
-		pop	bx
-		pop	di
-		pop	si
-		pop 	bp
-		ret
-_jems_calldriver	endp
-
-JMEMDOSA_TXT	ends
-
-		end
--- a/jpeg/jmemmgr.c
+++ b/jpeg/jmemmgr.c
@ -57,22 +57,25 @@ extern char * getenv JPP((const char * name));
 * requirement, and we had better do so too.
 * There isn't any really portable way to determine the worst-case alignment
 * requirement.  This module assumes that the alignment requirement is
- * multiples of sizeof(ALIGN_TYPE).
- * By default, we define ALIGN_TYPE as double.  This is necessary on some
+ * multiples of ALIGN_SIZE.
+ * By default, we define ALIGN_SIZE as sizeof(double).  This is necessary on some
 * workstations (where doubles really do need 8-byte alignment) and will work
 * fine on nearly everything.  If your machine has lesser alignment needs,
- * you can save a few bytes by making ALIGN_TYPE smaller.
+ * you can save a few bytes by making ALIGN_SIZE smaller.
 * The only place I know of where this will NOT work is certain Macintosh
 * 680x0 compilers that define double as a 10-byte IEEE extended float.
 * Doing 10-byte alignment is counterproductive because longwords won't be
- * aligned well.  Put "#define ALIGN_TYPE long" in jconfig.h if you have
+ * aligned well.  Put "#define ALIGN_SIZE 4" in jconfig.h if you have
 * such a compiler.
 */

-#ifndef ALIGN_TYPE		/* so can override from jconfig.h */
-#define ALIGN_TYPE  double
+#ifndef ALIGN_SIZE		/* so can override from jconfig.h */
+#ifndef WITH_SIMD
+#define ALIGN_SIZE  SIZEOF(double)
+#else
+#define ALIGN_SIZE  16 /* Most SIMD implementations require this */
+#endif
 #endif
-

 /*
 * We allocate objects from "pools", where each pool is gotten with a single
@ -81,34 +84,24 @@ extern char * getenv JPP((const char * name));
 * header with a link to the next pool of the same class.
 * Small and large pool headers are identical except that the latter's
 * link pointer must be FAR on 80x86 machines.
- * Notice that the "real" header fields are union'ed with a dummy ALIGN_TYPE
- * field.  This forces the compiler to make SIZEOF(small_pool_hdr) a multiple
- * of the alignment requirement of ALIGN_TYPE.
 */

-typedef union small_pool_struct * small_pool_ptr;
+typedef struct small_pool_struct * small_pool_ptr;

-typedef union small_pool_struct {
-  struct {
-    small_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct small_pool_struct {
+  small_pool_ptr next;	/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
 } small_pool_hdr;

-typedef union large_pool_struct FAR * large_pool_ptr;
+typedef struct large_pool_struct FAR * large_pool_ptr;

-typedef union large_pool_struct {
-  struct {
-    large_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct large_pool_struct {
+  large_pool_ptr next;	/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
 } large_pool_hdr;

-
 /*
 * Here is the full definition of a memory manager object.
 */
@ -129,7 +122,7 @@ typedef struct {
  jvirt_barray_ptr virt_barray_list;

  /* This counts total space obtained from jpeg_get_small/large */
-  long total_space_allocated;
+  size_t total_space_allocated;

  /* alloc_sarray and alloc_barray set this value for use by virtual
   * array routines.
@ -197,16 +190,16 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
 	  pool_id, mem->total_space_allocated);

  for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
-       lhdr_ptr = lhdr_ptr->hdr.next) {
+       lhdr_ptr = lhdr_ptr->next) {
    fprintf(stderr, "  Large chunk used %ld\n",
-	    (long) lhdr_ptr->hdr.bytes_used);
+	    (long) lhdr_ptr->bytes_used);
  }

  for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
-       shdr_ptr = shdr_ptr->hdr.next) {
+       shdr_ptr = shdr_ptr->next) {
    fprintf(stderr, "  Small chunk used %ld free %ld\n",
-	    (long) shdr_ptr->hdr.bytes_used,
-	    (long) shdr_ptr->hdr.bytes_left);
+	    (long) shdr_ptr->bytes_used,
+	    (long) shdr_ptr->bytes_left);
  }
 }

@ -236,6 +229,10 @@ out_of_memory (j_common_ptr cinfo, int which)
 * and we also distinguish the first pool of a class from later ones.
 * NOTE: the values given work fairly well on both 16- and 32-bit-int
 * machines, but may be too small if longs are 64 bits or more.
+ *
+ * Since we do not know what alignment malloc() gives us, we have to
+ * allocate ALIGN_SIZE-1 extra space per pool to have room for alignment
+ * adjustment.
 */

 static const size_t first_pool_slop[JPOOL_NUMPOOLS] = 
@ -260,33 +257,36 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
  small_pool_ptr hdr_ptr, prev_hdr_ptr;
  char * data_ptr;
-  size_t odd_bytes, min_request, slop;
+  size_t min_request, slop;
+
+  /*
+   * Round up the requested size to a multiple of ALIGN_SIZE in order
+   * to assure alignment for the next object allocated in the same pool
+   * and so that algorithms can straddle outside the proper area up
+   * to the next alignment.
+   */
+  sizeofobject = jround_up(sizeofobject, ALIGN_SIZE);

  /* Check for unsatisfiable request (do now to ensure no overflow below) */
-  if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(small_pool_hdr)))
+  if ((SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
    out_of_memory(cinfo, 1);	/* request exceeds malloc's ability */

-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
-  if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
-
  /* See if space is available in any existing pool */
  if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */
  prev_hdr_ptr = NULL;
  hdr_ptr = mem->small_list[pool_id];
  while (hdr_ptr != NULL) {
-    if (hdr_ptr->hdr.bytes_left >= sizeofobject)
+    if (hdr_ptr->bytes_left >= sizeofobject)
      break;			/* found pool with enough space */
    prev_hdr_ptr = hdr_ptr;
-    hdr_ptr = hdr_ptr->hdr.next;
+    hdr_ptr = hdr_ptr->next;
  }

  /* Time to make a new pool? */
  if (hdr_ptr == NULL) {
    /* min_request is what we need now, slop is what will be leftover */
-    min_request = sizeofobject + SIZEOF(small_pool_hdr);
+    min_request = SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1;
    if (prev_hdr_ptr == NULL)	/* first pool in class? */
      slop = first_pool_slop[pool_id];
    else
@ -305,20 +305,23 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
    }
    mem->total_space_allocated += min_request + slop;
    /* Success, initialize the new pool header and add to end of list */
-    hdr_ptr->hdr.next = NULL;
-    hdr_ptr->hdr.bytes_used = 0;
-    hdr_ptr->hdr.bytes_left = sizeofobject + slop;
+    hdr_ptr->next = NULL;
+    hdr_ptr->bytes_used = 0;
+    hdr_ptr->bytes_left = sizeofobject + slop;
    if (prev_hdr_ptr == NULL)	/* first pool in class? */
      mem->small_list[pool_id] = hdr_ptr;
    else
-      prev_hdr_ptr->hdr.next = hdr_ptr;
+      prev_hdr_ptr->next = hdr_ptr;
  }

  /* OK, allocate the object from the current pool */
-  data_ptr = (char *) (hdr_ptr + 1); /* point to first data byte in pool */
-  data_ptr += hdr_ptr->hdr.bytes_used; /* point to place for object */
-  hdr_ptr->hdr.bytes_used += sizeofobject;
-  hdr_ptr->hdr.bytes_left -= sizeofobject;
+  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+  if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+    data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
+  data_ptr += hdr_ptr->bytes_used; /* point to place for object */
+  hdr_ptr->bytes_used += sizeofobject;
+  hdr_ptr->bytes_left -= sizeofobject;

  return (void *) data_ptr;
 }
@ -344,37 +347,45 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 {
  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
  large_pool_ptr hdr_ptr;
-  size_t odd_bytes;
+  char FAR * data_ptr;
+
+  /*
+   * Round up the requested size to a multiple of ALIGN_SIZE so that
+   * algorithms can straddle outside the proper area up to the next
+   * alignment.
+   */
+  sizeofobject = jround_up(sizeofobject, ALIGN_SIZE);

  /* Check for unsatisfiable request (do now to ensure no overflow below) */
-  if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)))
+  if ((SIZEOF(large_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK)
    out_of_memory(cinfo, 3);	/* request exceeds malloc's ability */

-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
-  if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
-
  /* Always make a new pool */
  if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
    ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id);	/* safety check */

  hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject +
-					    SIZEOF(large_pool_hdr));
+					    SIZEOF(large_pool_hdr) +
+					    ALIGN_SIZE - 1);
  if (hdr_ptr == NULL)
    out_of_memory(cinfo, 4);	/* jpeg_get_large failed */
-  mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr);
+  mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr) + ALIGN_SIZE - 1;

  /* Success, initialize the new pool header and add to list */
-  hdr_ptr->hdr.next = mem->large_list[pool_id];
+  hdr_ptr->next = mem->large_list[pool_id];
  /* We maintain space counts in each pool header for statistical purposes,
   * even though they are not needed for allocation.
   */
-  hdr_ptr->hdr.bytes_used = sizeofobject;
-  hdr_ptr->hdr.bytes_left = 0;
+  hdr_ptr->bytes_used = sizeofobject;
+  hdr_ptr->bytes_left = 0;
  mem->large_list[pool_id] = hdr_ptr;

-  return (void FAR *) (hdr_ptr + 1); /* point to first data byte in pool */
+  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr += SIZEOF(small_pool_hdr); /* ...by skipping the header... */
+  if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
+    data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
+
+  return (void FAR *) data_ptr;
 }


@ -389,6 +400,10 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 * this chunking of rows.  The rowsperchunk value is left in the mem manager
 * object so that it can be saved away if this sarray is the workspace for
 * a virtual array.
+ *
+ * Since we are often upsampling with a factor 2, we align the size (not
+ * the start) to 2 * ALIGN_SIZE so that the upsampling routines don't have
+ * to be as careful about size.
 */

 METHODDEF(JSAMPARRAY)
@ -402,6 +417,11 @@ alloc_sarray (j_common_ptr cinfo, int pool_id,
  JDIMENSION rowsperchunk, currow, i;
  long ltemp;

+  /* Make sure each row is properly aligned */
+  if ((ALIGN_SIZE % SIZEOF(JSAMPLE)) != 0)
+    out_of_memory(cinfo, 5);	/* safety check */
+  samplesperrow = (JDIMENSION)jround_up(samplesperrow, (2 * ALIGN_SIZE) / SIZEOF(JSAMPLE));
+
  /* Calculate max # of rows allowed in one allocation chunk */
  ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
 	  ((long) samplesperrow * SIZEOF(JSAMPLE));
@ -450,6 +470,10 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
  JDIMENSION rowsperchunk, currow, i;
  long ltemp;

+  /* Make sure each row is properly aligned */
+  if ((SIZEOF(JBLOCK) % ALIGN_SIZE) != 0)
+    out_of_memory(cinfo, 6);	/* safety check */
+
  /* Calculate max # of rows allowed in one allocation chunk */
  ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
 	  ((long) blocksperrow * SIZEOF(JBLOCK));
@ -584,8 +608,8 @@ realize_virt_arrays (j_common_ptr cinfo)
 /* Allocate the in-memory buffers for any unrealized virtual arrays */
 {
  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
-  long space_per_minheight, maximum_space, avail_mem;
-  long minheights, max_minheights;
+  size_t space_per_minheight, maximum_space, avail_mem;
+  size_t minheights, max_minheights;
  jvirt_sarray_ptr sptr;
  jvirt_barray_ptr bptr;

@ -968,9 +992,9 @@ free_pool (j_common_ptr cinfo, int pool_id)
  mem->large_list[pool_id] = NULL;

  while (lhdr_ptr != NULL) {
-    large_pool_ptr next_lhdr_ptr = lhdr_ptr->hdr.next;
-    space_freed = lhdr_ptr->hdr.bytes_used +
-		  lhdr_ptr->hdr.bytes_left +
+    large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
+    space_freed = lhdr_ptr->bytes_used +
+		  lhdr_ptr->bytes_left +
 		  SIZEOF(large_pool_hdr);
    jpeg_free_large(cinfo, (void FAR *) lhdr_ptr, space_freed);
    mem->total_space_allocated -= space_freed;
@ -982,9 +1006,9 @@ free_pool (j_common_ptr cinfo, int pool_id)
  mem->small_list[pool_id] = NULL;

  while (shdr_ptr != NULL) {
-    small_pool_ptr next_shdr_ptr = shdr_ptr->hdr.next;
-    space_freed = shdr_ptr->hdr.bytes_used +
-		  shdr_ptr->hdr.bytes_left +
+    small_pool_ptr next_shdr_ptr = shdr_ptr->next;
+    space_freed = shdr_ptr->bytes_used +
+		  shdr_ptr->bytes_left +
 		  SIZEOF(small_pool_hdr);
    jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
    mem->total_space_allocated -= space_freed;
@ -1041,16 +1065,16 @@ jinit_memory_mgr (j_common_ptr cinfo)
   * in common if and only if X is a power of 2, ie has only one one-bit.
   * Some compilers may give an "unreachable code" warning here; ignore it.
   */
-  if ((SIZEOF(ALIGN_TYPE) & (SIZEOF(ALIGN_TYPE)-1)) != 0)
+  if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
    ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
  /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
-   * a multiple of SIZEOF(ALIGN_TYPE).
+   * a multiple of ALIGN_SIZE.
   * Again, an "unreachable code" warning may be ignored here.
   * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
   */
  test_mac = (size_t) MAX_ALLOC_CHUNK;
  if ((long) test_mac != MAX_ALLOC_CHUNK ||
-      (MAX_ALLOC_CHUNK % SIZEOF(ALIGN_TYPE)) != 0)
+      (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
    ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);

  max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
--- a/jpeg/jmemname.c
+++ b/jpeg/jmemname.c
@ -1,276 +0,0 @@
-/*
- * jmemname.c
- *
- * Copyright (C) 1992-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file provides a generic implementation of the system-dependent
- * portion of the JPEG memory manager.  This implementation assumes that
- * you must explicitly construct a name for each temp file.
- * Also, the problem of determining the amount of memory available
- * is shoved onto the user.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jmemsys.h"		/* import the system-dependent declarations */
-
-#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
-extern void * malloc JPP((size_t size));
-extern void free JPP((void *ptr));
-#endif
-
-#ifndef SEEK_SET		/* pre-ANSI systems may not define this; */
-#define SEEK_SET  0		/* if not, assume 0 is correct */
-#endif
-
-#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
-#define READ_BINARY	"r"
-#define RW_BINARY	"w+"
-#else
-#ifdef VMS			/* VMS is very nonstandard */
-#define READ_BINARY	"rb", "ctx=stm"
-#define RW_BINARY	"w+b", "ctx=stm"
-#else				/* standard ANSI-compliant case */
-#define READ_BINARY	"rb"
-#define RW_BINARY	"w+b"
-#endif
-#endif
-
-
-/*
- * Selection of a file name for a temporary file.
- * This is system-dependent!
- *
- * The code as given is suitable for most Unix systems, and it is easily
- * modified for most non-Unix systems.  Some notes:
- *  1.  The temp file is created in the directory named by TEMP_DIRECTORY.
- *      The default value is /usr/tmp, which is the conventional place for
- *      creating large temp files on Unix.  On other systems you'll probably
- *      want to change the file location.  You can do this by editing the
- *      #define, or (preferred) by defining TEMP_DIRECTORY in jconfig.h.
- *
- *  2.  If you need to change the file name as well as its location,
- *      you can override the TEMP_FILE_NAME macro.  (Note that this is
- *      actually a printf format string; it must contain %s and %d.)
- *      Few people should need to do this.
- *
- *  3.  mktemp() is used to ensure that multiple processes running
- *      simultaneously won't select the same file names.  If your system
- *      doesn't have mktemp(), define NO_MKTEMP to do it the hard way.
- *      (If you don't have <errno.h>, also define NO_ERRNO_H.)
- *
- *  4.  You probably want to define NEED_SIGNAL_CATCHER so that cjpeg.c/djpeg.c
- *      will cause the temp files to be removed if you stop the program early.
- */
-
-#ifndef TEMP_DIRECTORY		/* can override from jconfig.h or Makefile */
-#define TEMP_DIRECTORY  "/usr/tmp/" /* recommended setting for Unix */
-#endif
-
-static int next_file_num;	/* to distinguish among several temp files */
-
-#ifdef NO_MKTEMP
-
-#ifndef TEMP_FILE_NAME		/* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME  "%sJPG%03d.TMP"
-#endif
-
-#ifndef NO_ERRNO_H
-#include <errno.h>		/* to define ENOENT */
-#endif
-
-/* ANSI C specifies that errno is a macro, but on older systems it's more
- * likely to be a plain int variable.  And not all versions of errno.h
- * bother to declare it, so we have to in order to be most portable.  Thus:
- */
-#ifndef errno
-extern int errno;
-#endif
-
-
-LOCAL(void)
-select_file_name (char * fname)
-{
-  FILE * tfile;
-
-  /* Keep generating file names till we find one that's not in use */
-  for (;;) {
-    next_file_num++;		/* advance counter */
-    sprintf(fname, TEMP_FILE_NAME, TEMP_DIRECTORY, next_file_num);
-    if ((tfile = fopen(fname, READ_BINARY)) == NULL) {
-      /* fopen could have failed for a reason other than the file not
-       * being there; for example, file there but unreadable.
-       * If <errno.h> isn't available, then we cannot test the cause.
-       */
-#ifdef ENOENT
-      if (errno != ENOENT)
-	continue;
-#endif
-      break;
-    }
-    fclose(tfile);		/* oops, it's there; close tfile & try again */
-  }
-}
-
-#else /* ! NO_MKTEMP */
-
-/* Note that mktemp() requires the initial filename to end in six X's */
-#ifndef TEMP_FILE_NAME		/* can override from jconfig.h or Makefile */
-#define TEMP_FILE_NAME  "%sJPG%dXXXXXX"
-#endif
-
-LOCAL(void)
-select_file_name (char * fname)
-{
-  next_file_num++;		/* advance counter */
-  sprintf(fname, TEMP_FILE_NAME, TEMP_DIRECTORY, next_file_num);
-  mktemp(fname);		/* make sure file name is unique */
-  /* mktemp replaces the trailing XXXXXX with a unique string of characters */
-}
-
-#endif /* NO_MKTEMP */
-
-
-/*
- * Memory allocation and freeing are controlled by the regular library
- * routines malloc() and free().
- */
-
-GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * "Large" objects are treated the same as "small" ones.
- * NB: although we include FAR keywords in the routine declarations,
- * this file won't actually work in 80x86 small/medium model; at least,
- * you probably won't be able to process useful-size images in only 64KB.
- */
-
-GLOBAL(void FAR *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
-{
-  return (void FAR *) malloc(sizeofobject);
-}
-
-GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
-{
-  free(object);
-}
-
-
-/*
- * This routine computes the total memory space available for allocation.
- * It's impossible to do this in a portable way; our current solution is
- * to make the user tell us (with a default value set at compile time).
- * If you can actually get the available space, it's a good idea to subtract
- * a slop factor of 5% or so.
- */
-
-#ifndef DEFAULT_MAX_MEM		/* so can override from makefile */
-#define DEFAULT_MAX_MEM		1000000L /* default: one megabyte */
-#endif
-
-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
-{
-  return cinfo->mem->max_memory_to_use - already_allocated;
-}
-
-
-/*
- * Backing store (temporary file) management.
- * Backing store objects are only used when the value returned by
- * jpeg_mem_available is less than the total space needed.  You can dispense
- * with these routines if you have plenty of virtual memory; see jmemnobs.c.
- */
-
-
-METHODDEF(void)
-read_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		    void FAR * buffer_address,
-		    long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFREAD(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_READ);
-}
-
-
-METHODDEF(void)
-write_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-		     void FAR * buffer_address,
-		     long file_offset, long byte_count)
-{
-  if (fseek(info->temp_file, file_offset, SEEK_SET))
-    ERREXIT(cinfo, JERR_TFILE_SEEK);
-  if (JFWRITE(info->temp_file, buffer_address, byte_count)
-      != (size_t) byte_count)
-    ERREXIT(cinfo, JERR_TFILE_WRITE);
-}
-
-
-METHODDEF(void)
-close_backing_store (j_common_ptr cinfo, backing_store_ptr info)
-{
-  fclose(info->temp_file);	/* close the file */
-  unlink(info->temp_name);	/* delete the file */
-/* If your system doesn't have unlink(), use remove() instead.
- * remove() is the ANSI-standard name for this function, but if
- * your system was ANSI you'd be using jmemansi.c, right?
- */
-  TRACEMSS(cinfo, 1, JTRC_TFILE_CLOSE, info->temp_name);
-}
-
-
-/*
- * Initial opening of a backing-store object.
- */
-
-GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-			 long total_bytes_needed)
-{
-  select_file_name(info->temp_name);
-  if ((info->temp_file = fopen(info->temp_name, RW_BINARY)) == NULL)
-    ERREXITS(cinfo, JERR_TFILE_CREATE, info->temp_name);
-  info->read_backing_store = read_backing_store;
-  info->write_backing_store = write_backing_store;
-  info->close_backing_store = close_backing_store;
-  TRACEMSS(cinfo, 1, JTRC_TFILE_OPEN, info->temp_name);
-}
-
-
-/*
- * These routines take care of any system-dependent initialization and
- * cleanup required.
- */
-
-GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
-{
-  next_file_num = 0;		/* initialize temp file name generator */
-  return DEFAULT_MAX_MEM;	/* default for max_memory_to_use */
-}
-
-GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
-{
-  /* no work */
-}
--- a/jpeg/jmemnobs.c
+++ b/jpeg/jmemnobs.c
@ -69,9 +69,9 @@ jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
 * Here we always say, "we got all you want bud!"
 */

-GLOBAL(long)
-jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
-		    long max_bytes_needed, long already_allocated)
+GLOBAL(size_t)
+jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
+		    size_t max_bytes_needed, size_t already_allocated)
 {
  return max_bytes_needed;
 }
--- a/jpeg/jmemsys.h
+++ b/jpeg/jmemsys.h
@ -100,10 +100,10 @@ EXTERN(void) jpeg_free_large JPP((j_common_ptr cinfo, void FAR * object,
 * Conversely, zero may be returned to always use the minimum amount of memory.
 */

-EXTERN(long) jpeg_mem_available JPP((j_common_ptr cinfo,
-				     long min_bytes_needed,
-				     long max_bytes_needed,
-				     long already_allocated));
+EXTERN(size_t) jpeg_mem_available JPP((j_common_ptr cinfo,
+				     size_t min_bytes_needed,
+				     size_t max_bytes_needed,
+				     size_t already_allocated));


 /*
--- a/jpeg/jmorecfg.h
+++ b/jpeg/jmorecfg.h
@ -2,6 +2,7 @@
 * jmorecfg.h
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -10,13 +11,7 @@
 * optimizations.  Most users will not need to touch this file.
 */

-/*
- * This file has been modified for the Mozilla/Netscape environment.
- * Modifications are distributed under the mozilla.org tri-license and are
- * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
- * Reserved. See http://www.mozilla.org/MPL/
- */
-
+#include "prtypes.h"

 /*
 * Define BITS_IN_JSAMPLE as either
@ -69,11 +64,11 @@ typedef unsigned char JSAMPLE;
 #else /* not HAVE_UNSIGNED_CHAR */

 typedef char JSAMPLE;
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 #define GETJSAMPLE(value)  ((int) (value))
 #else
 #define GETJSAMPLE(value)  ((int) (value) & 0xFF)
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */

 #endif /* HAVE_UNSIGNED_CHAR */

@ -105,24 +100,6 @@ typedef short JSAMPLE;

 typedef short JCOEF;

-/* Defines for MMX/SSE2 support. */
-
-#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__)
-#define HAVE_MMX_INTEL_MNEMONICS 
-
-/* SSE2 code appears broken for some cpus (bug 247437) */
-#define HAVE_SSE2_INTEL_MNEMONICS
-#define HAVE_SSE2_INTRINSICS
-#endif
-
-#if defined(__GNUC__) && defined(__i386__)
-#if defined(XP_MACOSX)
-#define HAVE_SSE2_INTRINSICS
-#endif /* ! XP_MACOSX */
-#endif /* ! GNUC && i386 */
-
-/* Add support for other platforms here */
-

 /* Compressed datastreams are represented as arrays of JOCTET.
 * These must be EXACTLY 8 bits wide, at least once they are written to
@ -138,11 +115,11 @@ typedef unsigned char JOCTET;
 #else /* not HAVE_UNSIGNED_CHAR */

 typedef char JOCTET;
-#ifdef CHAR_IS_UNSIGNED
+#ifdef __CHAR_UNSIGNED__
 #define GETJOCTET(value)  (value)
 #else
 #define GETJOCTET(value)  ((value) & 0xFF)
-#endif /* CHAR_IS_UNSIGNED */
+#endif /* __CHAR_UNSIGNED__ */

 #endif /* HAVE_UNSIGNED_CHAR */

@ -156,39 +133,19 @@ typedef char JOCTET;

 /* UINT8 must hold at least the values 0..255. */

-#ifdef HAVE_UNSIGNED_CHAR
-typedef unsigned char UINT8;
-#else /* not HAVE_UNSIGNED_CHAR */
-#ifdef CHAR_IS_UNSIGNED
-typedef char UINT8;
-#else /* not CHAR_IS_UNSIGNED */
-typedef short UINT8;
-#endif /* CHAR_IS_UNSIGNED */
-#endif /* HAVE_UNSIGNED_CHAR */
+typedef PRUint8 UINT8;

 /* UINT16 must hold at least the values 0..65535. */

-#ifdef HAVE_UNSIGNED_SHORT
-typedef unsigned short UINT16;
-#else /* not HAVE_UNSIGNED_SHORT */
-typedef unsigned int UINT16;
-#endif /* HAVE_UNSIGNED_SHORT */
+typedef PRUint16 UINT16;

 /* INT16 must hold at least the values -32768..32767. */

-#ifndef XMD_H			/* X11/xmd.h correctly defines INT16 */
-typedef short INT16;
-#endif
+typedef PRInt16 INT16;

 /* INT32 must hold at least signed 32-bit values. */

-#ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
-#ifndef _BASETSD_H_		/* basetsd.h correctly defines INT32 */
-#ifndef _BASETSD_H
-typedef long INT32;
-#endif
-#endif
-#endif
+typedef PRInt32 INT32;

 /* Datatype used for image dimensions.  The JPEG standard only supports
 * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
@ -209,21 +166,14 @@ typedef unsigned int JDIMENSION;
 * or code profilers that require it.
 */

-/* Mozilla mod: make external functions be DLL-able via JRI_PUBLIC_API(),
- * and supply extern "C" for C++ users of the C-compiled IJG library.
- * (Well, not anymore, but there's still a modification here.)
- */
-#include "prtypes.h"
-
 /* a function called through method pointers: */
 #define METHODDEF(type)		static type
 /* a function used only in its module: */
 #define LOCAL(type)		static type
-
-PR_BEGIN_EXTERN_C
-#define GLOBAL(type) type
-#define EXTERN(type) extern type
-PR_END_EXTERN_C
+/* a function referenced thru EXTERNs: */
+#define GLOBAL(type)		type
+/* a reference to a GLOBAL function: */
+#define EXTERN(type)		extern type


 /* This macro is used to declare a "method", that is, a function pointer.
@ -245,13 +195,11 @@ PR_END_EXTERN_C
 * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
 */

-#ifndef FAR
 #ifdef NEED_FAR_POINTERS
 #define FAR  far
 #else
 #define FAR
 #endif
-#endif


 /*
@ -261,20 +209,8 @@ PR_END_EXTERN_C
 * Defining HAVE_BOOLEAN before including jpeglib.h should make it work.
 */

-/* Mozilla mod: IJG distribution makes boolean = int, but on Windows
- * it's far safer to define boolean = unsigned char.  Easier to switch
- * than fight.
- */
-
-/* For some reason, on SunOS 5.3 HAVE_BOOLEAN gets defined when using
- * gcc, but boolean doesn't.  Even if you use -UHAVE_BOOLEAN, it still
- * gets reset somewhere.
- */
-#if defined(MUST_UNDEF_HAVE_BOOLEAN_AFTER_INCLUDES) && defined(HAVE_BOOLEAN)
-#undef HAVE_BOOLEAN
-#endif
 #ifndef HAVE_BOOLEAN
-typedef unsigned char boolean;
+typedef int boolean;
 #endif
 #ifndef FALSE			/* in case these macros already exist */
 #define FALSE	0		/* values of boolean */
@ -306,22 +242,13 @@ typedef unsigned char boolean;
 * (You may HAVE to do that if your compiler doesn't like null source files.)
 */

-/*
- * Mozilla mods here: undef some features not actually used by the browser.
- * This reduces object code size and more importantly allows us to compile
- * even with broken compilers that crash when fed certain modules of the
- * IJG sources.  Currently we undef:
- * DCT_FLOAT_SUPPORTED INPUT_SMOOTHING_SUPPORTED IDCT_SCALING_SUPPORTED
- * QUANT_1PASS_SUPPORTED QUANT_2PASS_SUPPORTED
- */
-
 /* Arithmetic coding is unsupported for legal reasons.  Complaints to IBM. */

 /* Capability options common to encoder and decoder: */

 #define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
-#undef  DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
-#undef  DCT_FLOAT_SUPPORTED	/* floating-point: accurate, fast on fast HW */
+#define DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
+#define DCT_FLOAT_SUPPORTED	/* floating-point: accurate, fast on fast HW */

 /* Encoder capability options: */

@ -337,7 +264,7 @@ typedef unsigned char boolean;
 * The exact same statements apply for progressive JPEG: the default tables
 * don't work for progressive mode.  (This may get fixed, however.)
 */
-#undef  INPUT_SMOOTHING_SUPPORTED   /* Input image smoothing option? */
+#define INPUT_SMOOTHING_SUPPORTED   /* Input image smoothing option? */

 /* Decoder capability options: */

@ -346,11 +273,11 @@ typedef unsigned char boolean;
 #define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
 #define SAVE_MARKERS_SUPPORTED	    /* jpeg_save_markers() needed? */
 #define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
-#undef  IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
+#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
 #define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
-#undef  QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
-#undef  QUANT_2PASS_SUPPORTED	    /* 2-pass color quantization? */
+#define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
+#define QUANT_2PASS_SUPPORTED	    /* 2-pass color quantization? */

 /* more capability options later, no doubt */

@ -375,40 +302,37 @@ typedef unsigned char boolean;
 #define RGB_BLUE	2	/* Offset of Blue */
 #define RGB_PIXELSIZE	3	/* JSAMPLEs per RGB scanline element */

+#define JPEG_NUMCS 12
+
+static const int rgb_red[JPEG_NUMCS] = {
+	-1, -1, RGB_RED, -1, -1, -1, 0, 0, 2, 2, 3, 1
+};
+
+static const int rgb_green[JPEG_NUMCS] = {
+	-1, -1, RGB_GREEN, -1, -1, -1, 1, 1, 1, 1, 2, 2
+};
+
+static const int rgb_blue[JPEG_NUMCS] = {
+	-1, -1, RGB_BLUE, -1, -1, -1, 2, 2, 0, 0, 1, 3
+};
+
+static const int rgb_pixelsize[JPEG_NUMCS] = {
+	-1, -1, RGB_PIXELSIZE, -1, -1, -1, 3, 4, 3, 4, 4, 4
+};

 /* Definitions for speed-related optimizations. */

-
-/* If your compiler supports inline functions, define INLINE
- * as the inline keyword; otherwise define it as empty.
- */
-
-/* Mozilla mods here: add more ways of defining INLINE */
-
-#ifndef INLINE
-#ifdef __GNUC__			/* for instance, GNU C knows about inline */
-#define INLINE __inline__
-#endif
-#if defined( __IBMC__ ) || defined (__IBMCPP__)
-#define INLINE _Inline
-#endif
-#ifndef INLINE
-#ifdef __cplusplus
-#define INLINE inline		/* a C++ compiler should have it too */
-#else
-#define INLINE			/* default is to define it as empty */
-#endif
-#endif
-#endif
-
-
 /* On some machines (notably 68000 series) "int" is 32 bits, but multiplying
 * two 16-bit shorts is faster than multiplying two ints.  Define MULTIPLIER
 * as short on such a machine.  MULTIPLIER must be at least 16 bits wide.
 */

 #ifndef MULTIPLIER
-#define MULTIPLIER  int16		/* type for fastest integer multiply */
+#ifndef WITH_SIMD
+#define MULTIPLIER  int		/* type for fastest integer multiply */
+#else
+#define MULTIPLIER short  /* prefer 16-bit with SIMD for parellelism */
+#endif
 #endif


--- a/jpeg/jos2fig.h
+++ b/jpeg/jos2fig.h
@ -1,45 +0,0 @@
-/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* ***** BEGIN LICENSE BLOCK *****
- * Version: MPL 1.1/GPL 2.0/LGPL 2.1
- *
- * The contents of this file are subject to the Mozilla Public License Version
- * 1.1 (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * The Original Code is mozilla.org code.
- *
- * The Initial Developer of the Original Code is
- * Netscape Communications Corporation.
- * Portions created by the Initial Developer are Copyright (C) 1998
- * the Initial Developer. All Rights Reserved.
- *
- * Contributor(s):
- *
- * Alternatively, the contents of this file may be used under the terms of
- * either the GNU General Public License Version 2 or later (the "GPL"), or
- * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- * in which case the provisions of the GPL or the LGPL are applicable instead
- * of those above. If you wish to allow use of your version of this file only
- * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the MPL, indicate your
- * decision by deleting the provisions above and replace them with the notice
- * and other provisions required by the GPL or the LGPL. If you do not delete
- * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the MPL, the GPL or the LGPL.
- *
- * ***** END LICENSE BLOCK ***** */
-
-#ifndef __jos2fig_h__
-#define __jos2fig_h__
-
-/*
-** Place holder for the OS/2 code that might actually make it into the trunk someday.  Maybe.
-*/
-
-#endif  /* __jos2fig_h__ */
--- a/jpeg/jpegcomp.h
+++ b/jpeg/jpegcomp.h
@ -0,0 +1,26 @@
+/*
+ * jpegcomp.h
+ *
+ * Copyright (C) 2010, D. R. Commander
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * JPEG compatibility macros
+ * These declarations are considered internal to the JPEG library; most
+ * applications using the library shouldn't need to include this file.
+ */
+
+#if JPEG_LIB_VERSION >= 70
+#define _DCT_scaled_size DCT_h_scaled_size
+#define _min_DCT_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_v_scaled_size
+#define _jpeg_width jpeg_width
+#define _jpeg_height jpeg_height
+#else
+#define _DCT_scaled_size DCT_scaled_size
+#define _min_DCT_scaled_size min_DCT_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_scaled_size
+#define _jpeg_width image_width
+#define _jpeg_height image_height
+#endif
--- a/jpeg/jpegint.h
+++ b/jpeg/jpegint.h
@ -2,6 +2,7 @@
 * jpegint.h
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -304,6 +305,7 @@ struct jpeg_color_quantizer {
 #define jinit_forward_dct	jIFDCT
 #define jinit_huff_encoder	jIHEncoder
 #define jinit_phuff_encoder	jIPHEncoder
+#define jinit_arith_encoder	jIAEncoder
 #define jinit_marker_writer	jIMWriter
 #define jinit_master_decompress	jIDMaster
 #define jinit_d_main_controller	jIDMainC
@ -313,6 +315,7 @@ struct jpeg_color_quantizer {
 #define jinit_marker_reader	jIMReader
 #define jinit_huff_decoder	jIHDecoder
 #define jinit_phuff_decoder	jIPHDecoder
+#define jinit_arith_decoder	jIADecoder
 #define jinit_inverse_dct	jIIDCT
 #define jinit_upsampler		jIUpsampler
 #define jinit_color_deconverter	jIDColor
@ -327,6 +330,7 @@ struct jpeg_color_quantizer {
 #define jzero_far		jZeroFar
 #define jpeg_zigzag_order	jZIGTable
 #define jpeg_natural_order	jZAGTable
+#define jpeg_aritab		jAriTab
 #endif /* NEED_SHORT_EXTERNAL_NAMES */


@ -345,6 +349,7 @@ EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo));
 /* Decompression module initialization routines */
 EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo));
@ -358,6 +363,7 @@ EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo));
@ -369,7 +375,7 @@ EXTERN(void) jinit_memory_mgr JPP((j_common_ptr cinfo));

 /* Utility routines in jutils.c */
 EXTERN(long) jdiv_round_up JPP((long a, long b));
-EXTERN(long) jround_up JPP((long a, long b));
+EXTERN(size_t) jround_up JPP((size_t a, size_t b));
 EXTERN(void) jcopy_sample_rows JPP((JSAMPARRAY input_array, int source_row,
 				    JSAMPARRAY output_array, int dest_row,
 				    int num_rows, JDIMENSION num_cols));
@ -382,6 +388,9 @@ extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
 #endif
 extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */

+/* Arithmetic coding probability estimation tables in jaricom.c */
+extern const INT32 jpeg_aritab[];
+
 /* Suppress undefined-structure complaints if necessary. */

 #ifdef INCOMPLETE_TYPES_BROKEN
--- a/jpeg/jpeglib.h
+++ b/jpeg/jpeglib.h
@ -2,6 +2,8 @@
 * jpeglib.h
 *
 * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
+ * Copyright (C) 2009-2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -13,23 +15,6 @@
 #ifndef JPEGLIB_H
 #define JPEGLIB_H

-#ifdef XP_OS2
-/*
- * On OS/2, the system will have defined RGB_* so we #undef 'em to avoid warnings
- * from jmorecfg.h.
- */
-#ifdef RGB_RED
-	#undef RGB_RED
-#endif
-#ifdef RGB_GREEN
-	#undef RGB_GREEN
-#endif
-#ifdef RGB_BLUE
-	#undef RGB_BLUE
-#endif
-
-#endif
-
 /*
 * First we include the configuration files that record how this
 * installation of the JPEG library is set up.  jconfig.h can be
@ -43,16 +28,11 @@
 #include "jmorecfg.h"		/* seldom changed options */


-#ifdef HAVE_MMX_INTEL_MNEMONICS
-	extern int MMXAvailable;
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
 #endif
-
-
-/* Version ID for the JPEG library.
- * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
- */
-
-#define JPEG_LIB_VERSION  62	/* Version 6b */


 /* Various constants determining the sizes of things.
@ -166,12 +146,17 @@ typedef struct {
   * Values of 1,2,4,8 are likely to be supported.  Note that different
   * components may receive different IDCT scalings.
   */
+#if JPEG_LIB_VERSION >= 70
+  int DCT_h_scaled_size;
+  int DCT_v_scaled_size;
+#else
  int DCT_scaled_size;
+#endif
  /* The downsampled dimensions are the component's actual, unpadded number
   * of samples at the main buffer (preprocessing/compression interface), thus
   * downsampled_width = ceil(image_width * Hi/Hmax)
   * and similarly for height.  For decompression, IDCT scaling is included, so
-   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_scaled_size/DCTSIZE)
+   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE)
   */
  JDIMENSION downsampled_width;	 /* actual width in samples */
  JDIMENSION downsampled_height; /* actual height in samples */
@ -186,7 +171,7 @@ typedef struct {
  int MCU_width;		/* number of blocks per MCU, horizontally */
  int MCU_height;		/* number of blocks per MCU, vertically */
  int MCU_blocks;		/* MCU_width * MCU_height */
-  int MCU_sample_width;		/* MCU width in samples, MCU_width*DCT_scaled_size */
+  int MCU_sample_width;		/* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
  int last_col_width;		/* # of non-dummy blocks across in last MCU */
  int last_row_height;		/* # of non-dummy blocks down in last MCU */

@ -225,13 +210,22 @@ struct jpeg_marker_struct {

 /* Known color spaces. */

+#define JCS_EXTENSIONS 1
+
 typedef enum {
 	JCS_UNKNOWN,		/* error/unspecified */
 	JCS_GRAYSCALE,		/* monochrome */
-	JCS_RGB,		/* red/green/blue */
+	JCS_RGB,		/* red/green/blue as specified by the RGB_RED, RGB_GREEN,
+                 RGB_BLUE, and RGB_PIXELSIZE macros */
 	JCS_YCbCr,		/* Y/Cb/Cr (also known as YUV) */
 	JCS_CMYK,		/* C/M/Y/K */
-	JCS_YCCK		/* Y/Cb/Cr/K */
+	JCS_YCCK,		/* Y/Cb/Cr/K */
+	JCS_EXT_RGB,		/* red/green/blue */
+	JCS_EXT_RGBX,		/* red/green/blue/x */
+	JCS_EXT_BGR,		/* blue/green/red */
+	JCS_EXT_BGRX,		/* blue/green/red/x */
+	JCS_EXT_XBGR,		/* x/blue/green/red */
+	JCS_EXT_XRGB		/* x/red/green/blue */
 } J_COLOR_SPACE;

 /* DCT/IDCT algorithm options. */
@ -313,6 +307,19 @@ struct jpeg_compress_struct {
   * helper routines to simplify changing parameters.
   */

+#if JPEG_LIB_VERSION >= 70
+  unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+  JDIMENSION jpeg_width;	/* scaled JPEG image width */
+  JDIMENSION jpeg_height;	/* scaled JPEG image height */
+  /* Dimensions of actual JPEG image that will be written to file,
+   * derived from input dimensions by scaling factors above.
+   * These fields are computed by jpeg_start_compress().
+   * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+   * in advance of calling jpeg_start_compress().
+   */
+#endif
+
  int data_precision;		/* bits of precision in image data */

  int num_components;		/* # of color components in JPEG image */
@ -320,14 +327,19 @@ struct jpeg_compress_struct {

  jpeg_component_info * comp_info;
  /* comp_info[i] describes component that appears i'th in SOF */
-  
+
  JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
-  /* ptrs to coefficient quantization tables, or NULL if not defined */
-  
+#if JPEG_LIB_VERSION >= 70
+  int q_scale_factor[NUM_QUANT_TBLS];
+#endif
+  /* ptrs to coefficient quantization tables, or NULL if not defined,
+   * and corresponding scale factors (percentage, initialized 100).
+   */
+
  JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
  JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
  /* ptrs to Huffman coding tables, or NULL if not defined */
-  
+
  UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
  UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
  UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
@ -343,6 +355,9 @@ struct jpeg_compress_struct {
  boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
  boolean optimize_coding;	/* TRUE=optimize entropy encoding parms */
  boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+#if JPEG_LIB_VERSION >= 70
+  boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
+#endif
  int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
  J_DCT_METHOD dct_method;	/* DCT algorithm selector */

@ -386,6 +401,11 @@ struct jpeg_compress_struct {
  int max_h_samp_factor;	/* largest h_samp_factor */
  int max_v_samp_factor;	/* largest v_samp_factor */

+#if JPEG_LIB_VERSION >= 70
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+#endif
+
  JDIMENSION total_iMCU_rows;	/* # of iMCU rows to be input to coef ctlr */
  /* The coefficient controller receives data in units of MCU rows as defined
   * for fully interleaved scans (whether the JPEG file is interleaved or not).
@ -411,6 +431,12 @@ struct jpeg_compress_struct {

  int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */

+#if JPEG_LIB_VERSION >= 80
+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order;	/* natural-order position array */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) */
+#endif
+
  /*
   * Links to compression subobjects (methods and private variables of modules)
   */
@ -557,6 +583,9 @@ struct jpeg_decompress_struct {
  jpeg_component_info * comp_info;
  /* comp_info[i] describes component that appears i'th in SOF */

+#if JPEG_LIB_VERSION >= 80
+  boolean is_baseline;		/* TRUE if Baseline SOF0 encountered */
+#endif
  boolean progressive_mode;	/* TRUE if SOFn specifies progressive mode */
  boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */

@ -597,7 +626,12 @@ struct jpeg_decompress_struct {
  int max_h_samp_factor;	/* largest h_samp_factor */
  int max_v_samp_factor;	/* largest v_samp_factor */

+#if JPEG_LIB_VERSION >= 70
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+#else
  int min_DCT_scaled_size;	/* smallest DCT_scaled_size of any component */
+#endif

  JDIMENSION total_iMCU_rows;	/* # of iMCU rows in image */
  /* The coefficient controller's input and output progress is measured in
@ -605,7 +639,7 @@ struct jpeg_decompress_struct {
   * in fully interleaved JPEG scans, but are used whether the scan is
   * interleaved or not.  We define an iMCU row as v_samp_factor DCT block
   * rows of each component.  Therefore, the IDCT output contains
-   * v_samp_factor*DCT_scaled_size sample rows of a component per iMCU row.
+   * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row.
   */

  JSAMPLE * sample_range_limit; /* table for fast range-limiting */
@ -629,6 +663,14 @@ struct jpeg_decompress_struct {

  int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */

+#if JPEG_LIB_VERSION >= 80
+  /* These fields are derived from Se of first SOS marker.
+   */
+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order; /* natural-order position array for entropy decode */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) for entropy decode */
+#endif
+
  /* This field is shared between entropy decoder and marker parser.
   * It is either zero or the code of a JPEG marker that has been
   * read from the data source, but has not yet been processed.
@ -858,11 +900,18 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_destroy_decompress	jDestDecompress
 #define jpeg_stdio_dest		jStdDest
 #define jpeg_stdio_src		jStdSrc
+#if JPEG_LIB_VERSION >= 80
+#define jpeg_mem_dest		jMemDest
+#define jpeg_mem_src		jMemSrc
+#endif
 #define jpeg_set_defaults	jSetDefaults
 #define jpeg_set_colorspace	jSetColorspace
 #define jpeg_default_colorspace	jDefColorspace
 #define jpeg_set_quality	jSetQuality
 #define jpeg_set_linear_quality	jSetLQuality
+#if JPEG_LIB_VERSION >= 70
+#define jpeg_default_qtables	jDefQTables
+#endif
 #define jpeg_add_quant_table	jAddQuantTable
 #define jpeg_quality_scaling	jQualityScaling
 #define jpeg_simple_progression	jSimProgress
@ -872,6 +921,9 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_start_compress	jStrtCompress
 #define jpeg_write_scanlines	jWrtScanlines
 #define jpeg_finish_compress	jFinCompress
+#if JPEG_LIB_VERSION >= 70
+#define jpeg_calc_jpeg_dimensions	jCjpegDimensions
+#endif
 #define jpeg_write_raw_data	jWrtRawData
 #define jpeg_write_marker	jWrtMarker
 #define jpeg_write_m_header	jWrtMHeader
@ -888,6 +940,9 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_input_complete	jInComplete
 #define jpeg_new_colormap	jNewCMap
 #define jpeg_consume_input	jConsumeInput
+#if JPEG_LIB_VERSION >= 80
+#define jpeg_core_output_dimensions	jCoreDimensions
+#endif
 #define jpeg_calc_output_dimensions	jCalcDimensions
 #define jpeg_save_markers	jSaveMarkers
 #define jpeg_set_marker_processor	jSetMarker
@ -901,9 +956,6 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_resync_to_restart	jResyncRestart
 #endif /* NEED_SHORT_EXTERNAL_NAMES */

-#ifdef __cplusplus
-extern "C" {
-#endif

 /* Default error-management setup */
 EXTERN(struct jpeg_error_mgr *) jpeg_std_error
@ -935,6 +987,16 @@ EXTERN(void) jpeg_destroy_decompress JPP((j_decompress_ptr cinfo));
 EXTERN(void) jpeg_stdio_dest JPP((j_compress_ptr cinfo, FILE * outfile));
 EXTERN(void) jpeg_stdio_src JPP((j_decompress_ptr cinfo, FILE * infile));

+#if JPEG_LIB_VERSION >= 80
+/* Data source and destination managers: memory buffers. */
+EXTERN(void) jpeg_mem_dest JPP((j_compress_ptr cinfo,
+			       unsigned char ** outbuffer,
+			       unsigned long * outsize));
+EXTERN(void) jpeg_mem_src JPP((j_decompress_ptr cinfo,
+			      unsigned char * inbuffer,
+			      unsigned long insize));
+#endif
+
 /* Default parameter setup for compression */
 EXTERN(void) jpeg_set_defaults JPP((j_compress_ptr cinfo));
 /* Compression parameter setup aids */
@ -946,6 +1008,10 @@ EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality,
 EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo,
 					  int scale_factor,
 					  boolean force_baseline));
+#if JPEG_LIB_VERSION >= 70
+EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo,
+				       boolean force_baseline));
+#endif
 EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl,
 				       const unsigned int *basic_table,
 				       int scale_factor,
@ -965,12 +1031,17 @@ EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo,
 					     JDIMENSION num_lines));
 EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo));

+#if JPEG_LIB_VERSION >= 70
+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo));
+#endif
+
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
 EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo,
 					    JSAMPIMAGE data,
 					    JDIMENSION num_lines));

-/* Write a special marker.  See libjpeg.doc concerning safe usage. */
+/* Write a special marker.  See libjpeg.txt concerning safe usage. */
 EXTERN(void) jpeg_write_marker
 	JPP((j_compress_ptr cinfo, int marker,
 	     const JOCTET * dataptr, unsigned int datalen));
@ -1024,6 +1095,9 @@ EXTERN(int) jpeg_consume_input JPP((j_decompress_ptr cinfo));
 #define JPEG_SCAN_COMPLETED	4 /* Completed last iMCU row of a scan */

 /* Precalculate output dimensions for current decompression parameters. */
+#if JPEG_LIB_VERSION >= 80
+EXTERN(void) jpeg_core_output_dimensions JPP((j_decompress_ptr cinfo));
+#endif
 EXTERN(void) jpeg_calc_output_dimensions JPP((j_decompress_ptr cinfo));

 /* Control saving of COM and APPn markers into marker_list. */
@ -1062,9 +1136,6 @@ EXTERN(void) jpeg_destroy JPP((j_common_ptr cinfo));
 EXTERN(boolean) jpeg_resync_to_restart JPP((j_decompress_ptr cinfo,
 					    int desired));

-#ifdef __cplusplus
-} /* extern "C" */
-#endif /* __cplusplus */

 /* These marker codes are exported since applications and data source modules
 * are likely to want to use them.
@ -1121,4 +1192,10 @@ struct jpeg_color_quantizer { long dummy; };
 #include "jerror.h"		/* fetch error codes too */
 #endif

+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+}
+#endif
+#endif
+
 #endif /* JPEGLIB_H */
--- a/jpeg/jquant1.c
+++ b/jpeg/jquant1.c
@ -2,6 +2,7 @@
 * jquant1.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -193,7 +194,10 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
  int total_colors, iroot, i, j;
  boolean changed;
  long temp;
-  static const int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE };
+  int RGB_order[3] = { RGB_GREEN, RGB_RED, RGB_BLUE };
+  RGB_order[0] = rgb_green[cinfo->out_color_space];
+  RGB_order[1] = rgb_red[cinfo->out_color_space];
+  RGB_order[2] = rgb_blue[cinfo->out_color_space];

  /* We can allocate at least the nc'th root of max_colors per component. */
  /* Compute floor(nc'th root of max_colors). */
--- a/jpeg/jquant2.c
+++ b/jpeg/jquant2.c
@ -2,6 +2,7 @@
 * jquant2.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Copyright (C) 2009, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -74,29 +75,10 @@
 #define G_SCALE 3		/* scale G distances by this much */
 #define B_SCALE 1		/* and B by this much */

-/* Relabel R/G/B as components 0/1/2, respecting the RGB ordering defined
- * in jmorecfg.h.  As the code stands, it will do the right thing for R,G,B
- * and B,G,R orders.  If you define some other weird order in jmorecfg.h,
- * you'll get compile errors until you extend this logic.  In that case
- * you'll probably want to tweak the histogram sizes too.
- */
-
-#if RGB_RED == 0
-#define C0_SCALE R_SCALE
-#endif
-#if RGB_BLUE == 0
-#define C0_SCALE B_SCALE
-#endif
-#if RGB_GREEN == 1
-#define C1_SCALE G_SCALE
-#endif
-#if RGB_RED == 2
-#define C2_SCALE R_SCALE
-#endif
-#if RGB_BLUE == 2
-#define C2_SCALE B_SCALE
-#endif
-
+static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
+#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
+#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
+#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]

 /*
 * First we have the histogram data structure and routines for creating it.
@ -454,15 +436,16 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
    /* We want to break any ties in favor of green, then red, blue last.
     * This code does the right thing for R,G,B or B,G,R color orders only.
     */
-#if RGB_RED == 0
-    cmax = c1; n = 1;
-    if (c0 > cmax) { cmax = c0; n = 0; }
-    if (c2 > cmax) { n = 2; }
-#else
-    cmax = c1; n = 1;
-    if (c2 > cmax) { cmax = c2; n = 2; }
-    if (c0 > cmax) { n = 0; }
-#endif
+    if (rgb_red[cinfo->out_color_space] == 0) {
+      cmax = c1; n = 1;
+      if (c0 > cmax) { cmax = c0; n = 0; }
+      if (c2 > cmax) { n = 2; }
+    }
+    else {
+      cmax = c1; n = 1;
+      if (c2 > cmax) { cmax = c2; n = 2; }
+      if (c0 > cmax) { n = 0; }
+    }
    /* Choose split point along selected axis, and update box bounds.
     * Current algorithm: split at halfway point.
     * (Since the box has been shrunk to minimum volume,
--- a/jpeg/jsimd.h
+++ b/jpeg/jsimd.h
@ -0,0 +1,90 @@
+/*
+ * jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jsimd_can_rgb_ycc                 jSCanRgbYcc
+#define jsimd_can_ycc_rgb                 jSCanYccRgb
+#define jsimd_rgb_ycc_convert             jSRgbYccConv
+#define jsimd_ycc_rgb_convert             jSYccRgbConv
+#define jsimd_can_h2v2_downsample         jSCanH2V2Down
+#define jsimd_can_h2v1_downsample         jSCanH2V1Down
+#define jsimd_h2v2_downsample             jSH2V2Down
+#define jsimd_h2v1_downsample             jSH2V1Down
+#define jsimd_can_h2v2_upsample           jSCanH2V2Up
+#define jsimd_can_h2v1_upsample           jSCanH2V1Up
+#define jsimd_h2v2_upsample               jSH2V2Up
+#define jsimd_h2v1_upsample               jSH2V1Up
+#define jsimd_can_h2v2_fancy_upsample     jSCanH2V2FUp
+#define jsimd_can_h2v1_fancy_upsample     jSCanH2V1FUp
+#define jsimd_h2v2_fancy_upsample         jSH2V2FUp
+#define jsimd_h2v1_fancy_upsample         jSH2V1FUp
+#define jsimd_can_h2v2_merged_upsample    jSCanH2V2MUp
+#define jsimd_can_h2v1_merged_upsample    jSCanH2V1MUp
+#define jsimd_h2v2_merged_upsample        jSH2V2MUp
+#define jsimd_h2v1_merged_upsample        jSH2V1MUp
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+EXTERN(int) jsimd_can_rgb_ycc JPP((void));
+EXTERN(int) jsimd_can_ycc_rgb JPP((void));
+
+EXTERN(void) jsimd_rgb_ycc_convert
+        JPP((j_compress_ptr cinfo,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_ycc_rgb_convert
+        JPP((j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+EXTERN(int) jsimd_can_h2v2_downsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_downsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_downsample
+        JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample
+        JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(int) jsimd_can_h2v2_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(int) jsimd_can_h2v2_fancy_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_fancy_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_fancy_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample
+        JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(int) jsimd_can_h2v2_merged_upsample JPP((void));
+EXTERN(int) jsimd_can_h2v1_merged_upsample JPP((void));
+
+EXTERN(void) jsimd_h2v2_merged_upsample
+        JPP((j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+             JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample
+        JPP((j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+             JSAMPARRAY output_buf));
+
--- a/jpeg/jsimd_none.c
+++ b/jpeg/jsimd_none.c
@ -0,0 +1,300 @@
+/*
+ * jsimd_none.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2009 D. R. Commander
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains stubs for when there is no SIMD support available.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jsimd.h"
+#include "jdct.h"
+#include "jsimddct.h"
+
+GLOBAL(int)
+jsimd_can_rgb_ycc (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                       JDIMENSION output_row, int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
+                       JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
+                     jpeg_component_info * compptr, 
+                     JSAMPARRAY input_data,
+                     JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
+                           jpeg_component_info * compptr, 
+                           JSAMPARRAY input_data,
+                           JSAMPARRAY * output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
+                            JSAMPIMAGE input_buf,
+                            JDIMENSION in_row_group_ctr,
+                            JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast (DCTELEM * data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float (FAST_FLOAT * data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
+                DCTELEM * workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+                      FAST_FLOAT * workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2 (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4 (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast (void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float (void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
+{
+}
+
--- a/jpeg/jsimddct.h
+++ b/jpeg/jsimddct.h
@ -0,0 +1,102 @@
+/*
+ * jsimddct.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jsimd_can_convsamp                jSCanConv
+#define jsimd_can_convsamp_float          jSCanConvF
+#define jsimd_convsamp                    jSConv
+#define jsimd_convsamp_float              jSConvF
+#define jsimd_can_fdct_islow              jSCanFDCTIS
+#define jsimd_can_fdct_ifast              jSCanFDCTIF
+#define jsimd_can_fdct_float              jSCanFDCTFl
+#define jsimd_fdct_islow                  jSFDCTIS
+#define jsimd_fdct_ifast                  jSFDCTIF
+#define jsimd_fdct_float                  jSFDCTFl
+#define jsimd_can_quantize                jSCanQuant
+#define jsimd_can_quantize_float          jSCanQuantF
+#define jsimd_quantize                    jSQuant
+#define jsimd_quantize_float              jSQuantF
+#define jsimd_can_idct_2x2                jSCanIDCT22
+#define jsimd_can_idct_4x4                jSCanIDCT44
+#define jsimd_idct_2x2                    jSIDCT22
+#define jsimd_idct_4x4                    jSIDCT44
+#define jsimd_can_idct_islow              jSCanIDCTIS
+#define jsimd_can_idct_ifast              jSCanIDCTIF
+#define jsimd_can_idct_float              jSCanIDCTFl
+#define jsimd_idct_islow                  jSIDCTIS
+#define jsimd_idct_ifast                  jSIDCTIF
+#define jsimd_idct_float                  jSIDCTFl
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+EXTERN(int) jsimd_can_convsamp JPP((void));
+EXTERN(int) jsimd_can_convsamp_float JPP((void));
+
+EXTERN(void) jsimd_convsamp JPP((JSAMPARRAY sample_data,
+                                 JDIMENSION start_col,
+                                 DCTELEM * workspace));
+EXTERN(void) jsimd_convsamp_float JPP((JSAMPARRAY sample_data,
+                                       JDIMENSION start_col,
+                                       FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_fdct_islow JPP((void));
+EXTERN(int) jsimd_can_fdct_ifast JPP((void));
+EXTERN(int) jsimd_can_fdct_float JPP((void));
+
+EXTERN(void) jsimd_fdct_islow JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_float JPP((FAST_FLOAT * data));
+
+EXTERN(int) jsimd_can_quantize JPP((void));
+EXTERN(int) jsimd_can_quantize_float JPP((void));
+
+EXTERN(void) jsimd_quantize JPP((JCOEFPTR coef_block,
+                                 DCTELEM * divisors,
+                                 DCTELEM * workspace));
+EXTERN(void) jsimd_quantize_float JPP((JCOEFPTR coef_block,
+                                       FAST_FLOAT * divisors,
+                                       FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_idct_2x2 JPP((void));
+EXTERN(int) jsimd_can_idct_4x4 JPP((void));
+
+EXTERN(void) jsimd_idct_2x2 JPP((j_decompress_ptr cinfo,
+                                 jpeg_component_info * compptr,
+                                 JCOEFPTR coef_block,
+                                 JSAMPARRAY output_buf,
+                                 JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4 JPP((j_decompress_ptr cinfo,
+                                 jpeg_component_info * compptr,
+                                 JCOEFPTR coef_block,
+                                 JSAMPARRAY output_buf,
+                                 JDIMENSION output_col));
+
+EXTERN(int) jsimd_can_idct_islow JPP((void));
+EXTERN(int) jsimd_can_idct_ifast JPP((void));
+EXTERN(int) jsimd_can_idct_float JPP((void));
+
+EXTERN(void) jsimd_idct_islow JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+EXTERN(void) jsimd_idct_float JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+
--- a/jpeg/jutils.c
+++ b/jpeg/jutils.c
@ -77,8 +77,8 @@ jdiv_round_up (long a, long b)
 }


-GLOBAL(long)
-jround_up (long a, long b)
+GLOBAL(size_t)
+jround_up (size_t a, size_t b)
 /* Compute a rounded up to next multiple of b, ie, ceil(a/b)*b */
 /* Assumes a >= 0, b > 0 */
 {
--- a/jpeg/jversion.h
+++ b/jpeg/jversion.h
@ -1,7 +1,8 @@
 /*
 * jversion.h
 *
- * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 2010, D. R. Commander.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@ -9,6 +10,28 @@
 */


+#if JPEG_LIB_VERSION >= 80
+
+#define JVERSION	"8b  16-May-2010"
+
+#define JCOPYRIGHT	"Copyright (C) 2010, Thomas G. Lane, Guido Vollbeding"
+
+#elif JPEG_LIB_VERSION >= 70
+
+#define JVERSION        "7  27-Jun-2009"
+
+#define JCOPYRIGHT      "Copyright (C) 2009, Thomas G. Lane, Guido Vollbeding"
+
+#else
+
 #define JVERSION	"6b  27-Mar-1998"

 #define JCOPYRIGHT	"Copyright (C) 1998, Thomas G. Lane"
+
+#endif
+
+#define LJTCOPYRIGHT	"Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+			"Copyright (C) 2004 Landmark Graphics Corporation\n" \
+			"Copyright (C) 2005-2007 Sun Microsystems, Inc.\n" \
+			"Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
+			"Copyright (C) 2009-2011 D. R. Commander"
--- a/jpeg/jwinfig.h
+++ b/jpeg/jwinfig.h
@ -1,48 +0,0 @@
-/* jconfig.mc6 --- jconfig.h for Microsoft C on MS-DOS, version 6.00A & up. */
-/* see jconfig.doc for explanations */
-
-/* this is a hack */
-#define HAVE_BOOLEAN
-#ifndef __RPCNDR_H__
-typedef unsigned char boolean;
-#endif
-
-#define HAVE_PROTOTYPES
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef CHAR_IS_UNSIGNED
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
-#undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS	/* for small or medium memory model */
-#undef NEED_SHORT_EXTERNAL_NAMES
-#undef INCOMPLETE_TYPES_BROKEN
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#define USE_MSDOS_MEMANSI
-
-#define MAX_ALLOC_CHUNK 65520L	/* Maximum request to malloc() */
-
-#endif /* JPEG_INTERNALS */
-
-#ifdef JPEG_CJPEG_DJPEG
-
-#define BMP_SUPPORTED		/* BMP image file format */
-#define GIF_SUPPORTED		/* GIF image file format */
-#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED		/* Utah RLE image file format */
-#define TARGA_SUPPORTED		/* Targa image file format */
-
-#define TWO_FILE_COMMANDLINE
-#define USE_SETMODE		/* Microsoft has setmode() */
-#define NEED_SIGNAL_CATCHER	/* Define this if you use jmemdos.c */
-#undef DONT_USE_B_MODE
-#undef PROGRESS_REPORT		/* optional */
-
-#endif /* JPEG_CJPEG_DJPEG */
--- a/jpeg/libjpeg.doc
+++ b/jpeg/libjpeg.doc
--- a/jpeg/makefile.gen
+++ b/jpeg/makefile.gen
@ -1,274 +0,0 @@
-# Generated automatically from makefile.cfg by configure.
-# Makefile for Independent JPEG Group's software
-
-# makefile.cfg is edited by configure to produce a custom Makefile.
-
-# Read installation instructions before saying "make" !!
-
-# For compiling with source and object files in different directories.
-srcdir = $(VPATH)
-
-# Where to install the programs and man pages.
-prefix = /usr/local
-exec_prefix = ${prefix}
-bindir = $(exec_prefix)/bin
-libdir = $(exec_prefix)/lib
-includedir = $(prefix)/include
-binprefix =
-manprefix =
-manext = 1
-mandir = $(prefix)/man/man$(manext)
-
-# The name of your C compiler:
-CC= cc
-
-# You may need to adjust these cc options:
-CFLAGS= -O3  -I$(srcdir)
-# Generally, we recommend defining any configuration symbols in jconfig.h,
-# NOT via -D switches here.
-# However, any special defines for ansi2knr.c may be included here:
-ANSI2KNRFLAGS= 
-
-# Link-time cc options:
-LDFLAGS= 
-
-# To link any special libraries, add the necessary -l commands here.
-LDLIBS= 
-
-# Put here the object file name for the correct system-dependent memory
-# manager file.  For Unix this is usually jmemnobs.o, but you may want
-# to use jmemansi.o or jmemname.o if you have limited swap space.
-SYSDEPMEM= jmemnobs.o
-
-# miscellaneous OS-dependent stuff
-SHELL= /bin/sh
-# linker
-LN= $(CC)
-# file deletion command
-RM= rm -f
-# file rename command
-MV= mv
-# library (.a) file creation command
-AR= ar rc
-# second step in .a creation (use "touch" if not needed)
-AR2= ranlib
-# installation program
-INSTALL= cp
-INSTALL_PROGRAM= ${INSTALL}
-INSTALL_DATA= ${INSTALL}
-
-# End of configurable options.
-
-
-# source files: JPEG library proper
-LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
-        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
-        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
-        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
-        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
-        jquant2.c jutils.c jmemmgr.c jmemansi.c jmemname.c jmemnobs.c \
-        jmemdos.c
-# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
-APPSOURCES= cjpeg.c djpeg.c jpegtran.c cdjpeg.c rdcolmap.c rdswitch.c \
-        rdjpgcom.c wrjpgcom.c rdppm.c wrppm.c rdgif.c wrgif.c rdtarga.c \
-        wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
-SOURCES= $(LIBSOURCES) $(APPSOURCES)
-# files included by source files
-INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h
-# documentation, test, and support files
-DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
-        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
-        coderules.doc filelist.doc change.log
-MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
-        makefile.mc6 makefile.dj makefile.wat makcjpeg.st makdjpeg.st \
-        makljpeg.st maktjpeg.st makefile.manx makefile.sas makefile.mms \
-        makefile.vms makvms.opt
-CONFIGFILES= jconfig.cfg jconfig.manx jconfig.sas jconfig.st jconfig.bcc \
-        jconfig.mc6 jconfig.dj jconfig.wat jconfig.vms
-OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
-TESTFILES= testorig.jpg testimg.ppm testimg.gif testimg.jpg testprog.jpg \
-        testimgp.jpg
-DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
-        $(OTHERFILES) $(TESTFILES)
-# library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
-# compression library object files
-CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
-        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
-# decompression library object files
-DLIBOBJECTS= jdapimin.o jdapistd.o jdatasrc.o jdmaster.o \
-        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
-# These objectfiles are included in libjpeg.a
-LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
-# object files for sample applications (excluding library files)
-COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
-        cdjpeg.o
-DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
-        cdjpeg.o
-TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o
-
-all: libjpeg.a
-
-realall:  libjpeg.a cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-
-# This rule causes ansi2knr to be invoked.
-# .c.o:
-# 	./ansi2knr $(srcdir)/$*.c T$*.c
-# 	$(CC) $(CFLAGS) -c T$*.c
-# 	$(RM) T$*.c $*.o
-# 	$(MV) T$*.o $*.o
-
-ansi2knr: ansi2knr.c
-	$(CC) $(CFLAGS) $(ANSI2KNRFLAGS) -o ansi2knr ansi2knr.c
-
-# Decompression-only library
-libjpeg.a:  $(DLIBOBJECTS) $(COMOBJECTS)
-	$(RM) libjpeg.a
-	$(AR) libjpeg.a  $(DLIBOBJECTS) $(COMOBJECTS)
-	$(AR2) libjpeg.a
-
-cjpeg: $(COBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o cjpeg $(COBJECTS) libjpeg.a $(LDLIBS)
-
-djpeg: $(DOBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o djpeg $(DOBJECTS) libjpeg.a $(LDLIBS)
-
-jpegtran: $(TROBJECTS) libjpeg.a
-	$(LN) $(LDFLAGS) -o jpegtran $(TROBJECTS) libjpeg.a $(LDLIBS)
-
-rdjpgcom: rdjpgcom.o
-	$(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
-
-wrjpgcom: wrjpgcom.o
-	$(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
-
-jconfig.h: jconfig.doc
-	echo You must prepare a system-dependent jconfig.h file.
-	echo Please read the installation directions in install.doc.
-	exit 1
-
-install: cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-	$(INSTALL_PROGRAM) cjpeg $(bindir)/$(binprefix)cjpeg
-	$(INSTALL_PROGRAM) djpeg $(bindir)/$(binprefix)djpeg
-	$(INSTALL_PROGRAM) jpegtran $(bindir)/$(binprefix)jpegtran
-	$(INSTALL_PROGRAM) rdjpgcom $(bindir)/$(binprefix)rdjpgcom
-	$(INSTALL_PROGRAM) wrjpgcom $(bindir)/$(binprefix)wrjpgcom
-	$(INSTALL_DATA) $(srcdir)/cjpeg.1 $(mandir)/$(manprefix)cjpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/djpeg.1 $(mandir)/$(manprefix)djpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/jpegtran.1 $(mandir)/$(manprefix)jpegtran.$(manext)
-	$(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(mandir)/$(manprefix)rdjpgcom.$(manext)
-	$(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(mandir)/$(manprefix)wrjpgcom.$(manext)
-
-install-lib: libjpeg.a install-headers
-	$(INSTALL_DATA) libjpeg.a $(libdir)/$(binprefix)libjpeg.a
-
-install-headers: jconfig.h
-	$(INSTALL_DATA) jconfig.h $(includedir)/jconfig.h
-	$(INSTALL_DATA) $(srcdir)/jpeglib.h $(includedir)/jpeglib.h
-	$(INSTALL_DATA) $(srcdir)/jmorecfg.h $(includedir)/jmorecfg.h
-	$(INSTALL_DATA) $(srcdir)/jerror.h $(includedir)/jerror.h
-
-clean:
-	$(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
-	$(RM) ansi2knr core testout* config.log config.status
-
-distribute:
-	$(RM) jpegsrc.tar*
-	tar cvf jpegsrc.tar $(DISTFILES)
-	compress -v jpegsrc.tar
-
-test: cjpeg djpeg jpegtran
-	$(RM) testout*
-	./djpeg -dct int -ppm -outfile testout.ppm  $(srcdir)/testorig.jpg
-	./djpeg -dct int -gif -outfile testout.gif  $(srcdir)/testorig.jpg
-	./cjpeg -dct int -outfile testout.jpg  $(srcdir)/testimg.ppm
-	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)/testprog.jpg
-	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)/testimg.ppm
-	./jpegtran -outfile testoutt.jpg $(srcdir)/testprog.jpg
-	cmp $(srcdir)/testimg.ppm testout.ppm
-	cmp $(srcdir)/testimg.gif testout.gif
-	cmp $(srcdir)/testimg.jpg testout.jpg
-	cmp $(srcdir)/testimg.ppm testoutp.ppm
-	cmp $(srcdir)/testimgp.jpg testoutp.jpg
-	cmp $(srcdir)/testorig.jpg testoutt.jpg
-
-check: test
-
-# GNU Make likes to know which target names are not really files to be made:
-.PHONY: all install install-lib install-headers clean distribute test check
-
-
-jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
-jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
-jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
-jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
-cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
-cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
-wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
-rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
-wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
--- a/jpeg/netscape_mods.doc
+++ b/jpeg/netscape_mods.doc
@ -1,52 +0,0 @@
-***** BEGIN LICENSE BLOCK *****
-Version: MPL 1.1/GPL 2.0/LGPL 2.1
-
-The contents of this file are subject to the Mozilla Public License Version 
-1.1 (the "License"); you may not use this file except in compliance with 
-the License. You may obtain a copy of the License at 
-http://www.mozilla.org/MPL/
-
-Software distributed under the License is distributed on an "AS IS" basis,
-WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
-for the specific language governing rights and limitations under the
-License.
-
-The Original Code is mozilla.org code.
-
-The Initial Developer of the Original Code is
-Netscape Communications Corporation
-Portions created by the Initial Developer are Copyright (C) 1998
-the Initial Developer. All Rights Reserved.
-
-Contributor(s):
-
-Alternatively, the contents of this file may be used under the terms of
-either the GNU General Public License Version 2 or later (the "GPL"), or
-the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
-in which case the provisions of the GPL or the LGPL are applicable instead
-of those above. If you wish to allow use of your version of this file only
-under the terms of either the GPL or the LGPL, and not to allow others to
-use your version of this file under the terms of the MPL, indicate your
-decision by deleting the provisions above and replace them with the notice
-and other provisions required by the GPL or the LGPL. If you do not delete
-the provisions above, a recipient may use your version of this file under
-the terms of any one of the MPL, the GPL or the LGPL.
-
-***** END LICENSE BLOCK *****
-
-This directory contains a subset of the IJG JPEG library.  Among other
-omissions, most of the original IJG documentation has been deleted.
-You can find the full IJG distribution at the archive sites mentioned in
-the README file.  Please note that the IJG code does not fall under the
-Netscape NPL, but is freely distributable under its own copyright terms
-(see README).
-
-Several files have been modified to allow incorporation of the IJG code
-into the Netscape environment.  As of this writing, jconfig.h, jmorecfg.h,
-and jerror.c contain Netscape-specific changes.  In addition, we have created
-our own makefiles following Netscape conventions, rather than using any of
-those provided by IJG.
-
-There are some other changes herein, such as MMX-specific optimizations,
-which should eventually make their way back into the standard IJG
-distribution.
--- a/jpeg/simd/Makefile.in
+++ b/jpeg/simd/Makefile.in
@ -0,0 +1,48 @@
+#
+# ***** BEGIN LICENSE BLOCK *****
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is mozilla.org code.
+#
+# The Initial Developer of the Original Code is
+# Mozilla Corporation
+# Portions created by the Initial Developer are Copyright (C) 2010
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+#  Justin Lebar <justin.lebar@gmail.com>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ***** END LICENSE BLOCK *****
+
+DEPTH		= ../..
+topsrcdir	= @top_srcdir@
+srcdir		= @srcdir@
+VPATH		= @srcdir@
+
+include $(DEPTH)/config/autoconf.mk
+
+# empty makefile so this directory gets created in the objdir.
+
+include $(topsrcdir)/config/rules.mk
--- a/jpeg/simd/jcclrmmx.asm
+++ b/jpeg/simd/jcclrmmx.asm
@ -0,0 +1,479 @@
+;
+; jcclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
+;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                           JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION img_width
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [img_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	xor	eax,eax
+	mov	al, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	xor	edx,edx
+	mov	dx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	mmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	mmG, DWORD [esi+ecx]
+	psllq	mmA, DWORD_BIT
+	por	mmA,mmG
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	movq	mmG,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	mov	ecx, SIZEOF_MMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld16:
+	test	cl, 2*SIZEOF_MMWORD
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 01 11 21 02 12)
+	; mmG=(22 03 13 23 04 14 24 05)
+	; mmF=(15 25 06 16 26 07 17 27)
+
+	movq      mmD,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
+	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
+
+	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
+	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
+
+	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
+	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
+
+	movq      mmE,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
+	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
+
+	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
+
+	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
+	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
+
+	pxor      mmH,mmH
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
+
+	movq      mmB,mmE
+	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
+	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
+
+	movq      mmF,mmD
+	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
+	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_MMWORD/8
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_MMWORD/8
+	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_MMWORD/4
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_MMWORD/4
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+	test	cl, SIZEOF_MMWORD/2
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmD,mmA
+	movq	mmC,mmF
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 30 01 11 21 31)
+	; mmF=(02 12 22 32 03 13 23 33)
+	; mmD=(04 14 24 34 05 15 25 35)
+	; mmC=(06 16 26 36 07 17 27 37)
+
+	movq      mmB,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
+	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
+
+	movq      mmG,mmD
+	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
+	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
+
+	movq      mmE,mmA
+	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
+
+	movq      mmH,mmB
+	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
+	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
+
+	pxor      mmF,mmF
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
+
+	movq      mmD,mmB
+	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
+	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
+
+	movq      mmG,mmE
+	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
+	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
+
+	punpcklbw mmF,mmH
+	punpckhbw mmH,mmH
+	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
+	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movq      MMWORD [wk(0)], mm0	; wk(0)=RE
+	movq      MMWORD [wk(1)], mm1	; wk(1)=RO
+	movq      MMWORD [wk(2)], mm4	; wk(2)=BE
+	movq      MMWORD [wk(3)], mm5	; wk(3)=BO
+
+	movq      mm6,mm1
+	punpcklwd mm1,mm3
+	punpckhwd mm6,mm3
+	movq      mm7,mm1
+	movq      mm4,mm6
+	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movq      MMWORD [wk(4)], mm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movq      MMWORD [wk(5)], mm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      mm1,mm1
+	pxor      mm6,mm6
+	punpcklwd mm1,mm5		; mm1=BOL
+	punpckhwd mm6,mm5		; mm6=BOH
+	psrld     mm1,1			; mm1=BOL*FIX(0.500)
+	psrld     mm6,1			; mm6=BOH*FIX(0.500)
+
+	movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm1
+	paddd     mm4,mm6
+	paddd     mm7,mm5
+	paddd     mm4,mm5
+	psrld     mm7,SCALEBITS		; mm7=CbOL
+	psrld     mm4,SCALEBITS		; mm4=CbOH
+	packssdw  mm7,mm4		; mm7=CbO
+
+	movq      mm1, MMWORD [wk(2)]	; mm1=BE
+
+	movq      mm6,mm0
+	punpcklwd mm0,mm2
+	punpckhwd mm6,mm2
+	movq      mm5,mm0
+	movq      mm4,mm6
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movq      MMWORD [wk(6)], mm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movq      MMWORD [wk(7)], mm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      mm0,mm0
+	pxor      mm6,mm6
+	punpcklwd mm0,mm1		; mm0=BEL
+	punpckhwd mm6,mm1		; mm6=BEH
+	psrld     mm0,1			; mm0=BEL*FIX(0.500)
+	psrld     mm6,1			; mm6=BEH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm5,mm0
+	paddd     mm4,mm6
+	paddd     mm5,mm1
+	paddd     mm4,mm1
+	psrld     mm5,SCALEBITS		; mm5=CbEL
+	psrld     mm4,SCALEBITS		; mm4=CbEH
+	packssdw  mm5,mm4		; mm5=CbE
+
+	psllw     mm7,BYTE_BIT
+	por       mm5,mm7		; mm5=Cb
+	movq      MMWORD [ebx], mm5	; Save Cb
+
+	movq      mm0, MMWORD [wk(3)]	; mm0=BO
+	movq      mm6, MMWORD [wk(2)]	; mm6=BE
+	movq      mm1, MMWORD [wk(1)]	; mm1=RO
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm3
+	punpckhwd mm4,mm3
+	movq      mm7,mm0
+	movq      mm5,mm4
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
+
+	paddd     mm0, MMWORD [wk(4)]
+	paddd     mm4, MMWORD [wk(5)]
+	paddd     mm0,mm3
+	paddd     mm4,mm3
+	psrld     mm0,SCALEBITS		; mm0=YOL
+	psrld     mm4,SCALEBITS		; mm4=YOH
+	packssdw  mm0,mm4		; mm0=YO
+
+	pxor      mm3,mm3
+	pxor      mm4,mm4
+	punpcklwd mm3,mm1		; mm3=ROL
+	punpckhwd mm4,mm1		; mm4=ROH
+	psrld     mm3,1			; mm3=ROL*FIX(0.500)
+	psrld     mm4,1			; mm4=ROH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm3
+	paddd     mm5,mm4
+	paddd     mm7,mm1
+	paddd     mm5,mm1
+	psrld     mm7,SCALEBITS		; mm7=CrOL
+	psrld     mm5,SCALEBITS		; mm5=CrOH
+	packssdw  mm7,mm5		; mm7=CrO
+
+	movq      mm3, MMWORD [wk(0)]	; mm3=RE
+
+	movq      mm4,mm6
+	punpcklwd mm6,mm2
+	punpckhwd mm4,mm2
+	movq      mm1,mm6
+	movq      mm5,mm4
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
+
+	paddd     mm6, MMWORD [wk(6)]
+	paddd     mm4, MMWORD [wk(7)]
+	paddd     mm6,mm2
+	paddd     mm4,mm2
+	psrld     mm6,SCALEBITS		; mm6=YEL
+	psrld     mm4,SCALEBITS		; mm4=YEH
+	packssdw  mm6,mm4		; mm6=YE
+
+	psllw     mm0,BYTE_BIT
+	por       mm6,mm0		; mm6=Y
+	movq      MMWORD [edi], mm6	; Save Y
+
+	pxor      mm2,mm2
+	pxor      mm4,mm4
+	punpcklwd mm2,mm3		; mm2=REL
+	punpckhwd mm4,mm3		; mm4=REH
+	psrld     mm2,1			; mm2=REL*FIX(0.500)
+	psrld     mm4,1			; mm4=REH*FIX(0.500)
+
+	movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+	paddd     mm1,mm2
+	paddd     mm5,mm4
+	paddd     mm1,mm0
+	paddd     mm5,mm0
+	psrld     mm1,SCALEBITS		; mm1=CrEL
+	psrld     mm5,SCALEBITS		; mm5=CrEH
+	packssdw  mm1,mm5		; mm1=CrE
+
+	psllw     mm7,BYTE_BIT
+	por       mm1,mm7		; mm1=Cr
+	movq      MMWORD [edx], mm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_MMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
+	add	edi, byte SIZEOF_MMWORD			; outptr0
+	add	ebx, byte SIZEOF_MMWORD			; outptr1
+	add	edx, byte SIZEOF_MMWORD			; outptr2
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcclrss2-64.asm
+++ b/jpeg/simd/jcclrss2-64.asm
@ -0,0 +1,487 @@
+;
+; jcclrss2-64.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+; r10 = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13 = JDIMENSION output_row
+; r14 = int num_rows
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		8
+
+	align	16
+
+	global	EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+	push	rbx
+
+	mov	rcx, r10
+	test	rcx,rcx
+	jz	near .return
+
+	push	rcx
+
+	mov rsi, r12
+	mov rcx, r13
+	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+	pop	rcx
+
+	mov rsi, r11
+	mov	eax, r14d
+	test	rax,rax
+	jle	near .return
+.rowloop:
+	push	rdx
+	push	rbx
+	push	rdi
+	push	rsi
+	push	rcx			; col
+
+	mov	rsi, JSAMPROW [rsi]	; inptr
+	mov	rdi, JSAMPROW [rdi]	; outptr0
+	mov	rbx, JSAMPROW [rbx]	; outptr1
+	mov	rdx, JSAMPROW [rdx]	; outptr2
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	rax
+	push	rdx
+	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	rcx, byte SIZEOF_BYTE
+	movzx	rax, BYTE [rsi+rcx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	rcx, byte SIZEOF_WORD
+	movzx	rdx, WORD [rsi+rcx]
+	shl	rax, WORD_BIT
+	or	rax,rdx
+.column_ld4:
+	movd	xmmA,eax
+	pop	rdx
+	pop	rax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	rcx, byte SIZEOF_DWORD
+	movd	xmmF, XMM_DWORD [rsi+rcx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	rcx, byte SIZEOF_MMWORD
+	movq	xmmB, XMM_MMWORD [rsi+rcx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	rcx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	rcx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	rcx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	rcx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	rcx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
+	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
+	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
+	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	movdqa    xmm7,xmm1
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      xmm1,xmm1
+	pxor      xmm6,xmm6
+	punpcklwd xmm1,xmm5		; xmm1=BOL
+	punpckhwd xmm6,xmm5		; xmm6=BOH
+	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
+
+	movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm1
+	paddd     xmm4,xmm6
+	paddd     xmm7,xmm5
+	paddd     xmm4,xmm5
+	psrld     xmm7,SCALEBITS	; xmm7=CbOL
+	psrld     xmm4,SCALEBITS	; xmm4=CbOH
+	packssdw  xmm7,xmm4		; xmm7=CbO
+
+	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      xmm0,xmm0
+	pxor      xmm6,xmm6
+	punpcklwd xmm0,xmm1		; xmm0=BEL
+	punpckhwd xmm6,xmm1		; xmm6=BEH
+	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
+
+	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm5,xmm0
+	paddd     xmm4,xmm6
+	paddd     xmm5,xmm1
+	paddd     xmm4,xmm1
+	psrld     xmm5,SCALEBITS	; xmm5=CbEL
+	psrld     xmm4,SCALEBITS	; xmm4=CbEH
+	packssdw  xmm5,xmm4		; xmm5=CbE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm5,xmm7		; xmm5=Cb
+	movdqa    XMMWORD [rbx], xmm5	; Save Cb
+
+	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
+	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	movdqa    xmm7,xmm0
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, XMMWORD [wk(4)]
+	paddd     xmm4, XMMWORD [wk(5)]
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	pxor      xmm3,xmm3
+	pxor      xmm4,xmm4
+	punpcklwd xmm3,xmm1		; xmm3=ROL
+	punpckhwd xmm4,xmm1		; xmm4=ROH
+	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
+
+	movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm3
+	paddd     xmm5,xmm4
+	paddd     xmm7,xmm1
+	paddd     xmm5,xmm1
+	psrld     xmm7,SCALEBITS	; xmm7=CrOL
+	psrld     xmm5,SCALEBITS	; xmm5=CrOH
+	packssdw  xmm7,xmm5		; xmm7=CrO
+
+	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(6)]
+	paddd     xmm4, XMMWORD [wk(7)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [rdi], xmm6	; Save Y
+
+	pxor      xmm2,xmm2
+	pxor      xmm4,xmm4
+	punpcklwd xmm2,xmm3		; xmm2=REL
+	punpckhwd xmm4,xmm3		; xmm4=REH
+	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
+
+	movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm1,xmm2
+	paddd     xmm5,xmm4
+	paddd     xmm1,xmm0
+	paddd     xmm5,xmm0
+	psrld     xmm1,SCALEBITS	; xmm1=CrEL
+	psrld     xmm5,SCALEBITS	; xmm5=CrEH
+	packssdw  xmm1,xmm5		; xmm1=CrE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm1,xmm7		; xmm1=Cr
+	movdqa    XMMWORD [rdx], xmm1	; Save Cr
+
+	sub	rcx, byte SIZEOF_XMMWORD
+	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte SIZEOF_XMMWORD		; outptr0
+	add	rbx, byte SIZEOF_XMMWORD		; outptr1
+	add	rdx, byte SIZEOF_XMMWORD		; outptr2
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	rcx,rcx
+	jnz	near .column_ld1
+
+	pop	rcx			; col
+	pop	rsi
+	pop	rdi
+	pop	rbx
+	pop	rdx
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
+	add	rdi, byte SIZEOF_JSAMPROW
+	add	rbx, byte SIZEOF_JSAMPROW
+	add	rdx, byte SIZEOF_JSAMPROW
+	dec	rax				; num_rows
+	jg	near .rowloop
+
+.return:
+	pop	rbx
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcclrss2.asm
+++ b/jpeg/simd/jcclrss2.asm
@ -0,0 +1,505 @@
+;
+; jcclrss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION img_width
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+
+	global	EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [img_width(eax)]
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	movzx	eax, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	movzx	edx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	xmmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	xmmF, XMM_DWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	ecx, byte SIZEOF_MMWORD
+	movq	xmmB, XMM_MMWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
+	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
+	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
+	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	movdqa    xmm7,xmm1
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      xmm1,xmm1
+	pxor      xmm6,xmm6
+	punpcklwd xmm1,xmm5		; xmm1=BOL
+	punpckhwd xmm6,xmm5		; xmm6=BOH
+	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
+
+	movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm1
+	paddd     xmm4,xmm6
+	paddd     xmm7,xmm5
+	paddd     xmm4,xmm5
+	psrld     xmm7,SCALEBITS	; xmm7=CbOL
+	psrld     xmm4,SCALEBITS	; xmm4=CbOH
+	packssdw  xmm7,xmm4		; xmm7=CbO
+
+	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      xmm0,xmm0
+	pxor      xmm6,xmm6
+	punpcklwd xmm0,xmm1		; xmm0=BEL
+	punpckhwd xmm6,xmm1		; xmm6=BEH
+	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm5,xmm0
+	paddd     xmm4,xmm6
+	paddd     xmm5,xmm1
+	paddd     xmm4,xmm1
+	psrld     xmm5,SCALEBITS	; xmm5=CbEL
+	psrld     xmm4,SCALEBITS	; xmm4=CbEH
+	packssdw  xmm5,xmm4		; xmm5=CbE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm5,xmm7		; xmm5=Cb
+	movdqa    XMMWORD [ebx], xmm5	; Save Cb
+
+	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
+	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	movdqa    xmm7,xmm0
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, XMMWORD [wk(4)]
+	paddd     xmm4, XMMWORD [wk(5)]
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	pxor      xmm3,xmm3
+	pxor      xmm4,xmm4
+	punpcklwd xmm3,xmm1		; xmm3=ROL
+	punpckhwd xmm4,xmm1		; xmm4=ROH
+	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm3
+	paddd     xmm5,xmm4
+	paddd     xmm7,xmm1
+	paddd     xmm5,xmm1
+	psrld     xmm7,SCALEBITS	; xmm7=CrOL
+	psrld     xmm5,SCALEBITS	; xmm5=CrOH
+	packssdw  xmm7,xmm5		; xmm7=CrO
+
+	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(6)]
+	paddd     xmm4, XMMWORD [wk(7)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [edi], xmm6	; Save Y
+
+	pxor      xmm2,xmm2
+	pxor      xmm4,xmm4
+	punpcklwd xmm2,xmm3		; xmm2=REL
+	punpckhwd xmm4,xmm3		; xmm4=REH
+	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
+
+	movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm1,xmm2
+	paddd     xmm5,xmm4
+	paddd     xmm1,xmm0
+	paddd     xmm5,xmm0
+	psrld     xmm1,SCALEBITS	; xmm1=CrEL
+	psrld     xmm5,SCALEBITS	; xmm5=CrEH
+	packssdw  xmm1,xmm5		; xmm1=CrE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm1,xmm7		; xmm1=Cr
+	movdqa    XMMWORD [edx], xmm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_XMMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	edi, byte SIZEOF_XMMWORD		; outptr0
+	add	ebx, byte SIZEOF_XMMWORD		; outptr1
+	add	edx, byte SIZEOF_XMMWORD		; outptr2
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jccolmmx.asm
+++ b/jpeg/simd/jccolmmx.asm
@ -0,0 +1,120 @@
+;
+; jccolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 2 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 2 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
+%include "jcclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
+%include "jcclrmmx.asm"
--- a/jpeg/simd/jccolss2-64.asm
+++ b/jpeg/simd/jccolss2-64.asm
@ -0,0 +1,117 @@
+;
+; jccolss2-64.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"
--- a/jpeg/simd/jccolss2.asm
+++ b/jpeg/simd/jccolss2.asm
@ -0,0 +1,117 @@
+;
+; jccolss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2.asm"
--- a/jpeg/simd/jcolsamp.inc
+++ b/jpeg/simd/jcolsamp.inc
@ -0,0 +1,105 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define  mmA  mm0
+%define  mmB  mm1
+%define xmmA xmm0
+%define xmmB xmm1
+%elif RGB_GREEN == 0
+%define  mmA  mm2
+%define  mmB  mm3
+%define xmmA xmm2
+%define xmmB xmm3
+%elif RGB_BLUE == 0
+%define  mmA  mm4
+%define  mmB  mm5
+%define xmmA xmm4
+%define xmmB xmm5
+%else
+%define  mmA  mm6
+%define  mmB  mm7
+%define xmmA xmm6
+%define xmmB xmm7
+%endif
+
+%if RGB_RED == 1
+%define  mmC  mm0
+%define  mmD  mm1
+%define xmmC xmm0
+%define xmmD xmm1
+%elif RGB_GREEN == 1
+%define  mmC  mm2
+%define  mmD  mm3
+%define xmmC xmm2
+%define xmmD xmm3
+%elif RGB_BLUE == 1
+%define  mmC  mm4
+%define  mmD  mm5
+%define xmmC xmm4
+%define xmmD xmm5
+%else
+%define  mmC  mm6
+%define  mmD  mm7
+%define xmmC xmm6
+%define xmmD xmm7
+%endif
+
+%if RGB_RED == 2
+%define  mmE  mm0
+%define  mmF  mm1
+%define xmmE xmm0
+%define xmmF xmm1
+%elif RGB_GREEN == 2
+%define  mmE  mm2
+%define  mmF  mm3
+%define xmmE xmm2
+%define xmmF xmm3
+%elif RGB_BLUE == 2
+%define  mmE  mm4
+%define  mmF  mm5
+%define xmmE xmm4
+%define xmmF xmm5
+%else
+%define  mmE  mm6
+%define  mmF  mm7
+%define xmmE xmm6
+%define xmmF xmm7
+%endif
+
+%if RGB_RED == 3
+%define  mmG  mm0
+%define  mmH  mm1
+%define xmmG xmm0
+%define xmmH xmm1
+%elif RGB_GREEN == 3
+%define  mmG  mm2
+%define  mmH  mm3
+%define xmmG xmm2
+%define xmmH xmm3
+%elif RGB_BLUE == 3
+%define  mmG  mm4
+%define  mmH  mm5
+%define xmmG xmm4
+%define xmmH xmm5
+%else
+%define  mmG  mm6
+%define  mmH  mm7
+%define xmmG xmm6
+%define xmmH xmm7
+%endif
+
+; --------------------------------------------------------------------------
--- a/jpeg/simd/jcqnt3dn.asm
+++ b/jpeg/simd/jcqnt3dn.asm
@ -0,0 +1,233 @@
+;
+; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                             FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad	mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	pi2fd	mm4,mm4
+	pi2fd	mm2,mm2
+	psrad	mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad	mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	pi2fd	mm5,mm5
+	pi2fd	mm0,mm0
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad	mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad	mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	pi2fd	mm6,mm6
+	pi2fd	mm3,mm3
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad	mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	pi2fd	mm4,mm4
+	pi2fd	mm1,mm1
+
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                             FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov       eax, 0x4B400000	; (float)0x00C00000 (rndint_magic)
+	movd      mm7,eax
+	punpckldq mm7,mm7		; mm7={12582912.0F 12582912.0F}
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm0,mm7			; mm0=(00 ** 01 **)
+	pfadd	mm1,mm7			; mm1=(02 ** 03 **)
+	pfadd	mm2,mm7			; mm0=(04 ** 05 **)
+	pfadd	mm3,mm7			; mm1=(06 ** 07 **)
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm1		; mm0=(00 02 ** **)
+	punpckhwd mm4,mm1		; mm4=(01 03 ** **)
+	movq      mm5,mm2
+	punpcklwd mm2,mm3		; mm2=(04 06 ** **)
+	punpckhwd mm5,mm3		; mm5=(05 07 ** **)
+
+	punpcklwd mm0,mm4		; mm0=(00 01 02 03)
+	punpcklwd mm2,mm5		; mm2=(04 05 06 07)
+
+	movq	mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm6,mm7			; mm0=(10 ** 11 **)
+	pfadd	mm1,mm7			; mm4=(12 ** 13 **)
+	pfadd	mm3,mm7			; mm0=(14 ** 15 **)
+	pfadd	mm4,mm7			; mm4=(16 ** 17 **)
+
+	movq      mm5,mm6
+	punpcklwd mm6,mm1		; mm6=(10 12 ** **)
+	punpckhwd mm5,mm1		; mm5=(11 13 ** **)
+	movq      mm1,mm3
+	punpcklwd mm3,mm4		; mm3=(14 16 ** **)
+	punpckhwd mm1,mm4		; mm1=(15 17 ** **)
+
+	punpcklwd mm6,mm5		; mm6=(10 11 12 13)
+	punpcklwd mm3,mm1		; mm3=(14 15 16 17)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcqntmmx.asm
+++ b/jpeg/simd/jcqntmmx.asm
@ -0,0 +1,274 @@
+;
+; jcqntmmx.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	mm6,mm6			; mm6=(all 0's)
+	pcmpeqw	mm7,mm7
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm0=(01234567)
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm2=(GHIJKLMN)
+	movq	mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm3=(OPQRSTUV)
+
+	movq      mm4,mm0
+	punpcklbw mm0,mm6		; mm0=(0123)
+	punpckhbw mm4,mm6		; mm4=(4567)
+	movq      mm5,mm1
+	punpcklbw mm1,mm6		; mm1=(89AB)
+	punpckhbw mm5,mm6		; mm5=(CDEF)
+
+	paddw	mm0,mm7
+	paddw	mm4,mm7
+	paddw	mm1,mm7
+	paddw	mm5,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+	movq      mm0,mm2
+	punpcklbw mm2,mm6		; mm2=(GHIJ)
+	punpckhbw mm0,mm6		; mm0=(KLMN)
+	movq      mm4,mm3
+	punpcklbw mm3,mm6		; mm3=(OPQR)
+	punpckhbw mm4,mm6		; mm4=(STUV)
+
+	paddw	mm2,mm7
+	paddw	mm0,mm7
+	paddw	mm3,mm7
+	paddw	mm4,mm7
+
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
+;                     DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ah, 2
+	alignx	16,7
+.quantloop1:
+	mov	al, DCTSIZE2/8/2
+	alignx	16,7
+.quantloop2:
+	movq	mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+	movq	mm0,mm2
+	movq	mm1,mm3
+
+	psraw	mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
+	psraw	mm3,(WORD_BIT-1)
+
+	pxor	mm0,mm2   ; val = -val
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+
+	;
+	; MMX is an annoyingly crappy instruction set. It has two
+	; misfeatures that are causing problems here:
+	;
+	; - All multiplications are signed.
+	;
+	; - The second operand for the shifts is not treated as packed.
+	;
+	;
+	; We work around the first problem by implementing this algorithm:
+	;
+	; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+	; {
+	;   enum { SHORT_BIT = 16 };
+	;   signed short sx = (signed short) x;
+	;   signed short sy = (signed short) y;
+	;   signed long sz;
+	; 
+	;   sz = (long) sx * (long) sy;     /* signed multiply */
+	; 
+	;   if (sx < 0) sz += (long) sy << SHORT_BIT;
+	;   if (sy < 0) sz += (long) sx << SHORT_BIT;
+	; 
+	;   return (unsigned long) sz;
+	; }
+	;
+	; (note that a negative sx adds _sy_ and vice versa)
+	;
+	; For the second problem, we replace the shift by a multiplication.
+	; Unfortunately that means we have to deal with the signed issue again.
+	;
+
+	paddw	mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
+	paddw	mm1, MMWORD [CORRECTION(0,1,edx)]
+
+	movq	mm4,mm0   ; store current value for later
+	movq	mm5,mm1
+	pmulhw	mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
+	pmulhw	mm1, MMWORD [RECIPROCAL(0,1,edx)]
+	paddw	mm0,mm4		; reciprocal is always negative (MSB=1),
+	paddw	mm1,mm5   ; so we always need to add the initial value
+	                ; (input value is never negative as we
+	                ; inverted it at the start of this routine)
+
+	; here it gets a bit tricky as both scale
+	; and mm0/mm1 can be negative
+	movq	mm6, MMWORD [SCALE(0,0,edx)]	; scale
+	movq	mm7, MMWORD [SCALE(0,1,edx)]
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pmulhw	mm0,mm6
+	pmulhw	mm1,mm7
+
+	psraw	mm6,(WORD_BIT-1)    ; determine if scale is negative
+	psraw	mm7,(WORD_BIT-1)
+
+	pand	mm6,mm4             ; and add input if it is
+	pand	mm7,mm5
+	paddw	mm0,mm6
+	paddw	mm1,mm7
+
+	psraw	mm4,(WORD_BIT-1)    ; then check if negative input 
+	psraw	mm5,(WORD_BIT-1)
+
+	pand	mm4, MMWORD [SCALE(0,0,edx)]	; and add scale if it is
+	pand	mm5, MMWORD [SCALE(0,1,edx)]
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	pxor	mm0,mm2   ; val = -val
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+	add	esi, byte 8*SIZEOF_DCTELEM
+	add	edx, byte 8*SIZEOF_DCTELEM
+	add	edi, byte 8*SIZEOF_JCOEF
+	dec	al
+	jnz	near .quantloop2
+	dec	ah
+	jnz	near .quantloop1	; to avoid branch misprediction
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcqnts2f-64.asm
+++ b/jpeg/simd/jcqnts2f-64.asm
@ -0,0 +1,158 @@
+;
+; jcqnts2f-64.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+	push	rbx
+
+	pcmpeqw  xmm7,xmm7
+	psllw    xmm7,7
+	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov rsi, r10
+	mov	rax, r11
+	mov rdi, r12
+	mov	rcx, DCTSIZE/2
+.convloop:
+	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+	psubb	xmm0,xmm7			; xmm0=(01234567)
+	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
+
+	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
+	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
+
+	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
+	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
+	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
+	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
+
+	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
+	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
+	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
+	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
+	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
+	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
+	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
+	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+	add	rsi, byte 2*SIZEOF_JSAMPROW
+	add	rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	rcx
+	jnz	short .convloop
+
+	pop	rbx
+	uncollect_args
+	pop	rbp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                         FAST_FLOAT * workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT * divisors
+; r12 = FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov rsi, r12
+	mov rdx, r11
+	mov rdi, r10
+	mov	rax, DCTSIZE2/16
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+	cvtps2dq xmm0,xmm0
+	cvtps2dq xmm1,xmm1
+	cvtps2dq xmm2,xmm2
+	cvtps2dq xmm3,xmm3
+
+	packssdw xmm0,xmm1
+	packssdw xmm2,xmm3
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+	add	rsi, byte 16*SIZEOF_FAST_FLOAT
+	add	rdx, byte 16*SIZEOF_FAST_FLOAT
+	add	rdi, byte 16*SIZEOF_JCOEF
+	dec	rax
+	jnz	short .quantloop
+
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcqnts2f.asm
+++ b/jpeg/simd/jcqnts2f.asm
@ -0,0 +1,171 @@
+;
+; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  xmm7,xmm7
+	psllw    xmm7,7
+	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	xmm0,xmm7			; xmm0=(01234567)
+	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
+
+	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
+	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
+
+	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
+	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
+	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
+	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
+
+	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
+	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
+	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
+	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
+	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
+	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
+	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
+	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                         FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	cvtps2dq xmm0,xmm0
+	cvtps2dq xmm1,xmm1
+	cvtps2dq xmm2,xmm2
+	cvtps2dq xmm3,xmm3
+
+	packssdw xmm0,xmm1
+	packssdw xmm2,xmm3
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcqnts2i-64.asm
+++ b/jpeg/simd/jcqnts2i-64.asm
@ -0,0 +1,187 @@
+;
+; jcqnts2i-64.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+	push	rbx
+
+	pxor	xmm6,xmm6		; xmm6=(all 0's)
+	pcmpeqw	xmm7,xmm7
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	mov rsi, r10
+	mov rax, r11
+	mov rdi, r12
+	mov	rcx, DCTSIZE/4
+.convloop:
+	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
+	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
+
+	mov	rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
+	movq	xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
+
+	punpcklbw xmm0,xmm6		; xmm0=(01234567)
+	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
+	paddw     xmm0,xmm7
+	paddw     xmm1,xmm7
+	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
+	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
+	paddw     xmm2,xmm7
+	paddw     xmm3,xmm7
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+	add	rsi, byte 4*SIZEOF_JSAMPROW
+	add	rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	rcx
+	jnz	short .convloop
+
+	pop	rbx
+	uncollect_args
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+;                      DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM * divisors
+; r12 = DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov rsi, r12
+	mov rdx, r11
+	mov rdi, r10
+	mov	rax, DCTSIZE2/32
+.quantloop:
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm5
+	movdqa	xmm2,xmm6
+	movdqa	xmm3,xmm7
+	psraw	xmm4,(WORD_BIT-1)
+	psraw	xmm5,(WORD_BIT-1)
+	psraw	xmm6,(WORD_BIT-1)
+	psraw	xmm7,(WORD_BIT-1)
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
+	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
+	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
+	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
+
+	paddw	xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+	paddw	xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+	paddw	xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+	paddw	xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+	pmulhuw	xmm0, XMMWORD [SCALE(0,0,rdx)]	; scale
+	pmulhuw	xmm1, XMMWORD [SCALE(1,0,rdx)]
+	pmulhuw	xmm2, XMMWORD [SCALE(2,0,rdx)]
+	pmulhuw	xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4
+	psubw	xmm1,xmm5
+	psubw	xmm2,xmm6
+	psubw	xmm3,xmm7
+	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+	add	rsi, byte 32*SIZEOF_DCTELEM
+	add	rdx, byte 32*SIZEOF_DCTELEM
+	add	rdi, byte 32*SIZEOF_JCOEF
+	dec	rax
+	jnz	near .quantloop
+
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcqnts2i.asm
+++ b/jpeg/simd/jcqnts2i.asm
@ -0,0 +1,200 @@
+;
+; jcqnts2i.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	xmm6,xmm6		; xmm6=(all 0's)
+	pcmpeqw	xmm7,xmm7
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
+	movq	xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
+	movq	xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
+
+	punpcklbw xmm0,xmm6		; xmm0=(01234567)
+	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
+	paddw     xmm0,xmm7
+	paddw     xmm1,xmm7
+	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
+	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
+	paddw     xmm2,xmm7
+	paddw     xmm3,xmm7
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+;                      DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/32
+	alignx	16,7
+.quantloop:
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm5
+	movdqa	xmm2,xmm6
+	movdqa	xmm3,xmm7
+	psraw	xmm4,(WORD_BIT-1)
+	psraw	xmm5,(WORD_BIT-1)
+	psraw	xmm6,(WORD_BIT-1)
+	psraw	xmm7,(WORD_BIT-1)
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
+	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
+	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
+	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
+
+	paddw	xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+	paddw	xmm1, XMMWORD [CORRECTION(1,0,edx)]
+	paddw	xmm2, XMMWORD [CORRECTION(2,0,edx)]
+	paddw	xmm3, XMMWORD [CORRECTION(3,0,edx)]
+	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+	pmulhuw	xmm0, XMMWORD [SCALE(0,0,edx)]	; scale
+	pmulhuw	xmm1, XMMWORD [SCALE(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [SCALE(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [SCALE(3,0,edx)]
+
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4
+	psubw	xmm1,xmm5
+	psubw	xmm2,xmm6
+	psubw	xmm3,xmm7
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 32*SIZEOF_DCTELEM
+	add	edx, byte 32*SIZEOF_DCTELEM
+	add	edi, byte 32*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcqntsse.asm
+++ b/jpeg/simd/jcqntsse.asm
@ -0,0 +1,211 @@
+;
+; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
+	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
+	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
+	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
+	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
+	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
+
+	movlhps   xmm0,xmm1			; xmm0=(0123)
+	movlhps   xmm2,xmm3			; xmm2=(4567)
+	movlhps   xmm4,xmm5			; xmm4=(89AB)
+	movlhps   xmm6,xmm7			; xmm6=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                           FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	movhlps  xmm4,xmm0
+	movhlps  xmm5,xmm1
+
+	cvtps2pi mm0,xmm0
+	cvtps2pi mm1,xmm1
+	cvtps2pi mm4,xmm4
+	cvtps2pi mm5,xmm5
+
+	movhlps  xmm6,xmm2
+	movhlps  xmm7,xmm3
+
+	cvtps2pi mm2,xmm2
+	cvtps2pi mm3,xmm3
+	cvtps2pi mm6,xmm6
+	cvtps2pi mm7,xmm7
+
+	packssdw mm0,mm4
+	packssdw mm1,mm5
+	packssdw mm2,mm6
+	packssdw mm3,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcsammmx.asm
+++ b/jpeg/simd/jcsammmx.asm
@ -0,0 +1,324 @@
+;
+; jcsammmx.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov       edx, 0x00010000	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mm2,mm0
+	movq	mm3,mm1
+
+	pand	mm0,mm6
+	psrlw	mm2,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm3,BYTE_BIT
+
+	paddw	mm0,mm2
+	paddw	mm1,mm3
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+	psrlw	mm0,1
+	psrlw	mm1,1
+
+	packuswb mm0,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	short .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov       edx, 0x00020001	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pand	mm0,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	movq	mm4,mm2
+	movq	mm5,mm3
+	pand	mm2,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm3,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm2,mm4
+	paddw	mm3,mm5
+
+	paddw	mm0,mm1
+	paddw	mm2,mm3
+	paddw	mm0,mm7
+	paddw	mm2,mm7
+	psrlw	mm0,2
+	psrlw	mm2,2
+
+	packuswb mm0,mm2
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcsamss2-64.asm
+++ b/jpeg/simd/jcsamss2-64.asm
@ -0,0 +1,330 @@
+;
+; jcsamss2-64.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov rcx, r13
+	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
+	jz	near .return
+
+	mov rdx, r10
+
+	; -- expand_right_edge
+
+	push	rcx
+	shl	rcx,1				; output_cols * 2
+	sub	rcx,rdx
+	jle	short .expand_end
+
+	mov	rax, r11
+	test	rax,rax
+	jle	short .expand_end
+
+	cld
+	mov	rsi, r14	; input_data
+.expandloop:
+	push	rax
+	push	rcx
+
+	mov	rdi, JSAMPROW [rsi]
+	add	rdi,rdx
+	mov	al, JSAMPLE [rdi-1]
+
+	rep stosb
+
+	pop	rcx
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW
+	dec	rax
+	jg	short .expandloop
+
+.expand_end:
+	pop	rcx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	rax, r12	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	rdx, 0x00010000		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	rsi, r14	; input_data
+	mov	rdi, r15	; output_data
+.rowloop:
+	push	rcx
+	push	rdi
+	push	rsi
+
+	mov	rsi, JSAMPROW [rsi]		; inptr
+	mov rdi, JSAMPROW [rdi]		; outptr
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	pxor	xmm1,xmm1
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .downsample
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm1
+
+	pand	xmm0,xmm6
+	psrlw	xmm2,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm3,BYTE_BIT
+
+	paddw	xmm0,xmm2
+	paddw	xmm1,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+	psrlw	xmm0,1
+	psrlw	xmm1,1
+
+	packuswb xmm0,xmm1
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	rcx, byte SIZEOF_XMMWORD	; outcol
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	test	rcx,rcx
+	jnz	short .columnloop_r8
+
+	pop	rsi
+	pop	rdi
+	pop	rcx
+
+	add	rsi, byte SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte SIZEOF_JSAMPROW	; output_data
+	dec	rax				; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+	push	rbp
+	mov	rax,rsp
+	mov	rbp,rsp
+	collect_args
+
+	mov	rcx, r13
+	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
+	jz	near .return
+
+	mov	rdx, r10
+
+	; -- expand_right_edge
+
+	push	rcx
+	shl	rcx,1				; output_cols * 2
+	sub	rcx,rdx
+	jle	short .expand_end
+
+	mov	rax, r11
+	test	rax,rax
+	jle	short .expand_end
+
+	cld
+	mov	rsi, r14	; input_data
+.expandloop:
+	push	rax
+	push	rcx
+
+	mov	rdi, JSAMPROW [rsi]
+	add	rdi,rdx
+	mov	al, JSAMPLE [rdi-1]
+
+	rep stosb
+
+	pop	rcx
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW
+	dec	rax
+	jg	short .expandloop
+
+.expand_end:
+	pop	rcx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	rax, r12	; rowctr
+	test	rax,rax
+	jle	near .return
+
+	mov	rdx, 0x00020001		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	rsi, r14	; input_data
+	mov	rdi, r15	; output_data
+.rowloop:
+	push	rcx
+	push	rdi
+	push	rsi
+
+	mov	rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	rdi, JSAMPROW [rdi]			; outptr
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	rcx, SIZEOF_XMMWORD
+	jmp	short .downsample
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	pand	xmm0,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm0,xmm4
+	paddw	xmm1,xmm5
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm3
+	pand	xmm2,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm3,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm2,xmm4
+	paddw	xmm3,xmm5
+
+	paddw	xmm0,xmm1
+	paddw	xmm2,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm7
+	psrlw	xmm0,2
+	psrlw	xmm2,2
+
+	packuswb xmm0,xmm2
+
+	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	rcx, byte SIZEOF_XMMWORD	; outcol
+	add	rdx, byte 2*SIZEOF_XMMWORD	; inptr0
+	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr1
+	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	rcx,rcx
+	jnz	near .columnloop_r8
+
+	pop	rsi
+	pop	rdi
+	pop	rcx
+
+	add	rsi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	rdi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	rax				; rowctr
+	jg	near .rowloop
+
+.return:
+	uncollect_args
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jcsamss2.asm
+++ b/jpeg/simd/jcsamss2.asm
@ -0,0 +1,351 @@
+;
+; jcsamss2.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00010000		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm1,xmm1
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm1
+
+	pand	xmm0,xmm6
+	psrlw	xmm2,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm3,BYTE_BIT
+
+	paddw	xmm0,xmm2
+	paddw	xmm1,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+	psrlw	xmm0,1
+	psrlw	xmm1,1
+
+	packuswb xmm0,xmm1
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	test	ecx,ecx
+	jnz	short .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define img_width(b)	(b)+8			; JDIMENSION image_width
+%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
+%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
+%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
+%define input_data(b)	(b)+24		; JSAMPARRAY input_data
+%define output_data(b)	(b)+28	; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, JDIMENSION [width_blks(ebp)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, JDIMENSION [img_width(ebp)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, INT [max_v_samp(ebp)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00020001		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	pand	xmm0,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm0,xmm4
+	paddw	xmm1,xmm5
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm3
+	pand	xmm2,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm3,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm2,xmm4
+	paddw	xmm3,xmm5
+
+	paddw	xmm0,xmm1
+	paddw	xmm2,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm7
+	psrlw	xmm0,2
+	psrlw	xmm2,2
+
+	packuswb xmm0,xmm2
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	edx, byte 2*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jdclrmmx.asm
+++ b/jpeg/simd/jdclrmmx.asm
@ -0,0 +1,407 @@
+;
+; jdclrmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
+;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                            JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b)	(b)+8			; JDIMENSION out_width
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
+	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
+
+	pcmpeqw	mm4,mm4
+	pcmpeqw	mm7,mm7
+	psrlw	mm4,BYTE_BIT
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	mm4,mm5			; mm4=Cb(0246)=CbE
+	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
+	pand	mm0,mm1			; mm0=Cr(0246)=CrE
+	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
+
+	paddw	mm4,mm7
+	paddw	mm5,mm7
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm2,mm4			; mm2=CbE
+	movq	mm3,mm5			; mm3=CbO
+	paddw	mm4,mm4			; mm4=2*CbE
+	paddw	mm5,mm5			; mm5=2*CbO
+	movq	mm6,mm0			; mm6=CrE
+	movq	mm7,mm1			; mm7=CrO
+	paddw	mm0,mm0			; mm0=2*CrE
+	paddw	mm1,mm1			; mm1=2*CrO
+
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
+	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
+	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
+
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	paddw	mm5,[GOTOFF(eax,PW_ONE)]
+	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
+	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	paddw	mm1,[GOTOFF(eax,PW_ONE)]
+	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
+	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
+
+	paddw	mm4,mm2
+	paddw	mm5,mm3
+	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
+	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
+
+	movq      mm4,mm2
+	movq      mm5,mm3
+	punpcklwd mm2,mm6
+	punpckhwd mm4,mm6
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm3,mm7
+	punpckhwd mm5,mm7
+	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm4,SCALEBITS
+	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm3,SCALEBITS
+	psrad     mm5,SCALEBITS
+
+	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
+
+	pcmpeqw   mm4,mm4
+	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      mm4,mm5		; mm4=Y(0246)=YE
+	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
+
+	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .nextrow
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .nextrow
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jdclrss2-64.asm
+++ b/jpeg/simd/jdclrss2-64.asm
@ -0,0 +1,487 @@
+;
+; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+; r10 = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14 = int num_rows
+
+%define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+	push	rbp
+	mov	rax,rsp				; rax = original rbp
+	sub	rsp, byte 4
+	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[rsp],rax
+	mov	rbp,rsp				; rbp = aligned rbp
+	lea	rsp, [wk(0)]
+	collect_args
+	push	rbx
+
+	mov	rcx, r10	; num_cols
+	test	rcx,rcx
+	jz	near .return
+
+	push	rcx
+
+	mov	rdi, r11
+	mov	rcx, r12
+	mov	rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+	mov	rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+	mov	rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+	lea	rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+	lea	rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+	lea	rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+	pop	rcx
+
+	mov	rdi, r13
+	mov	eax, r14d
+	test	rax,rax
+	jle	near .return
+.rowloop:
+	push	rax
+	push	rdi
+	push	rdx
+	push	rbx
+	push	rsi
+	push	rcx			; col
+
+	mov	rsi, JSAMPROW [rsi]	; inptr0
+	mov	rbx, JSAMPROW [rbx]	; inptr1
+	mov	rdx, JSAMPROW [rdx]	; inptr2
+	mov	rdi, JSAMPROW [rdi]	; outptr
+.columnloop:
+
+	movdqa	xmm5, XMMWORD [rbx]	; xmm5=Cb(0123456789ABCDEF)
+	movdqa	xmm1, XMMWORD [rdx]	; xmm1=Cr(0123456789ABCDEF)
+
+	pcmpeqw	xmm4,xmm4
+	pcmpeqw	xmm7,xmm7
+	psrlw	xmm4,BYTE_BIT
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
+	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
+	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
+	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
+
+	paddw	xmm4,xmm7
+	paddw	xmm5,xmm7
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm2,xmm4		; xmm2=CbE
+	movdqa	xmm3,xmm5		; xmm3=CbO
+	paddw	xmm4,xmm4		; xmm4=2*CbE
+	paddw	xmm5,xmm5		; xmm5=2*CbO
+	movdqa	xmm6,xmm0		; xmm6=CrE
+	movdqa	xmm7,xmm1		; xmm7=CrO
+	paddw	xmm0,xmm0		; xmm0=2*CrE
+	paddw	xmm1,xmm1		; xmm1=2*CrO
+
+	pmulhw	xmm4,[rel PW_MF0228]	; xmm4=(2*CbE * -FIX(0.22800))
+	pmulhw	xmm5,[rel PW_MF0228]	; xmm5=(2*CbO * -FIX(0.22800))
+	pmulhw	xmm0,[rel PW_F0402]	; xmm0=(2*CrE * FIX(0.40200))
+	pmulhw	xmm1,[rel PW_F0402]	; xmm1=(2*CrO * FIX(0.40200))
+
+	paddw	xmm4,[rel PW_ONE]
+	paddw	xmm5,[rel PW_ONE]
+	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
+	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
+	paddw	xmm0,[rel PW_ONE]
+	paddw	xmm1,[rel PW_ONE]
+	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
+	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
+
+	paddw	xmm4,xmm2
+	paddw	xmm5,xmm3
+	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm5,xmm3
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm4,xmm6
+	pmaddwd   xmm2,[rel PW_MF0344_F0285]
+	pmaddwd   xmm4,[rel PW_MF0344_F0285]
+	punpcklwd xmm3,xmm7
+	punpckhwd xmm5,xmm7
+	pmaddwd   xmm3,[rel PW_MF0344_F0285]
+	pmaddwd   xmm5,[rel PW_MF0344_F0285]
+
+	paddd     xmm2,[rel PD_ONEHALF]
+	paddd     xmm4,[rel PD_ONEHALF]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm4,SCALEBITS
+	paddd     xmm3,[rel PD_ONEHALF]
+	paddd     xmm5,[rel PD_ONEHALF]
+	psrad     xmm3,SCALEBITS
+	psrad     xmm5,SCALEBITS
+
+	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movdqa    xmm5, XMMWORD [rsi]	; xmm5=Y(0123456789ABCDEF)
+
+	pcmpeqw   xmm4,xmm4
+	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
+	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
+
+	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [rdi], xmmF
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	rcx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	rcx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	rax,rcx
+	xor	rcx, byte 0x0F
+	shl	rcx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	rax,rcx
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,rcx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	rcx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	rdi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [rdi], xmmC
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [rdi], xmmH
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	rcx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	rsi, byte SIZEOF_XMMWORD	; inptr0
+	add	rbx, byte SIZEOF_XMMWORD	; inptr1
+	add	rdx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	rcx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	rcx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	rcx, byte SIZEOF_XMMWORD/16
+	jb	near .nextrow
+	mov	rax,rcx
+	xor	rcx, byte 0x03
+	inc	rcx
+	shl	rcx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	rcx,rdi
+	and	rcx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	rax, [rcx+rax*4]	; RGB_PIXELSIZE
+	cmp	rax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	rcx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+	pop	rcx
+	pop	rsi
+	pop	rbx
+	pop	rdx
+	pop	rdi
+	pop	rax
+
+	add	rsi, byte SIZEOF_JSAMPROW
+	add	rbx, byte SIZEOF_JSAMPROW
+	add	rdx, byte SIZEOF_JSAMPROW
+	add	rdi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	rax				; num_rows
+	jg	near .rowloop
+
+	sfence		; flush the write buffer
+
+.return:
+	pop	rbx
+	uncollect_args
+	mov	rsp,rbp		; rsp <- aligned rbp
+	pop	rsp		; rsp <- original rbp
+	pop	rbp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jdclrss2.asm
+++ b/jpeg/simd/jdclrss2.asm
@ -0,0 +1,505 @@
+;
+; jdclrss2.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+				
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+%define out_width(b)	(b)+8			; JDIMENSION out_width
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm5, XMMWORD [ebx]	; xmm5=Cb(0123456789ABCDEF)
+	movdqa	xmm1, XMMWORD [edx]	; xmm1=Cr(0123456789ABCDEF)
+
+	pcmpeqw	xmm4,xmm4
+	pcmpeqw	xmm7,xmm7
+	psrlw	xmm4,BYTE_BIT
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
+	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
+	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
+	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
+
+	paddw	xmm4,xmm7
+	paddw	xmm5,xmm7
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm2,xmm4		; xmm2=CbE
+	movdqa	xmm3,xmm5		; xmm3=CbO
+	paddw	xmm4,xmm4		; xmm4=2*CbE
+	paddw	xmm5,xmm5		; xmm5=2*CbO
+	movdqa	xmm6,xmm0		; xmm6=CrE
+	movdqa	xmm7,xmm1		; xmm7=CrO
+	paddw	xmm0,xmm0		; xmm0=2*CrE
+	paddw	xmm1,xmm1		; xmm1=2*CrO
+
+	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbE * -FIX(0.22800))
+	pmulhw	xmm5,[GOTOFF(eax,PW_MF0228)]	; xmm5=(2*CbO * -FIX(0.22800))
+	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrE * FIX(0.40200))
+	pmulhw	xmm1,[GOTOFF(eax,PW_F0402)]	; xmm1=(2*CrO * FIX(0.40200))
+
+	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm5,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
+	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
+	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm1,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
+	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
+
+	paddw	xmm4,xmm2
+	paddw	xmm5,xmm3
+	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm5,xmm3
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm4,xmm6
+	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd xmm3,xmm7
+	punpckhwd xmm5,xmm7
+	pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm4,SCALEBITS
+	paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm3,SCALEBITS
+	psrad     xmm5,SCALEBITS
+
+	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movdqa    xmm5, XMMWORD [esi]	; xmm5=Y(0123456789ABCDEF)
+
+	pcmpeqw   xmm4,xmm4
+	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
+	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
+
+	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	ecx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	eax,ecx
+	xor	ecx, byte 0x0F
+	shl	ecx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	eax,ecx
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	ecx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	ecx, byte SIZEOF_XMMWORD/16
+	jb	short .nextrow
+	mov	eax,ecx
+	xor	ecx, byte 0x03
+	inc	ecx
+	shl	ecx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	sfence		; flush the write buffer
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+	align	16
--- a/jpeg/simd/jdcolmmx.asm
+++ b/jpeg/simd/jdcolmmx.asm
@ -0,0 +1,117 @@
+;
+; jdcolmmx.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
+%include "jdclrmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
+%include "jdclrmmx.asm"
--- a/jpeg/simd/jdcolss2-64.asm
+++ b/jpeg/simd/jdcolss2-64.asm
@ -0,0 +1,117 @@
+;
+; jdcolss2-64.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdclrss2-64.asm"
--- a/Показать больше
+++ b/Показать больше