зеркало из https://github.com/mozilla/mozjpeg.git
Use clz/bsr instructions on ARM for bit counting rather than the lookup table (reduces memory footprint and can improve performance in some cases.)
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1220 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
Родитель
8bad4251a9
Коммит
0cfc4c17b7
|
@ -39,6 +39,15 @@ OS X 10.6 "Snow Leopard" or later must be used when packaging libjpeg-turbo,
|
|||
although the packages produced can be installed on OS X 10.5 "Leopard" or
|
||||
later. OS X 10.4 "Tiger" is no longer supported.
|
||||
|
||||
[5] The Huffman encoder now uses clz and bsr instructions for bit counting on
|
||||
ARM platforms rather than a lookup table. This reduces the memory footprint
|
||||
by 64k, which may be important for some mobile applications. Out of four
|
||||
Android devices that were tested, two demonstrated a small loss (~3-4% on
|
||||
average) with ARMv6 code and a small gain (also ~3-4%) with ARMv7 code when
|
||||
enabling this new feature, but the other two devices demonstrated a
|
||||
significant performance gain across the board (~10-20%.) Actual mileage may
|
||||
vary.
|
||||
|
||||
|
||||
1.3.1
|
||||
=====
|
||||
|
|
34
jchuff.c
34
jchuff.c
|
@ -22,8 +22,36 @@
|
|||
#include "jchuff.h" /* Declarations shared with jcphuff.c */
|
||||
#include <limits.h>
|
||||
|
||||
/*
|
||||
* NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
|
||||
* used for bit counting rather than the lookup table. This will reduce the
|
||||
* memory footprint by 64k, which is important for some mobile applications
|
||||
* that create many isolated instances of libjpeg-turbo (web browsers, for
|
||||
* instance.) This may improve performance on some mobile platforms as well.
|
||||
* This feature is enabled by default only on ARM processors, because some x86
|
||||
* chips have a slow implementation of bsr, and the use of clz/bsr cannot be
|
||||
* shown to have a significant performance impact even on the x86 chips that
|
||||
* have a fast implementation of it. When building for ARMv6, you can
|
||||
* explicitly disable the use of clz/bsr by adding -mthumb to the compiler
|
||||
* flags (this defines __thumb__).
|
||||
*/
|
||||
|
||||
/* NOTE: Both GCC and Clang define __GNUC__ */
|
||||
#if defined __GNUC__ && defined __arm__
|
||||
#if !defined __thumb__ || defined __thumb2__
|
||||
#define USE_CLZ_INTRINSIC
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_CLZ_INTRINSIC
|
||||
#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
|
||||
#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
|
||||
#else
|
||||
static unsigned char jpeg_nbits_table[65536];
|
||||
static int jpeg_nbits_table_init = 0;
|
||||
#define JPEG_NBITS(x) (jpeg_nbits_table[x])
|
||||
#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
|
||||
#endif
|
||||
|
||||
#ifndef min
|
||||
#define min(a,b) ((a)<(b)?(a):(b))
|
||||
|
@ -272,6 +300,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
|
|||
dtbl->ehufsi[i] = huffsize[p];
|
||||
}
|
||||
|
||||
#ifndef USE_CLZ_INTRINSIC
|
||||
if(!jpeg_nbits_table_init) {
|
||||
for(i = 0; i < 65536; i++) {
|
||||
int nbits = 0, temp = i;
|
||||
|
@ -280,6 +309,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
|
|||
}
|
||||
jpeg_nbits_table_init = 1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -482,7 +512,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
|
|||
temp2 += temp3;
|
||||
|
||||
/* Find the number of bits needed for the magnitude of the coefficient */
|
||||
nbits = jpeg_nbits_table[temp];
|
||||
nbits = JPEG_NBITS(temp);
|
||||
|
||||
/* Emit the Huffman-coded symbol for the number of bits */
|
||||
code = dctbl->ehufco[nbits];
|
||||
|
@ -516,7 +546,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
|
|||
temp ^= temp3; \
|
||||
temp -= temp3; \
|
||||
temp2 += temp3; \
|
||||
nbits = jpeg_nbits_table[temp]; \
|
||||
nbits = JPEG_NBITS_NONZERO(temp); \
|
||||
/* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
|
||||
while (r > 15) { \
|
||||
EMIT_BITS(code_0xf0, size_0xf0) \
|
||||
|
|
Загрузка…
Ссылка в новой задаче