diff --git a/ChangeLog.txt b/ChangeLog.txt
index 4b3db1ee..36e8a5b6 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -39,6 +39,15 @@ OS X 10.6 "Snow Leopard" or later must be used when packaging libjpeg-turbo,
 although the packages produced can be installed on OS X 10.5 "Leopard" or
 later.  OS X 10.4 "Tiger" is no longer supported.
 
+[5] The Huffman encoder now uses clz and bsr instructions for bit counting on
+ARM platforms rather than a lookup table.  This reduces the memory footprint
+by 64k, which may be important for some mobile applications.  Out of four
+Android devices that were tested, two demonstrated a small loss (~3-4% on
+average) with ARMv6 code and a small gain (also ~3-4%) with ARMv7 code when
+enabling this new feature, but the other two devices demonstrated a
+significant performance gain across the board (~10-20%.)  Actual mileage may
+vary.
+
 
 1.3.1
 =====
diff --git a/jchuff.c b/jchuff.c
index 29bf3892..68e4e0e0 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -22,8 +22,36 @@
 #include "jchuff.h"		/* Declarations shared with jcphuff.c */
 #include <limits.h>
 
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table.  This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.)  This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on ARM processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it.  When building for ARMv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if defined __GNUC__ && defined __arm__
+#if !defined __thumb__ || defined __thumb2__
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
 static unsigned char jpeg_nbits_table[65536];
 static int jpeg_nbits_table_init = 0;
+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
+#endif
 
 #ifndef min
  #define min(a,b) ((a)<(b)?(a):(b))
@@ -272,6 +300,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
     dtbl->ehufsi[i] = huffsize[p];
   }
 
+#ifndef USE_CLZ_INTRINSIC
   if(!jpeg_nbits_table_init) {
     for(i = 0; i < 65536; i++) {
       int nbits = 0, temp = i;
@@ -280,6 +309,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
     }
     jpeg_nbits_table_init = 1;
   }
+#endif
 }
 
 
@@ -482,7 +512,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
   temp2 += temp3;
 
   /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = jpeg_nbits_table[temp];
+  nbits = JPEG_NBITS(temp);
 
   /* Emit the Huffman-coded symbol for the number of bits */
   code = dctbl->ehufco[nbits];
@@ -516,7 +546,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
     temp ^= temp3; \
     temp -= temp3; \
     temp2 += temp3; \
-    nbits = jpeg_nbits_table[temp]; \
+    nbits = JPEG_NBITS_NONZERO(temp); \
     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
     while (r > 15) { \
       EMIT_BITS(code_0xf0, size_0xf0) \