Faster vp9_regular_quantize_b_8x8.

A couple of scalar optimizations speeding up quantization by about 1.6x. Overall encoder speedup is around 3%. Change-Id: I19981d1ef0b33e4e5732739574f367fe82771a84
2013-02-11 17:43:27 -08:00 · 2013-02-11 17:43:27 -08:00 · 0e4397f0cd
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@ -228,10 +228,11 @@ void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {
 }

 void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {
+  if (!b->skip_block) {
    int i, rc, eob;
    int zbin;
    int x, y, z, sz;
-  int zero_run = 0;
+    int zero_run;
    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
    int16_t *coeff_ptr  = b->coeff;
    int16_t *zbin_ptr   = b->zbin;
@ -248,23 +249,49 @@ void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {

    eob = -1;

-  if (!b->skip_block) {
-    for (i = 0; i < 64; i++) {
+    // Special case for DC as it is the one triggering access in various
+    // tables: {zbin, quant, quant_shift, dequant}_ptr[rc != 0]
+    {
+      z    = coeff_ptr[0];
+      zbin = (zbin_ptr[0] + zbin_boost_ptr[0] + zbin_oq_value);
+      zero_run = 1;
+
+      sz = (z >> 31);                                // sign of z
+      x  = (z ^ sz) - sz;                            // x = abs(z)
+
+      if (x >= zbin) {
+        x += (round_ptr[0]);
+        y  = ((int)(((int)(x * quant_ptr[0]) >> 16) + x))
+             >> quant_shift_ptr[0];                  // quantize (x)
+        x  = (y ^ sz) - sz;                          // get the sign back
+        qcoeff_ptr[0]  = x;                          // write to destination
+        dqcoeff_ptr[0] = x * dequant_ptr[0];         // dequantized value
+
+        if (y) {
+          eob = 0;                                   // last nonzero coeffs
+          zero_run = 0;
+        }
+      }
+    }
+    for (i = 1; i < 64; i++) {
      rc   = vp9_default_zig_zag1d_8x8[i];
      z    = coeff_ptr[rc];
-      zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value);
-      zero_run += (zero_run < 15);
+      zbin = (zbin_ptr[1] + zbin_boost_ptr[zero_run] + zbin_oq_value);
+      // The original code was incrementing zero_run while keeping it at
+      // maximum 15 by adding "(zero_run < 15)". The same is achieved by
+      // removing the opposite of the sign mask of "(zero_run - 15)".
+      zero_run -= (zero_run - 15) >> 31;

      sz = (z >> 31);                                // sign of z
      x  = (z ^ sz) - sz;                            // x = abs(z)

      if (x >= zbin) {
        x += (round_ptr[rc != 0]);
-        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
-             >> quant_shift_ptr[rc != 0];            // quantize (x)
+        y  = ((int)(((int)(x * quant_ptr[1]) >> 16) + x))
+             >> quant_shift_ptr[1];                  // quantize (x)
        x  = (y ^ sz) - sz;                          // get the sign back
        qcoeff_ptr[rc]  = x;                         // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0];  // dequantized value
+        dqcoeff_ptr[rc] = x * dequant_ptr[1];        // dequantized value

        if (y) {
          eob = i;                                   // last nonzero coeffs
@ -272,8 +299,10 @@ void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {
        }
      }
    }
-  }
    d->eob = eob + 1;
+  } else {
+    d->eob = 0;
+  }
 }

 void vp9_quantize_mby_8x8(MACROBLOCK *x) {