32x32 transform for superblocks.

This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds code all over the place to wrap that in the bitstream/encoder/decoder/RD. Some implementation notes (these probably need careful review): - token range is extended by 1 bit, since the value range out of this transform is [-16384,16383]. - the coefficients coming out of the FDCT are manually scaled back by 1 bit, or else they won't fit in int16_t (they are 17 bits). Because of this, the RD error scoring does not right-shift the MSE score by two (unlike for 4x4/8x8/16x16). - to compensate for this loss in precision, the quantizer is halved also. This is currently a little hacky. - FDCT and IDCT is double-only right now. Needs a fixed-point impl. - There are no default probabilities for the 32x32 transform yet; I'm simply using the 16x16 luma ones. A future commit will add newly generated probabilities for all transforms. - No ADST version. I don't think we'll add one for this level; if an ADST is desired, transform-size selection can scale back to 16x16 or lower, and use an ADST at that level. Additional notes specific to Debargha's DWT/DCT hybrid: - coefficient scale is different for the top/left 16x16 (DCT-over-DWT) block than for the rest (DWT pixel differences) of the block. Therefore, RD error scoring isn't easily scalable between coefficient and pixel domain. Thus, unfortunately, we need to compute the RD distortion in the pixel domain until we figure out how to scale these appropriately. Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 14:45:05 -08:00 · 2012-12-07 14:45:05 -08:00 · c456b35fdf
--- a/2
+++ b/2
@ -247,6 +247,8 @@ EXPERIMENT_LIST="
    implicit_segmentation
    newbintramodes
    comp_interintra_pred
+    tx32x32
+    dwt32x32hybrid
 "
 CONFIG_LIST="
    external_build
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+extern "C" {
+#include "vp9/common/vp9_entropy.h"
+#include "./vp9_rtcd.h"
+  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
+  void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+}
+
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+#if !CONFIG_DWT32X32HYBRID
+static const double kPi = 3.141592653589793238462643383279502884;
+static void reference2_32x32_idct_2d(double *input, double *output) {
+  double x;
+  for (int l = 0; l < 32; ++l) {
+    for (int k = 0; k < 32; ++k) {
+      double s = 0;
+      for (int i = 0; i < 32; ++i) {
+        for (int j = 0; j < 32; ++j) {
+          x = cos(kPi * j * (l + 0.5) / 32.0) *
+              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
+          if (i != 0)
+            x *= sqrt(2.0);
+          if (j != 0)
+            x *= sqrt(2.0);
+          s += x;
+        }
+      }
+      output[k * 32 + l] = s / 4;
+    }
+  }
+}
+
+static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 32; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 32; n++)
+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 64.0);
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+  // First transform columns
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = input[j*32 + i];
+    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    for (int j = 0; j < 32; ++j)
+      output[j * 32 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i*32];
+    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    // Scale by some magic number
+    for (int j = 0; j < 32; ++j)
+      output[j + i * 32] = temp_out[j] / 4;
+  }
+}
+
+
+TEST(VP9Idct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t in[1024], coeff[1024];
+    int16_t out_c[1024];
+    double out_r[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      in[j] = rnd.Rand8() - rnd.Rand8();
+
+    reference_32x32_dct_2d(in, out_r);
+    for (int j = 0; j < 1024; j++)
+      coeff[j] = round(out_r[j]);
+    vp9_short_idct32x32_c(coeff, out_c, 64);
+    for (int j = 0; j < 1024; ++j) {
+      const int diff = out_c[j] - in[j];
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 3x32 IDCT has error " << error
+          << " at index " << j;
+    }
+
+    vp9_short_fdct32x32_c(in, out_c, 64);
+    for (int j = 0; j < 1024; ++j) {
+      const double diff = coeff[j] - out_c[j];
+      const double error = diff * diff;
+      EXPECT_GE(1.0, error)
+          << "Error: 32x32 FDCT has error " << error
+          << " at index " << j;
+    }
+  }
+}
+#else  // CONFIG_DWT32X32HYBRID
+  // TODO(rbultje/debargha): add DWT-specific tests
+#endif  // CONFIG_DWT32X32HYBRID
+TEST(VP9Fdct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  unsigned int max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[1024];
+    int16_t test_temp_block[1024];
+    int16_t test_output_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+
+    for (int j = 0; j < 1024; ++j) {
+      const unsigned diff = test_input_block[j] - test_output_block[j];
+      const unsigned error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block/10, total_error)
+      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1/10 per block";
+}
+
+TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t input_block[1024], input_extreme_block[1024];
+    int16_t output_block[1024], output_extreme_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 1024; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 32;
+    vp9_short_fdct32x32_c(input_block, output_block, pitch);
+    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 1024; ++j) {
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 32x32 FDCT extreme has coefficient larger than "
+             "4*DCT_MAX_VALUE";
+    }
+  }
+}
+}  // namespace
--- a/test/test.mk
+++ b/test/test.mk
@ -64,6 +64,9 @@ endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 #LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
+ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_TX32X32),yesyes)
+LIBVPX_TEST_SRCS-yes += dct32x32_test.cc
+endif
 LIBVPX_TEST_SRCS-yes += idct8x8_test.cc
 LIBVPX_TEST_SRCS-yes += variance_test.cc
 endif # VP9
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@ -129,7 +129,13 @@ typedef enum {
  TX_4X4,                      // 4x4 dct transform
  TX_8X8,                      // 8x8 dct transform
  TX_16X16,                    // 16x16 dct transform
-  TX_SIZE_MAX                  // Number of different transforms available
+  TX_SIZE_MAX_MB,              // Number of transforms available to MBs
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  TX_32X32 = TX_SIZE_MAX_MB,   // 32x32 dct transform
+  TX_SIZE_MAX_SB,              // Number of transforms available to SBs
+#else
+  TX_SIZE_MAX_SB = TX_SIZE_MAX_MB,
+#endif
 } TX_SIZE;

 typedef enum {
@ -302,6 +308,15 @@ typedef struct blockd {
  union b_mode_info bmi;
 } BLOCKD;

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+typedef struct superblockd {
+  /* 32x32 Y and 16x16 U/V. No 2nd order transform yet. */
+  DECLARE_ALIGNED(16, short, diff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, short, qcoeff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, short, dqcoeff[32*32+16*16*2]);
+} SUPERBLOCKD;
+#endif
+
 typedef struct macroblockd {
  DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
  DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
@ -309,6 +324,10 @@ typedef struct macroblockd {
  DECLARE_ALIGNED(16, short, dqcoeff[400]);
  DECLARE_ALIGNED(16, unsigned short,  eobs[25]);

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  SUPERBLOCKD sb_coeff_data;
+#endif
+
  /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
  BLOCKD block[25];
  int fullpixel_mask;
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@ -1375,3 +1375,5 @@ static const vp9_prob
    }
  }
 };
+
+#define default_coef_probs_32x32 default_coef_probs_16x16
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@ -132,6 +132,109 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,
 };

+DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
+  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
+  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
+    0,    1,   32,   64,   33,    2,    3,   34,   65,   96,  128,   97,   66,   35,    4,    5,   36,   67,   98,  129,  160,  192,  161,  130,   99,   68,   37,    6,    7,   38,   69,  100,
+  131,  162,  193,  224,  256,  225,  194,  163,  132,  101,   70,   39,    8,    9,   40,   71,  102,  133,  164,  195,  226,  257,  288,  320,  289,  258,  227,  196,  165,  134,  103,   72,
+   41,   10,   11,   42,   73,  104,  135,  166,  197,  228,  259,  290,  321,  352,  384,  353,  322,  291,  260,  229,  198,  167,  136,  105,   74,   43,   12,   13,   44,   75,  106,  137,
+  168,  199,  230,  261,  292,  323,  354,  385,  416,  448,  417,  386,  355,  324,  293,  262,  231,  200,  169,  138,  107,   76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,
+  263,  294,  325,  356,  387,  418,  449,  480,  512,  481,  450,  419,  388,  357,  326,  295,  264,  233,  202,  171,  140,  109,   78,   47,   16,   17,   48,   79,  110,  141,  172,  203,
+  234,  265,  296,  327,  358,  389,  420,  451,  482,  513,  544,  576,  545,  514,  483,  452,  421,  390,  359,  328,  297,  266,  235,  204,  173,  142,  111,   80,   49,   18,   19,   50,
+   81,  112,  143,  174,  205,  236,  267,  298,  329,  360,  391,  422,  453,  484,  515,  546,  577,  608,  640,  609,  578,  547,  516,  485,  454,  423,  392,  361,  330,  299,  268,  237,
+  206,  175,  144,  113,   82,   51,   20,   21,   52,   83,  114,  145,  176,  207,  238,  269,  300,  331,  362,  393,  424,  455,  486,  517,  548,  579,  610,  641,  672,  704,  673,  642,
+  611,  580,  549,  518,  487,  456,  425,  394,  363,  332,  301,  270,  239,  208,  177,  146,  115,   84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
+  395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,  768,  737,  706,  675,  644,  613,  582,  551,  520,  489,  458,  427,  396,  365,  334,  303,  272,  241,  210,  179,
+  148,  117,   86,   55,   24,   25,   56,   87,  118,  149,  180,  211,  242,  273,  304,  335,  366,  397,  428,  459,  490,  521,  552,  583,  614,  645,  676,  707,  738,  769,  800,  832,
+  801,  770,  739,  708,  677,  646,  615,  584,  553,  522,  491,  460,  429,  398,  367,  336,  305,  274,  243,  212,  181,  150,  119,   88,   57,   26,   27,   58,   89,  120,  151,  182,
+  213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,  802,  833,  864,  896,  865,  834,  803,  772,  741,  710,  679,  648,  617,
+  586,  555,  524,  493,  462,  431,  400,  369,  338,  307,  276,  245,  214,  183,  152,  121,   90,   59,   28,   29,   60,   91,  122,  153,  184,  215,  246,  277,  308,  339,  370,  401,
+  432,  463,  494,  525,  556,  587,  618,  649,  680,  711,  742,  773,  804,  835,  866,  897,  928,  960,  929,  898,  867,  836,  805,  774,  743,  712,  681,  650,  619,  588,  557,  526,
+  495,  464,  433,  402,  371,  340,  309,  278,  247,  216,  185,  154,  123,   92,   61,   30,   31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
+  527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,  993,  962,  931,  900,  869,  838,  807,  776,  745,  714,  683,  652,  621,  590,  559,  528,
+  497,  466,  435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,   94,   63,   95,  126,  157,  188,  219,  250,  281,  312,  343,  374,  405,  436,  467,  498,  529,  560,  591,
+  622,  653,  684,  715,  746,  777,  808,  839,  870,  901,  932,  963,  994,  995,  964,  933,  902,  871,  840,  809,  778,  747,  716,  685,  654,  623,  592,  561,  530,  499,  468,  437,
+  406,  375,  344,  313,  282,  251,  220,  189,  158,  127,  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
+  841,  872,  903,  934,  965,  996,  997,  966,  935,  904,  873,  842,  811,  780,  749,  718,  687,  656,  625,  594,  563,  532,  501,  470,  439,  408,  377,  346,  315,  284,  253,  222,
+  191,  223,  254,  285,  316,  347,  378,  409,  440,  471,  502,  533,  564,  595,  626,  657,  688,  719,  750,  781,  812,  843,  874,  905,  936,  967,  998,  999,  968,  937,  906,  875,
+  844,  813,  782,  751,  720,  689,  658,  627,  596,  565,  534,  503,  472,  441,  410,  379,  348,  317,  286,  255,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
+  659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000, 1001,  970,  939,  908,  877,  846,  815,  784,  753,  722,  691,  660,  629,  598,  567,  536,  505,  474,  443,  412,
+  381,  350,  319,  351,  382,  413,  444,  475,  506,  537,  568,  599,  630,  661,  692,  723,  754,  785,  816,  847,  878,  909,  940,  971, 1002, 1003,  972,  941,  910,  879,  848,  817,
+  786,  755,  724,  693,  662,  631,  600,  569,  538,  507,  476,  445,  414,  383,  415,  446,  477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
+  973, 1004, 1005,  974,  943,  912,  881,  850,  819,  788,  757,  726,  695,  664,  633,  602,  571,  540,  509,  478,  447,  479,  510,  541,  572,  603,  634,  665,  696,  727,  758,  789,
+  820,  851,  882,  913,  944,  975, 1006, 1007,  976,  945,  914,  883,  852,  821,  790,  759,  728,  697,  666,  635,  604,  573,  542,  511,  543,  574,  605,  636,  667,  698,  729,  760,
+  791,  822,  853,  884,  915,  946,  977, 1008, 1009,  978,  947,  916,  885,  854,  823,  792,  761,  730,  699,  668,  637,  606,  575,  607,  638,  669,  700,  731,  762,  793,  824,  855,
+  886,  917,  948,  979, 1010, 1011,  980,  949,  918,  887,  856,  825,  794,  763,  732,  701,  670,  639,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012, 1013,  982,
+  951,  920,  889,  858,  827,  796,  765,  734,  703,  735,  766,  797,  828,  859,  890,  921,  952,  983, 1014, 1015,  984,  953,  922,  891,  860,  829,  798,  767,  799,  830,  861,  892,
+  923,  954,  985, 1016, 1017,  986,  955,  924,  893,  862,  831,  863,  894,  925,  956,  987, 1018, 1019,  988,  957,  926,  895,  927,  958,  989, 1020, 1021,  990,  959,  991, 1022, 1023,
+};

 /* Array indices are identical to previously-existing CONTEXT_NODE indices */

@ -160,10 +263,11 @@ static const Prob Pcat2[] = { 165, 145};
 static const Prob Pcat3[] = { 173, 148, 140};
 static const Prob Pcat4[] = { 176, 155, 140, 135};
 static const Prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const Prob Pcat6[] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+static const Prob Pcat6[] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};

-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];

 static void init_bit_tree(vp9_tree_index *p, int n) {
  int i = 0;
@ -182,7 +286,7 @@ static void init_bit_trees() {
  init_bit_tree(cat3, 3);
  init_bit_tree(cat4, 4);
  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 13);
+  init_bit_tree(cat6, 14);
 }

 vp9_extra_bit_struct vp9_extra_bits[12] = {
@ -196,7 +300,7 @@ vp9_extra_bit_struct vp9_extra_bits[12] = {
  { cat3, Pcat3, 3, 11},
  { cat4, Pcat4, 4, 19},
  { cat5, Pcat5, 5, 35},
-  { cat6, Pcat6, 13, 67},
+  { cat6, Pcat6, 14, 67},
  { 0, 0, 0, 0}
 };

@ -218,6 +322,11 @@ void vp9_default_coef_probs(VP9_COMMON *pc) {
  vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,
             default_hybrid_coef_probs_16x16,
             sizeof(pc->fc.hybrid_coef_probs_16x16));
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,
+             sizeof(pc->fc.coef_probs_32x32));
+#endif
 }

 void vp9_coef_tree_initialize() {
@ -444,4 +553,28 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) {
          else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;
        }
      }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  for (i = 0; i < BLOCK_TYPES_32X32; ++i)
+    for (j = 0; j < COEF_BANDS; ++j)
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+          coef_probs, branch_ct, cm->fc.coef_counts_32x32[i][j][k], 256, 1);
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          int prob;
+          count = branch_ct[t][0] + branch_ct[t][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          prob = ((int)cm->fc.pre_coef_probs_32x32[i][j][k][t] *
+                  (256 - factor) +
+                  (int)coef_probs[t] * factor + 128) >> 8;
+          if (prob <= 0) cm->fc.coef_probs_32x32[i][j][k][t] = 1;
+          else if (prob > 255) cm->fc.coef_probs_32x32[i][j][k][t] = 255;
+          else cm->fc.coef_probs_32x32[i][j][k][t] = prob;
+        }
+      }
+#endif
 }
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@ -55,7 +55,7 @@ extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
 #define PROB_UPDATE_BASELINE_COST   7

 #define MAX_PROB                255
-#define DCT_MAX_VALUE           8192
+#define DCT_MAX_VALUE           16384

 /* Coefficients are predicted via a 3-dimensional probability table. */

@ -66,6 +66,10 @@ extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */

 #define BLOCK_TYPES_16X16 4

+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+#define BLOCK_TYPES_32X32 4
+#endif
+
 /* Middle dimension is a coarsening of the coefficient's
   position within the 4x4 DCT. */

@ -73,6 +77,9 @@ extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
 extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);
 extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);
 extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]);
+#endif

 /* Inside dimension is 3-valued measure of nearby complexity, that is,
   the extent to which nearby coefficients are nonzero.  For the first
@ -106,9 +113,13 @@ extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);
 extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);

 extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]);
+#endif
+
 void vp9_coef_tree_initialize(void);

-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
 void vp9_adapt_coef_probs(struct VP9Common *);

 #endif
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@ -1774,3 +1774,465 @@ void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
 #undef RIGHT_SHIFT
 #undef RIGHT_ROUNDING
 #endif
+
+#if CONFIG_TX32X32
+#if !CONFIG_DWT32X32HYBRID
+#define DownshiftMultiplyBy2(x) x * 2
+#define DownshiftMultiply(x) x
+static void idct16(double *input, double *output, int stride) {
+  static const double C1 = 0.995184726672197;
+  static const double C2 = 0.98078528040323;
+  static const double C3 = 0.956940335732209;
+  static const double C4 = 0.923879532511287;
+  static const double C5 = 0.881921264348355;
+  static const double C6 = 0.831469612302545;
+  static const double C7 = 0.773010453362737;
+  static const double C8 = 0.707106781186548;
+  static const double C9 = 0.634393284163646;
+  static const double C10 = 0.555570233019602;
+  static const double C11 = 0.471396736825998;
+  static const double C12 = 0.38268343236509;
+  static const double C13 = 0.290284677254462;
+  static const double C14 = 0.195090322016128;
+  static const double C15 = 0.098017140329561;
+
+  double step[16];
+  double intermediate[16];
+  double temp1, temp2;
+
+  // step 1 and 2
+  step[ 0] = input[stride*0] + input[stride*8];
+  step[ 1] = input[stride*0] - input[stride*8];
+
+  temp1 = input[stride*4]*C12;
+  temp2 = input[stride*12]*C4;
+
+  temp1 -= temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+
+  step[ 2] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*4]*C4;
+  temp2 = input[stride*12]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  step[ 3] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*2]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] + input[stride*10];
+
+  step[ 4] = temp1 + temp2;
+  step[ 5] = temp1 - temp2;
+
+  temp1 = input[stride*14]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] - input[stride*10];
+
+  step[ 6] = temp2 - temp1;
+  step[ 7] = temp2 + temp1;
+
+  // for odd input
+  temp1 = input[stride*3]*C12;
+  temp2 = input[stride*13]*C4;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[ 8] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*3]*C4;
+  temp2 = input[stride*13]*C12;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[ 9] = DownshiftMultiplyBy2(temp2);
+
+  intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8);
+  intermediate[11] = input[stride*15] - input[stride*1];
+  intermediate[12] = input[stride*15] + input[stride*1];
+  intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8));
+
+  temp1 = input[stride*11]*C12;
+  temp2 = input[stride*5]*C4;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[14] = DownshiftMultiplyBy2(temp2);
+
+  temp1 = input[stride*11]*C4;
+  temp2 = input[stride*5]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[15] = DownshiftMultiplyBy2(temp1);
+
+  step[ 8] = intermediate[ 8] + intermediate[14];
+  step[ 9] = intermediate[ 9] + intermediate[15];
+  step[10] = intermediate[10] + intermediate[11];
+  step[11] = intermediate[10] - intermediate[11];
+  step[12] = intermediate[12] + intermediate[13];
+  step[13] = intermediate[12] - intermediate[13];
+  step[14] = intermediate[ 8] - intermediate[14];
+  step[15] = intermediate[ 9] - intermediate[15];
+
+  // step 3
+  output[stride*0] = step[ 0] + step[ 3];
+  output[stride*1] = step[ 1] + step[ 2];
+  output[stride*2] = step[ 1] - step[ 2];
+  output[stride*3] = step[ 0] - step[ 3];
+
+  temp1 = step[ 4]*C14;
+  temp2 = step[ 7]*C2;
+  temp1 -= temp2;
+  output[stride*4] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 4]*C2;
+  temp2 = step[ 7]*C14;
+  temp1 += temp2;
+  output[stride*7] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C10;
+  temp2 = step[ 6]*C6;
+  temp1 -= temp2;
+  output[stride*5] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C6;
+  temp2 = step[ 6]*C10;
+  temp1 += temp2;
+  output[stride*6] =  DownshiftMultiply(temp1);
+
+  output[stride*8] = step[ 8] + step[11];
+  output[stride*9] = step[ 9] + step[10];
+  output[stride*10] = step[ 9] - step[10];
+  output[stride*11] = step[ 8] - step[11];
+  output[stride*12] = step[12] + step[15];
+  output[stride*13] = step[13] + step[14];
+  output[stride*14] = step[13] - step[14];
+  output[stride*15] = step[12] - step[15];
+
+  // output 4
+  step[ 0] = output[stride*0] + output[stride*7];
+  step[ 1] = output[stride*1] + output[stride*6];
+  step[ 2] = output[stride*2] + output[stride*5];
+  step[ 3] = output[stride*3] + output[stride*4];
+  step[ 4] = output[stride*3] - output[stride*4];
+  step[ 5] = output[stride*2] - output[stride*5];
+  step[ 6] = output[stride*1] - output[stride*6];
+  step[ 7] = output[stride*0] - output[stride*7];
+
+  temp1 = output[stride*8]*C7;
+  temp2 = output[stride*15]*C9;
+  temp1 -= temp2;
+  step[ 8] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C11;
+  temp2 = output[stride*14]*C5;
+  temp1 += temp2;
+  step[ 9] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*10]*C3;
+  temp2 = output[stride*13]*C13;
+  temp1 -= temp2;
+  step[10] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C15;
+  temp2 = output[stride*12]*C1;
+  temp1 += temp2;
+  step[11] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C1;
+  temp2 = output[stride*12]*C15;
+  temp2 -= temp1;
+  step[12] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*10]*C13;
+  temp2 = output[stride*13]*C3;
+  temp1 += temp2;
+  step[13] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C5;
+  temp2 = output[stride*14]*C11;
+  temp2 -= temp1;
+  step[14] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*8]*C9;
+  temp2 = output[stride*15]*C7;
+  temp1 += temp2;
+  step[15] = DownshiftMultiply(temp1);
+
+  // step 5
+  output[stride*0] = step[0] + step[15];
+  output[stride*1] = step[1] + step[14];
+  output[stride*2] = step[2] + step[13];
+  output[stride*3] = step[3] + step[12];
+  output[stride*4] = step[4] + step[11];
+  output[stride*5] = step[5] + step[10];
+  output[stride*6] = step[6] + step[ 9];
+  output[stride*7] = step[7] + step[ 8];
+
+  output[stride*15] = step[0] - step[15];
+  output[stride*14] = step[1] - step[14];
+  output[stride*13] = step[2] - step[13];
+  output[stride*12] = step[3] - step[12];
+  output[stride*11] = step[4] - step[11];
+  output[stride*10] = step[5] - step[10];
+  output[stride*9] = step[6] - step[ 9];
+  output[stride*8] = step[7] - step[ 8];
+}
+static void butterfly_32_idct_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step1[32];
+  double step2[32];
+
+  step1[ 0] = input[stride*0];
+  step1[ 1] = input[stride*2];
+  step1[ 2] = input[stride*4];
+  step1[ 3] = input[stride*6];
+  step1[ 4] = input[stride*8];
+  step1[ 5] = input[stride*10];
+  step1[ 6] = input[stride*12];
+  step1[ 7] = input[stride*14];
+  step1[ 8] = input[stride*16];
+  step1[ 9] = input[stride*18];
+  step1[10] = input[stride*20];
+  step1[11] = input[stride*22];
+  step1[12] = input[stride*24];
+  step1[13] = input[stride*26];
+  step1[14] = input[stride*28];
+  step1[15] = input[stride*30];
+
+  step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16);
+  step1[17] = (input[stride*3] + input[stride*1]);
+  step1[18] = (input[stride*5] + input[stride*3]);
+  step1[19] = (input[stride*7] + input[stride*5]);
+  step1[20] = (input[stride*9] + input[stride*7]);
+  step1[21] = (input[stride*11] + input[stride*9]);
+  step1[22] = (input[stride*13] + input[stride*11]);
+  step1[23] = (input[stride*15] + input[stride*13]);
+  step1[24] = (input[stride*17] + input[stride*15]);
+  step1[25] = (input[stride*19] + input[stride*17]);
+  step1[26] = (input[stride*21] + input[stride*19]);
+  step1[27] = (input[stride*23] + input[stride*21]);
+  step1[28] = (input[stride*25] + input[stride*23]);
+  step1[29] = (input[stride*27] + input[stride*25]);
+  step1[30] = (input[stride*29] + input[stride*27]);
+  step1[31] = (input[stride*31] + input[stride*29]);
+
+  idct16(step1, step2, 1);
+  idct16(step1 + 16, step2 + 16, 1);
+
+  step2[16] = DownshiftMultiply(step2[16] / (2*C1));
+  step2[17] = DownshiftMultiply(step2[17] / (2*C3));
+  step2[18] = DownshiftMultiply(step2[18] / (2*C5));
+  step2[19] = DownshiftMultiply(step2[19] / (2*C7));
+  step2[20] = DownshiftMultiply(step2[20] / (2*C9));
+  step2[21] = DownshiftMultiply(step2[21] / (2*C11));
+  step2[22] = DownshiftMultiply(step2[22] / (2*C13));
+  step2[23] = DownshiftMultiply(step2[23] / (2*C15));
+  step2[24] = DownshiftMultiply(step2[24] / (2*C17));
+  step2[25] = DownshiftMultiply(step2[25] / (2*C19));
+  step2[26] = DownshiftMultiply(step2[26] / (2*C21));
+  step2[27] = DownshiftMultiply(step2[27] / (2*C23));
+  step2[28] = DownshiftMultiply(step2[28] / (2*C25));
+  step2[29] = DownshiftMultiply(step2[29] / (2*C27));
+  step2[30] = DownshiftMultiply(step2[30] / (2*C29));
+  step2[31] = DownshiftMultiply(step2[31] / (2*C31));
+
+  output[stride* 0] = step2[ 0] + step2[16];
+  output[stride* 1] = step2[ 1] + step2[17];
+  output[stride* 2] = step2[ 2] + step2[18];
+  output[stride* 3] = step2[ 3] + step2[19];
+  output[stride* 4] = step2[ 4] + step2[20];
+  output[stride* 5] = step2[ 5] + step2[21];
+  output[stride* 6] = step2[ 6] + step2[22];
+  output[stride* 7] = step2[ 7] + step2[23];
+  output[stride* 8] = step2[ 8] + step2[24];
+  output[stride* 9] = step2[ 9] + step2[25];
+  output[stride*10] = step2[10] + step2[26];
+  output[stride*11] = step2[11] + step2[27];
+  output[stride*12] = step2[12] + step2[28];
+  output[stride*13] = step2[13] + step2[29];
+  output[stride*14] = step2[14] + step2[30];
+  output[stride*15] = step2[15] + step2[31];
+  output[stride*16] = step2[15] - step2[(31 - 0)];
+  output[stride*17] = step2[14] - step2[(31 - 1)];
+  output[stride*18] = step2[13] - step2[(31 - 2)];
+  output[stride*19] = step2[12] - step2[(31 - 3)];
+  output[stride*20] = step2[11] - step2[(31 - 4)];
+  output[stride*21] = step2[10] - step2[(31 - 5)];
+  output[stride*22] = step2[ 9] - step2[(31 - 6)];
+  output[stride*23] = step2[ 8] - step2[(31 - 7)];
+  output[stride*24] = step2[ 7] - step2[(31 - 8)];
+  output[stride*25] = step2[ 6] - step2[(31 - 9)];
+  output[stride*26] = step2[ 5] - step2[(31 - 10)];
+  output[stride*27] = step2[ 4] - step2[(31 - 11)];
+  output[stride*28] = step2[ 3] - step2[(31 - 12)];
+  output[stride*29] = step2[ 2] - step2[(31 - 13)];
+  output[stride*30] = step2[ 1] - step2[(31 - 14)];
+  output[stride*31] = step2[ 0] - step2[(31 - 15)];
+}
+
+void vp9_short_idct32x32_c(short *input, short *output, int pitch) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double out[32*32], out2[32*32];
+    const int short_pitch = pitch >> 1;
+    int i, j;
+    // First transform rows
+    for (i = 0; i < 32; ++i) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = input[j + i*short_pitch];
+      butterfly_32_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out[j + i*32] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 32; ++i) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = out[j*32 + i];
+      butterfly_32_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out2[j*32 + i] = temp_out[j];
+    }
+    for (i = 0; i < 32*32; ++i)
+      output[i] = round(out2[i]/128);
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+#else  // CONFIG_DWT32X32HYBRID
+
+#define MAX_BLOCK_LENGTH   64
+#define ENH_PRECISION_BITS 1
+#define ENH_PRECISION_RND ((1 << ENH_PRECISION_BITS) / 2)
+
+// Note: block length must be even for this implementation
+static void synthesis_53_row(int length, short *lowpass, short *highpass,
+                             short *x) {
+  short r, * a, * b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *x++ = ((r = *a++) + 1) >> 1;
+    *x++ = *b++ + ((r + (*a) + 2) >> 2);
+  }
+  *x++ = ((r = *a) + 1)>>1;
+  *x++ = *b + ((r+1)>>1);
+}
+
+static void synthesis_53_col(int length, short *lowpass, short *highpass,
+                             short *x) {
+  short r, * a, * b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *x++ = r = *a++;
+    *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
+  }
+  *x++ = r = *a;
+  *x++ = ((*b) << 1) + r;
+}
+
+// NOTE: Using a 5/3 integer wavelet for now. Explore using a wavelet
+// with a better response later
+void dyadic_synthesize(int levels, int width, int height, short *c, int pitch_c,
+                       short *x, int pitch_x) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  short buffer[2 * MAX_BLOCK_LENGTH];
+
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = c[i * pitch_c + j];
+      synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
+      synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
+    }
+  }
+  for (i = 0; i < height; i++)
+    for (j = 0; j < width; j++)
+      x[i * pitch_x + j] = (c[i * pitch_c + j] + ENH_PRECISION_RND) >>
+      ENH_PRECISION_BITS;
+}
+
+void vp9_short_idct32x32_c(short *input, short *output, int pitch) {
+  // assume out is a 32x32 buffer
+  short buffer[16 * 16];
+  short buffer2[32 * 32];
+  const int short_pitch = pitch >> 1;
+  int i;
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the idct16x16 function
+  vp9_short_idct16x16_c(input, buffer, pitch);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(short) * 16);
+    vpx_memcpy(buffer2 + i * 32 + 16, input + i * short_pitch + 16,
+               sizeof(short) * 16);
+  }
+  for (; i < 32; ++i) {
+    vpx_memcpy(buffer2 + i * 32, input + i * short_pitch,
+               sizeof(short) * 32);
+  }
+  dyadic_synthesize(1, 32, 32, buffer2, 32, output, 32);
+}
+#endif  // CONFIG_DWT32X32HYBRID
+#endif  // CONFIG_TX32X32
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@ -143,3 +143,16 @@ void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) {
  vp9_inverse_transform_mby_16x16(xd);
  vp9_inverse_transform_mbuv_8x8(xd);
 }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb) {
+  vp9_short_idct32x32(xd_sb->dqcoeff, xd_sb->diff, 64);
+}
+
+void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb) {
+  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1024,
+                                xd_sb->diff + 1024, 32);
+  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1280,
+                                xd_sb->diff + 1280, 32);
+}
+#endif
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@ -38,4 +38,9 @@ extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);

 extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+extern void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb);
+extern void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb);
+#endif
+
 #endif  // __INC_INVTRANS_H
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@ -192,6 +192,9 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {

  /* Point at base of Mb MODE_INFO list */
  const MODE_INFO *mode_info_context = cm->mi;
+#if CONFIG_SUPERBLOCKS
+  const int mis = cm->mode_info_stride;
+#endif

  /* Initialize the loop filter for this frame. */
  vp9_loop_filter_frame_init(cm, xd, cm->filter_level);
@ -226,14 +229,18 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
          if (mb_col > 0
 #if CONFIG_SUPERBLOCKS
              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-1].mbmi.mb_skip_coeff)
+                   ((mode_info_context[0].mbmi.mb_skip_coeff &&
+                     mode_info_context[-1].mbmi.mb_skip_coeff)
+#if CONFIG_TX32X32
+                    || mode_info_context[-1].mbmi.txfm_size == TX_32X32
+#endif
+                    ))
 #endif
              )
            vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
                                post->uv_stride, &lfi);

-          if (!skip_lf && tx_type != TX_16X16) {
+          if (!skip_lf && tx_type < TX_16X16) {
            if (tx_type == TX_8X8)
              vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
                                    post->uv_stride, &lfi);
@ -247,14 +254,18 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
          if (mb_row > 0
 #if CONFIG_SUPERBLOCKS
              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+                   ((mode_info_context[0].mbmi.mb_skip_coeff &&
+                     mode_info_context[-mis].mbmi.mb_skip_coeff)
+#if CONFIG_TX32X32
+                    || mode_info_context[-mis].mbmi.txfm_size == TX_32X32
+#endif
+                    ))
 #endif
              )
            vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
                                post->uv_stride, &lfi);

-          if (!skip_lf && tx_type != TX_16X16) {
+          if (!skip_lf && tx_type < TX_16X16) {
            if (tx_type == TX_8X8)
              vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
                                    post->uv_stride, &lfi);
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@ -58,6 +58,9 @@ typedef struct frame_contexts {
  vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
  vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
  vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_prob coef_probs_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+#endif

  nmv_context nmvc;
  nmv_context pre_nmvc;
@ -95,6 +98,11 @@ typedef struct frame_contexts {
  vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_prob pre_coef_probs_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+#endif
+
  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]
      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]
@ -110,6 +118,11 @@ typedef struct frame_contexts {
  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  unsigned int coef_counts_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS]
+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
+
  nmv_context_counts NMVcount;
  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                 [VP9_SWITCHABLE_FILTERS - 1];
@ -139,8 +152,11 @@ typedef enum {
  ONLY_4X4            = 0,
  ALLOW_8X8           = 1,
  ALLOW_16X16         = 2,
-  TX_MODE_SELECT      = 3,
-  NB_TXFM_MODES       = 4,
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  ALLOW_32X32         = 3,
+#endif
+  TX_MODE_SELECT      = 3 + (CONFIG_TX32X32 && CONFIG_SUPERBLOCKS),
+  NB_TXFM_MODES       = 4 + (CONFIG_TX32X32 && CONFIG_SUPERBLOCKS),
 } TXFM_MODE;

 typedef struct VP9Common {
@ -268,7 +284,7 @@ typedef struct VP9Common {
  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];

  // FIXME contextualize
-  vp9_prob prob_tx[TX_SIZE_MAX - 1];
+  vp9_prob prob_tx[TX_SIZE_MAX_SB - 1];

  vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];

--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@ -168,6 +168,53 @@ void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
    }
  }
 }
+
+#if CONFIG_TX32X32
+void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
+  int x, y, stride = xd->block[0].dst_stride;
+  short *diff = xd->sb_coeff_data.diff;
+
+  for (y = 0; y < 32; y++) {
+    for (x = 0; x < 32; x++) {
+      int a = dst[x] + diff[x];
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+      dst[x] = a;
+    }
+    dst += stride;
+    diff += 32;
+  }
+}
+
+void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+  int x, y, stride = xd->block[16].dst_stride;
+  short *udiff = xd->sb_coeff_data.diff + 1024;
+  short *vdiff = xd->sb_coeff_data.diff + 1280;
+
+  for (y = 0; y < 16; y++) {
+    for (x = 0; x < 16; x++) {
+      int u = udst[x] + udiff[x];
+      int v = vdst[x] + vdiff[x];
+      if (u < 0)
+        u = 0;
+      else if (u > 255)
+        u = 255;
+      if (v < 0)
+        v = 0;
+      else if (v > 255)
+        v = 255;
+      udst[x] = u;
+      vdst[x] = v;
+    }
+    udst += stride;
+    vdst += stride;
+    udiff += 16;
+    vdiff += 16;
+  }
+}
+#endif
 #endif

 void vp9_recon_mby_c(MACROBLOCKD *xd) {
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -361,6 +361,9 @@ specialize vp9_short_idct16x16
 prototype void vp9_short_idct10_16x16 "short *input, short *output, int pitch"
 specialize vp9_short_idct10_16x16

+prototype void vp9_short_idct32x32 "short *input, short *output, int pitch"
+specialize vp9_short_idct32x32
+
 prototype void vp9_ihtllm "const short *input, short *output, int pitch, int tx_type, int tx_dim"
 specialize vp9_ihtllm

@ -640,6 +643,9 @@ specialize vp9_short_fdct8x4
 prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"
 specialize vp9_short_walsh4x4

+prototype void vp9_short_fdct32x32 "short *InputData, short *OutputData, int pitch"
+specialize vp9_short_fdct32x32
+
 prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"
 specialize vp9_short_fdct16x16

--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@ -14,7 +14,7 @@

 static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };
 static const int seg_feature_data_max[SEG_LVL_MAX] =
-                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX - 1};
+                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX_SB - 1};

 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@ -209,8 +209,17 @@ static void kfread_modes(VP9D_COMP *pbi,
      m->mbmi.mode <= I8X8_PRED) {
    // FIXME(rbultje) code ternary symbol once all experiments are merged
    m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
-    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)
+    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) {
      m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.encoded_as_sb)
+        m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[2]);
+#endif
+    }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  } else if (cm->txfm_mode >= ALLOW_32X32 && m->mbmi.encoded_as_sb) {
+    m->mbmi.txfm_size = TX_32X32;
+#endif
  } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
    m->mbmi.txfm_size = TX_16X16;
  } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
@ -1219,8 +1228,17 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
    // FIXME(rbultje) code ternary symbol once all experiments are merged
    mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);
    if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
-        mbmi->mode != SPLITMV)
+        mbmi->mode != SPLITMV) {
      mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb && mbmi->txfm_size != TX_8X8)
+        mbmi->txfm_size += vp9_read(bc, cm->prob_tx[2]);
+#endif
+    }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  } else if (mbmi->encoded_as_sb && cm->txfm_mode >= ALLOW_32X32) {
+    mbmi->txfm_size = TX_32X32;
+#endif
  } else if (cm->txfm_mode >= ALLOW_16X16 &&
      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@ -693,6 +693,7 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
  VP9_COMMON *const pc = &pbi->common;
  MODE_INFO *orig_mi = xd->mode_info_context;
+  const int mis = pc->mode_info_stride;

  assert(xd->mode_info_context->mbmi.encoded_as_sb);

@ -733,6 +734,30 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
  }

  /* dequantization and idct */
+#if CONFIG_TX32X32
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
+    if (eobtotal == 0) {  // skip loopfilter
+      xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+      if (mb_col + 1 < pc->mb_cols)
+        xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
+      if (mb_row + 1 < pc->mb_rows) {
+        xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
+        if (mb_col + 1 < pc->mb_cols)
+          xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
+      }
+    } else {
+      vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,
+                                 xd->dst.y_buffer, xd->dst.y_buffer,
+                                 xd->dst.y_stride, xd->dst.y_stride,
+                                 xd->eobs[0]);
+      vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
+                                            xd->block[16].dequant,
+                                            xd->dst.u_buffer, xd->dst.v_buffer,
+                                            xd->dst.uv_stride, xd->eobs + 16);
+    }
+  } else {
+#endif
  for (n = 0; n < 4; n++) {
    int x_idx = n & 1, y_idx = n >> 1;

@ -742,7 +767,7 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,

    xd->above_context = pc->above_context + mb_col + x_idx;
    xd->left_context = pc->left_context + y_idx;
-    xd->mode_info_context = orig_mi + x_idx + y_idx * pc->mode_info_stride;
+    xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
    for (i = 0; i < 25; i++) {
      xd->block[i].eob = 0;
      xd->eobs[i] = 0;
@ -766,6 +791,9 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
  xd->above_context = pc->above_context + mb_col;
  xd->left_context = pc->left_context;
  xd->mode_info_context = orig_mi;
+#if CONFIG_TX32X32
+  }
+#endif
 }
 #endif

@ -1244,6 +1272,11 @@ static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
    read_coef_probs_common(bc, pc->fc.coef_probs_16x16);
    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);
  }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (pbi->common.txfm_mode > ALLOW_16X16) {
+    read_coef_probs_common(bc, pc->fc.coef_probs_32x32);
+  }
+#endif
 }

 int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
@ -1433,9 +1466,16 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {

  /* Read the loop filter level and type */
  pc->txfm_mode = vp9_read_literal(&header_bc, 2);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (pc->txfm_mode == 3)
+    pc->txfm_mode += vp9_read_bit(&header_bc);
+#endif
  if (pc->txfm_mode == TX_MODE_SELECT) {
    pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
    pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);
+#endif
  }

  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
@ -1591,6 +1631,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
           pbi->common.fc.coef_probs_16x16);
  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,
           pbi->common.fc.hybrid_coef_probs_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(pbi->common.fc.pre_coef_probs_32x32,
+           pbi->common.fc.coef_probs_32x32);
+#endif
  vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
 #if CONFIG_SUPERBLOCKS
  vp9_copy(pbi->common.fc.pre_sb_ymode_prob, pbi->common.fc.sb_ymode_prob);
@ -1610,6 +1654,9 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
  vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);
  vp9_zero(pbi->common.fc.coef_counts_16x16);
  vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_zero(pbi->common.fc.coef_counts_32x32);
+#endif
  vp9_zero(pbi->common.fc.ymode_counts);
 #if CONFIG_SUPERBLOCKS
  vp9_zero(pbi->common.fc.sb_ymode_counts);
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@ -352,3 +352,30 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
  }
 }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_dequant_idct_add_32x32(int16_t *input, const int16_t *dq,
+                                uint8_t *pred, uint8_t *dest, int pitch,
+                                int stride, uint16_t eobs) {
+  short output[1024];
+  int i;
+
+  input[0]= input[0] * dq[0] / 2;
+  for (i = 1; i < 1024; i++)
+    input[i] = input[i] * dq[1] / 2;
+  vp9_short_idct32x32_c(input, output, 64);
+  vpx_memset(input, 0, 2048);
+
+  add_residual(output, pred, pitch, dest, stride, 32, 32);
+}
+
+void vp9_dequant_idct_add_uv_block_16x16_c(short *q, const short *dq,
+                                           unsigned char *dstu,
+                                           unsigned char *dstv,
+                                           int stride,
+                                           unsigned short *eobs) {
+  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride, eobs[0]);
+  vp9_dequant_idct_add_16x16_c(q + 256, dq,
+                               dstv, dstv, stride, stride, eobs[4]);
+}
+#endif
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@ -55,8 +55,9 @@
 #define CAT5_PROB3 157
 #define CAT5_PROB4 180

-static const unsigned char cat6_prob[14] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const unsigned char cat6_prob[15] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
+};

 void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
  /* Clear entropy contexts */
@ -161,6 +162,12 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
        coef_counts = fc->hybrid_coef_counts_16x16[type];
      }
      break;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    case TX_32X32:
+      coef_probs = fc->coef_probs_32x32[type];
+      coef_counts = fc->coef_counts_32x32[type];
+      break;
+#endif
  }

  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@ -256,6 +263,54 @@ static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
  return eob;
 }

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc) {
+  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
+  unsigned short* const eobs = xd->eobs;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int c, i, eobtotal = 0, seg_eob;
+
+  // Luma block
+  eobs[0] = c = decode_coefs(pbi, xd, bc, A, L, PLANE_TYPE_Y_WITH_DC,
+                             DCT_DCT, get_eob(xd, segment_id, 1024),
+                             xd->sb_coeff_data.qcoeff,
+                             vp9_default_zig_zag1d_32x32,
+                             TX_32X32, vp9_coef_bands_32x32);
+  A[1] = A[2] = A[3] = A[0];
+  L[1] = L[2] = L[3] = L[0];
+  eobtotal += c;
+
+  // 16x16 chroma blocks
+  seg_eob = get_eob(xd, segment_id, 256);
+  for (i = 16; i < 24; i += 4) {
+    ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];
+    ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];
+
+    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
+                               DCT_DCT, seg_eob,
+                               xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
+                               vp9_default_zig_zag1d_16x16,
+                               TX_16X16, vp9_coef_bands_16x16);
+    a[1] = a[0];
+    l[1] = l[0];
+    eobtotal += c;
+  }
+
+  // no Y2 block
+  vpx_memset(&A[8], 0, sizeof(A[8]));
+  vpx_memset(&L[8], 0, sizeof(L[8]));
+
+  vpx_memcpy(xd->above_context + 1, xd->above_context,
+             sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(xd->left_context + 1, xd->left_context,
+             sizeof(ENTROPY_CONTEXT_PLANES));
+
+  return eobtotal;
+}
+#endif

 static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,
                                      MACROBLOCKD* const xd,
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@ -23,6 +23,12 @@ int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
 int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
                         BOOL_DECODER* const);

+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc);
+#endif
+
 int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,
                                BOOL_DECODER* const bc);

--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@ -1200,8 +1200,13 @@ static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
          TX_SIZE sz = mi->txfm_size;
          // FIXME(rbultje) code ternary symbol once all experiments are merged
          vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV)
+          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {
            vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+            if (mi->encoded_as_sb && sz != TX_8X8)
+              vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
+#endif
+          }
        }

 #ifdef ENTROPY_STATS
@ -1337,8 +1342,13 @@ static void write_mb_modes_kf(const VP9_COMMON  *c,
    TX_SIZE sz = m->mbmi.txfm_size;
    // FIXME(rbultje) code ternary symbol once all experiments are merged
    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
-    if (sz != TX_4X4 && ym <= TM_PRED)
+    if (sz != TX_4X4 && ym <= TM_PRED) {
      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      if (m->mbmi.encoded_as_sb && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
+#endif
+    }
  }
 }

@ -1551,25 +1561,50 @@ static void build_coeff_contexts(VP9_COMP *cpi) {
        }
      }
    }
-  }
-  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
-    for (j = 0; j < COEF_BANDS; ++j) {
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          cpi->frame_hybrid_coef_probs_16x16[i][j][k],
-          cpi->frame_hybrid_branch_ct_16x16[i][j][k],
-          cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);
+    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+      for (j = 0; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+          vp9_tree_probs_from_distribution(
+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+            cpi->frame_hybrid_coef_probs_16x16[i][j][k],
+            cpi->frame_hybrid_branch_ct_16x16[i][j][k],
+            cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);
 #ifdef ENTROPY_STATS
-        if (!cpi->dummy_packing)
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t];
+          if (!cpi->dummy_packing)
+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+              hybrid_context_counters_16x16[i][j][k][t] +=
+                cpi->hybrid_coef_counts_16x16[i][j][k][t];
 #endif
+        }
      }
    }
  }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (cpi->common.txfm_mode > ALLOW_16X16) {
+    for (i = 0; i < BLOCK_TYPES_32X32; ++i) {
+      for (j = 0; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+          vp9_tree_probs_from_distribution(
+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
+            cpi->frame_coef_probs_32x32[i][j][k],
+            cpi->frame_branch_ct_32x32[i][j][k],
+            cpi->coef_counts_32x32[i][j][k], 256, 1);
+#ifdef ENTROPY_STATS
+          if (!cpi->dummy_packing)
+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+              context_counters_32x32[i][j][k][t] +=
+                cpi->coef_counts_32x32[i][j][k][t];
+#endif
+        }
+      }
+    }
+  }
+#endif
 }

 static void update_coef_probs_common(
@ -1714,6 +1749,15 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
                             cpi->common.fc.hybrid_coef_probs_16x16,
                             cpi->frame_hybrid_branch_ct_16x16);
  }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (cpi->common.txfm_mode > ALLOW_16X16) {
+    update_coef_probs_common(bc,
+                             cpi->frame_coef_probs_32x32,
+                             cpi->common.fc.coef_probs_32x32,
+                             cpi->frame_branch_ct_32x32);
+  }
+#endif
 }

 #ifdef PACKET_TESTING
@ -1955,18 +1999,53 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,

  {
    if (pc->txfm_mode == TX_MODE_SELECT) {
-      pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0],
-                                cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] +
-                                cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]);
-      pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]);
+      pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] +
+                                cpi->txfm_count_16x16p[TX_4X4] +
+                                cpi->txfm_count_8x8p[TX_4X4],
+                                cpi->txfm_count_32x32p[TX_4X4] +
+                                cpi->txfm_count_32x32p[TX_8X8] +
+                                cpi->txfm_count_32x32p[TX_16X16] +
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+                                cpi->txfm_count_32x32p[TX_32X32] +
+#endif
+                                cpi->txfm_count_16x16p[TX_4X4] +
+                                cpi->txfm_count_16x16p[TX_8X8] +
+                                cpi->txfm_count_16x16p[TX_16X16] +
+                                cpi->txfm_count_8x8p[TX_4X4] +
+                                cpi->txfm_count_8x8p[TX_8X8]);
+      pc->prob_tx[1] = get_prob(cpi->txfm_count_32x32p[TX_8X8] +
+                                cpi->txfm_count_16x16p[TX_8X8],
+                                cpi->txfm_count_32x32p[TX_8X8] +
+                                cpi->txfm_count_32x32p[TX_16X16] +
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+                                cpi->txfm_count_32x32p[TX_32X32] +
+#endif
+                                cpi->txfm_count_16x16p[TX_8X8] +
+                                cpi->txfm_count_16x16p[TX_16X16]);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      pc->prob_tx[2] = get_prob(cpi->txfm_count_32x32p[TX_16X16],
+                                cpi->txfm_count_32x32p[TX_16X16] +
+                                cpi->txfm_count_32x32p[TX_32X32]);
+#endif
    } else {
      pc->prob_tx[0] = 128;
      pc->prob_tx[1] = 128;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      pc->prob_tx[2] = 128;
+#endif
    }
-    vp9_write_literal(&header_bc, pc->txfm_mode, 2);
+    vp9_write_literal(&header_bc, pc->txfm_mode <= 3 ? pc->txfm_mode : 3, 2);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    if (pc->txfm_mode > ALLOW_16X16) {
+      vp9_write_bit(&header_bc, pc->txfm_mode == TX_MODE_SELECT);
+    }
+#endif
    if (pc->txfm_mode == TX_MODE_SELECT) {
      vp9_write_literal(&header_bc, pc->prob_tx[0], 8);
      vp9_write_literal(&header_bc, pc->prob_tx[1], 8);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      vp9_write_literal(&header_bc, pc->prob_tx[2], 8);
+#endif
    }
  }

@ -2150,6 +2229,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8);
  vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);
  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
+           cpi->common.fc.coef_probs_32x32);
+#endif
 #if CONFIG_SUPERBLOCKS
  vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
 #endif
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@ -36,9 +36,15 @@ typedef struct block {
  short *zbin;
  short *zbin_8x8;
  short *zbin_16x16;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  short *zbin_32x32;
+#endif
  short *zrun_zbin_boost;
  short *zrun_zbin_boost_8x8;
  short *zrun_zbin_boost_16x16;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  short *zrun_zbin_boost_32x32;
+#endif
  short *round;

  // Zbin Over Quant value
@ -52,6 +58,9 @@ typedef struct block {
  int eob_max_offset;
  int eob_max_offset_8x8;
  int eob_max_offset_16x16;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  int eob_max_offset_32x32;
+#endif
 } BLOCK;

 typedef struct {
@ -83,6 +92,13 @@ typedef struct {
  int64_t txfm_rd_diff[NB_TXFM_MODES];
 } PICK_MODE_CONTEXT;

+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+typedef struct superblock {
+  DECLARE_ALIGNED(16, short, src_diff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, short, coeff[32*32+16*16*2]);
+} SUPERBLOCK;
+#endif
+
 typedef struct macroblock {
  DECLARE_ALIGNED(16, short, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
  DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
@ -95,6 +111,10 @@ typedef struct macroblock {
  // 1 DC 2nd order block each with 16 entries
  BLOCK block[25];

+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+  SUPERBLOCK sb_coeff_data;
+#endif
+
  YV12_BUFFER_CONFIG src;

  MACROBLOCKD e_mbd;
@ -153,9 +173,9 @@ typedef struct macroblock {

  unsigned char *active_ptr;

-  unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
+  unsigned int token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][COEF_BANDS]
    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
+  unsigned int hybrid_token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][COEF_BANDS]
    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

  int optimize;
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@ -1330,3 +1330,461 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {
 #undef RIGHT_SHIFT
 #undef ROUNDING
 #endif
+
+#if CONFIG_TX32X32
+#if !CONFIG_DWT32X32HYBRID
+static void dct32_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C2 = 0.995184726672;  // cos(pi * 2 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C4 = 0.980785280403;  // cos(pi * 4 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C6 = 0.956940335732;  // cos(pi * 6 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C8 = 0.923879532511;  // cos(pi * 8 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C10 = 0.881921264348;  // cos(pi * 10 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C12 = 0.831469612303;  // cos(pi * 12 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C14 = 0.773010453363;  // cos(pi * 14 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C18 = 0.634393284164;  // cos(pi * 18 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C20 = 0.555570233020;  // cos(pi * 20 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C22 = 0.471396736826;  // cos(pi * 22 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C24 = 0.382683432365;  // cos(pi * 24 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C26 = 0.290284677254;  // cos(pi * 26 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C28 = 0.195090322016;  // cos(pi * 28 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C30 = 0.098017140330;  // cos(pi * 30 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step[32];
+
+  // Stage 1
+  step[0] = input[stride*0] + input[stride*(32 - 1)];
+  step[1] = input[stride*1] + input[stride*(32 - 2)];
+  step[2] = input[stride*2] + input[stride*(32 - 3)];
+  step[3] = input[stride*3] + input[stride*(32 - 4)];
+  step[4] = input[stride*4] + input[stride*(32 - 5)];
+  step[5] = input[stride*5] + input[stride*(32 - 6)];
+  step[6] = input[stride*6] + input[stride*(32 - 7)];
+  step[7] = input[stride*7] + input[stride*(32 - 8)];
+  step[8] = input[stride*8] + input[stride*(32 - 9)];
+  step[9] = input[stride*9] + input[stride*(32 - 10)];
+  step[10] = input[stride*10] + input[stride*(32 - 11)];
+  step[11] = input[stride*11] + input[stride*(32 - 12)];
+  step[12] = input[stride*12] + input[stride*(32 - 13)];
+  step[13] = input[stride*13] + input[stride*(32 - 14)];
+  step[14] = input[stride*14] + input[stride*(32 - 15)];
+  step[15] = input[stride*15] + input[stride*(32 - 16)];
+  step[16] = -input[stride*16] + input[stride*(32 - 17)];
+  step[17] = -input[stride*17] + input[stride*(32 - 18)];
+  step[18] = -input[stride*18] + input[stride*(32 - 19)];
+  step[19] = -input[stride*19] + input[stride*(32 - 20)];
+  step[20] = -input[stride*20] + input[stride*(32 - 21)];
+  step[21] = -input[stride*21] + input[stride*(32 - 22)];
+  step[22] = -input[stride*22] + input[stride*(32 - 23)];
+  step[23] = -input[stride*23] + input[stride*(32 - 24)];
+  step[24] = -input[stride*24] + input[stride*(32 - 25)];
+  step[25] = -input[stride*25] + input[stride*(32 - 26)];
+  step[26] = -input[stride*26] + input[stride*(32 - 27)];
+  step[27] = -input[stride*27] + input[stride*(32 - 28)];
+  step[28] = -input[stride*28] + input[stride*(32 - 29)];
+  step[29] = -input[stride*29] + input[stride*(32 - 30)];
+  step[30] = -input[stride*30] + input[stride*(32 - 31)];
+  step[31] = -input[stride*31] + input[stride*(32 - 32)];
+
+  // Stage 2
+  output[stride*0] = step[0] + step[16 - 1];
+  output[stride*1] = step[1] + step[16 - 2];
+  output[stride*2] = step[2] + step[16 - 3];
+  output[stride*3] = step[3] + step[16 - 4];
+  output[stride*4] = step[4] + step[16 - 5];
+  output[stride*5] = step[5] + step[16 - 6];
+  output[stride*6] = step[6] + step[16 - 7];
+  output[stride*7] = step[7] + step[16 - 8];
+  output[stride*8] = -step[8] + step[16 - 9];
+  output[stride*9] = -step[9] + step[16 - 10];
+  output[stride*10] = -step[10] + step[16 - 11];
+  output[stride*11] = -step[11] + step[16 - 12];
+  output[stride*12] = -step[12] + step[16 - 13];
+  output[stride*13] = -step[13] + step[16 - 14];
+  output[stride*14] = -step[14] + step[16 - 15];
+  output[stride*15] = -step[15] + step[16 - 16];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18];
+  output[stride*19] = step[19];
+
+  output[stride*20] = (-step[20] + step[27])*C16;
+  output[stride*21] = (-step[21] + step[26])*C16;
+  output[stride*22] = (-step[22] + step[25])*C16;
+  output[stride*23] = (-step[23] + step[24])*C16;
+
+  output[stride*24] = (step[24] + step[23])*C16;
+  output[stride*25] = (step[25] + step[22])*C16;
+  output[stride*26] = (step[26] + step[21])*C16;
+  output[stride*27] = (step[27] + step[20])*C16;
+
+  output[stride*28] = step[28];
+  output[stride*29] = step[29];
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 3
+  step[0] = output[stride*0] + output[stride*(8 - 1)];
+  step[1] = output[stride*1] + output[stride*(8 - 2)];
+  step[2] = output[stride*2] + output[stride*(8 - 3)];
+  step[3] = output[stride*3] + output[stride*(8 - 4)];
+  step[4] = -output[stride*4] + output[stride*(8 - 5)];
+  step[5] = -output[stride*5] + output[stride*(8 - 6)];
+  step[6] = -output[stride*6] + output[stride*(8 - 7)];
+  step[7] = -output[stride*7] + output[stride*(8 - 8)];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9];
+  step[10] = (-output[stride*10] + output[stride*13])*C16;
+  step[11] = (-output[stride*11] + output[stride*12])*C16;
+  step[12] = (output[stride*12] + output[stride*11])*C16;
+  step[13] = (output[stride*13] + output[stride*10])*C16;
+  step[14] = output[stride*14];
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*23];
+  step[17] = output[stride*17] + output[stride*22];
+  step[18] = output[stride*18] + output[stride*21];
+  step[19] = output[stride*19] + output[stride*20];
+  step[20] = -output[stride*20] + output[stride*19];
+  step[21] = -output[stride*21] + output[stride*18];
+  step[22] = -output[stride*22] + output[stride*17];
+  step[23] = -output[stride*23] + output[stride*16];
+  step[24] = -output[stride*24] + output[stride*31];
+  step[25] = -output[stride*25] + output[stride*30];
+  step[26] = -output[stride*26] + output[stride*29];
+  step[27] = -output[stride*27] + output[stride*28];
+  step[28] = output[stride*28] + output[stride*27];
+  step[29] = output[stride*29] + output[stride*26];
+  step[30] = output[stride*30] + output[stride*25];
+  step[31] = output[stride*31] + output[stride*24];
+
+  // Stage 4
+  output[stride*0] = step[0] + step[3];
+  output[stride*1] = step[1] + step[2];
+  output[stride*2] = -step[2] + step[1];
+  output[stride*3] = -step[3] + step[0];
+  output[stride*4] = step[4];
+  output[stride*5] = (-step[5] + step[6])*C16;
+  output[stride*6] = (step[6] + step[5])*C16;
+  output[stride*7] = step[7];
+  output[stride*8] = step[8] + step[11];
+  output[stride*9] = step[9] + step[10];
+  output[stride*10] = -step[10] + step[9];
+  output[stride*11] = -step[11] + step[8];
+  output[stride*12] = -step[12] + step[15];
+  output[stride*13] = -step[13] + step[14];
+  output[stride*14] = step[14] + step[13];
+  output[stride*15] = step[15] + step[12];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18]*-C8 + step[29]*C24;
+  output[stride*19] = step[19]*-C8 + step[28]*C24;
+  output[stride*20] = step[20]*-C24 + step[27]*-C8;
+  output[stride*21] = step[21]*-C24 + step[26]*-C8;
+  output[stride*22] = step[22];
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25];
+  output[stride*26] = step[26]*C24 + step[21]*-C8;
+  output[stride*27] = step[27]*C24 + step[20]*-C8;
+  output[stride*28] = step[28]*C8 + step[19]*C24;
+  output[stride*29] = step[29]*C8 + step[18]*C24;
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 5
+  step[0] = (output[stride*0] + output[stride*1]) * C16;
+  step[1] = (-output[stride*1] + output[stride*0]) * C16;
+  step[2] = output[stride*2]*C24 + output[stride*3] * C8;
+  step[3] = output[stride*3]*C24 - output[stride*2] * C8;
+  step[4] = output[stride*4] + output[stride*5];
+  step[5] = -output[stride*5] + output[stride*4];
+  step[6] = -output[stride*6] + output[stride*7];
+  step[7] = output[stride*7] + output[stride*6];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9]*-C8 + output[stride*14]*C24;
+  step[10] = output[stride*10]*-C24 + output[stride*13]*-C8;
+  step[11] = output[stride*11];
+  step[12] = output[stride*12];
+  step[13] = output[stride*13]*C24 + output[stride*10]*-C8;
+  step[14] = output[stride*14]*C8 + output[stride*9]*C24;
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*19];
+  step[17] = output[stride*17] + output[stride*18];
+  step[18] = -output[stride*18] + output[stride*17];
+  step[19] = -output[stride*19] + output[stride*16];
+  step[20] = -output[stride*20] + output[stride*23];
+  step[21] = -output[stride*21] + output[stride*22];
+  step[22] = output[stride*22] + output[stride*21];
+  step[23] = output[stride*23] + output[stride*20];
+  step[24] = output[stride*24] + output[stride*27];
+  step[25] = output[stride*25] + output[stride*26];
+  step[26] = -output[stride*26] + output[stride*25];
+  step[27] = -output[stride*27] + output[stride*24];
+  step[28] = -output[stride*28] + output[stride*31];
+  step[29] = -output[stride*29] + output[stride*30];
+  step[30] = output[stride*30] + output[stride*29];
+  step[31] = output[stride*31] + output[stride*28];
+
+  // Stage 6
+  output[stride*0] = step[0];
+  output[stride*1] = step[1];
+  output[stride*2] = step[2];
+  output[stride*3] = step[3];
+  output[stride*4] = step[4]*C28 + step[7]*C4;
+  output[stride*5] = step[5]*C12 + step[6]*C20;
+  output[stride*6] = step[6]*C12 + step[5]*-C20;
+  output[stride*7] = step[7]*C28 + step[4]*-C4;
+  output[stride*8] = step[8] + step[9];
+  output[stride*9] = -step[9] + step[8];
+  output[stride*10] = -step[10] + step[11];
+  output[stride*11] = step[11] + step[10];
+  output[stride*12] = step[12] + step[13];
+  output[stride*13] = -step[13] + step[12];
+  output[stride*14] = -step[14] + step[15];
+  output[stride*15] = step[15] + step[14];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17]*-C4 + step[30]*C28;
+  output[stride*18] = step[18]*-C28 + step[29]*-C4;
+  output[stride*19] = step[19];
+  output[stride*20] = step[20];
+  output[stride*21] = step[21]*-C20 + step[26]*C12;
+  output[stride*22] = step[22]*-C12 + step[25]*-C20;
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25]*C12 + step[22]*-C20;
+  output[stride*26] = step[26]*C20 + step[21]*C12;
+  output[stride*27] = step[27];
+  output[stride*28] = step[28];
+  output[stride*29] = step[29]*C28 + step[18]*-C4;
+  output[stride*30] = step[30]*C4 + step[17]*C28;
+  output[stride*31] = step[31];
+
+  // Stage 7
+  step[0] = output[stride*0];
+  step[1] = output[stride*1];
+  step[2] = output[stride*2];
+  step[3] = output[stride*3];
+  step[4] = output[stride*4];
+  step[5] = output[stride*5];
+  step[6] = output[stride*6];
+  step[7] = output[stride*7];
+  step[8] = output[stride*8]*C30 + output[stride*15]*C2;
+  step[9] = output[stride*9]*C14 + output[stride*14]*C18;
+  step[10] = output[stride*10]*C22 + output[stride*13]*C10;
+  step[11] = output[stride*11]*C6 + output[stride*12]*C26;
+  step[12] = output[stride*12]*C6 + output[stride*11]*-C26;
+  step[13] = output[stride*13]*C22 + output[stride*10]*-C10;
+  step[14] = output[stride*14]*C14 + output[stride*9]*-C18;
+  step[15] = output[stride*15]*C30 + output[stride*8]*-C2;
+
+  step[16] = output[stride*16] + output[stride*17];
+  step[17] = -output[stride*17] + output[stride*16];
+  step[18] = -output[stride*18] + output[stride*19];
+  step[19] = output[stride*19] + output[stride*18];
+  step[20] = output[stride*20] + output[stride*21];
+  step[21] = -output[stride*21] + output[stride*20];
+  step[22] = -output[stride*22] + output[stride*23];
+  step[23] = output[stride*23] + output[stride*22];
+  step[24] = output[stride*24] + output[stride*25];
+  step[25] = -output[stride*25] + output[stride*24];
+  step[26] = -output[stride*26] + output[stride*27];
+  step[27] = output[stride*27] + output[stride*26];
+  step[28] = output[stride*28] + output[stride*29];
+  step[29] = -output[stride*29] + output[stride*28];
+  step[30] = -output[stride*30] + output[stride*31];
+  step[31] = output[stride*31] + output[stride*30];
+
+  // Final stage --- outputs indices are bit-reversed.
+  output[stride*0] = step[0];
+  output[stride*16] = step[1];
+  output[stride*8] = step[2];
+  output[stride*24] = step[3];
+  output[stride*4] = step[4];
+  output[stride*20] = step[5];
+  output[stride*12] = step[6];
+  output[stride*28] = step[7];
+  output[stride*2] = step[8];
+  output[stride*18] = step[9];
+  output[stride*10] = step[10];
+  output[stride*26] = step[11];
+  output[stride*6] = step[12];
+  output[stride*22] = step[13];
+  output[stride*14] = step[14];
+  output[stride*30] = step[15];
+
+  output[stride*1] = step[16]*C31 + step[31]*C1;
+  output[stride*17] = step[17]*C15 + step[30]*C17;
+  output[stride*9] = step[18]*C23 + step[29]*C9;
+  output[stride*25] = step[19]*C7 + step[28]*C25;
+  output[stride*5] = step[20]*C27 + step[27]*C5;
+  output[stride*21] = step[21]*C11 + step[26]*C21;
+  output[stride*13] = step[22]*C19 + step[25]*C13;
+  output[stride*29] = step[23]*C3 + step[24]*C29;
+  output[stride*3] = step[24]*C3 + step[23]*-C29;
+  output[stride*19] = step[25]*C19 + step[22]*-C13;
+  output[stride*11] = step[26]*C11 + step[21]*-C21;
+  output[stride*27] = step[27]*C27 + step[20]*-C5;
+  output[stride*7] = step[28]*C7 + step[19]*-C25;
+  output[stride*23] = step[29]*C23 + step[18]*-C9;
+  output[stride*15] = step[30]*C15 + step[17]*-C17;
+  output[stride*31] = step[31]*C31 + step[16]*-C1;
+}
+
+void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    int shortpitch = pitch >> 1;
+    int i, j;
+    double output[1024];
+    // First transform columns
+    for (i = 0; i < 32; i++) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; j++)
+        temp_in[j] = input[j*shortpitch + i];
+      dct32_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; j++)
+        output[j*32 + i] = temp_out[j];
+    }
+    // Then transform rows
+    for (i = 0; i < 32; ++i) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = output[j + i*32];
+      dct32_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        output[j + i*32] = temp_out[j];
+    }
+    // Scale by some magic number
+    for (i = 0; i < 1024; i++) {
+      out[i] = (short)round(output[i]/4);
+    }
+  }
+
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+#else  // CONFIG_DWT32X32HYBRID
+
+#define MAX_BLOCK_LENGTH   64
+#define ENH_PRECISION_BITS 1
+#define ENH_PRECISION_RND ((1 << ENH_PRECISION_BITS) / 2)
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, short *x,
+                            short *lowpass, short *highpass) {
+  int n;
+  short r, * a, * b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++) << 1;
+    *b++ = *x - ((r + x[1] + 1) >> 1);
+    x++;
+  }
+  *a = (r = *x++) << 1;
+  *b = *x - r;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void analysis_53_col(int length, short *x,
+                            short *lowpass, short *highpass) {
+  int n;
+  short r, * a, * b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++);
+    *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2;
+    x++;
+  }
+  *a = (r = *x++);
+  *b = (*x - r + 1) >> 1;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+// NOTE: Using a 5/3 integer wavelet for now. Explore using a wavelet
+// with a better response later
+static void dyadic_analyze(int levels, int width, int height,
+                           short *x, int pitch_x, short *c, int pitch_c) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  short buffer[2 * MAX_BLOCK_LENGTH];
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      c[i * pitch_c + j] = x[i * pitch_x + j] << ENH_PRECISION_BITS;
+    }
+  }
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
+      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i + nh] = c[i * pitch_c + j];
+      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
+  // assume out is a 32x32 buffer
+  short buffer[16 * 16];
+  int i;
+  const int short_pitch = pitch >> 1;
+  dyadic_analyze(1, 32, 32, input, short_pitch, out, 32);
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the dct16x16 function
+  vp9_short_fdct16x16_c(out, buffer, 64);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
+}
+#endif  // CONFIG_DWT32X32HYBRID
+#endif  // CONFIG_TX32X32
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -456,6 +456,10 @@ static void update_state(VP9_COMP *cpi, MACROBLOCK *x,
      if (xd->mb_to_right_edge >= 0)
        vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));
    }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  } else {
+    ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];
+#endif
  }
 #endif

@ -1487,6 +1491,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
  vp9_zero(cpi->hybrid_coef_counts_8x8);
  vp9_zero(cpi->coef_counts_16x16);
  vp9_zero(cpi->hybrid_coef_counts_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_zero(cpi->coef_counts_32x32);
+#endif

  vp9_frame_init_quantizer(cpi);

@ -1507,7 +1514,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
  vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
-  vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));
+  vpx_memset(cpi->txfm_count_32x32p, 0, sizeof(cpi->txfm_count_32x32p));
+  vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p));
  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
  {
@ -1700,7 +1708,11 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     * keyframe's probabilities as an estimate of what the current keyframe's
     * coefficient cost distributions may look like. */
    if (frame_type == 0) {
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      txfm_type = ALLOW_32X32;
+#else
      txfm_type = ALLOW_16X16;
+#endif
    } else
 #if 0
    /* FIXME (rbultje)
@ -1731,9 +1743,15 @@ void vp9_encode_frame(VP9_COMP *cpi) {
    } else
      txfm_type = ALLOW_8X8;
 #else
-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >=
                 cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+    ALLOW_32X32 : TX_MODE_SELECT;
+#else
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+    cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
    ALLOW_16X16 : TX_MODE_SELECT;
+#endif
 #endif
    cpi->common.txfm_mode = txfm_type;
    if (txfm_type != TX_MODE_SELECT) {
@ -1753,7 +1771,8 @@ void vp9_encode_frame(VP9_COMP *cpi) {
      int64_t pd = cpi->rd_tx_select_diff[i];
      int diff;
      if (i == TX_MODE_SELECT)
-        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);
+        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
+                     2048 * (TX_SIZE_MAX_SB - 1), 0);
      diff = (int)(pd / cpi->common.MBs);
      cpi->rd_tx_select_threshes[frame_type][i] += diff;
      cpi->rd_tx_select_threshes[frame_type][i] /= 2;
@ -1776,19 +1795,37 @@ void vp9_encode_frame(VP9_COMP *cpi) {
    }

    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4];
-      const int count8x8 = cpi->txfm_count[TX_8X8];
+      const int count4x4 = cpi->txfm_count_16x16p[TX_4X4] +
+                           cpi->txfm_count_32x32p[TX_4X4] +
+                           cpi->txfm_count_8x8p[TX_4X4];
+      const int count8x8_lp = cpi->txfm_count_32x32p[TX_8X8] +
+                              cpi->txfm_count_16x16p[TX_8X8];
      const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
-      const int count16x16 = cpi->txfm_count[TX_16X16];
+      const int count16x16_16x16p = cpi->txfm_count_16x16p[TX_16X16];
+      const int count16x16_lp = cpi->txfm_count_32x32p[TX_16X16];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      const int count32x32 = cpi->txfm_count_32x32p[TX_32X32];
+#else
+      const int count32x32 = 0;
+#endif

-      if (count4x4 == 0 && count16x16 == 0) {
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32 == 0) {
        cpi->common.txfm_mode = ALLOW_8X8;
        reset_skip_txfm_size(cpi, TX_8X8);
-      } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) {
+      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
        cpi->common.txfm_mode = ONLY_4X4;
        reset_skip_txfm_size(cpi, TX_4X4);
-      } else if (count8x8 == 0 && count4x4 == 0) {
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+        cpi->common.txfm_mode = ALLOW_32X32;
+#endif
+      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
        cpi->common.txfm_mode = ALLOW_16X16;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+        reset_skip_txfm_size(cpi, TX_16X16);
+#endif
      }
    }
  } else {
@ -2087,6 +2124,7 @@ static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
    vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
  }

+  assert(mbmi->txfm_size <= TX_16X16);
  if (mbmi->ref_frame == INTRA_FRAME) {
 #ifdef ENC_DEBUG
    if (enc_debug) {
@ -2266,7 +2304,7 @@ static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
          mbmi->mode != SPLITMV) {
-        cpi->txfm_count[mbmi->txfm_size]++;
+        cpi->txfm_count_16x16p[mbmi->txfm_size]++;
      } else if (mbmi->mode == I8X8_PRED ||
                 (mbmi->mode == SPLITMV &&
                  mbmi->partitioning != PARTITIONING_4X4)) {
@ -2308,6 +2346,7 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
  MODE_INFO *mi = x->e_mbd.mode_info_context;
  unsigned int segment_id = mi->mbmi.segment_id;
  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+  const int mis = cm->mode_info_stride;

  x->skip = 0;

@ -2397,6 +2436,53 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
                                       xd->dst.y_stride, xd->dst.uv_stride);
  }

+#if CONFIG_TX32X32
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,
+                         dst, dst_y_stride);
+    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                          usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride);
+    vp9_transform_sby_32x32(x);
+    vp9_transform_sbuv_16x16(x);
+    vp9_quantize_sby_32x32(x);
+    vp9_quantize_sbuv_16x16(x);
+    // TODO(rbultje): trellis optimize
+    vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data);
+    vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data);
+    vp9_recon_sby_s_c(&x->e_mbd, dst);
+    vp9_recon_sbuv_s_c(&x->e_mbd, udst, vdst);
+
+    if (!x->skip) {
+      vp9_tokenize_sb(cpi, &x->e_mbd, t, 0);
+    } else {
+      int mb_skip_context =
+          cpi->common.mb_no_coeff_skip ?
+          (mi - 1)->mbmi.mb_skip_coeff +
+          (mi - mis)->mbmi.mb_skip_coeff :
+          0;
+      mi->mbmi.mb_skip_coeff = 1;
+      if (cm->mb_no_coeff_skip) {
+        cpi->skip_true_count[mb_skip_context]++;
+        vp9_fix_contexts_sb(xd);
+      } else {
+        vp9_stuff_sb(cpi, xd, t, 0);
+        cpi->skip_false_count[mb_skip_context]++;
+      }
+    }
+
+    // copy skip flag on all mb_mode_info contexts in this SB
+    // if this was a skip at this txfm size
+    if (mb_col < cm->mb_cols - 1)
+      mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+    if (mb_row < cm->mb_rows - 1) {
+      mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+      if (mb_col < cm->mb_cols - 1)
+        mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+    }
+    skip[0] = skip[2] = skip[1] = skip[3] = mi->mbmi.mb_skip_coeff;
+  } else {
+#endif
  for (n = 0; n < 4; n++) {
    int x_idx = n & 1, y_idx = n >> 1;

@ -2405,7 +2491,7 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
    memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
    memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
    tp[n] = *t;
-    xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+    xd->mode_info_context = mi + x_idx + y_idx * mis;

    vp9_subtract_mby_s_c(x->src_diff,
                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
@ -2433,7 +2519,7 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
      int mb_skip_context =
        cpi->common.mb_no_coeff_skip ?
          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
-            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+            (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff :
          0;
      xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;
      if (cpi->common.mb_no_coeff_skip) {
@ -2450,20 +2536,29 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,

  xd->mode_info_context = mi;
  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+#if CONFIG_TX32X32
+  }
+#endif
  if (cm->txfm_mode == TX_MODE_SELECT &&
      !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
-    cpi->txfm_count[mi->mbmi.txfm_size]++;
+    cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
  } else {
-    TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_16X16 : cm->txfm_mode;
+    TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
+#if CONFIG_TX32X32
+                    TX_32X32 :
+#else
+                    TX_16X16 :
+#endif
+                    cm->txfm_mode;
    mi->mbmi.txfm_size = sz;
    if (mb_col < cm->mb_cols - 1)
      mi[1].mbmi.txfm_size = sz;
    if (mb_row < cm->mb_rows - 1) {
-      mi[cm->mode_info_stride].mbmi.txfm_size = sz;
+      mi[mis].mbmi.txfm_size = sz;
      if (mb_col < cm->mb_cols - 1)
-        mi[cm->mode_info_stride + 1].mbmi.txfm_size = sz;
+        mi[mis + 1].mbmi.txfm_size = sz;
    }
  }
 }
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -108,6 +108,52 @@ void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,
  }
 }

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_subtract_sby_s_c(short *diff, const unsigned char *src, int src_stride,
+                          const unsigned char *pred, int dst_stride) {
+  int r, c;
+
+  for (r = 0; r < 32; r++) {
+    for (c = 0; c < 32; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += 32;
+    pred += dst_stride;
+    src  += src_stride;
+  }
+}
+
+void vp9_subtract_sbuv_s_c(short *diff, const unsigned char *usrc,
+                           const unsigned char *vsrc, int src_stride,
+                           const unsigned char *upred,
+                           const unsigned char *vpred, int dst_stride) {
+  short *udiff = diff + 1024;
+  short *vdiff = diff + 1024 + 256;
+  int r, c;
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      udiff[c] = usrc[c] - upred[c];
+    }
+
+    udiff += 16;
+    upred += dst_stride;
+    usrc  += src_stride;
+  }
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      vdiff[c] = vsrc[c] - vpred[c];
+    }
+
+    vdiff += 16;
+    vpred += dst_stride;
+    vsrc  += src_stride;
+  }
+}
+#endif
+
 void vp9_subtract_mby_c(short *diff, unsigned char *src,
                        unsigned char *pred, int stride) {
  vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
@ -265,6 +311,22 @@ void vp9_transform_mb_16x16(MACROBLOCK *x) {
  vp9_transform_mbuv_8x8(x);
 }

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_transform_sby_32x32(MACROBLOCK *x) {
+  SUPERBLOCK * const x_sb = &x->sb_coeff_data;
+  vp9_short_fdct32x32(x_sb->src_diff, x_sb->coeff, 64);
+}
+
+void vp9_transform_sbuv_16x16(MACROBLOCK *x) {
+  SUPERBLOCK * const x_sb = &x->sb_coeff_data;
+  vp9_clear_system_state();
+  x->vp9_short_fdct16x16(x_sb->src_diff + 1024,
+                         x_sb->coeff + 1024, 32);
+  x->vp9_short_fdct16x16(x_sb->src_diff + 1280,
+                         x_sb->coeff + 1280, 32);
+}
+#endif
+
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@ -47,6 +47,11 @@ void vp9_transform_mb_16x16(MACROBLOCK *mb);
 void vp9_transform_mby_16x16(MACROBLOCK *x);
 void vp9_optimize_mby_16x16(MACROBLOCK *x);

+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+void vp9_transform_sby_32x32(MACROBLOCK *x);
+void vp9_transform_sbuv_16x16(MACROBLOCK *x);
+#endif
+
 void vp9_fidct_mb(MACROBLOCK *x);

 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
@ -59,6 +64,14 @@ void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
 void vp9_subtract_mby_s_c(short *diff, const unsigned char *src,
                          int src_stride, const unsigned char *pred,
                          int dst_stride);
+#if CONFIG_TX32X32
+void vp9_subtract_sby_s_c(short *diff, const unsigned char *src, int src_stride,
+                          const unsigned char *pred, int dst_stride);
+void vp9_subtract_sbuv_s_c(short *diff, const unsigned char *usrc,
+                           const unsigned char *vsrc, int src_stride,
+                           const unsigned char *upred,
+                           const unsigned char *vpred, int dst_stride);
+#endif
 #endif

 #endif
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@ -1810,7 +1810,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 #endif
  for (i = 0; i < COMP_PRED_CONTEXTS; i++)
    cm->prob_comppred[i]         = 128;
-  for (i = 0; i < TX_SIZE_MAX - 1; i++)
+  for (i = 0; i < TX_SIZE_MAX_SB - 1; i++)
    cm->prob_tx[i]               = 128;

  // Prime the recent reference frame useage counters.
@ -3698,6 +3698,9 @@ static void encode_frame_to_data_rate
  vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
  vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,
           cpi->hybrid_coef_counts_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);
+#endif
  vp9_adapt_coef_probs(&cpi->common);
  if (cpi->common.frame_type != KEY_FRAME) {
 #if CONFIG_SUPERBLOCKS
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@ -109,6 +109,11 @@ typedef struct {
  vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_prob coef_probs_32x32[BLOCK_TYPES_32X32]
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+#endif
+
 #if CONFIG_SUPERBLOCKS
  vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
 #endif
@ -435,6 +440,15 @@ typedef struct VP9_COMP {
  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);
  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  DECLARE_ALIGNED(16, short, Y1zbin_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, Y2zbin_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, UVzbin_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_32x32[QINDEX_RANGE][1024]);
+#endif
+
  MACROBLOCK mb;
  VP9_COMMON common;
  VP9_CONFIG oxcf;
@ -483,8 +497,9 @@ typedef struct VP9_COMP {
  int comp_pred_count[COMP_PRED_CONTEXTS];
  int single_pred_count[COMP_PRED_CONTEXTS];
  // FIXME contextualize
-  int txfm_count[TX_SIZE_MAX];
-  int txfm_count_8x8p[TX_SIZE_MAX - 1];
+  int txfm_count_32x32p[TX_SIZE_MAX_SB];
+  int txfm_count_16x16p[TX_SIZE_MAX_MB];
+  int txfm_count_8x8p[TX_SIZE_MAX_MB - 1];
  int64_t rd_tx_select_diff[NB_TXFM_MODES];
  int rd_tx_select_threshes[4][NB_TXFM_MODES];

@ -604,6 +619,12 @@ typedef struct VP9_COMP {
  vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
  unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+  unsigned int coef_counts_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+  vp9_prob frame_coef_probs_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  unsigned int frame_branch_ct_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+#endif
+
  int gfu_boost;
  int last_boost;
  int kf_boost;
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@ -323,28 +323,25 @@ void vp9_quantize_mb_16x16(MACROBLOCK *x) {
  vp9_quantize_mbuv_8x8(x);
 }

-void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
+static void quantize(short *zbin_boost_orig_ptr,
+                     short *coeff_ptr, int n_coeffs, int max_coeffs,
+                     short *zbin_ptr, short *round_ptr, short *quant_ptr,
+                     unsigned char *quant_shift_ptr,
+                     short *qcoeff_ptr, short *dqcoeff_ptr,
+                     short *dequant_ptr, short zbin_oq_value,
+                     int *eob_ptr, const int *scan, int mul) {
  int i, rc, eob;
  int zbin;
  int x, y, z, sz;
-  short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;
-  short *coeff_ptr  = b->coeff;
-  short *zbin_ptr   = b->zbin_16x16;
-  short *round_ptr  = b->round;
-  short *quant_ptr  = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-  short zbin_oq_value = b->zbin_extra;
+  short *zbin_boost_ptr = zbin_boost_orig_ptr;

-  vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));
-  vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(short));

  eob = -1;
-  for (i = 0; i < b->eob_max_offset_16x16; i++) {
-    rc   = vp9_default_zig_zag1d_16x16[i];
-    z    = coeff_ptr[rc];
+  for (i = 0; i < max_coeffs; i++) {
+    rc   = scan[i];
+    z    = coeff_ptr[rc] * mul;

    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
    zbin_boost_ptr ++;
@ -354,22 +351,70 @@ void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {

    if (x >= zbin) {
      x += (round_ptr[rc!=0]);
-      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))
+      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
          >> quant_shift_ptr[rc!=0];              // quantize (x)
      x  = (y ^ sz) - sz;                         // get the sign back
      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];   // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value

      if (y) {
        eob = i;                                  // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost_16x16;
+        zbin_boost_ptr = zbin_boost_orig_ptr;
      }
    }
  }

-  d->eob = eob + 1;
+  *eob_ptr = eob + 1;
 }

+void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
+  quantize(b->zrun_zbin_boost_16x16,
+           b->coeff,
+           256, b->eob_max_offset_16x16,
+           b->zbin_16x16, b->round, b->quant, b->quant_shift,
+           d->qcoeff,
+           d->dqcoeff,
+           d->dequant,
+           b->zbin_extra,
+           &d->eob, vp9_default_zig_zag1d_16x16, 1);
+}
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_quantize_sby_32x32(MACROBLOCK *x) {
+  x->e_mbd.block[0].eob = 0;
+  quantize(x->block[0].zrun_zbin_boost_32x32,
+           x->sb_coeff_data.coeff,
+           1024, x->block[0].eob_max_offset_32x32,
+           x->block[0].zbin_32x32,
+           x->block[0].round, x->block[0].quant, x->block[0].quant_shift,
+           x->e_mbd.sb_coeff_data.qcoeff,
+           x->e_mbd.sb_coeff_data.dqcoeff,
+           x->e_mbd.block[0].dequant,
+           x->block[0].zbin_extra,
+           &x->e_mbd.block[0].eob,
+           vp9_default_zig_zag1d_32x32, 2);
+}
+
+void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {
+  int i;
+
+  x->e_mbd.block[16].eob = 0;
+  x->e_mbd.block[20].eob = 0;
+  for (i = 16; i < 24; i += 4)
+    quantize(x->block[i].zrun_zbin_boost_16x16,
+             x->sb_coeff_data.coeff + 1024 + (i - 16) * 64,
+             256, x->block[i].eob_max_offset_16x16,
+             x->block[i].zbin_16x16,
+             x->block[i].round, x->block[0].quant, x->block[i].quant_shift,
+             x->e_mbd.sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
+             x->e_mbd.sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64,
+             x->e_mbd.block[i].dequant,
+             x->block[i].zbin_extra,
+             &x->e_mbd.block[i].eob,
+             vp9_default_zig_zag1d_16x16, 1);
+}
+#endif
+
 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
 * these two C functions if corresponding optimized routine is not available.
 * NEON optimized version implements currently the fast quantization for pair
@ -427,6 +472,74 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
  };
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  static const int zbin_boost_32x32[1024] = {
+    0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
+    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+  };
+#endif
  int qrounding_factor = 48;


@ -454,7 +567,13 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
    cpi->zrun_zbin_boost_y1_8x8[Q][0] =
      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_y1_16x16[Q][0] =
+      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    cpi->Y1zbin_32x32[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->zrun_zbin_boost_y1_32x32[Q][0] =
+     ((quant_val * zbin_boost_32x32[0]) + 64) >> 7;
+#endif


    quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);
@ -468,7 +587,8 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
    cpi->zrun_zbin_boost_y2_8x8[Q][0] =
      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_y2_16x16[Q][0] =
+      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

    quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
    invert_quant(cpi->UVquant[Q] + 0,
@ -481,7 +601,8 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
    cpi->zrun_zbin_boost_uv_8x8[Q][0] =
      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_uv_16x16[Q][0] =
+      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

    // all the 4x4 ac values =;
    for (i = 1; i < 16; i++) {
@ -543,16 +664,30 @@ void vp9_init_quantizer(VP9_COMP *cpi) {

      quant_val = vp9_ac_yquant(Q);
      cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+      cpi->zrun_zbin_boost_y1_16x16[Q][i] =
+        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
      cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+      cpi->zrun_zbin_boost_y2_16x16[Q][i] =
+        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
      cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+      cpi->zrun_zbin_boost_uv_16x16[Q][i] =
+        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
    }
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    // 32x32 structures. Same comment above applies.
+    for (i = 1; i < 1024; i++) {
+      int rc = vp9_default_zig_zag1d_32x32[i];
+
+      quant_val = vp9_ac_yquant(Q);
+      cpi->Y1zbin_32x32[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_y1_32x32[Q][i] =
+        ((quant_val * zbin_boost_32x32[i]) + 64) >> 7;
+    }
+#endif
  }
 }

@ -592,11 +727,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
    x->block[i].zbin = cpi->Y1zbin[QIndex];
    x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];
    x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    x->block[i].zbin_32x32 = cpi->Y1zbin_32x32[QIndex];
+#endif
    x->block[i].round = cpi->Y1round[QIndex];
    x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];
    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    x->block[i].zrun_zbin_boost_32x32 = cpi->zrun_zbin_boost_y1_32x32[QIndex];
+#endif
    x->block[i].zbin_extra = (short)zbin_extra;

    // Segment max eob offset feature.
@ -607,10 +748,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
      x->block[i].eob_max_offset_16x16 =
        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      x->block[i].eob_max_offset_32x32 =
+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+#endif
    } else {
      x->block[i].eob_max_offset = 16;
      x->block[i].eob_max_offset_8x8 = 64;
      x->block[i].eob_max_offset_16x16 = 256;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      x->block[i].eob_max_offset_32x32 = 1024;
+#endif
    }
  }

@ -640,9 +788,12 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
      x->block[i].eob_max_offset_8x8 =
        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+      x->block[i].eob_max_offset_16x16 =
+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
    } else {
      x->block[i].eob_max_offset = 16;
      x->block[i].eob_max_offset_8x8 = 64;
+      x->block[i].eob_max_offset_16x16 = 256;
    }
  }

--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@ -78,6 +78,11 @@ void vp9_quantize_mb_16x16(MACROBLOCK *x);
 extern prototype_quantize_block(vp9_quantize_quantb_16x16);
 extern prototype_quantize_mb(vp9_quantize_mby_16x16);

+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+void vp9_quantize_sby_32x32(MACROBLOCK *x);
+void vp9_quantize_sbuv_16x16(MACROBLOCK *x);
+#endif
+
 struct VP9_COMP;

 extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@ -175,6 +175,9 @@ void vp9_save_coding_context(VP9_COMP *cpi) {
  vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);
  vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
  vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32);
+#endif
  vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
  cc->interintra_prob = cm->fc.interintra_prob;
@ -234,6 +237,9 @@ void vp9_restore_coding_context(VP9_COMP *cpi) {
  vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);
  vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
  vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32);
+#endif
  vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
  cm->fc.interintra_prob = cc->interintra_prob;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -400,12 +400,18 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {
    cpi->common.fc.hybrid_coef_probs_16x16,
    BLOCK_TYPES_16X16);

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  fill_token_costs(
+    cpi->mb.token_costs[TX_32X32],
+    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_32x32,
+    BLOCK_TYPES_32X32);
+#endif
+
  /*rough estimate for costing*/
  cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
  vp9_init_mode_costs(cpi);

-  if (cpi->common.frame_type != KEY_FRAME)
-  {
+  if (cpi->common.frame_type != KEY_FRAME) {
    vp9_build_nmv_cost_table(
        cpi->mb.nmvjointcost,
        cpi->mb.e_mbd.allow_high_precision_mv ?
@ -556,7 +562,7 @@ static int cost_coeffs_2x2(MACROBLOCK *mb,

 static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       int tx_size) {
+                       TX_SIZE tx_size) {
  const int eob = b->eob;
  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
  int cost = 0, default_eob, seg_eob;
@ -613,9 +619,24 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
      default_eob = 256;
      if (type == PLANE_TYPE_Y_WITH_DC) {
        tx_type = get_tx_type_16x16(xd, b);
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      } else if (type == PLANE_TYPE_UV) {
+        int ib = (int)(b - xd->block) - 16;
+
+        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * ib;
+#endif
      }
      break;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    case TX_32X32:
+      scan = vp9_default_zig_zag1d_32x32;
+      band = vp9_coef_bands_32x32;
+      default_eob = 1024;
+      qcoeff_ptr = xd->sb_coeff_data.qcoeff;
+      break;
+#endif
    default:
+      abort();
      break;
  }
  if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB))
@ -813,23 +834,28 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
 }

 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
-                                     int r[2][TX_SIZE_MAX], int *rate,
-                                     int d[TX_SIZE_MAX], int *distortion,
-                                     int s[TX_SIZE_MAX], int *skip,
-                                     int64_t txfm_cache[NB_TXFM_MODES]) {
+                                     int (*r)[2], int *rate,
+                                     int *d, int *distortion,
+                                     int *s, int *skip,
+                                     int64_t txfm_cache[NB_TXFM_MODES],
+                                     TX_SIZE max_txfm_size) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
  vp9_prob skip_prob = cm->mb_no_coeff_skip ?
                       vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
-  int64_t rd[2][TX_SIZE_MAX];
-  int n;
+  int64_t rd[TX_SIZE_MAX_SB][2];
+  int n, m;

-  r[1][TX_16X16] = r[0][TX_16X16] + vp9_cost_one(cm->prob_tx[0]) +
-                   vp9_cost_one(cm->prob_tx[1]);
-  r[1][TX_8X8]   = r[0][TX_8X8] + vp9_cost_one(cm->prob_tx[0]) +
-                   vp9_cost_zero(cm->prob_tx[1]);
-  r[1][TX_4X4]   = r[0][TX_4X4] + vp9_cost_zero(cm->prob_tx[0]);
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    r[n][1] = r[n][0];
+    for (m = 0; m <= n - (n == max_txfm_size); m++) {
+      if (m == n)
+        r[n][1] += vp9_cost_zero(cm->prob_tx[m]);
+      else
+        r[n][1] += vp9_cost_one(cm->prob_tx[m]);
+    }
+  }

  if (cm->mb_no_coeff_skip) {
    int s0, s1;
@ -838,64 +864,82 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
    s0 = vp9_cost_bit(skip_prob, 0);
    s1 = vp9_cost_bit(skip_prob, 1);

-    for (n = TX_4X4; n <= TX_16X16; n++) {
+    for (n = TX_4X4; n <= max_txfm_size; n++) {
      if (s[n]) {
-        rd[0][n] = rd[1][n] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+        rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
      } else {
-        rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n] + s0, d[n]);
-        rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n] + s0, d[n]);
+        rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+        rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
      }
    }
  } else {
-    for (n = TX_4X4; n <= TX_16X16; n++) {
-      rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n], d[n]);
-      rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n], d[n]);
+    for (n = TX_4X4; n <= max_txfm_size; n++) {
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0], d[n]);
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1], d[n]);
    }
  }

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  if (max_txfm_size == TX_32X32 &&
+      (cm->txfm_mode == ALLOW_32X32 ||
+       (cm->txfm_mode == TX_MODE_SELECT &&
+        rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
+        rd[TX_32X32][1] < rd[TX_4X4][1]))) {
+    mbmi->txfm_size = TX_32X32;
+  } else
+#endif
  if ( cm->txfm_mode == ALLOW_16X16 ||
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+      (max_txfm_size == TX_16X16 && cm->txfm_mode == ALLOW_32X32) ||
+#endif
      (cm->txfm_mode == TX_MODE_SELECT &&
-       rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])) {
+       rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])) {
    mbmi->txfm_size = TX_16X16;
  } else if (cm->txfm_mode == ALLOW_8X8 ||
-           (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_8X8] < rd[1][TX_4X4])) {
+           (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
    mbmi->txfm_size = TX_8X8;
  } else {
-    assert(cm->txfm_mode == ONLY_4X4 ||
-          (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_4X4] <= rd[1][TX_8X8]));
+    assert(cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT);
    mbmi->txfm_size = TX_4X4;
  }

  *distortion = d[mbmi->txfm_size];
-  *rate       = r[cm->txfm_mode == TX_MODE_SELECT][mbmi->txfm_size];
+  *rate       = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];
  *skip       = s[mbmi->txfm_size];

-  txfm_cache[ONLY_4X4] = rd[0][TX_4X4];
-  txfm_cache[ALLOW_8X8] = rd[0][TX_8X8];
-  txfm_cache[ALLOW_16X16] = rd[0][TX_16X16];
-  if (rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])
-    txfm_cache[TX_MODE_SELECT] = rd[1][TX_16X16];
+  txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
+  txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];
+  txfm_cache[ALLOW_16X16] = rd[TX_16X16][0];
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+  txfm_cache[ALLOW_32X32] = rd[max_txfm_size][0];
+  if (max_txfm_size == TX_32X32 &&
+      rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
+      rd[TX_32X32][1] < rd[TX_4X4][1])
+    txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
  else
-    txfm_cache[TX_MODE_SELECT] = rd[1][TX_4X4] < rd[1][TX_8X8] ?
-                                 rd[1][TX_4X4] : rd[1][TX_8X8];
+#endif
+  if (rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
+    txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
+  else
+    txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
+                                 rd[TX_4X4][1] : rd[TX_8X8][1];
 }

 static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                            int *distortion, int *skippable,
                            int64_t txfm_cache[NB_TXFM_MODES]) {
  MACROBLOCKD *const xd = &x->e_mbd;
-  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX];
+  int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB];

  vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
                   x->block[0].src_stride);

-  macro_block_yrd_16x16(x, &r[0][TX_16X16], &d[TX_16X16],
-                        &s[TX_16X16], 1);
-  macro_block_yrd_8x8(x, &r[0][TX_8X8], &d[TX_8X8], &s[TX_8X8], 1);
-  macro_block_yrd_4x4(x, &r[0][TX_4X4], &d[TX_4X4], &s[TX_4X4], 1);
+  macro_block_yrd_16x16(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1);
+  macro_block_yrd_8x8(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1);
+  macro_block_yrd_4x4(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1);

  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
-                           txfm_cache);
+                           txfm_cache, TX_16X16);
 }

 static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
@ -908,25 +952,91 @@ static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
 }

 #if CONFIG_SUPERBLOCKS
+#if CONFIG_TX32X32
+static int rdcost_sby_32x32(MACROBLOCK *x) {
+  MACROBLOCKD * const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above,
+                  *tl = (ENTROPY_CONTEXT *) &t_left;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES));
+
+  return cost_coeffs(x, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
+}
+
+static int vp9_sb_block_error_c(short *coeff, short *dqcoeff, int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    unsigned int this_diff = coeff[i] - dqcoeff[i];
+    error += this_diff * this_diff;
+  }
+
+  return error > INT_MAX ? INT_MAX : error;
+}
+
+#define DEBUG_ERROR 0
+static void super_block_yrd_32x32(MACROBLOCK *x,
+                                  int *rate, int *distortion, int *skippable) {
+  SUPERBLOCK  * const x_sb = &x->sb_coeff_data;
+  MACROBLOCKD * const xd = &x->e_mbd;
+  SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data;
+#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID
+  short out[1024];
+#endif
+
+  vp9_transform_sby_32x32(x);
+  vp9_quantize_sby_32x32(x);
+#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID
+  vp9_short_idct32x32(xd_sb->dqcoeff, out, 64);
+#endif
+
+#if !CONFIG_DWT32X32HYBRID
+  *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024);
+#else
+  *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4;
+#endif
+#if DEBUG_ERROR
+  printf("IDCT/FDCT error 32x32: %d (d: %d)\n",
+         vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion);
+#endif
+  *rate       = rdcost_sby_32x32(x);
+  *skippable  = vp9_sby_is_skippable_32x32(&x->e_mbd);
+}
+#endif
+
 static void super_block_yrd(VP9_COMP *cpi,
                            MACROBLOCK *x, int *rate, int *distortion,
                            int *skip,
                            int64_t txfm_cache[NB_TXFM_MODES]) {
  MACROBLOCKD *const xd = &x->e_mbd;
-  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX], n;
+  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n;
  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  ENTROPY_CONTEXT_PLANES t_above[3][2], *orig_above = xd->above_context;
-  ENTROPY_CONTEXT_PLANES t_left[3][2], *orig_left = xd->left_context;
+  ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_MB][2],
+                        *orig_above = xd->above_context;
+  ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_MB][2],
+                        *orig_left = xd->left_context;

-  for (n = TX_4X4; n <= TX_16X16; n++) {
+  for (n = TX_4X4; n < TX_SIZE_MAX_MB; n++) {
    vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));
    vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));
-    r[0][n] = 0;
+    r[n][0] = 0;
    d[n] = 0;
    s[n] = 1;
  }

+#if CONFIG_TX32X32
+  vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,
+                       dst, dst_y_stride);
+  super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+#endif
+
+#if DEBUG_ERROR
+  int err[3] = { 0, 0, 0 };
+#endif
  for (n = 0; n < 4; n++) {
    int x_idx = n & 1, y_idx = n >> 1;
    int r_tmp, d_tmp, s_tmp;
@ -941,25 +1051,42 @@ static void super_block_yrd(VP9_COMP *cpi,
    xd->left_context = &t_left[TX_16X16][y_idx];
    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
    d[TX_16X16] += d_tmp;
-    r[0][TX_16X16] += r_tmp;
+    r[TX_16X16][0] += r_tmp;
    s[TX_16X16] = s[TX_16X16] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_16x16(xd);
+    err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif

    xd->above_context = &t_above[TX_4X4][x_idx];
    xd->left_context = &t_left[TX_4X4][y_idx];
    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
    d[TX_4X4] += d_tmp;
-    r[0][TX_4X4] += r_tmp;
+    r[TX_4X4][0] += r_tmp;
    s[TX_4X4] = s[TX_4X4] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_4x4(xd);
+    err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif

    xd->above_context = &t_above[TX_8X8][x_idx];
    xd->left_context = &t_left[TX_8X8][y_idx];
    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
    d[TX_8X8] += d_tmp;
-    r[0][TX_8X8] += r_tmp;
+    r[TX_8X8][0] += r_tmp;
    s[TX_8X8] = s[TX_8X8] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_8x8(xd);
+    err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
  }
-
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache);
+#if DEBUG_ERROR
+  printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]);
+  printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]);
+  printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]);
+#endif
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
+                           TX_SIZE_MAX_SB - 1);

  xd->above_context = orig_above;
  xd->left_context = orig_left;
@ -1632,14 +1759,59 @@ static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 }

 #if CONFIG_SUPERBLOCKS
+#if CONFIG_TX32X32
+static int rd_cost_sbuv_16x16(MACROBLOCK *x) {
+  int b;
+  int cost = 0;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *) &t_above;
+  tl = (ENTROPY_CONTEXT *) &t_left;
+
+  for (b = 16; b < 24; b += 4)
+    cost += cost_coeffs(x, xd->block + b, PLANE_TYPE_UV,
+                        ta + vp9_block2above_8x8[b],
+                        tl + vp9_block2left_8x8[b], TX_16X16);
+
+  return cost;
+}
+
+static void rd_inter32x32_uv_16x16(MACROBLOCK *x, int *rate,
+                                   int *distortion, int *skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  vp9_transform_sbuv_16x16(x);
+  vp9_quantize_sbuv_16x16(x);
+
+  *rate       = rd_cost_sbuv_16x16(x);
+  *distortion = vp9_block_error_c(x->sb_coeff_data.coeff + 1024,
+                                   xd->sb_coeff_data.dqcoeff + 1024, 512) >> 2;
+  *skip       = vp9_sbuv_is_skippable_16x16(xd);
+}
+#endif
+
 static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                int *distortion, int fullpixel, int *skip) {
  MACROBLOCKD *xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  int n, r = 0, d = 0;
  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+
+#if CONFIG_TX32X32
+  if (mbmi->txfm_size == TX_32X32) {
+    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                          usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride);
+    rd_inter32x32_uv_16x16(x, rate, distortion, skip);
+  } else {
+#endif
+  int n, r = 0, d = 0;
  int skippable = 1;
  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
@ -1680,8 +1852,11 @@ static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
  xd->above_context = ta;
  memcpy(xd->above_context, t_above, sizeof(t_above));
  memcpy(xd->left_context, t_left, sizeof(t_left));
+#if CONFIG_TX32X32
+  }
+#endif

-  return RDCOST(x->rdmult, x->rddiv, r, d);
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 #endif

@ -1818,15 +1993,26 @@ static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
 }

 #if CONFIG_SUPERBLOCKS
-static void super_block_uvrd_8x8(MACROBLOCK *x,
-                                 int *rate,
-                                 int *distortion,
-                                 int *skippable) {
+// TODO(rbultje) very similar to rd_inter32x32_uv(), merge?
+static void super_block_uvrd(MACROBLOCK *x,
+                             int *rate,
+                             int *distortion,
+                             int *skippable) {
  MACROBLOCKD *const xd = &x->e_mbd;
-  int d = 0, r = 0, n, s = 1;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+
+#if CONFIG_TX32X32
+  if (mbmi->txfm_size == TX_32X32) {
+    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                          usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride);
+    rd_inter32x32_uv_16x16(x, rate, distortion, skippable);
+  } else {
+#endif
+  int d = 0, r = 0, n, s = 1;
  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
@ -1844,9 +2030,15 @@ static void super_block_uvrd_8x8(MACROBLOCK *x,
                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                          dst_uv_stride);
-    vp9_transform_mbuv_8x8(x);
-    vp9_quantize_mbuv_8x8(x);
-    s &= vp9_mbuv_is_skippable_8x8(xd);
+    if (mbmi->txfm_size == TX_4X4) {
+      vp9_transform_mbuv_4x4(x);
+      vp9_quantize_mbuv_4x4(x);
+      s &= vp9_mbuv_is_skippable_4x4(xd);
+    } else {
+      vp9_transform_mbuv_8x8(x);
+      vp9_quantize_mbuv_8x8(x);
+      s &= vp9_mbuv_is_skippable_8x8(xd);
+    }

    d += vp9_mbuverror(x) >> 2;
    xd->above_context = ta + x_idx;
@ -1864,6 +2056,9 @@ static void super_block_uvrd_8x8(MACROBLOCK *x,
  xd->above_context = ta;
  memcpy(xd->above_context, t_above, sizeof(t_above));
  memcpy(xd->left_context,  t_left,  sizeof(t_left));
+#if CONFIG_TX32X32
+  }
+#endif
 }

 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
@ -1882,8 +2077,8 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

-    super_block_uvrd_8x8(x, &this_rate_tokenonly,
-                         &this_distortion, &s);
+    super_block_uvrd(x, &this_rate_tokenonly,
+                     &this_distortion, &s);
    this_rate = this_rate_tokenonly +
                x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@ -4141,8 +4336,6 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  int y_skip, uv_skip;
  int64_t txfm_cache[NB_TXFM_MODES];

-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-
  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                   &dist_y, &y_skip, txfm_cache);
  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
@ -4362,6 +4555,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0;
  MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV;
  int switchable_filter_index = 0;
+#if CONFIG_TX32X32
+  int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0;
+  int dist_uv_16x16 = 0, uv_skip_16x16 = 0;
+  MB_PREDICTION_MODE mode_uv_16x16;
+#endif

  x->skip = 0;
  xd->mode_info_context->mbmi.segment_id = segment_id;
@ -4397,6 +4595,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                            &dist_uv_8x8, &uv_skip_8x8);
    mode_uv_8x8 = mbmi->uv_mode;
  }
+#if CONFIG_TX32X32
+  if (cm->txfm_mode >= ALLOW_32X32) {
+    mbmi->txfm_size = TX_32X32;
+    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16, &rate_uv_tokenonly_16x16,
+                            &dist_uv_16x16, &uv_skip_16x16);
+    mode_uv_16x16 = mbmi->uv_mode;
+  }
+#endif

  for (mode_index = 0; mode_index < MAX_MODES;
       mode_index += (!switchable_filter_index)) {
@ -4524,6 +4730,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        distortion_uv = dist_uv_4x4;
        skippable = skippable && uv_skip_4x4;
        mbmi->uv_mode = mode_uv_4x4;
+#if CONFIG_TX32X32
+      } else if (mbmi->txfm_size == TX_32X32) {
+        rate_uv = rate_uv_16x16;
+        distortion_uv = dist_uv_16x16;
+        skippable = skippable && uv_skip_16x16;
+        mbmi->uv_mode = mode_uv_16x16;
+#endif
      } else {
        rate_uv = rate_uv_8x8;
        distortion_uv = dist_uv_8x8;
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@ -117,7 +117,7 @@ static void tokenize_b(VP9_COMP *cpi,
                       int dry_run) {
  int pt; /* near block/prev token context index */
  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
-  const int eob = b->eob;     /* one beyond last nonzero coeff */
+  int eob = b->eob;     /* one beyond last nonzero coeff */
  TOKENEXTRA *t = *tp;        /* store tokens starting here */
  const short *qcoeff_ptr = b->qcoeff;
  int seg_eob;
@ -177,7 +177,23 @@ static void tokenize_b(VP9_COMP *cpi,
        counts = cpi->coef_counts_16x16;
        probs = cpi->common.fc.coef_probs_16x16;
      }
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+      if (type == PLANE_TYPE_UV) {
+        int uv_idx = (((int) (b - xd->block)) - 16) >> 2;
+        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 256 * uv_idx;
+      }
+#endif
      break;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    case TX_32X32:
+      seg_eob = 1024;
+      bands = vp9_coef_bands_32x32;
+      scan = vp9_default_zig_zag1d_32x32;
+      counts = cpi->coef_counts_32x32;
+      probs = cpi->common.fc.coef_probs_32x32;
+      qcoeff_ptr = xd->sb_coeff_data.qcoeff;
+      break;
+#endif
  }

  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
@ -283,6 +299,79 @@ static int mb_is_skippable_16x16(MACROBLOCKD *xd) {
  return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));
 }

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) {
+  int skip = 1;
+  skip &= !xd->block[0].eob;
+  return skip;
+}
+
+int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) {
+  return (!xd->block[16].eob) & (!xd->block[20].eob);
+}
+
+static int sb_is_skippable_32x32(MACROBLOCKD *xd) {
+  return vp9_sby_is_skippable_32x32(xd) &&
+         vp9_sbuv_is_skippable_16x16(xd);
+}
+
+void vp9_tokenize_sb(VP9_COMP *cpi,
+                     MACROBLOCKD *xd,
+                     TOKENEXTRA **t,
+                     int dry_run) {
+  VP9_COMMON * const cm = &cpi->common;
+  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->above_context + 1), };
+  ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->left_context + 1), };
+  const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
+  const int segment_id = mbmi->segment_id;
+  const int skip_inc =  !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+                        (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0);
+  int b;
+
+  mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);
+
+  if (mbmi->mb_skip_coeff) {
+    if (!dry_run)
+      cpi->skip_true_count[mb_skip_context] += skip_inc;
+    if (!cm->mb_no_coeff_skip) {
+      vp9_stuff_sb(cpi, xd, t, dry_run);
+    } else {
+      vp9_fix_contexts_sb(xd);
+    }
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    cpi->skip_false_count[mb_skip_context] += skip_inc;
+
+  tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,
+             A[0], L[0], TX_32X32, dry_run);
+  A[0][1] = A[0][2] = A[0][3] = A[0][0];
+  L[0][1] = L[0][2] = L[0][3] = L[0][0];
+
+  for (b = 16; b < 24; b += 4) {
+    tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+               A[0] + vp9_block2above_8x8[b], L[0] + vp9_block2left_8x8[b],
+               TX_16X16, dry_run);
+    A[0][vp9_block2above_8x8[b] + 1] = A[0][vp9_block2above_8x8[b]];
+    L[0][vp9_block2left_8x8[b] + 1]  = L[0][vp9_block2left_8x8[b]];
+  }
+  vpx_memset(&A[0][8], 0, sizeof(A[0][8]));
+  vpx_memset(&L[0][8], 0, sizeof(L[0][8]));
+  vpx_memcpy(A[1], A[0], sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(L[1], L[0], sizeof(ENTROPY_CONTEXT_PLANES));
+
+  if (dry_run)
+    *t = t_backup;
+}
+#endif
+
 void vp9_tokenize_mb(VP9_COMP *cpi,
                     MACROBLOCKD *xd,
                     TOKENEXTRA **t,
@ -717,6 +806,13 @@ static __inline void stuff_b(VP9_COMP *cpi,
        probs = cpi->common.fc.coef_probs_16x16;
      }
      break;
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+    case TX_32X32:
+      bands = vp9_coef_bands_32x32;
+      counts = cpi->coef_counts_32x32;
+      probs = cpi->common.fc.coef_probs_32x32;
+      break;
+#endif
  }
  band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];
  t->Token = DCT_EOB_TOKEN;
@ -775,7 +871,8 @@ static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
  A[1] = A[2] = A[3] = A[0];
  L[1] = L[2] = L[3] = L[0];
  for (b = 16; b < 24; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+            A + vp9_block2above_8x8[b],
            L + vp9_block2above_8x8[b], TX_8X8, dry_run);
    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
@ -869,6 +966,43 @@ void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
  }
 }

+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+static void stuff_sb_32x32(VP9_COMP *cpi, MACROBLOCKD *xd,
+                               TOKENEXTRA **t, int dry_run) {
+  ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->above_context + 1), };
+  ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->left_context + 1), };
+  int b;
+
+  stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,
+          A[0], L[0], TX_32X32, dry_run);
+  A[0][1] = A[0][2] = A[0][3] = A[0][0];
+  L[0][1] = L[0][2] = L[0][3] = L[0][0];
+  for (b = 16; b < 24; b += 4) {
+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
+            A[0] + vp9_block2above_8x8[b],
+            L[0] + vp9_block2above_8x8[b], TX_16X16, dry_run);
+    A[0][vp9_block2above_8x8[b] + 1] = A[0][vp9_block2above_8x8[b]];
+    L[0][vp9_block2left_8x8[b] + 1]  = L[0][vp9_block2left_8x8[b]];
+  }
+  vpx_memset(&A[0][8], 0, sizeof(A[0][8]));
+  vpx_memset(&L[0][8], 0, sizeof(L[0][8]));
+  vpx_memcpy(A[1], A[0], sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(L[1], L[0], sizeof(ENTROPY_CONTEXT_PLANES));
+}
+
+void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
+  TOKENEXTRA * const t_backup = *t;
+
+  stuff_sb_32x32(cpi, xd, t, dry_run);
+
+  if (dry_run) {
+    *t = t_backup;
+  }
+}
+#endif
+
 void vp9_fix_contexts(MACROBLOCKD *xd) {
  /* Clear entropy contexts for blocks */
  if ((xd->mode_info_context->mbmi.mode != B_PRED
@ -885,3 +1019,10 @@ void vp9_fix_contexts(MACROBLOCKD *xd) {
    xd->left_context->y2 = 1;
  }
 }
+
+#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS
+void vp9_fix_contexts_sb(MACROBLOCKD *xd) {
+  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+}
+#endif
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@ -34,16 +34,29 @@ extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
 extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);
 extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
 extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+extern int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);
+extern int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);
+#endif

 struct VP9_COMP;

 extern void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                            TOKENEXTRA **t, int dry_run);
+extern void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                            TOKENEXTRA **t, int dry_run);

 extern void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                         TOKENEXTRA **t, int dry_run);
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+extern void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                         TOKENEXTRA **t, int dry_run);
+#endif

 extern void vp9_fix_contexts(MACROBLOCKD *xd);
+#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32
+extern void vp9_fix_contexts_sb(MACROBLOCKD *xd);
+#endif

 #ifdef ENTROPY_STATS
 void init_context_counters();