Fwd txfm and quantizer HBD/LBD data paths co-exist

Change-Id: Iaae46d0735539b8b8daf9faac81c2a3434838020
2017-06-27 16:07:28 -07:00 · 2017-06-27 16:07:28 -07:00 · 0f4195c218
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@ -290,10 +290,10 @@ DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h

 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_quantize_intrin_avx2.c
-endif
+
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@ -525,13 +525,12 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") {

    add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";

-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+    add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";

-      add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+    add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+
+    add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";

-      add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
-    }  # CONFIG_HIGHBITDEPTH
  }  # CONFIG_AV1_ENCODER
 } else {
  if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
@ -543,15 +542,14 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") {

    add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-      specialize qw/aom_highbd_quantize_b sse2 avx2/;
+    add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/aom_highbd_quantize_b sse2 avx2/;

-      add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-      specialize qw/aom_highbd_quantize_b_32x32 sse2/;
+    add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/aom_highbd_quantize_b_32x32 sse2/;
+
+    add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-      add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    }  # CONFIG_HIGHBITDEPTH
  }  # CONFIG_AV1_ENCODER
 } # CONFIG_AOM_QM
 if (aom_config("CONFIG_AV1") eq "yes") {
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c
@ -256,7 +256,6 @@ void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
 }
 #endif  // CONFIG_TX64X64

-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                            int skip_block, const int16_t *round_ptr,
                            const int16_t quant, tran_low_t *qcoeff_ptr,
@ -523,7 +522,6 @@ void aom_highbd_quantize_b_64x64_c(
  *eob_ptr = eob + 1;
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH

 #else  // CONFIG_AOM_QM

@ -602,7 +600,6 @@ void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
 }
 #endif  // CONFIG_TX64X64

-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                            int skip_block, const int16_t *round_ptr,
                            const int16_t quant, tran_low_t *qcoeff_ptr,
@ -825,5 +822,4 @@ void aom_highbd_quantize_b_64x64_c(
  *eob_ptr = eob + 1;
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH
 #endif  // CONFIG_AOM_QM
--- a/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@ -15,7 +15,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"

-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
                                int skip_block, const int16_t *zbin_ptr,
                                const int16_t *round_ptr,
@ -152,4 +151,3 @@ void aom_highbd_quantize_b_32x32_sse2(
  }
  *eob_ptr = eob + 1;
 }
-#endif
--- a/av1/av1_cx.mk
+++ b/av1/av1_cx.mk
@ -125,10 +125,10 @@ endif
 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/av1_quantize_sse2.c
 AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/av1_quantize_avx2.c
 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+
 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
 AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/av1_highbd_quantize_avx2.c
-endif
+

 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
@ -140,10 +140,10 @@ endif
 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
 AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
 AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/hybrid_fwd_txfm_avx2.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
+
 AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/av1_highbd_quantize_sse4.c
+
 AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
-endif

 ifeq ($(CONFIG_EXT_INTER),yes)
 AV1_CX_SRCS-yes += encoder/wedge_utils.c
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@ -417,25 +417,23 @@ if (aom_config("CONFIG_DPCM_INTRA") eq "yes") {
  }
 }

-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  #fwd txfm
-  add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
-  add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
-  specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
-}
+#fwd txfm
+add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
+add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
+add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
+add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;

 #
 # Motion search
@ -480,33 +478,34 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
  add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
  specialize qw/av1_highbd_block_error sse2/;

-  if (aom_config("CONFIG_AOM_QM") eq "yes") {
-    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
-
-    add_proto qw/void av1_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
-
-    if (aom_config("CONFIG_TX64X64") eq "yes") {
-      add_proto qw/void av1_highbd_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
-    }
-
-    add_proto qw/void av1_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
-  } else {
-    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-    specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
-
-    add_proto qw/void av1_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
-  }
-
  # fdct functions
  if (aom_config("CONFIG_TX64X64") eq "yes") {
    add_proto qw/void av1_highbd_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
  }

-  add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-
  add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";

 }
+
+if (aom_config("CONFIG_AOM_QM") eq "yes") {
+  add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+
+  add_proto qw/void av1_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+
+  if (aom_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void av1_highbd_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+  }
+
+  add_proto qw/void av1_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+} else {
+  add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+  specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+
+  add_proto qw/void av1_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+}
+
+add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+
 # End av1_high encoder functions

 if (aom_config("CONFIG_EXT_INTER") eq "yes") {
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@ -845,7 +845,6 @@ void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 #endif  // CONFIG_NEW_QUANT

-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                   tran_low_t *qcoeff_ptr,
@ -951,7 +950,6 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
  }
 }

-#if CONFIG_HIGHBITDEPTH
 static INLINE void highbd_quantize_dc(
    const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
    const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
@ -980,7 +978,6 @@ static INLINE void highbd_quantize_dc(
  }
  *eob_ptr = eob + 1;
 }
-#endif  // CONFIG_HIGHBITDEPTH

 void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
@ -1519,9 +1516,7 @@ void av1_highbd_quantize_dc_nuq_facade(
  }
 }
 #endif  // CONFIG_NEW_QUANT
-#endif  // CONFIG_HIGHBITDEPTH

-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
                              int skip_block, const int16_t *zbin_ptr,
                              const int16_t *round_ptr,
@ -1579,8 +1574,6 @@ void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
  *eob_ptr = eob + 1;
 }

-#endif  // CONFIG_HIGHBITDEPTH
-
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
  uint32_t t;
  int l, m;
--- a/av1/encoder/av1_quantize.h
+++ b/av1/encoder/av1_quantize.h
@ -146,7 +146,6 @@ void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                const QUANT_PARAM *qparam);
 #endif  // CONFIG_NEW_QUANT

-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                   tran_low_t *qcoeff_ptr,
@ -190,7 +189,6 @@ void av1_highbd_quantize_dc_nuq_facade(
    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
    const QUANT_PARAM *qparam);
 #endif  // CONFIG_NEW_QUANT
-#endif  // CONFIG_HIGHBITDEPTH

 #ifdef __cplusplus
 }  // extern "C"
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@ -2020,12 +2020,10 @@ void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
  }
 }

-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
                          int stride) {
  av1_fwht4x4_c(input, output, stride);
 }
-#endif  // CONFIG_HIGHBITDEPTH

 void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
                    int tx_type) {
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@ -494,7 +494,6 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
 }

 #if !CONFIG_PVQ
-#if CONFIG_HIGHBITDEPTH
 typedef enum QUANT_FUNC {
  QUANT_FUNC_LOWBD = 0,
  QUANT_FUNC_HIGHBD = 1,
@ -514,29 +513,12 @@ static AV1_QUANT_FACADE
 #endif  // !CONFIG_NEW_QUANT
      { NULL, NULL }
    };
+#endif  // !CONFIG_PVQ

-#else
-
-typedef enum QUANT_FUNC {
-  QUANT_FUNC_LOWBD = 0,
-  QUANT_FUNC_TYPES = 1
-} QUANT_FUNC;
-
-static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES]
-                                       [QUANT_FUNC_TYPES] = {
-#if !CONFIG_NEW_QUANT
-                                         { av1_quantize_fp_facade },
-                                         { av1_quantize_b_facade },
-                                         { av1_quantize_dc_facade },
-#else   // !CONFIG_NEW_QUANT
-                                         { av1_quantize_fp_nuq_facade },
-                                         { av1_quantize_b_nuq_facade },
-                                         { av1_quantize_dc_nuq_facade },
-#endif  // !CONFIG_NEW_QUANT
-                                         { NULL }
-                                       };
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_PVQ
+typedef void (*fwdTxfmFunc)(const int16_t *diff, tran_low_t *coeff, int stride,
+                            FWD_TXFM_PARAM *param);
+static const fwdTxfmFunc fwd_txfm_func[2] = { av1_fwd_txfm,
+                                              av1_highbd_fwd_txfm };

 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                     int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
@ -668,29 +650,13 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
  fwd_txfm_param.lossless = xd->lossless[mbmi->segment_id];

 #if !CONFIG_PVQ
-#if CONFIG_HIGHBITDEPTH
  fwd_txfm_param.bd = xd->bd;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
-      if (LIKELY(!x->skip_block)) {
-        quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
-            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
-      } else {
-        av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
-      }
-    }
-#if CONFIG_LV_MAP
-    p->txb_entropy_ctx[block] =
-        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
-#endif  // CONFIG_LV_MAP
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  av1_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  const int is_hbd = get_bitdepth_data_path_index(xd);
+  fwd_txfm_func[is_hbd](src_diff, coeff, diff_stride, &fwd_txfm_param);
+
  if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
    if (LIKELY(!x->skip_block)) {
-      quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
+      quant_func_list[xform_quant_idx][is_hbd](
          coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
    } else {
      av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
@ -700,7 +666,8 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
  p->txb_entropy_ctx[block] =
      (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
 #endif  // CONFIG_LV_MAP
-#else   // #if !CONFIG_PVQ
+  return;
+#else  // CONFIG_PVQ
  (void)xform_quant_idx;
 #if CONFIG_HIGHBITDEPTH
  fwd_txfm_param.bd = xd->bd;
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@ -203,7 +203,6 @@ static void fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
 }
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT

-#if CONFIG_HIGHBITDEPTH
 #if CONFIG_CHROMA_2X2
 static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
                                int diff_stride, TX_TYPE tx_type, int lossless,
@ -237,6 +236,7 @@ static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                                int diff_stride, TX_TYPE tx_type, int lossless,
                                const int bd) {
+  int32_t *dst_coeff = (int32_t *)coeff;
  if (lossless) {
    assert(tx_type == DCT_DCT);
    av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
@ -248,7 +248,7 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
    case DCT_ADST:
    case ADST_ADST:
      // fallthrough intended
-      av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
 #if CONFIG_EXT_TX
    case FLIPADST_DCT:
@ -257,7 +257,7 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
    case ADST_FLIPADST:
    case FLIPADST_ADST:
      // fallthrough intended
-      av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
    // use the c version for anything including identity for now
    case V_DCT:
@ -268,7 +268,7 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
    case H_FLIPADST:
    case IDTX:
      // fallthrough intended
-      av1_fwd_txfm2d_4x4_c(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_4x4_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
 #endif  // CONFIG_EXT_TX
    default: assert(0);
@ -279,47 +279,54 @@ static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
                                int diff_stride, TX_TYPE tx_type,
                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
  (void)fwd_txfm_opt;
-  av1_fwd_txfm2d_4x8_c(src_diff, coeff, diff_stride, tx_type, bd);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }

 static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
                                int diff_stride, TX_TYPE tx_type,
                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
  (void)fwd_txfm_opt;
-  av1_fwd_txfm2d_8x4_c(src_diff, coeff, diff_stride, tx_type, bd);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x4_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }

 static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type,
                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
  (void)fwd_txfm_opt;
-  av1_fwd_txfm2d_8x16_c(src_diff, coeff, diff_stride, tx_type, bd);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }

 static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type,
                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
  (void)fwd_txfm_opt;
-  av1_fwd_txfm2d_16x8_c(src_diff, coeff, diff_stride, tx_type, bd);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }

 static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TX_TYPE tx_type,
                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
  (void)fwd_txfm_opt;
-  av1_fwd_txfm2d_16x32_c(src_diff, coeff, diff_stride, tx_type, bd);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }

 static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TX_TYPE tx_type,
                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
  (void)fwd_txfm_opt;
-  av1_fwd_txfm2d_32x16_c(src_diff, coeff, diff_stride, tx_type, bd);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }

 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                int diff_stride, TX_TYPE tx_type,
                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  int32_t *dst_coeff = (int32_t *)coeff;
  (void)fwd_txfm_opt;
  switch (tx_type) {
    case DCT_DCT:
@ -327,7 +334,7 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
    case DCT_ADST:
    case ADST_ADST:
      // fallthrough intended
-      av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
 #if CONFIG_EXT_TX
    case FLIPADST_DCT:
@ -336,7 +343,7 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
    case ADST_FLIPADST:
    case FLIPADST_ADST:
      // fallthrough intended
-      av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
    // use the c version for anything including identity for now
    case V_DCT:
@ -347,7 +354,7 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
    case H_FLIPADST:
    case IDTX:
      // fallthrough intended
-      av1_fwd_txfm2d_8x8_c(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_8x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
 #endif  // CONFIG_EXT_TX
    default: assert(0);
@ -357,6 +364,7 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TX_TYPE tx_type,
                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  int32_t *dst_coeff = (int32_t *)coeff;
  (void)fwd_txfm_opt;
  switch (tx_type) {
    case DCT_DCT:
@ -364,7 +372,7 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
    case DCT_ADST:
    case ADST_ADST:
      // fallthrough intended
-      av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
 #if CONFIG_EXT_TX
    case FLIPADST_DCT:
@ -373,7 +381,7 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
    case ADST_FLIPADST:
    case FLIPADST_ADST:
      // fallthrough intended
-      av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
    // use the c version for anything including identity for now
    case V_DCT:
@ -384,7 +392,7 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
    case H_FLIPADST:
    case IDTX:
      // fallthrough intended
-      av1_fwd_txfm2d_16x16_c(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_16x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
 #endif  // CONFIG_EXT_TX
    default: assert(0);
@ -394,6 +402,7 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TX_TYPE tx_type,
                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  int32_t *dst_coeff = (int32_t *)coeff;
  (void)fwd_txfm_opt;
  switch (tx_type) {
    case DCT_DCT:
@ -401,7 +410,7 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
    case DCT_ADST:
    case ADST_ADST:
      // fallthrough intended
-      av1_fwd_txfm2d_32x32(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
 #if CONFIG_EXT_TX
    case FLIPADST_DCT:
@ -410,7 +419,7 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
    case ADST_FLIPADST:
    case FLIPADST_ADST:
      // fallthrough intended
-      av1_fwd_txfm2d_32x32(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
    // use the c version for anything including identity for now
    case V_DCT:
@ -421,7 +430,7 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
    case H_FLIPADST:
    case IDTX:
      // fallthrough intended
-      av1_fwd_txfm2d_32x32_c(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_32x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
 #endif  // CONFIG_EXT_TX
    default: assert(0);
@ -432,11 +441,12 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TX_TYPE tx_type,
                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  int32_t *dst_coeff = (int32_t *)coeff;
  (void)fwd_txfm_opt;
  (void)bd;
  switch (tx_type) {
    case DCT_DCT:
-      av1_fwd_txfm2d_64x64(src_diff, coeff, diff_stride, tx_type, bd);
+      av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, tx_type, bd);
      break;
 #if CONFIG_EXT_TX
    case ADST_DCT:
@ -459,7 +469,7 @@ static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
      // in a later change. This shouldn't impact performance since
      // DCT_DCT is the only extended type currently allowed for 64x64,
      // as dictated by get_ext_tx_set_type in blockd.h.
-      av1_fwd_txfm2d_64x64_c(src_diff, coeff, diff_stride, DCT_DCT, bd);
+      av1_fwd_txfm2d_64x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
      break;
    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
 #endif  // CONFIG_EXT_TX
@ -467,7 +477,6 @@ static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
  }
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH

 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
                  FWD_TXFM_PARAM *fwd_txfm_param) {
@ -534,7 +543,6 @@ void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
  }
 }

-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
                         int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param) {
  const int fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
@ -596,4 +604,3 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
    default: assert(0); break;
  }
 }
-#endif  // CONFIG_HIGHBITDEPTH
--- a/av1/encoder/hybrid_fwd_txfm.h
+++ b/av1/encoder/hybrid_fwd_txfm.h
@ -20,9 +20,7 @@ typedef struct FWD_TXFM_PARAM {
  TX_TYPE tx_type;
  TX_SIZE tx_size;
  int lossless;
-#if CONFIG_HIGHBITDEPTH
  int bd;
-#endif  // CONFIG_HIGHBITDEPTH
 } FWD_TXFM_PARAM;

 #ifdef __cplusplus
@ -32,10 +30,8 @@ extern "C" {
 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
                  FWD_TXFM_PARAM *fwd_txfm_param);

-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
                         int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param);
-#endif  // CONFIG_HIGHBITDEPTH

 #ifdef __cplusplus
 }  // extern "C"
--- a/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@ -113,7 +113,7 @@ static void fdct4x4_sse4_1(__m128i *in, int bit) {
  in[3] = _mm_unpackhi_epi64(v1, v3);
 }

-static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
@ -404,7 +404,7 @@ static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
  in[15] = _mm_srai_epi32(in[15], shift);
 }

-static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
@ -1788,7 +1788,7 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) {
  col_txfm_8x8_rounding(&in[48], shift);
 }

-static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
  const int size_8x8 = 16 * 4;
  write_buffer_8x8(&in[0], output);
  output += size_8x8;