LQ-Nets/learned_quantization.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File: learned_quantization.py

import tensorflow as tf
from tensorflow.contrib.framework import add_model_variable
from tensorflow.python.training import moving_averages
from tensorpack.models import *
from tensorpack.tfutils.tower import get_current_tower_context

MOVING_AVERAGES_FACTOR = 0.9
EPS = 0.0001
NORM_PPF_0_75 = 0.6745


@layer_register()
def QuantizedActiv(x, nbit=2):
    """
    Quantize activation.
    Args:
        x (tf.Tensor): a 4D tensor.
        nbit (int): number of bits of quantized activation. Defaults to 2.
    Returns:
        tf.Tensor with attribute `variables`.
    Variable Names:
    * ``basis``: basis of quantized activation.
    Note:
        About multi-GPU training: moving averages across GPUs are not aggregated.
        Batch statistics are computed by main training tower. This is consistent with most frameworks.
    """
    init_basis = [(NORM_PPF_0_75 * 2 / (2 ** nbit - 1)) * (2. ** i) for i in range(nbit)]
    init_basis = tf.constant_initializer(init_basis)
    bit_dims = [nbit, 1]
    num_levels = 2 ** nbit
    # initialize level multiplier
    init_level_multiplier = []
    for i in range(0, num_levels):
        level_multiplier_i = [0. for j in range(nbit)]
        level_number = i
        for j in range(nbit):
            level_multiplier_i[j] = float(level_number % 2)
            level_number = level_number // 2
        init_level_multiplier.append(level_multiplier_i)
    # initialize threshold multiplier
    init_thrs_multiplier = []
    for i in range(1, num_levels):
        thrs_multiplier_i = [0. for j in range(num_levels)]
        thrs_multiplier_i[i - 1] = 0.5
        thrs_multiplier_i[i] = 0.5
        init_thrs_multiplier.append(thrs_multiplier_i)

    with tf.variable_scope('ActivationQuantization'):
        basis = tf.get_variable(
            'basis', bit_dims, tf.float32,
            initializer=init_basis,
            trainable=False)

        ctx = get_current_tower_context()  # current tower context
        # calculate levels and sort
        level_codes = tf.constant(init_level_multiplier)
        levels = tf.matmul(level_codes, basis)
        levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels)
        levels = tf.reverse(levels, [-1])
        sort_id = tf.reverse(sort_id, [-1])
        levels = tf.transpose(levels, [1, 0])
        sort_id = tf.transpose(sort_id, [1, 0])
        # calculate threshold
        thrs_multiplier = tf.constant(init_thrs_multiplier)
        thrs = tf.matmul(thrs_multiplier, levels)
        # calculate output y and its binary code
        y = tf.zeros_like(x)  # output
        reshape_x = tf.reshape(x, [-1])
        zero_dims = tf.stack([tf.shape(reshape_x)[0], nbit])
        bits_y = tf.fill(zero_dims, 0.)
        zero_y = tf.zeros_like(x)
        zero_bits_y = tf.fill(zero_dims, 0.)
        for i in range(num_levels - 1):
            g = tf.greater(x, thrs[i])
            y = tf.where(g, zero_y + levels[i + 1], y)
            bits_y = tf.where(tf.reshape(g, [-1]), zero_bits_y + level_codes[sort_id[i + 1][0]], bits_y)
        # training
        if ctx.is_main_training_tower:
            BT = tf.matrix_transpose(bits_y)
            # calculate BTxB
            BTxB = []
            for i in range(nbit):
                for j in range(nbit):
                    BTxBij = tf.multiply(BT[i], BT[j])
                    BTxBij = tf.reduce_sum(BTxBij)
                    BTxB.append(BTxBij)
            BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit])
            BTxB_inv = tf.matrix_inverse(BTxB)
            # calculate BTxX
            BTxX = []
            for i in range(nbit):
                BTxXi0 = tf.multiply(BT[i], reshape_x)
                BTxXi0 = tf.reduce_sum(BTxXi0)
                BTxX.append(BTxXi0)
            BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, 1])

            new_basis = tf.matmul(BTxB_inv, BTxX)  # calculate new basis
            # create moving averages op
            updata_moving_basis = moving_averages.assign_moving_average(
                basis, new_basis, MOVING_AVERAGES_FACTOR)
            add_model_variable(basis)
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis)

            for i in range(nbit):
                tf.summary.scalar('basis%d' % i, new_basis[i][0])

        x_clip = tf.minimum(x, levels[num_levels - 1])  # gradient clip
        y = x_clip + tf.stop_gradient(-x_clip) + tf.stop_gradient(y)  # gradient: y=clip(x)
        y.variables = VariableHolder(basis=basis)
        return y


def QuantizedWeight(name, x, n, nbit=2):
    """
    Quantize weight.
    Args:
        x (tf.Tensor): a 4D tensor.
            Must have known number of channels, but can have other unknown dimensions.
        name (str): operator's name.
        n (int or double): variance of weight initialization.
        nbit (int): number of bits of quantized weight. Defaults to 2.
    Returns:
        tf.Tensor with attribute `variables`.
    Variable Names:
    * ``basis``: basis of quantized weight.
    Note:
        About multi-GPU training: moving averages across GPUs are not aggregated.
        Batch statistics are computed by main training tower. This is consistent with most frameworks.
    """
    num_filters = x.get_shape().as_list()[-1]
    init_basis = []
    base = NORM_PPF_0_75 * ((2. / n) ** 0.5) / (2 ** (nbit - 1))
    for j in range(nbit):
        init_basis.append([(2 ** j) * base for i in range(num_filters)])
    init_basis = tf.constant_initializer(init_basis)
    bit_dims = [nbit, num_filters]
    num_levels = 2 ** nbit
    delta = EPS
    # initialize level multiplier
    init_level_multiplier = []
    for i in range(num_levels):
        level_multiplier_i = [0. for j in range(nbit)]
        level_number = i
        for j in range(nbit):
            binary_code = level_number % 2
            if binary_code == 0:
                binary_code = -1
            level_multiplier_i[j] = float(binary_code)
            level_number = level_number // 2
        init_level_multiplier.append(level_multiplier_i)
    # initialize threshold multiplier
    init_thrs_multiplier = []
    for i in range(1, num_levels):
        thrs_multiplier_i = [0. for j in range(num_levels)]
        thrs_multiplier_i[i - 1] = 0.5
        thrs_multiplier_i[i] = 0.5
        init_thrs_multiplier.append(thrs_multiplier_i)

    with tf.variable_scope(name):
        basis = tf.get_variable(
            'basis', bit_dims, tf.float32,
            initializer=init_basis,
            trainable=False)
        level_codes = tf.constant(init_level_multiplier)
        thrs_multiplier = tf.constant(init_thrs_multiplier)
        sum_multiplier = tf.constant(1., shape=[1, tf.reshape(x, [-1, num_filters]).get_shape()[0]])
        sum_multiplier_basis = tf.constant(1., shape=[1, nbit])

        ctx = get_current_tower_context()  # current tower context
        # calculate levels and sort
        levels = tf.matmul(level_codes, basis)
        levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels)
        levels = tf.reverse(levels, [-1])
        sort_id = tf.reverse(sort_id, [-1])
        levels = tf.transpose(levels, [1, 0])
        sort_id = tf.transpose(sort_id, [1, 0])
        # calculate threshold
        thrs = tf.matmul(thrs_multiplier, levels)
        # calculate level codes per channel
        reshape_x = tf.reshape(x, [-1, num_filters])
        level_codes_channelwise_dims = tf.stack([num_levels * num_filters, nbit])
        level_codes_channelwise = tf.fill(level_codes_channelwise_dims, 0.)
        for i in range(num_levels):
            eq = tf.equal(sort_id, i)
            level_codes_channelwise = tf.where(tf.reshape(eq, [-1]), level_codes_channelwise + level_codes[i], level_codes_channelwise)
        level_codes_channelwise = tf.reshape(level_codes_channelwise, [num_levels, num_filters, nbit])
        # calculate output y and its binary code
        y = tf.zeros_like(x) + levels[0]  # output
        zero_dims = tf.stack([tf.shape(reshape_x)[0] * num_filters, nbit])
        bits_y = tf.fill(zero_dims, -1.)
        zero_y = tf.zeros_like(x)
        zero_bits_y = tf.fill(zero_dims, 0.)
        zero_bits_y = tf.reshape(zero_bits_y, [-1, num_filters, nbit])
        for i in range(num_levels - 1):
            g = tf.greater(x, thrs[i])
            y = tf.where(g, zero_y + levels[i + 1], y)
            bits_y = tf.where(tf.reshape(g, [-1]), tf.reshape(zero_bits_y + level_codes_channelwise[i + 1], [-1, nbit]), bits_y)
        bits_y = tf.reshape(bits_y, [-1, num_filters, nbit])
        # training
        if ctx.is_main_training_tower:
            BT = tf.transpose(bits_y, [2, 0, 1])
            # calculate BTxB
            BTxB = []
            for i in range(nbit):
                for j in range(nbit):
                    BTxBij = tf.multiply(BT[i], BT[j])
                    BTxBij = tf.matmul(sum_multiplier, BTxBij)
                    if i == j:
                        mat_one = tf.ones([1, num_filters])
                        BTxBij = BTxBij + (delta * mat_one)  # + E
                    BTxB.append(BTxBij)
            BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit, num_filters])
            # calculate inverse of BTxB
            if nbit > 2:
                BTxB_transpose = tf.transpose(BTxB, [2, 0, 1])
                BTxB_inv = tf.matrix_inverse(BTxB_transpose)
                BTxB_inv = tf.transpose(BTxB_inv, [1, 2, 0])
            elif nbit == 2:
                det = tf.multiply(BTxB[0][0], BTxB[1][1]) - tf.multiply(BTxB[0][1], BTxB[1][0])
                inv = []
                inv.append(BTxB[1][1] / det)
                inv.append(-BTxB[0][1] / det)
                inv.append(-BTxB[1][0] / det)
                inv.append(BTxB[0][0] / det)
                BTxB_inv = tf.reshape(tf.stack(values=inv), [nbit, nbit, num_filters])
            elif nbit == 1:
                BTxB_inv = tf.reciprocal(BTxB)
            # calculate BTxX
            BTxX = []
            for i in range(nbit):
                BTxXi0 = tf.multiply(BT[i], reshape_x)
                BTxXi0 = tf.matmul(sum_multiplier, BTxXi0)
                BTxX.append(BTxXi0)
            BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, num_filters])
            BTxX = BTxX + (delta * basis)  # + basis
            # calculate new basis
            new_basis = []
            for i in range(nbit):
                new_basis_i = tf.multiply(BTxB_inv[i], BTxX)
                new_basis_i = tf.matmul(sum_multiplier_basis, new_basis_i)
                new_basis.append(new_basis_i)
            new_basis = tf.reshape(tf.stack(values=new_basis), [nbit, num_filters])
            # create moving averages op
            updata_moving_basis = moving_averages.assign_moving_average(
                basis, new_basis, MOVING_AVERAGES_FACTOR)
            add_model_variable(basis)
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis)

        y = x + tf.stop_gradient(-x) + tf.stop_gradient(y)  # gradient: y=x
        y.variables = VariableHolder(basis=basis)
        return y


@layer_register()
def Conv2DQuant(x, out_channel, kernel_shape,
                padding='SAME', stride=1,
                W_init=None, b_init=None,
                nl=tf.identity, split=1, use_bias=True,
                data_format='NHWC', is_quant=True, nbit=1, fc=False):
    """
    2D convolution on 4D inputs.
    Args:
        x (tf.Tensor): a 4D tensor.
            Must have known number of channels, but can have other unknown dimensions.
        out_channel (int): number of output channel.
        kernel_shape: (h, w) tuple or a int.
        stride: (h, w) tuple or a int.
        padding (str): 'valid' or 'same'. Case insensitive.
        split (int): Split channels as used in Alexnet. Defaults to 1 (no split).
        W_init: initializer for W. Defaults to `variance_scaling_initializer`.
        b_init: initializer for b. Defaults to zero.
        nl: a nonlinearity function.
        use_bias (bool): whether to use bias.
        data_format (str): 'NHWC' or 'NCHW'. Defaults to 'NHWC'.
        is_quant (bool): whether to quantize weight. Defaults to 'True'.
        nbit (int): number of bits of quantized weight. Defaults to 1.
        fc (bool): whether to convert Conv2D to FullyConnect. Defaults to 'False'.
    Returns:
        tf.Tensor named ``output`` with attribute `variables`.
    Variable Names:
    * ``W``: weights
    * ``b``: bias
    """
    n = kernel_shape * kernel_shape * out_channel
    in_shape = x.get_shape().as_list()
    channel_axis = 3 if data_format == 'NHWC' else 1
    in_channel = in_shape[channel_axis]
    assert in_channel is not None, "[Conv2DQuant] Input cannot have unknown channel!"
    assert in_channel % split == 0
    assert out_channel % split == 0

    if fc:
        x = tf.reshape(x, [-1, in_channel, 1, 1])

    kernel_shape = [kernel_shape, kernel_shape]
    padding = padding.upper()
    filter_shape = kernel_shape + [in_channel / split, out_channel]

    if data_format == 'NCHW':
        stride = [1, 1, stride, stride]
    else:
        stride = [1, stride, stride, 1]

    if W_init is None:
        W_init = tf.contrib.layers.variance_scaling_initializer()
    if b_init is None:
        b_init = tf.constant_initializer()

    W = tf.get_variable('W', filter_shape, initializer=W_init)

    kernel_in = W * 1
    tf.summary.scalar('weight', tf.reduce_mean(tf.abs(W)))
    if is_quant:
        quantized_weight = QuantizedWeight('weight_quant', kernel_in, n, nbit=nbit)
    else:
        quantized_weight = kernel_in

    if use_bias:
        b = tf.get_variable('b', [out_channel], initializer=b_init)

    if split == 1:
        conv = tf.nn.conv2d(x, quantized_weight, stride, padding, data_format=data_format)
    else:
        inputs = tf.split(x, split, channel_axis)
        kernels = tf.split(quantized_weight, split, 3)
        outputs = [tf.nn.conv2d(i, k, stride, padding, data_format=data_format)
                   for i, k in zip(inputs, kernels)]
        conv = tf.concat(outputs, channel_axis)

    ret = nl(tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv, name='output')
    ret.variables = VariableHolder(W=W)
    if use_bias:
        ret.variables.b = b
    if fc:
        ret = tf.reshape(ret, [-1, out_channel])
    return ret


@layer_register(log_shape=False, use_scope=None)
def BNReLUQuant(x):
    """
    A shorthand of BatchNormalization + ReLU + QuantizedActiv.
    """
    x = BatchNorm('bn', x)
    x = tf.nn.relu(x)
    x = QuantizedActiv('quant', x)
    return x


def getBNReLUQuant(x, name=None):
    """
    A shorthand of BatchNormalization + ReLU + QuantizedActiv.
    """
    x = BatchNorm('bn', x)
    x = tf.nn.relu(x, name=name)
    x = QuantizedActiv('quant', x)
    return x


def getfcBNReLUQuant(x, name=None):
    """
    A shorthand of BatchNormalization + ReLU + QuantizedActiv after FullyConnect.
    """
    x = BatchNorm('bn', x, data_format='NHWC', use_scale=False, use_bias=False)
    x = tf.nn.relu(x, name=name)
    x = QuantizedActiv('quant', x)
    return x


def getfcBNReLU(x, name=None):
    """
    A shorthand of BatchNormalization + ReLU after FullyConnect.
    """
    x = BatchNorm('bn', x, data_format='NHWC', use_scale=False, use_bias=False)
    x = tf.nn.relu(x, name=name)
    return x
Initial commit 2018-07-25 21:45:28 +03:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`# File: learned_quantization.py`

			`import tensorflow as tf`
			`from tensorflow.contrib.framework import add_model_variable`
			`from tensorflow.python.training import moving_averages`
			`from tensorpack.models import *`
			`from tensorpack.tfutils.tower import get_current_tower_context`

			`MOVING_AVERAGES_FACTOR = 0.9`
			`EPS = 0.0001`
			`NORM_PPF_0_75 = 0.6745`


			`@layer_register()`
			`def QuantizedActiv(x, nbit=2):`
			`"""`
			`Quantize activation.`
			`Args:`
			`x (tf.Tensor): a 4D tensor.`
			`nbit (int): number of bits of quantized activation. Defaults to 2.`
			`Returns:`
			tf.Tensor with attribute `variables`.
			`Variable Names:`
			* ``basis``: basis of quantized activation.
			`Note:`
			`About multi-GPU training: moving averages across GPUs are not aggregated.`
			`Batch statistics are computed by main training tower. This is consistent with most frameworks.`
			`"""`
			`init_basis = [(NORM_PPF_0_75 * 2 / (2 ** nbit - 1)) * (2. ** i) for i in range(nbit)]`
			`init_basis = tf.constant_initializer(init_basis)`
			`bit_dims = [nbit, 1]`
			`num_levels = 2 ** nbit`
			`# initialize level multiplier`
			`init_level_multiplier = []`
			`for i in range(0, num_levels):`
			`level_multiplier_i = [0. for j in range(nbit)]`
			`level_number = i`
			`for j in range(nbit):`
			`level_multiplier_i[j] = float(level_number % 2)`
			`level_number = level_number // 2`
			`init_level_multiplier.append(level_multiplier_i)`
			`# initialize threshold multiplier`
			`init_thrs_multiplier = []`
			`for i in range(1, num_levels):`
			`thrs_multiplier_i = [0. for j in range(num_levels)]`
			`thrs_multiplier_i[i - 1] = 0.5`
			`thrs_multiplier_i[i] = 0.5`
			`init_thrs_multiplier.append(thrs_multiplier_i)`

			`with tf.variable_scope('ActivationQuantization'):`
			`basis = tf.get_variable(`
			`'basis', bit_dims, tf.float32,`
			`initializer=init_basis,`
			`trainable=False)`

			`ctx = get_current_tower_context() # current tower context`
			`# calculate levels and sort`
			`level_codes = tf.constant(init_level_multiplier)`
			`levels = tf.matmul(level_codes, basis)`
			`levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels)`
			`levels = tf.reverse(levels, [-1])`
			`sort_id = tf.reverse(sort_id, [-1])`
			`levels = tf.transpose(levels, [1, 0])`
			`sort_id = tf.transpose(sort_id, [1, 0])`
			`# calculate threshold`
			`thrs_multiplier = tf.constant(init_thrs_multiplier)`
			`thrs = tf.matmul(thrs_multiplier, levels)`
			`# calculate output y and its binary code`
			`y = tf.zeros_like(x) # output`
			`reshape_x = tf.reshape(x, [-1])`
			`zero_dims = tf.stack([tf.shape(reshape_x)[0], nbit])`
			`bits_y = tf.fill(zero_dims, 0.)`
			`zero_y = tf.zeros_like(x)`
			`zero_bits_y = tf.fill(zero_dims, 0.)`
			`for i in range(num_levels - 1):`
			`g = tf.greater(x, thrs[i])`
			`y = tf.where(g, zero_y + levels[i + 1], y)`
			`bits_y = tf.where(tf.reshape(g, [-1]), zero_bits_y + level_codes[sort_id[i + 1][0]], bits_y)`
			`# training`
			`if ctx.is_main_training_tower:`
			`BT = tf.matrix_transpose(bits_y)`
			`# calculate BTxB`
			`BTxB = []`
			`for i in range(nbit):`
			`for j in range(nbit):`
			`BTxBij = tf.multiply(BT[i], BT[j])`
			`BTxBij = tf.reduce_sum(BTxBij)`
			`BTxB.append(BTxBij)`
			`BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit])`
			`BTxB_inv = tf.matrix_inverse(BTxB)`
			`# calculate BTxX`
			`BTxX = []`
			`for i in range(nbit):`
			`BTxXi0 = tf.multiply(BT[i], reshape_x)`
			`BTxXi0 = tf.reduce_sum(BTxXi0)`
			`BTxX.append(BTxXi0)`
			`BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, 1])`

			`new_basis = tf.matmul(BTxB_inv, BTxX) # calculate new basis`
			`# create moving averages op`
			`updata_moving_basis = moving_averages.assign_moving_average(`
			`basis, new_basis, MOVING_AVERAGES_FACTOR)`
			`add_model_variable(basis)`
			`tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis)`

			`for i in range(nbit):`
			`tf.summary.scalar('basis%d' % i, new_basis[i][0])`

			`x_clip = tf.minimum(x, levels[num_levels - 1]) # gradient clip`
			`y = x_clip + tf.stop_gradient(-x_clip) + tf.stop_gradient(y) # gradient: y=clip(x)`
			`y.variables = VariableHolder(basis=basis)`
			`return y`


			`def QuantizedWeight(name, x, n, nbit=2):`
			`"""`
			`Quantize weight.`
			`Args:`
			`x (tf.Tensor): a 4D tensor.`
			`Must have known number of channels, but can have other unknown dimensions.`
			`name (str): operator's name.`
			`n (int or double): variance of weight initialization.`
			`nbit (int): number of bits of quantized weight. Defaults to 2.`
			`Returns:`
			tf.Tensor with attribute `variables`.
			`Variable Names:`
			* ``basis``: basis of quantized weight.
			`Note:`
			`About multi-GPU training: moving averages across GPUs are not aggregated.`
			`Batch statistics are computed by main training tower. This is consistent with most frameworks.`
			`"""`
			`num_filters = x.get_shape().as_list()[-1]`
			`init_basis = []`
			`base = NORM_PPF_0_75 * ((2. / n) 0.5) / (2 (nbit - 1))`
			`for j in range(nbit):`
			`init_basis.append([(2 ** j) * base for i in range(num_filters)])`
			`init_basis = tf.constant_initializer(init_basis)`
			`bit_dims = [nbit, num_filters]`
			`num_levels = 2 ** nbit`
			`delta = EPS`
			`# initialize level multiplier`
			`init_level_multiplier = []`
			`for i in range(num_levels):`
			`level_multiplier_i = [0. for j in range(nbit)]`
			`level_number = i`
			`for j in range(nbit):`
			`binary_code = level_number % 2`
			`if binary_code == 0:`
			`binary_code = -1`
			`level_multiplier_i[j] = float(binary_code)`
			`level_number = level_number // 2`
			`init_level_multiplier.append(level_multiplier_i)`
			`# initialize threshold multiplier`
			`init_thrs_multiplier = []`
			`for i in range(1, num_levels):`
			`thrs_multiplier_i = [0. for j in range(num_levels)]`
			`thrs_multiplier_i[i - 1] = 0.5`
			`thrs_multiplier_i[i] = 0.5`
			`init_thrs_multiplier.append(thrs_multiplier_i)`

			`with tf.variable_scope(name):`
			`basis = tf.get_variable(`
			`'basis', bit_dims, tf.float32,`
			`initializer=init_basis,`
			`trainable=False)`
			`level_codes = tf.constant(init_level_multiplier)`
			`thrs_multiplier = tf.constant(init_thrs_multiplier)`
			`sum_multiplier = tf.constant(1., shape=[1, tf.reshape(x, [-1, num_filters]).get_shape()[0]])`
			`sum_multiplier_basis = tf.constant(1., shape=[1, nbit])`

			`ctx = get_current_tower_context() # current tower context`
			`# calculate levels and sort`
			`levels = tf.matmul(level_codes, basis)`
			`levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels)`
			`levels = tf.reverse(levels, [-1])`
			`sort_id = tf.reverse(sort_id, [-1])`
			`levels = tf.transpose(levels, [1, 0])`
			`sort_id = tf.transpose(sort_id, [1, 0])`
			`# calculate threshold`
			`thrs = tf.matmul(thrs_multiplier, levels)`
			`# calculate level codes per channel`
			`reshape_x = tf.reshape(x, [-1, num_filters])`
			`level_codes_channelwise_dims = tf.stack([num_levels * num_filters, nbit])`
			`level_codes_channelwise = tf.fill(level_codes_channelwise_dims, 0.)`
			`for i in range(num_levels):`
			`eq = tf.equal(sort_id, i)`
			`level_codes_channelwise = tf.where(tf.reshape(eq, [-1]), level_codes_channelwise + level_codes[i], level_codes_channelwise)`
			`level_codes_channelwise = tf.reshape(level_codes_channelwise, [num_levels, num_filters, nbit])`
			`# calculate output y and its binary code`
			`y = tf.zeros_like(x) + levels[0] # output`
			`zero_dims = tf.stack([tf.shape(reshape_x)[0] * num_filters, nbit])`
			`bits_y = tf.fill(zero_dims, -1.)`
			`zero_y = tf.zeros_like(x)`
			`zero_bits_y = tf.fill(zero_dims, 0.)`
			`zero_bits_y = tf.reshape(zero_bits_y, [-1, num_filters, nbit])`
			`for i in range(num_levels - 1):`
			`g = tf.greater(x, thrs[i])`
			`y = tf.where(g, zero_y + levels[i + 1], y)`
			`bits_y = tf.where(tf.reshape(g, [-1]), tf.reshape(zero_bits_y + level_codes_channelwise[i + 1], [-1, nbit]), bits_y)`
			`bits_y = tf.reshape(bits_y, [-1, num_filters, nbit])`
			`# training`
			`if ctx.is_main_training_tower:`
			`BT = tf.transpose(bits_y, [2, 0, 1])`
			`# calculate BTxB`
			`BTxB = []`
			`for i in range(nbit):`
			`for j in range(nbit):`
			`BTxBij = tf.multiply(BT[i], BT[j])`
			`BTxBij = tf.matmul(sum_multiplier, BTxBij)`
			`if i == j:`
			`mat_one = tf.ones([1, num_filters])`
			`BTxBij = BTxBij + (delta * mat_one) # + E`
			`BTxB.append(BTxBij)`
			`BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit, num_filters])`
			`# calculate inverse of BTxB`
			`if nbit > 2:`
			`BTxB_transpose = tf.transpose(BTxB, [2, 0, 1])`
			`BTxB_inv = tf.matrix_inverse(BTxB_transpose)`
			`BTxB_inv = tf.transpose(BTxB_inv, [1, 2, 0])`
			`elif nbit == 2:`
			`det = tf.multiply(BTxB[0][0], BTxB[1][1]) - tf.multiply(BTxB[0][1], BTxB[1][0])`
			`inv = []`
			`inv.append(BTxB[1][1] / det)`
			`inv.append(-BTxB[0][1] / det)`
			`inv.append(-BTxB[1][0] / det)`
			`inv.append(BTxB[0][0] / det)`
			`BTxB_inv = tf.reshape(tf.stack(values=inv), [nbit, nbit, num_filters])`
			`elif nbit == 1:`
			`BTxB_inv = tf.reciprocal(BTxB)`
			`# calculate BTxX`
			`BTxX = []`
			`for i in range(nbit):`
			`BTxXi0 = tf.multiply(BT[i], reshape_x)`
			`BTxXi0 = tf.matmul(sum_multiplier, BTxXi0)`
			`BTxX.append(BTxXi0)`
			`BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, num_filters])`
			`BTxX = BTxX + (delta * basis) # + basis`
			`# calculate new basis`
			`new_basis = []`
			`for i in range(nbit):`
			`new_basis_i = tf.multiply(BTxB_inv[i], BTxX)`
			`new_basis_i = tf.matmul(sum_multiplier_basis, new_basis_i)`
			`new_basis.append(new_basis_i)`
			`new_basis = tf.reshape(tf.stack(values=new_basis), [nbit, num_filters])`
			`# create moving averages op`
			`updata_moving_basis = moving_averages.assign_moving_average(`
			`basis, new_basis, MOVING_AVERAGES_FACTOR)`
			`add_model_variable(basis)`
			`tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis)`

			`y = x + tf.stop_gradient(-x) + tf.stop_gradient(y) # gradient: y=x`
			`y.variables = VariableHolder(basis=basis)`
			`return y`


			`@layer_register()`
			`def Conv2DQuant(x, out_channel, kernel_shape,`
			`padding='SAME', stride=1,`
			`W_init=None, b_init=None,`
			`nl=tf.identity, split=1, use_bias=True,`
			`data_format='NHWC', is_quant=True, nbit=1, fc=False):`
			`"""`
			`2D convolution on 4D inputs.`
			`Args:`
			`x (tf.Tensor): a 4D tensor.`
			`Must have known number of channels, but can have other unknown dimensions.`
			`out_channel (int): number of output channel.`
			`kernel_shape: (h, w) tuple or a int.`
			`stride: (h, w) tuple or a int.`
			`padding (str): 'valid' or 'same'. Case insensitive.`
			`split (int): Split channels as used in Alexnet. Defaults to 1 (no split).`
			W_init: initializer for W. Defaults to `variance_scaling_initializer`.
			`b_init: initializer for b. Defaults to zero.`
			`nl: a nonlinearity function.`
			`use_bias (bool): whether to use bias.`
			`data_format (str): 'NHWC' or 'NCHW'. Defaults to 'NHWC'.`
			`is_quant (bool): whether to quantize weight. Defaults to 'True'.`
			`nbit (int): number of bits of quantized weight. Defaults to 1.`
			`fc (bool): whether to convert Conv2D to FullyConnect. Defaults to 'False'.`
			`Returns:`
			tf.Tensor named ``output`` with attribute `variables`.
			`Variable Names:`
			* ``W``: weights
			* ``b``: bias
			`"""`
			`n = kernel_shape * kernel_shape * out_channel`
			`in_shape = x.get_shape().as_list()`
			`channel_axis = 3 if data_format == 'NHWC' else 1`
			`in_channel = in_shape[channel_axis]`
			`assert in_channel is not None, "[Conv2DQuant] Input cannot have unknown channel!"`
			`assert in_channel % split == 0`
			`assert out_channel % split == 0`

			`if fc:`
			`x = tf.reshape(x, [-1, in_channel, 1, 1])`

			`kernel_shape = [kernel_shape, kernel_shape]`
			`padding = padding.upper()`
			`filter_shape = kernel_shape + [in_channel / split, out_channel]`

			`if data_format == 'NCHW':`
			`stride = [1, 1, stride, stride]`
			`else:`
			`stride = [1, stride, stride, 1]`

			`if W_init is None:`
			`W_init = tf.contrib.layers.variance_scaling_initializer()`
			`if b_init is None:`
			`b_init = tf.constant_initializer()`

			`W = tf.get_variable('W', filter_shape, initializer=W_init)`

			`kernel_in = W * 1`
			`tf.summary.scalar('weight', tf.reduce_mean(tf.abs(W)))`
			`if is_quant:`
			`quantized_weight = QuantizedWeight('weight_quant', kernel_in, n, nbit=nbit)`
			`else:`
			`quantized_weight = kernel_in`

			`if use_bias:`
			`b = tf.get_variable('b', [out_channel], initializer=b_init)`

			`if split == 1:`
			`conv = tf.nn.conv2d(x, quantized_weight, stride, padding, data_format=data_format)`
			`else:`
			`inputs = tf.split(x, split, channel_axis)`
			`kernels = tf.split(quantized_weight, split, 3)`
			`outputs = [tf.nn.conv2d(i, k, stride, padding, data_format=data_format)`
			`for i, k in zip(inputs, kernels)]`
			`conv = tf.concat(outputs, channel_axis)`

			`ret = nl(tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv, name='output')`
			`ret.variables = VariableHolder(W=W)`
			`if use_bias:`
			`ret.variables.b = b`
			`if fc:`
			`ret = tf.reshape(ret, [-1, out_channel])`
			`return ret`


			`@layer_register(log_shape=False, use_scope=None)`
			`def BNReLUQuant(x):`
			`"""`
			`A shorthand of BatchNormalization + ReLU + QuantizedActiv.`
			`"""`
			`x = BatchNorm('bn', x)`
			`x = tf.nn.relu(x)`
			`x = QuantizedActiv('quant', x)`
			`return x`


			`def getBNReLUQuant(x, name=None):`
			`"""`
			`A shorthand of BatchNormalization + ReLU + QuantizedActiv.`
			`"""`
			`x = BatchNorm('bn', x)`
			`x = tf.nn.relu(x, name=name)`
			`x = QuantizedActiv('quant', x)`
			`return x`


			`def getfcBNReLUQuant(x, name=None):`
			`"""`
			`A shorthand of BatchNormalization + ReLU + QuantizedActiv after FullyConnect.`
			`"""`
			`x = BatchNorm('bn', x, data_format='NHWC', use_scale=False, use_bias=False)`
			`x = tf.nn.relu(x, name=name)`
			`x = QuantizedActiv('quant', x)`
			`return x`


			`def getfcBNReLU(x, name=None):`
			`"""`
			`A shorthand of BatchNormalization + ReLU after FullyConnect.`
			`"""`
			`x = BatchNorm('bn', x, data_format='NHWC', use_scale=False, use_bias=False)`
			`x = tf.nn.relu(x, name=name)`
			`return x`