460 строки
17 KiB
Python
460 строки
17 KiB
Python
# -*- coding: utf-8 -*-
|
|
# File: conv2d.py
|
|
|
|
|
|
from ..compat import tfv1 as tf # this should be avoided first in model code
|
|
|
|
from ..tfutils.common import get_tf_version_tuple
|
|
from ..utils.argtools import get_data_format, shape2d, shape4d, log_once
|
|
from .common import VariableHolder, layer_register
|
|
from .tflayer import convert_to_tflayer_args, rename_get_variable
|
|
|
|
__all__ = ['Conv2D', 'Deconv2D', 'Conv2DTranspose', 'GroupedConv2D', 'ResizeImages', 'SeparableConv2D']
|
|
|
|
|
|
@layer_register(log_shape=True)
|
|
@convert_to_tflayer_args(
|
|
args_names=['filters', 'kernel_size'],
|
|
name_mapping={
|
|
'out_channel': 'filters',
|
|
'kernel_shape': 'kernel_size',
|
|
'stride': 'strides',
|
|
})
|
|
def Conv2D(
|
|
inputs,
|
|
filters,
|
|
kernel_size,
|
|
strides=(1, 1),
|
|
padding='same',
|
|
data_format='channels_last',
|
|
dilation_rate=(1, 1),
|
|
activation=None,
|
|
use_bias=True,
|
|
kernel_initializer=None,
|
|
bias_initializer=tf.zeros_initializer(),
|
|
kernel_regularizer=None,
|
|
bias_regularizer=None,
|
|
activity_regularizer=None,
|
|
split=1):
|
|
"""
|
|
A wrapper around `tf.layers.Conv2D`.
|
|
Some differences to maintain backward-compatibility:
|
|
|
|
1. Default kernel initializer is variance_scaling_initializer(2.0).
|
|
2. Default padding is 'same'.
|
|
3. Support 'split' argument to do group conv. Note that this is not efficient.
|
|
|
|
Variable Names:
|
|
|
|
* ``W``: weights
|
|
* ``b``: bias
|
|
"""
|
|
if kernel_initializer is None:
|
|
if get_tf_version_tuple() <= (1, 12):
|
|
kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0)
|
|
else:
|
|
kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal')
|
|
dilation_rate = shape2d(dilation_rate)
|
|
|
|
if split == 1 and dilation_rate == [1, 1]:
|
|
# tf.layers.Conv2D has bugs with dilations (https://github.com/tensorflow/tensorflow/issues/26797)
|
|
with rename_get_variable({'kernel': 'W', 'bias': 'b'}):
|
|
layer = tf.layers.Conv2D(
|
|
filters,
|
|
kernel_size,
|
|
strides=strides,
|
|
padding=padding,
|
|
data_format=data_format,
|
|
dilation_rate=dilation_rate,
|
|
activation=activation,
|
|
use_bias=use_bias,
|
|
kernel_initializer=kernel_initializer,
|
|
bias_initializer=bias_initializer,
|
|
kernel_regularizer=kernel_regularizer,
|
|
bias_regularizer=bias_regularizer,
|
|
activity_regularizer=activity_regularizer,
|
|
_reuse=tf.get_variable_scope().reuse)
|
|
ret = layer.apply(inputs, scope=tf.get_variable_scope())
|
|
ret = tf.identity(ret, name='output')
|
|
|
|
ret.variables = VariableHolder(W=layer.kernel)
|
|
if use_bias:
|
|
ret.variables.b = layer.bias
|
|
|
|
# compute the flops of the conv
|
|
in_shape = inputs.get_shape().as_list()
|
|
channel_axis = 3 if data_format == 'channels_last' else 1
|
|
h_dim = 1 if data_format == 'channels_last' else 2
|
|
w_dim = h_dim + 1
|
|
in_channel = in_shape[channel_axis]
|
|
out_channel = filters
|
|
kernel_shape = shape2d(kernel_size)
|
|
stride = shape4d(strides, data_format=data_format)
|
|
flops = 1.0 * in_channel * out_channel * \
|
|
kernel_shape[0] * kernel_shape[1] / stride[h_dim] / stride[w_dim]
|
|
if in_shape[h_dim] is not None and in_shape[h_dim] > 0:
|
|
flops *= in_shape[h_dim] * in_shape[w_dim]
|
|
ret.info = VariableHolder(flops=flops)
|
|
|
|
else:
|
|
# group conv implementation
|
|
data_format = get_data_format(data_format, keras_mode=False)
|
|
in_shape = inputs.get_shape().as_list()
|
|
channel_axis = -1 if data_format == 'NHWC' else 1
|
|
in_channel = in_shape[channel_axis]
|
|
assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!"
|
|
assert in_channel % split == 0
|
|
|
|
assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \
|
|
"Not supported by group conv now!"
|
|
|
|
out_channel = filters
|
|
assert out_channel % split == 0
|
|
assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (1, 5), 'TF>=1.5 required for dilated conv.'
|
|
|
|
kernel_shape = shape2d(kernel_size)
|
|
filter_shape = kernel_shape + [in_channel / split, out_channel]
|
|
stride = shape4d(strides, data_format=data_format)
|
|
|
|
kwargs = dict(data_format=data_format)
|
|
if get_tf_version_tuple() >= (1, 5):
|
|
kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format)
|
|
|
|
W = tf.get_variable(
|
|
'W', filter_shape, initializer=kernel_initializer)
|
|
|
|
if use_bias:
|
|
b = tf.get_variable('b', [out_channel], initializer=bias_initializer)
|
|
|
|
conv = None
|
|
if get_tf_version_tuple() >= (1, 13):
|
|
try:
|
|
conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs)
|
|
except ValueError:
|
|
conv = None
|
|
log_once("CUDNN group convolution support is only available with "
|
|
"https://github.com/tensorflow/tensorflow/pull/25818 . "
|
|
"Will fall back to a loop-based slow implementation instead!", 'warn')
|
|
if conv is None:
|
|
inputs = tf.split(inputs, split, channel_axis)
|
|
kernels = tf.split(W, split, 3)
|
|
outputs = [tf.nn.conv2d(i, k, stride, padding.upper(), **kwargs)
|
|
for i, k in zip(inputs, kernels)]
|
|
conv = tf.concat(outputs, channel_axis)
|
|
|
|
if activation is None:
|
|
activation = tf.identity
|
|
ret = activation(tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv, name='output')
|
|
|
|
ret.variables = VariableHolder(W=W)
|
|
if use_bias:
|
|
ret.variables.b = b
|
|
|
|
h_dim = 1 if data_format == 'NHWC' else 2
|
|
w_dim = h_dim + 1
|
|
flops = 1.0 * in_channel * out_channel * \
|
|
kernel_shape[0] * kernel_shape[1] / stride[h_dim] / stride[w_dim] / split
|
|
if in_shape[h_dim] is not None and in_shape[h_dim] > 0:
|
|
flops *= in_shape[h_dim] * in_shape[w_dim]
|
|
ret.info = VariableHolder(flops=flops)
|
|
return ret
|
|
|
|
|
|
@layer_register(log_shape=True)
|
|
@convert_to_tflayer_args(
|
|
args_names=['filters', 'kernel_size', 'strides'],
|
|
name_mapping={
|
|
'out_channel': 'filters',
|
|
'kernel_shape': 'kernel_size',
|
|
'stride': 'strides',
|
|
})
|
|
def Conv2DTranspose(
|
|
inputs,
|
|
filters,
|
|
kernel_size,
|
|
strides=(1, 1),
|
|
padding='same',
|
|
data_format='channels_last',
|
|
activation=None,
|
|
use_bias=True,
|
|
kernel_initializer=None,
|
|
bias_initializer=tf.zeros_initializer(),
|
|
kernel_regularizer=None,
|
|
bias_regularizer=None,
|
|
activity_regularizer=None,
|
|
dyn_hw=None):
|
|
"""
|
|
A wrapper around `tf.layers.Conv2DTranspose`.
|
|
Some differences to maintain backward-compatibility:
|
|
|
|
1. Default kernel initializer is variance_scaling_initializer(2.0).
|
|
2. Default padding is 'same'
|
|
|
|
Variable Names:
|
|
|
|
* ``W``: weights
|
|
* ``b``: bias
|
|
"""
|
|
if kernel_initializer is None:
|
|
if get_tf_version_tuple() <= (1, 12):
|
|
kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0)
|
|
else:
|
|
kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal')
|
|
|
|
if get_tf_version_tuple() <= (1, 12):
|
|
with rename_get_variable({'kernel': 'W', 'bias': 'b'}):
|
|
layer = tf.layers.Conv2DTranspose(
|
|
filters,
|
|
kernel_size,
|
|
strides=strides,
|
|
padding=padding,
|
|
data_format=data_format,
|
|
activation=activation,
|
|
use_bias=use_bias,
|
|
kernel_initializer=kernel_initializer,
|
|
bias_initializer=bias_initializer,
|
|
kernel_regularizer=kernel_regularizer,
|
|
bias_regularizer=bias_regularizer,
|
|
activity_regularizer=activity_regularizer,
|
|
_reuse=tf.get_variable_scope().reuse)
|
|
ret = layer.apply(inputs, scope=tf.get_variable_scope())
|
|
ret = tf.identity(ret, name='output')
|
|
ret.variables = VariableHolder(W=layer.kernel)
|
|
if use_bias:
|
|
ret.variables.b = layer.bias
|
|
else:
|
|
# Our own implementation, to avoid Keras bugs. https://github.com/tensorflow/tensorflow/issues/25946
|
|
assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \
|
|
"Unsupported arguments due to Keras bug in TensorFlow 1.13"
|
|
data_format = get_data_format(data_format, keras_mode=False)
|
|
shape_dyn = tf.shape(inputs)
|
|
strides2d = shape2d(strides)
|
|
channels_in = inputs.shape[1 if data_format == 'NCHW' else 3]
|
|
if data_format == 'NCHW':
|
|
channels_in = inputs.shape[1]
|
|
out_shape_dyn = tf.stack(
|
|
[shape_dyn[0], filters,
|
|
shape_dyn[2] * strides2d[0],
|
|
shape_dyn[3] * strides2d[1]])
|
|
out_shape3_sta = [filters,
|
|
None if inputs.shape[2] is None else inputs.shape[2] * strides2d[0],
|
|
None if inputs.shape[3] is None else inputs.shape[3] * strides2d[1]]
|
|
else:
|
|
channels_in = inputs.shape[-1]
|
|
out_shape_dyn = tf.stack(
|
|
[shape_dyn[0],
|
|
shape_dyn[1] * strides2d[0],
|
|
shape_dyn[2] * strides2d[1],
|
|
filters])
|
|
out_shape3_sta = [None if inputs.shape[1] is None else inputs.shape[1] * strides2d[0],
|
|
None if inputs.shape[2] is None else inputs.shape[2] * strides2d[1],
|
|
filters]
|
|
|
|
kernel_shape = shape2d(kernel_size)
|
|
W = tf.get_variable('W', kernel_shape + [filters, channels_in], initializer=kernel_initializer)
|
|
if use_bias:
|
|
b = tf.get_variable('b', [filters], initializer=bias_initializer)
|
|
conv = tf.nn.conv2d_transpose(
|
|
inputs, W, out_shape_dyn,
|
|
shape4d(strides, data_format=data_format),
|
|
padding=padding.upper(),
|
|
data_format=data_format)
|
|
conv.set_shape(tf.TensorShape([None] + out_shape3_sta))
|
|
ret = activation(tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv, name='output')
|
|
|
|
ret.variables = VariableHolder(W=W)
|
|
if use_bias:
|
|
ret.variables.b = b
|
|
return ret
|
|
|
|
|
|
Deconv2D = Conv2DTranspose
|
|
|
|
|
|
@layer_register(log_shape=True)
|
|
def ResizeImages(
|
|
images,
|
|
size,
|
|
method=tf.image.ResizeMethod.BILINEAR,
|
|
align_corners=True,
|
|
data_format='channels_last'):
|
|
"""
|
|
Use tf.image.resize_images to resize feature map.
|
|
We have to do some transposing first before using image resize if the
|
|
data_format is 'channels_first', because resize_images only accept
|
|
'channels_last'.
|
|
|
|
images : tensor representing the feature map to resize
|
|
size : 2D int32 tensor of the new shape (h, w)
|
|
method : Resize method see tf.image.ResizeMethod
|
|
align_corners : Preserving the corner pixels ?
|
|
data_format : current data_format of the inputs 'channels_first' or 'channels_last'
|
|
"""
|
|
l = images
|
|
if data_format == 'channels_first':
|
|
l = tf.transpose(l, [0,2,3,1])
|
|
l = tf.image.resize_images(l, size, method, align_corners)
|
|
if data_format == 'channels_first':
|
|
l = tf.transpose(l, [0,3,1,2])
|
|
ret = tf.identity(l, name='output')
|
|
return ret
|
|
|
|
|
|
@layer_register(log_shape=True)
|
|
def GroupedConv2D(x, num_paths, path_ch_out, kernel_shape,
|
|
sum_paths=False, padding='SAME', stride=1,
|
|
W_init=None, b_init=None, nl=tf.identity,
|
|
use_bias=False, data_format='NHWC'):
|
|
"""
|
|
Grouped conv 2d for ResNeXt. Uses depthwise conv 2d and reshape and sum.
|
|
|
|
Args:
|
|
x : 4D tensor of data_format
|
|
num_paths : number of groups
|
|
path_ch_out : number of ch_out per group
|
|
kernel_shape : (h,w) tuple or an int
|
|
sum_paths : whether the groups are summed together (if True)
|
|
or concatenated (if False (default))
|
|
padding, W_init, b_init, nl, use_bias, data_format : see Conv2D
|
|
|
|
Returns:
|
|
tf.Tensor named ``output`` with attribute `variables`.
|
|
|
|
Variable Names:
|
|
|
|
* ``W``: weights
|
|
* ``b``: bias
|
|
"""
|
|
data_format = get_data_format(data_format, tfmode=False)
|
|
|
|
in_shape = x.get_shape().as_list()
|
|
ch_dim = 3 if data_format == 'NHWC' else 1
|
|
ch_in = in_shape[ch_dim]
|
|
assert ch_in % num_paths == 0, "Grouped conv requires n_groups to divide ch_in"
|
|
ch_in_per_path = ch_in // num_paths
|
|
ch_out = path_ch_out if sum_paths else num_paths * path_ch_out
|
|
|
|
kernel_shape = shape2d(kernel_shape)
|
|
padding = padding.upper()
|
|
filter_shape = kernel_shape + [ch_in, path_ch_out]
|
|
stride = shape4d(stride, data_format=data_format)
|
|
|
|
if W_init is None:
|
|
W_init = tf.contrib.layers.variance_scaling_initializer(2.0)
|
|
if b_init is None:
|
|
b_init = tf.constant_initializer()
|
|
|
|
W = tf.get_variable('W', filter_shape, initializer=W_init)
|
|
if use_bias:
|
|
b = tf.get_variable('b', [ch_out], initializer=b_init)
|
|
|
|
x = tf.nn.depthwise_conv2d(x, W, stride, padding, rate=None, data_format=data_format)
|
|
out_shape = x.get_shape().as_list()
|
|
|
|
# First reshape to expose the dimension by input channels
|
|
shape_depthwise = [num_paths, ch_in_per_path, path_ch_out]
|
|
if data_format == 'NHWC':
|
|
x = tf.reshape(x, [-1, out_shape[1], out_shape[2]] + shape_depthwise)
|
|
else:
|
|
x = tf.reshape(x, [-1] + shape_depthwise + [out_shape[2], out_shape[3]])
|
|
|
|
# Then reduce sum to remove the input channel leaving output dim and (path dim)
|
|
if sum_paths:
|
|
sum_axis = [ch_dim, ch_dim + 1]
|
|
else:
|
|
sum_axis = ch_dim + 1
|
|
x = tf.reduce_sum(x, sum_axis)
|
|
|
|
# reshape to output shape if path dim did not collapse
|
|
if not sum_paths:
|
|
if data_format == 'NHWC':
|
|
x = tf.reshape(x, [-1, out_shape[1], out_shape[2], ch_out])
|
|
else:
|
|
x = tf.reshape(x, [-1, ch_out, out_shape[2], out_shape[3]])
|
|
|
|
ret = nl(tf.nn.bias_add(x, b, data_format=data_format) if use_bias else x, name='output')
|
|
ret.variables = VariableHolder(W=W)
|
|
if use_bias:
|
|
ret.variables.b = b
|
|
return ret
|
|
|
|
|
|
|
|
@layer_register(log_shape=True)
|
|
@convert_to_tflayer_args(
|
|
args_names=['filters', 'kernel_size'],
|
|
name_mapping={
|
|
'out_channel': 'filters',
|
|
'kernel_shape': 'kernel_size',
|
|
'stride': 'strides',
|
|
})
|
|
def SeparableConv2D(
|
|
inputs,
|
|
filters,
|
|
kernel_size,
|
|
strides=(1,1),
|
|
padding='same',
|
|
data_format='channels_last',
|
|
dilation_rate=(1,1),
|
|
depth_multiplier=1,
|
|
activation=None,
|
|
use_bias=True,
|
|
depthwise_initializer=tf.contrib.layers.variance_scaling_initializer(2.0),
|
|
pointwise_initializer=tf.contrib.layers.variance_scaling_initializer(2.0),
|
|
bias_initializer=tf.zeros_initializer(),
|
|
depthwise_regularizer=None,
|
|
pointwise_regularizer=None,
|
|
bias_regularizer=None,
|
|
activity_regularizer=None,
|
|
depthwise_constraint=None,
|
|
pointwise_constraint=None,
|
|
bias_constraint=None,
|
|
trainable=True,
|
|
name=None,
|
|
reuse=None):
|
|
with rename_get_variable({'depthwise_kernel': 'Wd', 'pointwise_kernel': 'Wp', 'bias': 'b'}):
|
|
layer = tf.layers.SeparableConv2D(
|
|
filters,
|
|
kernel_size,
|
|
strides=strides,
|
|
padding=padding,
|
|
data_format=data_format,
|
|
dilation_rate=dilation_rate,
|
|
activation=activation,
|
|
use_bias=use_bias,
|
|
depth_multiplier=depth_multiplier,
|
|
depthwise_initializer=depthwise_initializer,
|
|
pointwise_initializer=pointwise_initializer,
|
|
bias_initializer=bias_initializer,
|
|
activity_regularizer=activity_regularizer,
|
|
depthwise_regularizer=depthwise_regularizer,
|
|
pointwise_regularizer=pointwise_regularizer,
|
|
bias_regularizer=bias_regularizer)
|
|
ret = layer.apply(inputs, scope=tf.get_variable_scope())
|
|
ret = tf.identity(ret, name='output')
|
|
|
|
ret.variables = VariableHolder(Wd=layer.depthwise_kernel, Wp=layer.pointwise_kernel)
|
|
if use_bias:
|
|
ret.variables.b = layer.bias
|
|
|
|
# compute the flops of the conv
|
|
in_shape = inputs.get_shape().as_list()
|
|
channel_axis = 3 if data_format == 'channels_last' else 1
|
|
h_dim = 1 if data_format == 'channels_last' else 2
|
|
w_dim = h_dim + 1
|
|
in_channel = in_shape[channel_axis]
|
|
out_channel = filters
|
|
kernel_shape = shape2d(kernel_size)
|
|
stride = shape4d(strides, data_format=data_format)
|
|
flops = 1.0 * in_channel * depth_multiplier * \
|
|
kernel_shape[0] * kernel_shape[1] / stride[h_dim] / stride[w_dim]
|
|
# since pointwise is on the result of depthwise, the stride is carried over.
|
|
pt_flops = 1.0 * (in_channel * depth_multiplier) * out_channel \
|
|
/ stride[h_dim] / stride[w_dim]
|
|
if in_shape[h_dim] is not None and in_shape[h_dim] > 0:
|
|
H_times_W = in_shape[h_dim] * in_shape[w_dim]
|
|
flops *= H_times_W
|
|
pt_flops *= H_times_W
|
|
ret.info = VariableHolder(flops=flops+pt_flops)
|
|
return ret
|