This commit is contained in:
Yu Deng 2020-12-04 15:36:24 +08:00
Родитель 05d0786aea
Коммит 3002ea2d52
18 изменённых файлов: 1870 добавлений и 99 удалений

Двоичные данные
BFM/BFM_model_front.mat Normal file

Двоичный файл не отображается.

76
data_loader.py Normal file
Просмотреть файл

@ -0,0 +1,76 @@
import tensorflow as tf
from tensorflow.contrib.data import prefetch_to_device, shuffle_and_repeat, map_and_batch
import os
import glob
import numpy as np
import cv2
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
###############################################################################################
# data loader for training stage
###############################################################################################
def _parse_function(image_path,lm_path,mask_path):
# input image
x = tf.read_file(image_path)
img = tf.image.decode_png(x, channels=3)
img = tf.cast(img,tf.float32)
img = img[:,:,::-1]
# ground truth landmark
x2 = tf.read_file(lm_path)
lm = tf.decode_raw(x2,tf.float64)
lm = tf.cast(lm,tf.float32)
lm = tf.reshape(lm,[68,2])
# skin mask
x3 = tf.read_file(mask_path)
mask = tf.image.decode_png(x3, channels=3)
mask = tf.cast(mask,tf.float32)
return img,lm,mask
def check_lm_bin(dataset,lm_path):
if not os.path.isdir(os.path.join(dataset,'lm_bin')):
os.makdirs(os.path.join(dataset,'lm_bin'))
for i in range(len(lm_path)):
lm = np.loadtxt(lm_path[i])
lm = np.reshape(lm,[-1])
lm.tofile(os.path.join(dataset,'lm_bin',lm_path[i].split('/')[-1].replace('txt','bin')))
def load_dataset(opt,train=True):
if train:
data_path = opt.data_path
else:
data_path = opt.val_data_path
image_path_all = []
lm_path_all = []
mask_path_all = []
for dataset in data_path:
image_path = glob.glob(dataset + '/' + '*.png')
image_path.sort()
lm_path_ = [os.path.join(dataset,'lm',f.split('/')[-1].replace('png','txt')) for f in image_path]
lm_path_.sort()
mask_path = [os.path.join(dataset,'mask',f.split('/')[-1]) for f in image_path]
mask_path.sort()
# check if landmark binary files exist
check_lm_bin(dataset,lm_path_)
lm_path = [os.path.join(dataset,'lm_bin',f.split('/')[-1].replace('png','bin')) for f in image_path]
lm_path.sort()
image_path_all += image_path
mask_path_all += mask_path
lm_path_all += lm_path
dataset_num = len(image_path_all)
dataset = tf.data.Dataset.from_tensor_slices((image_path_all,lm_path_all,mask_path_all))
dataset = dataset. \
apply(shuffle_and_repeat(dataset_num)). \
apply(map_and_batch(_parse_function, opt.batch_size, num_parallel_batches=4, drop_remainder=True)). \
apply(prefetch_to_device('/gpu:0', None)) # When using dataset.prefetch, use buffer_size=None to let it detect optimal buffer size
inputs_iterator = dataset.make_one_shot_iterator()
return inputs_iterator

50
demo.py
Просмотреть файл

@ -7,18 +7,25 @@ import cv2
import platform
from scipy.io import loadmat,savemat
from preprocess_img import Preprocess
from load_data import *
from preprocess_img import align_img
from utils import *
from face_decoder import Face3D
from options import Option
is_windows = platform.system() == "Windows"
def load_graph(graph_filename):
with tf.gfile.GFile(graph_filename,'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
def restore_weights(sess,opt):
var_list = tf.trainable_variables()
g_list = tf.global_variables()
return graph_def
# add batch normalization params into trainable variables
bn_moving_vars = [g for g in g_list if 'moving_mean' in g.name]
bn_moving_vars += [g for g in g_list if 'moving_variance' in g.name]
var_list +=bn_moving_vars
# create saver to save and restore weights
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess,opt.pretrain_weights)
def demo():
# input and output folder
@ -36,22 +43,32 @@ def demo():
# read standard landmarks for preprocessing images
lm3D = load_lm3d()
batchsize = 1
n = 0
# build reconstruction model
with tf.Graph().as_default() as graph,tf.device('/cpu:0'):
opt = Option()
opt.batch_size = 1
opt.is_train = False
FaceReconstructor = Face3D()
images = tf.placeholder(name = 'input_imgs', shape = [batchsize,224,224,3], dtype = tf.float32)
graph_def = load_graph('network/FaceReconModel.pb')
tf.import_graph_def(graph_def,name='resnet',input_map={'input_imgs:0': images})
images = tf.placeholder(name = 'input_imgs', shape = [opt.batch_size,224,224,3], dtype = tf.float32)
# output coefficients of R-Net (dim = 257)
coeff = graph.get_tensor_by_name('resnet/coeff:0')
if opt.use_pb and os.path.isfile('network/FaceReconModel.pb'):
print('Using pre-trained .pb file.')
use_pb = True
graph_def = load_graph('network/FaceReconModel.pb')
tf.import_graph_def(graph_def,name='resnet',input_map={'input_imgs:0': images})
# output coefficients of R-Net (dim = 257)
coeff = graph.get_tensor_by_name('resnet/coeff:0')
else:
print('Using pre-trained .ckpt file: %s'%opt.pretrain_weights)
use_pb = False
import networks
coeff = networks.R_Net(images,is_training=False)
# reconstructing faces
FaceReconstructor.Reconstruction_Block(coeff,batchsize)
FaceReconstructor.Reconstruction_Block(coeff,opt)
face_shape = FaceReconstructor.face_shape_t
face_texture = FaceReconstructor.face_texture
face_color = FaceReconstructor.face_color
@ -61,6 +78,9 @@ def demo():
with tf.Session() as sess:
if not use_pb:
restore_weights(sess,opt)
print('reconstructing...')
for file in img_list:
n += 1
@ -68,7 +88,7 @@ def demo():
# load images and corresponding 5 facial landmarks
img,lm = load_img(file,file.replace('png','txt').replace('jpg','txt'))
# preprocess input image
input_img,lm_new,transform_params = Preprocess(img,lm,lm3D)
input_img,lm_new,transform_params = align_img(img,lm,lm3D)
coeff_,face_shape_,face_texture_,face_color_,landmarks_2d_,recon_img_,tri_ = sess.run([coeff,\
face_shape,face_texture,face_color,landmarks_2d,recon_img,tri],feed_dict = {images: input_img})

Просмотреть файл

@ -7,40 +7,42 @@ import platform
is_windows = platform.system() == "Windows"
if not is_windows:
import mesh_renderer
from renderer import mesh_renderer
###############################################################################################
# Reconstruct 3D face based on output coefficients and facemodel
###############################################################################################
# BFM 3D face model
class BFM():
def __init__(self,model_path = 'BFM/BFM_model_front.mat'):
def __init__(self,model_path = './BFM/BFM_model_front.mat'):
model = loadmat(model_path)
self.meanshape = tf.constant(model['meanshape']) # mean face shape. [3*N,1]
self.idBase = tf.constant(model['idBase']) # identity basis. [3*N,80]
self.exBase = tf.constant(model['exBase'].astype(np.float32)) # expression basis. [3*N,64]
self.meantex = tf.constant(model['meantex']) # mean face texture. [3*N,1] (0-255)
self.texBase = tf.constant(model['texBase']) # texture basis. [3*N,80]
self.point_buf = tf.constant(model['point_buf']) # triangle indices for each vertex that lies in. starts from 1. [N,8]
self.face_buf = tf.constant(model['tri']) # vertex indices in each triangle. starts from 1. [F,3]
self.keypoints = tf.squeeze(tf.constant(model['keypoints'])) # vertex indices of 68 facial landmarks. starts from 1. [68,1]
self.point_buf = tf.constant(model['point_buf']) # face indices for each vertex that lies in. starts from 1. [N,8]
self.face_buf = tf.constant(model['tri']) # vertex indices for each face. starts from 1. [F,3]
self.front_mask_render = tf.squeeze(tf.constant(model['frontmask2_idx'])) # vertex indices for small face region to compute photometric error. starts from 1.
self.mask_face_buf = tf.constant(model['tri_mask2']) # vertex indices for each face from small face region. starts from 1. [f,3]
self.skin_mask = tf.squeeze(tf.constant(model['skinmask'])) # vertex indices for pre-defined skin region to compute reflectance loss
self.keypoints = tf.squeeze(tf.constant(model['keypoints'])) # vertex indices for 68 landmarks. starts from 1. [68,1]
# Analytic 3D face reconstructor
# Analytic 3D face
class Face3D():
def __init__(self):
facemodel = BFM()
self.facemodel = facemodel
# analytic 3D face reconstructions with coefficients from R-Net
def Reconstruction_Block(self,coeff,batchsize):
def Reconstruction_Block(self,coeff,opt):
#coeff: [batchsize,257] reconstruction coefficients
id_coeff,ex_coeff,tex_coeff,angles,translation,gamma = self.Split_coeff(coeff)
id_coeff,ex_coeff,tex_coeff,angles,translation,gamma,camera_scale,f_scale = self.Split_coeff(coeff)
# [batchsize,N,3] canonical face shape in BFM space
face_shape = self.Shape_formation_block(id_coeff,ex_coeff,self.facemodel)
# [batchsize,N,3] vertex texture (in RGB order)
face_texture = self.Texture_formation_block(tex_coeff,self.facemodel)
self.face_texture = face_texture
# [batchsize,3,3] rotation matrix for face shape
rotation = self.Compute_rotation_matrix(angles)
# [batchsize,N,3] vertex normal
@ -49,38 +51,44 @@ class Face3D():
# do rigid transformation for face shape using predicted rotation and translation
face_shape_t = self.Rigid_transform_block(face_shape,rotation,translation)
self.face_shape_t = face_shape_t
# compute 2d landmark projections
# landmark_p: [batchsize,68,2]
face_landmark_t = self.Compute_landmark(face_shape_t,self.facemodel)
landmark_p = self.Projection_block(face_landmark_t) # 256*256 image
landmark_p = tf.stack([landmark_p[:,:,0],223. - landmark_p[:,:,1]],axis = 2)
self.landmark_p = landmark_p
landmark_p = self.Projection_block(face_landmark_t,camera_scale,f_scale)
# [batchsize,N,3] vertex color (in RGB order)
face_color = self.Illumination_block(face_texture, norm_r, gamma)
# reconstruction images and region masks for computing photometric loss
render_imgs,img_mask,img_mask_crop = self.Render_block(face_shape_t,norm_r,face_color,camera_scale,f_scale,self.facemodel,opt.batch_size,opt.is_train)
self.id_coeff = id_coeff
self.ex_coeff = ex_coeff
self.tex_coeff = tex_coeff
self.f_scale = f_scale
self.gamma = gamma
self.face_shape = face_shape
self.face_shape_t = face_shape_t
self.face_texture = face_texture
self.face_color = face_color
self.landmark_p = landmark_p
self.render_imgs = render_imgs
self.img_mask = img_mask
self.img_mask_crop = img_mask_crop
# reconstruction images
if not is_windows:
render_imgs = self.Render_block(face_shape_t,norm_r,face_color,self.facemodel,batchsize)
render_imgs = tf.clip_by_value(render_imgs,0,255)
render_imgs = tf.cast(render_imgs,tf.float32)
self.render_imgs = render_imgs
else:
self.render_imgs = []
######################################################################################################
#----------------------------------------------------------------------------------------------
def Split_coeff(self,coeff):
id_coeff = coeff[:,:80] #identity
ex_coeff = coeff[:,80:144] #expression
tex_coeff = coeff[:,144:224] #texture
angles = coeff[:,224:227] #euler angles for pose
gamma = coeff[:,227:254] #lighting
translation = coeff[:,254:257] #translation
return id_coeff,ex_coeff,tex_coeff,angles,translation,gamma
id_coeff = coeff[:,:80]
ex_coeff = coeff[:,80:144]
tex_coeff = coeff[:,144:224]
angles = coeff[:,224:227]
gamma = coeff[:,227:254]
translation = coeff[:,254:257]
camera_scale = tf.ones([tf.shape(coeff)[0],1])
f_scale = tf.ones([tf.shape(coeff)[0],1])
return id_coeff,ex_coeff,tex_coeff,angles,translation,gamma,camera_scale,f_scale
def Shape_formation_block(self,id_coeff,ex_coeff,facemodel):
face_shape = tf.einsum('ij,aj->ai',facemodel.idBase,id_coeff) + \
@ -170,31 +178,27 @@ class Face3D():
# R = RzRyRx
rotation = tf.matmul(tf.matmul(rotation_Z,rotation_Y),rotation_X)
# because our face shape is N*3, so compute the transpose of R, so that rotation shapes can be calculated as face_shape*R
rotation = tf.transpose(rotation, perm = [0,2,1])
return rotation
def Projection_block(self,face_shape,focal=1015.0,half_image_width=112.):
def Projection_block(self,face_shape,camera_scale,f_scale):
# pre-defined camera focal for pespective projection
focal = tf.constant(focal)
# focal = tf.constant(400.0)
focal = tf.constant(1015.0)
focal = focal*f_scale
focal = tf.reshape(focal,[-1,1])
batchsize = tf.shape(face_shape)[0]
# center = tf.constant(112.0)
batchsize = tf.shape(focal)[0]
# define camera position
camera_pos = tf.reshape(tf.constant([0.0,0.0,10.0]),[1,1,3])
camera_pos = tf.reshape(tf.constant([0.0,0.0,10.0]),[1,1,3])*tf.reshape(camera_scale,[-1,1,1])
reverse_z = tf.tile(tf.reshape(tf.constant([1.0,0,0,0,1,0,0,0,-1.0]),[1,3,3]),[tf.shape(face_shape)[0],1,1])
# compute projection matrix
p_matrix = tf.concat([focal*tf.ones([batchsize,1]),tf.zeros([batchsize,1]),half_image_width*tf.ones([batchsize,1]),tf.zeros([batchsize,1]),\
focal*tf.ones([batchsize,1]),half_image_width*tf.ones([batchsize,1]),tf.zeros([batchsize,2]),tf.ones([batchsize,1])],axis = 1)
# p_matrix = tf.tile(tf.reshape(p_matrix,[1,3,3]),[tf.shape(face_shape)[0],1,1])
p_matrix = tf.concat([focal,tf.zeros([batchsize,1]),112.*tf.ones([batchsize,1]),tf.zeros([batchsize,1]),focal,112.*tf.ones([batchsize,1]),tf.zeros([batchsize,2]),tf.ones([batchsize,1])],axis = 1)
p_matrix = tf.reshape(p_matrix,[-1,3,3])
# convert z in canonical space to the distance to camera
reverse_z = tf.tile(tf.reshape(tf.constant([1.0,0,0,0,1,0,0,0,-1.0]),[1,3,3]),[tf.shape(face_shape)[0],1,1])
# convert z in world space to the distance to camera
face_shape = tf.matmul(face_shape,reverse_z) + camera_pos
aug_projection = tf.matmul(face_shape,tf.transpose(p_matrix,[0,2,1]))
@ -256,51 +260,84 @@ class Face3D():
return face_shape_t
def Render_block(self,face_shape,face_norm,face_color,facemodel,batchsize):
def Render_block(self,face_shape,face_norm,face_color,camera_scale,f_scale,facemodel,batchsize,is_train=True):
if is_train and is_windows:
raise ValueError('Not support training with Windows environment.')
if is_windows:
return [],[],[]
# render reconstruction images
n_vex = int(facemodel.idBase.shape[0].value/3)
fov_y = 2*tf.atan(112/(1015.))*180./m.pi + tf.zeros([batchsize])
fov_y = 2*tf.atan(112./(1015.*f_scale))*180./m.pi
fov_y = tf.reshape(fov_y,[batchsize])
# full face region
face_shape = tf.reshape(face_shape,[batchsize,n_vex,3])
face_norm = tf.reshape(face_norm,[batchsize,n_vex,3])
face_color = tf.reshape(face_color,[batchsize,n_vex,3])
#cammera settings
# same as in Projection_block
camera_position = tf.constant([[0,0,10.0]]) + tf.zeros([batchsize,3])
camera_lookat = tf.constant([[0,0,0.0]]) + tf.zeros([batchsize,3])
camera_up = tf.constant([[0,1.0,0]]) + tf.zeros([batchsize,3])
# pre-defined cropped face region
mask_face_shape = tf.gather(face_shape,tf.cast(facemodel.front_mask_render-1,tf.int32),axis = 1)
mask_face_norm = tf.gather(face_norm,tf.cast(facemodel.front_mask_render-1,tf.int32),axis = 1)
mask_face_color = tf.gather(face_color,tf.cast(facemodel.front_mask_render-1,tf.int32),axis = 1)
# setting light source position(intensities are set to 0 because we have already computed the vertex color)
light_positions = tf.reshape(tf.constant([0,0,1e5]),[1,1,3]) + tf.zeros([batchsize,1,3])
light_intensities = tf.reshape(tf.constant([0.0,0.0,0.0]),[1,1,3])+tf.zeros([batchsize,1,3])
ambient_color = tf.reshape(tf.constant([1.0,1,1]),[1,3])+ tf.zeros([batchsize,3])
# setting cammera settings
camera_position = tf.constant([[0,0,10.0]])*tf.reshape(camera_scale,[-1,1])
camera_lookat = tf.constant([0,0,0.0])
camera_up = tf.constant([0,1.0,0])
# setting light source position(intensities are set to 0 because we have computed the vertex color)
light_positions = tf.tile(tf.reshape(tf.constant([0,0,1e5]),[1,1,3]),[batchsize,1,1])
light_intensities = tf.tile(tf.reshape(tf.constant([0.0,0.0,0.0]),[1,1,3]),[batchsize,1,1])
ambient_color = tf.tile(tf.reshape(tf.constant([1.0,1,1]),[1,3]),[batchsize,1])
near_clip = 0.01*tf.ones([batchsize])
far_clip = 50*tf.ones([batchsize])
#using tf_mesh_renderer for rasterization (https://github.com/google/tf_mesh_renderer)
# img: [batchsize,224,224,4] images in RGBA order (0-255)
if not is_windows:
with tf.device('/cpu:0'):
img = mesh_renderer.mesh_renderer(face_shape,
tf.cast(facemodel.face_buf-1,tf.int32),
face_norm,
face_color,
camera_position = camera_position,
camera_lookat = camera_lookat,
camera_up = camera_up,
light_positions = light_positions,
light_intensities = light_intensities,
image_width = 224,
image_height = 224,
fov_y = fov_y, #12.5936
ambient_color = ambient_color,
near_clip = near_clip,
far_clip = far_clip)
return img
else:
return np.zeros([224, 224], dtype=np.int32)
# img: [batchsize,224,224,3] images in RGB order (0-255)
# mask:[batchsize,224,224,1] transparency for img ({0,1} value)
img_rgba = mesh_renderer.mesh_renderer(face_shape,
tf.cast(facemodel.face_buf-1,tf.int32),
face_norm,
face_color,
camera_position = camera_position,
camera_lookat = camera_lookat,
camera_up = camera_up,
light_positions = light_positions,
light_intensities = light_intensities,
image_width = 224,
image_height = 224,
fov_y = fov_y,
near_clip = 0.01,
far_clip = 50.0,
ambient_color = ambient_color)
img = img_rgba[:,:,:,:3]
mask = img_rgba[:,:,:,3:]
img = tf.cast(img[:,:,:,::-1],tf.float32) #transfer RGB to BGR
mask = tf.cast(mask,tf.float32) # full face region
if is_train:
# compute mask for small face region
img_crop_rgba = mesh_renderer.mesh_renderer(mask_face_shape,
tf.cast(facemodel.mask_face_buf-1,tf.int32),
mask_face_norm,
mask_face_color,
camera_position = camera_position,
camera_lookat = camera_lookat,
camera_up = camera_up,
light_positions = light_positions,
light_intensities = light_intensities,
image_width = 224,
image_height = 224,
fov_y = fov_y,
near_clip = 0.01,
far_clip = 50.0,
ambient_color = ambient_color)
mask_f = img_crop_rgba[:,:,:,3:]
mask_f = tf.cast(mask_f,tf.float32) # small face region
return img,mask,mask_f
img_rgba = tf.cast(tf.clip_by_value(img_rgba,0,255),tf.float32)
return img_rgba,mask,mask

247
inception_resnet_v1.py Normal file
Просмотреть файл

@ -0,0 +1,247 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains the definition of the Inception Resnet V1 architecture.
As described in http://arxiv.org/abs/1602.07261.
Inception-v4, Inception-ResNet and the Impact of Residual Connections
on Learning
Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import tensorflow.contrib.slim as slim
# Inception-Resnet-A
def block35(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None):
"""Builds the 35x35 resnet block."""
with tf.variable_scope(scope, 'Block35', [net], reuse=reuse):
with tf.variable_scope('Branch_0'):
tower_conv = slim.conv2d(net, 32, 1, scope='Conv2d_1x1')
with tf.variable_scope('Branch_1'):
tower_conv1_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1')
tower_conv1_1 = slim.conv2d(tower_conv1_0, 32, 3, scope='Conv2d_0b_3x3')
with tf.variable_scope('Branch_2'):
tower_conv2_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1')
tower_conv2_1 = slim.conv2d(tower_conv2_0, 32, 3, scope='Conv2d_0b_3x3')
tower_conv2_2 = slim.conv2d(tower_conv2_1, 32, 3, scope='Conv2d_0c_3x3')
mixed = tf.concat([tower_conv, tower_conv1_1, tower_conv2_2], 3)
up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None,
activation_fn=None, scope='Conv2d_1x1')
net += scale * up
if activation_fn:
net = activation_fn(net)
return net
# Inception-Resnet-B
def block17(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None):
"""Builds the 17x17 resnet block."""
with tf.variable_scope(scope, 'Block17', [net], reuse=reuse):
with tf.variable_scope('Branch_0'):
tower_conv = slim.conv2d(net, 128, 1, scope='Conv2d_1x1')
with tf.variable_scope('Branch_1'):
tower_conv1_0 = slim.conv2d(net, 128, 1, scope='Conv2d_0a_1x1')
tower_conv1_1 = slim.conv2d(tower_conv1_0, 128, [1, 7],
scope='Conv2d_0b_1x7')
tower_conv1_2 = slim.conv2d(tower_conv1_1, 128, [7, 1],
scope='Conv2d_0c_7x1')
mixed = tf.concat([tower_conv, tower_conv1_2], 3)
up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None,
activation_fn=None, scope='Conv2d_1x1')
net += scale * up
if activation_fn:
net = activation_fn(net)
return net
# Inception-Resnet-C
def block8(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None):
"""Builds the 8x8 resnet block."""
with tf.variable_scope(scope, 'Block8', [net], reuse=reuse):
with tf.variable_scope('Branch_0'):
tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1')
with tf.variable_scope('Branch_1'):
tower_conv1_0 = slim.conv2d(net, 192, 1, scope='Conv2d_0a_1x1')
tower_conv1_1 = slim.conv2d(tower_conv1_0, 192, [1, 3],
scope='Conv2d_0b_1x3')
tower_conv1_2 = slim.conv2d(tower_conv1_1, 192, [3, 1],
scope='Conv2d_0c_3x1')
mixed = tf.concat([tower_conv, tower_conv1_2], 3)
up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None,
activation_fn=None, scope='Conv2d_1x1')
net += scale * up
if activation_fn:
net = activation_fn(net)
return net
def reduction_a(net, k, l, m, n):
with tf.variable_scope('Branch_0'):
tower_conv = slim.conv2d(net, n, 3, stride=2, padding='VALID',
scope='Conv2d_1a_3x3')
with tf.variable_scope('Branch_1'):
tower_conv1_0 = slim.conv2d(net, k, 1, scope='Conv2d_0a_1x1')
tower_conv1_1 = slim.conv2d(tower_conv1_0, l, 3,
scope='Conv2d_0b_3x3')
tower_conv1_2 = slim.conv2d(tower_conv1_1, m, 3,
stride=2, padding='VALID',
scope='Conv2d_1a_3x3')
with tf.variable_scope('Branch_2'):
tower_pool = slim.max_pool2d(net, 3, stride=2, padding='VALID',
scope='MaxPool_1a_3x3')
net = tf.concat([tower_conv, tower_conv1_2, tower_pool], 3)
return net
def reduction_b(net):
with tf.variable_scope('Branch_0'):
tower_conv = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
tower_conv_1 = slim.conv2d(tower_conv, 384, 3, stride=2,
padding='VALID', scope='Conv2d_1a_3x3')
with tf.variable_scope('Branch_1'):
tower_conv1 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
tower_conv1_1 = slim.conv2d(tower_conv1, 256, 3, stride=2,
padding='VALID', scope='Conv2d_1a_3x3')
with tf.variable_scope('Branch_2'):
tower_conv2 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
tower_conv2_1 = slim.conv2d(tower_conv2, 256, 3,
scope='Conv2d_0b_3x3')
tower_conv2_2 = slim.conv2d(tower_conv2_1, 256, 3, stride=2,
padding='VALID', scope='Conv2d_1a_3x3')
with tf.variable_scope('Branch_3'):
tower_pool = slim.max_pool2d(net, 3, stride=2, padding='VALID',
scope='MaxPool_1a_3x3')
net = tf.concat([tower_conv_1, tower_conv1_1,
tower_conv2_2, tower_pool], 3)
return net
def inference(images, keep_probability, phase_train=True,
bottleneck_layer_size=128, weight_decay=0.0, reuse=None):
batch_norm_params = {
# Decay for the moving averages.
'decay': 0.995,
# epsilon to prevent 0s in variance.
'epsilon': 0.001,
# force in-place updates of mean and variance estimates
'updates_collections': None,
# Moving averages ends up in the trainable variables collection
'variables_collections': [ tf.GraphKeys.TRAINABLE_VARIABLES ],
}
with slim.arg_scope([slim.conv2d, slim.fully_connected],
weights_initializer=slim.initializers.xavier_initializer(),
weights_regularizer=slim.l2_regularizer(weight_decay),
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params):
return inception_resnet_v1(images, is_training=phase_train,
dropout_keep_prob=keep_probability, bottleneck_layer_size=bottleneck_layer_size, reuse=reuse)
def inception_resnet_v1(inputs, is_training=True,
dropout_keep_prob=0.8,
bottleneck_layer_size=128,
reuse=None,
scope='InceptionResnetV1'):
"""Creates the Inception Resnet V1 model.
Args:
inputs: a 4-D tensor of size [batch_size, height, width, 3].
num_classes: number of predicted classes.
is_training: whether is training or not.
dropout_keep_prob: float, the fraction to keep before final layer.
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
Returns:
logits: the logits outputs of the model.
end_points: the set of end_points from the inception model.
"""
end_points = {}
with tf.variable_scope(scope, 'InceptionResnetV1', [inputs], reuse=reuse):
with slim.arg_scope([slim.batch_norm, slim.dropout],
is_training=is_training):
with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
stride=1, padding='SAME'):
# 149 x 149 x 32
net = slim.conv2d(inputs, 32, 3, stride=2, padding='VALID',
scope='Conv2d_1a_3x3')
end_points['Conv2d_1a_3x3'] = net
# 147 x 147 x 32
net = slim.conv2d(net, 32, 3, padding='VALID',
scope='Conv2d_2a_3x3')
end_points['Conv2d_2a_3x3'] = net
# 147 x 147 x 64
net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3')
end_points['Conv2d_2b_3x3'] = net
# 73 x 73 x 64
net = slim.max_pool2d(net, 3, stride=2, padding='VALID',
scope='MaxPool_3a_3x3')
end_points['MaxPool_3a_3x3'] = net
# 73 x 73 x 80
net = slim.conv2d(net, 80, 1, padding='VALID',
scope='Conv2d_3b_1x1')
end_points['Conv2d_3b_1x1'] = net
# 71 x 71 x 192
net = slim.conv2d(net, 192, 3, padding='VALID',
scope='Conv2d_4a_3x3')
end_points['Conv2d_4a_3x3'] = net
# 35 x 35 x 256
net = slim.conv2d(net, 256, 3, stride=2, padding='VALID',
scope='Conv2d_4b_3x3')
end_points['Conv2d_4b_3x3'] = net
# 5 x Inception-resnet-A
net = slim.repeat(net, 5, block35, scale=0.17)
end_points['Mixed_5a'] = net
# Reduction-A
with tf.variable_scope('Mixed_6a'):
net = reduction_a(net, 192, 192, 256, 384)
end_points['Mixed_6a'] = net
# 10 x Inception-Resnet-B
net = slim.repeat(net, 10, block17, scale=0.10)
end_points['Mixed_6b'] = net
# Reduction-B
with tf.variable_scope('Mixed_7a'):
net = reduction_b(net)
end_points['Mixed_7a'] = net
# 5 x Inception-Resnet-C
net = slim.repeat(net, 5, block8, scale=0.20)
end_points['Mixed_8a'] = net
net = block8(net, activation_fn=None)
end_points['Mixed_8b'] = net
with tf.variable_scope('Logits'):
end_points['PrePool'] = net
#pylint: disable=no-member
net = slim.avg_pool2d(net, net.get_shape()[1:3], padding='VALID',
scope='AvgPool_1a_8x8')
net = slim.flatten(net)
net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
scope='Dropout')
end_points['PreLogitsFlatten'] = net
net = slim.fully_connected(net, bottleneck_layer_size, activation_fn=None,
scope='Bottleneck', reuse=False)
return net, end_points

76
losses.py Normal file
Просмотреть файл

@ -0,0 +1,76 @@
import tensorflow as tf
from scipy.io import loadmat,savemat
###############################################################################################
# Define losses for training
###############################################################################################
# photometric loss
# input_imgs and render_imgs are [batchsize,h,w,3] BGR images
# img_mask are [batchsize,h,w,1] attention masks
def Photo_loss(input_imgs,render_imgs,img_mask):
input_imgs = tf.cast(input_imgs,tf.float32)
# img_mask = tf.squeeze(img_mask,3)
img_mask = tf.stop_gradient(img_mask[:,:,:,0])
# photo loss with skin attention
photo_loss = tf.sqrt(tf.reduce_sum(tf.square(input_imgs - render_imgs),axis = 3))*img_mask/255
photo_loss = tf.reduce_sum(photo_loss) / tf.maximum(tf.reduce_sum(img_mask),1.0)
return photo_loss
# perceptual loss
# id_feature and id_label are [batchsize, c] identity features for reconstruction images and input images
def Perceptual_loss(id_feature,id_label):
id_feature = tf.nn.l2_normalize(id_feature, dim = 1)
id_label = tf.nn.l2_normalize(id_label, dim = 1)
# cosine similarity
sim = tf.reduce_sum(id_feature*id_label,1)
loss = tf.reduce_sum(tf.maximum(0.0,1.0 - sim))/tf.cast(tf.shape(id_feature)[0],tf.float32)
return loss
# landmark loss
# landmark_p and landmark_label are [batchsize, 68, 2] landmark projections for reconstruction images and input images
def Landmark_loss(landmark_p,landmark_label):
# we set higher weights for landmarks around the mouth and nose regions
landmark_weight = tf.concat([tf.ones([1,28]),20*tf.ones([1,3]),tf.ones([1,29]),20*tf.ones([1,8])],axis = 1)
landmark_weight = tf.tile(landmark_weight,[tf.shape(landmark_p)[0],1])
landmark_loss = tf.reduce_sum(tf.reduce_sum(tf.square(landmark_p-landmark_label),2)*landmark_weight)/(68.0*tf.cast(tf.shape(landmark_p)[0],tf.float32))
return landmark_loss
# coefficient regularization to ensure plausible 3d faces
def Regulation_loss(id_coeff,ex_coeff,tex_coeff,opt):
w_ex = opt.w_ex
w_tex = opt.w_tex
regulation_loss = tf.nn.l2_loss(id_coeff) + w_ex * tf.nn.l2_loss(ex_coeff) + w_tex * tf.nn.l2_loss(tex_coeff)
regulation_loss = 2*regulation_loss/ tf.cast(tf.shape(id_coeff)[0],tf.float32)
return regulation_loss
# albedo regularization to ensure an uniform skin albedo
def Reflectance_loss(face_texture,facemodel):
skin_mask = facemodel.skin_mask
skin_mask = tf.reshape(skin_mask,[1,tf.shape(skin_mask)[0],1])
texture_mean = tf.reduce_sum(face_texture*skin_mask,1)/tf.reduce_sum(skin_mask)
texture_mean = tf.expand_dims(texture_mean,1)
# minimize texture variance for pre-defined skin region
reflectance_loss = tf.reduce_sum(tf.square((face_texture - texture_mean)*skin_mask/255.0))/(tf.cast(tf.shape(face_texture)[0],tf.float32)*tf.reduce_sum(skin_mask))
return reflectance_loss
# gamma regularization to ensure a nearly-monochromatic light
def Gamma_loss(gamma):
gamma = tf.reshape(gamma,[-1,3,9])
gamma_mean = tf.reduce_mean(gamma,1, keep_dims = True)
gamma_loss = tf.reduce_mean(tf.square(gamma - gamma_mean))
return gamma_loss

Просмотреть файл

87
networks.py Normal file
Просмотреть файл

@ -0,0 +1,87 @@
import tensorflow as tf
from tensorflow.contrib.slim.nets import resnet_v1
slim = tf.contrib.slim
from inception_resnet_v1 import inception_resnet_v1
###############################################################################################
#Define R-Net and Perceptual-Net for 3D face reconstruction
###############################################################################################
def R_Net(inputs,is_training=True):
#input: [Batchsize,H,W,C], 0-255, BGR image
inputs = tf.cast(inputs,tf.float32)
# standard ResNet50 backbone (without the last classfication FC layer)
with slim.arg_scope(resnet_v1.resnet_arg_scope()):
net,end_points = resnet_v1.resnet_v1_50(inputs,is_training = is_training ,reuse = tf.AUTO_REUSE)
# Modified FC layer with 257 channels for reconstruction coefficients
net_id = slim.conv2d(net, 80, [1, 1],
activation_fn=None,
normalizer_fn=None,
weights_initializer = tf.zeros_initializer(),
scope='fc-id')
net_ex = slim.conv2d(net, 64, [1, 1],
activation_fn=None,
normalizer_fn=None,
weights_initializer = tf.zeros_initializer(),
scope='fc-ex')
net_tex = slim.conv2d(net, 80, [1, 1],
activation_fn=None,
normalizer_fn=None,
weights_initializer = tf.zeros_initializer(),
scope='fc-tex')
net_angles = slim.conv2d(net, 3, [1, 1],
activation_fn=None,
normalizer_fn=None,
weights_initializer = tf.zeros_initializer(),
scope='fc-angles')
net_gamma = slim.conv2d(net, 27, [1, 1],
activation_fn=None,
normalizer_fn=None,
weights_initializer = tf.zeros_initializer(),
scope='fc-gamma')
net_t_xy = slim.conv2d(net, 2, [1, 1],
activation_fn=None,
normalizer_fn=None,
weights_initializer = tf.zeros_initializer(),
scope='fc-XY')
net_t_z = slim.conv2d(net, 1, [1, 1],
activation_fn=None,
normalizer_fn=None,
weights_initializer = tf.zeros_initializer(),
scope='fc-Z')
net_id = tf.squeeze(net_id, [1,2], name='fc-id/squeezed')
net_ex = tf.squeeze(net_ex, [1,2], name='fc-ex/squeezed')
net_tex = tf.squeeze(net_tex, [1,2],name='fc-tex/squeezed')
net_angles = tf.squeeze(net_angles,[1,2], name='fc-angles/squeezed')
net_gamma = tf.squeeze(net_gamma,[1,2], name='fc-gamma/squeezed')
net_t_xy = tf.squeeze(net_t_xy,[1,2], name='fc-XY/squeezed')
net_t_z = tf.squeeze(net_t_z,[1,2], name='fc-Z/squeezed')
net_ = tf.concat([net_id,net_ex,net_tex,net_angles,net_gamma,net_t_xy,net_t_z], axis = 1)
return net_
def Perceptual_Net(input_imgs):
#input_imgs: [Batchsize,H,W,C], 0-255, BGR image
input_imgs = tf.reshape(input_imgs,[-1,224,224,3])
input_imgs = tf.cast(input_imgs,tf.float32)
input_imgs = tf.clip_by_value(input_imgs,0,255)
input_imgs = (input_imgs - 127.5)/128.0
#standard face-net backbone
batch_norm_params = {
'decay': 0.995,
'epsilon': 0.001,
'updates_collections': None}
with slim.arg_scope([slim.conv2d, slim.fully_connected],weights_initializer=slim.initializers.xavier_initializer(),
weights_regularizer=slim.l2_regularizer(0.0),
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params):
feature_128,_ = inception_resnet_v1(input_imgs, bottleneck_layer_size=128, is_training=False, reuse=tf.AUTO_REUSE)
# output the last FC layer feature(before classification) as identity feature
return feature_128

60
options.py Normal file
Просмотреть файл

@ -0,0 +1,60 @@
import numpy as np
import tensorflow as tf
import os
# training options
class Option():
def __init__(self):
#--------------------------------------------------------------------------------------
self.model_dir = 'result'
self.model_name = 'model_test2'
self.data_path = ['./processed_data']
self.val_data_path = ['./processed_data']
self.model_save_path = os.path.join(self.model_dir,self.model_name)
if not os.path.exists(self.model_save_path):
os.makedirs(self.model_save_path)
self.summary_dir = os.path.join(self.model_save_path,'summary')
self.train_summary_path = os.path.join(self.summary_dir, 'train')
self.val_summary_path = os.path.join(self.summary_dir, 'val')
#---------------------------------------------------------------------------------------
# visible gpu settings
self.config = tf.ConfigProto()
self.config.gpu_options.visible_device_list = '0'
self.is_train = True
self.use_pb = True
#---------------------------------------------------------------------------------------
# training parameters
self.w_photo = 1.92
self.w_lm = 1.6e-3
self.w_id = 0.2
self.w_reg = 3.0e-4
self.w_ref = 5.0
self.w_gamma = 10.0
self.w_ex = 0.8
self.w_tex = 1.7e-2
self.batch_size = 16
self.boundaries = [100000]
lr = [1e-4,2e-5]
self.global_step = tf.Variable(0,name='global_step',trainable = False)
self.lr = tf.train.piecewise_constant(self.global_step,self.boundaries,lr)
self.augment = True
self.train_maxiter = 200000
self.train_summary_iter = 50
self.image_summary_iter = 200
self.val_summary_iter = 1000
self.save_iter = 10000
#---------------------------------------------------------------------------------------
# initial weights for resnet and facenet
self.R_net_weights = os.path.join('./weights/resnet','resnet_v1_50.ckpt')
self.Perceptual_net_weights = './weights/id_net/model-20170512-110547.ckpt-250000'
self.pretrain_weights = os.path.join('train/model_test2','iter_100000.ckpt')

Просмотреть файл

@ -1,6 +1,14 @@
import numpy as np
from scipy.io import loadmat,savemat
from PIL import Image
from skin import skinmask
import argparse
from utils import *
import os
import glob
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
#calculating least square problem
def POS(xp,x):
@ -27,7 +35,8 @@ def POS(xp,x):
return t,s
def process_img(img,lm,t,s,target_size = 224.):
# resize and crop images
def resize_n_crop_img(img,lm,t,s,target_size = 224.):
w0,h0 = img.size
w = (w0/s*102).astype(np.int32)
h = (h0/s*102).astype(np.int32)
@ -49,7 +58,7 @@ def process_img(img,lm,t,s,target_size = 224.):
# resize and crop input images before sending to the R-Net
def Preprocess(img,lm,lm3D):
def align_img(img,lm,lm3D):
w0,h0 = img.size
@ -60,9 +69,83 @@ def Preprocess(img,lm,lm3D):
t,s = POS(lm.transpose(),lm3D.transpose())
# processing the image
img_new,lm_new = process_img(img,lm,t,s)
img_new,lm_new = resize_n_crop_img(img,lm,t,s)
lm_new = np.stack([lm_new[:,0],223 - lm_new[:,1]], axis = 1)
trans_params = np.array([w0,h0,102.0/s,t[0],t[1]])
return img_new,lm_new,trans_params
# detect 68 face landmarks for aligned images
def get_68landmark(img,detector,sess):
input_img = detector.get_tensor_by_name('input_imgs:0')
lm = detector.get_tensor_by_name('landmark:0')
landmark = sess.run(lm,feed_dict={input_img:img})
landmark = np.reshape(landmark,[68,2])
landmark = np.stack([landmark[:,1],223-landmark[:,0]],axis=1)
return landmark
# get skin attention mask for aligned images
def get_skinmask(img):
img = np.squeeze(img,0)
skin_img = skinmask(img)
return skin_img
def parse_args():
desc = "Data preprocessing for Deep3DRecon."
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('--img_path', type=str, default='./input', help='original images folder')
parser.add_argument('--save_path', type=str, default='./processed_data', help='custom path to save proccessed images and labels')
return parser.parse_args()
# training data pre-processing
def preprocessing():
args = parse_args()
image_path = args.img_path
save_path = args.save_path
if not os.path.isdir(save_path):
os.makedirs(save_path)
if not os.path.isdir(os.path.join(save_path,'lm')):
os.makedirs(os.path.join(save_path,'lm'))
if not os.path.isdir(os.path.join(save_path,'lm_bin')):
os.makedirs(os.path.join(save_path,'lm_bin'))
if not os.path.isdir(os.path.join(save_path,'mask')):
os.makedirs(os.path.join(save_path,'mask'))
img_list = sorted(glob.glob(image_path + '/' + '*.png'))
img_list += sorted(glob.glob(image_path + '/' + '*.jpg'))
lm3D = load_lm3d()
with tf.Graph().as_default() as graph, tf.device('/gpu:0'):
lm_detector = load_graph(os.path.join('network','landmark68_detector.pb'))
tf.import_graph_def(lm_detector,name='')
sess = tf.InteractiveSession()
for file in img_list:
print(file)
name = file.split('/')[-1].replace('.png','').replace('.jpg','')
img,lm5p = load_img(file,file.replace('png','txt').replace('jpg','txt'))
img_align,_,_ = align_img(img,lm5p,lm3D) # [1,224,224,3] BGR image
lm68p = get_68landmark(img_align,graph,sess)
lm68p = lm68p.astype(np.float64)
skin_mask = get_skinmask(img_align)
Image.fromarray(img_align.squeeze(0)[:,:,::-1].astype(np.uint8),'RGB').save(os.path.join(save_path,name+'.png'))
Image.fromarray(skin_mask.astype(np.uint8)).save(os.path.join(save_path,'mask',name+'.png'))
np.savetxt(os.path.join(save_path,'lm',name+'.txt'),lm68p)
lm_bin = np.reshape(lm68p,[-1])
lm_bin.tofile(os.path.join(save_path,'lm_bin',name+'.bin'))
if __name__ == '__main__':
preprocessing()

86
reconstruction_model.py Normal file
Просмотреть файл

@ -0,0 +1,86 @@
import tensorflow as tf
import face_decoder
import networks
import losses
from utils import *
###############################################################################################
# model for single image face reconstruction
###############################################################################################
class Reconstruction_model():
# initialization
def __init__(self,opt):
self.Face3D = face_decoder.Face3D() #analytic 3D face object
self.opt = opt # training options
self.Optimizer = tf.train.AdamOptimizer(learning_rate = opt.lr) # optimizer
# load input data from queue
def set_input(self,input_iterator):
self.imgs,self.lm_labels,self.attention_masks = input_iterator.get_next()
# forward process of the model
def forward(self,is_train = True):
with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
self.coeff = networks.R_Net(self.imgs,is_training=is_train)
self.Face3D.Reconstruction_Block(self.coeff,self.opt)
self.id_labels = networks.Perceptual_Net(self.imgs)
self.id_features = networks.Perceptual_Net(self.Face3D.render_imgs)
self.photo_loss = losses.Photo_loss(self.imgs,self.Face3D.render_imgs,self.Face3D.img_mask_crop*self.attention_masks)
self.landmark_loss = losses.Landmark_loss(self.Face3D.landmark_p,self.lm_labels)
self.perceptual_loss = losses.Perceptual_loss(self.id_features,self.id_labels)
self.reg_loss = losses.Regulation_loss(self.Face3D.id_coeff,self.Face3D.ex_coeff,self.Face3D.tex_coeff,self.opt)
self.reflect_loss = losses.Reflectance_loss(self.Face3D.face_texture,self.Face3D.facemodel)
self.gamma_loss = losses.Gamma_loss(self.Face3D.gamma)
self.loss = self.opt.w_photo*self.photo_loss + self.opt.w_lm*self.landmark_loss + self.opt.w_id*self.perceptual_loss\
+ self.opt.w_reg*self.reg_loss + self.opt.w_ref*self.reflect_loss + self.opt.w_gamma*self.gamma_loss
# backward process
def backward(self,is_train = True):
if is_train:
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
var_list = tf.trainable_variables()
update_var_list = [v for v in var_list if 'resnet_v1_50' in v.name or 'fc-' in v.name]
grads = tf.gradients(self.loss,update_var_list)
# get train_op with update_ops to ensure updating for bn parameters
with tf.control_dependencies(update_ops):
self.train_op = self.Optimizer.apply_gradients(zip(grads,update_var_list),global_step = self.opt.global_step)
# if not training stage, avoid updating variables
else:
pass
# forward and backward
def step(self, is_train = True):
with tf.variable_scope(tf.get_variable_scope()) as scope:
self.forward(is_train = is_train)
self.backward(is_train = is_train)
# statistics summarization
def summarize(self):
# scalar and histogram stats
stat = [
tf.summary.scalar('reflect_error',self.reflect_loss),
tf.summary.scalar('gamma_error',self.gamma_loss),
tf.summary.scalar('id_sim_error',self.perceptual_loss),
tf.summary.scalar('lm_error',tf.sqrt(self.landmark_loss)),
tf.summary.scalar('photo_error',self.photo_loss),
tf.summary.scalar('train_error',self.loss),
tf.summary.histogram('id_coeff',self.Face3D.id_coeff),
tf.summary.histogram('ex_coeff',self.Face3D.ex_coeff),
tf.summary.histogram('tex_coeff',self.Face3D.tex_coeff)]
self.summary_stat = tf.summary.merge(stat)
# combine face region of reconstruction images with input images
render_imgs = self.Face3D.render_imgs[:,:,:,::-1]*self.Face3D.img_mask + tf.cast(self.imgs[:,:,:,::-1],tf.float32)*(1-self.Face3D.img_mask)
render_imgs = tf.clip_by_value(render_imgs,0,255)
render_imgs = tf.cast(render_imgs,tf.uint8)
# image stats
img_stat = [tf.summary.image('imgs',tf.concat([tf.cast(self.imgs[:,:,:,::-1],tf.uint8),render_imgs],axis = 2), max_outputs = 8)]
self.summary_img = tf.summary.merge(img_stat)

1
renderer/__init__.py Normal file
Просмотреть файл

@ -0,0 +1 @@
#.

152
renderer/camera_utils.py Normal file
Просмотреть файл

@ -0,0 +1,152 @@
# Copyright 2017 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Collection of TF functions for managing 3D camera matrices."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
def perspective(aspect_ratio, fov_y, near_clip, far_clip):
"""Computes perspective transformation matrices.
Functionality mimes gluPerspective (third_party/GL/glu/include/GLU/glu.h).
Args:
aspect_ratio: float value specifying the image aspect ratio (width/height).
fov_y: 1-D float32 Tensor with shape [batch_size] specifying output vertical
field of views in degrees.
near_clip: 1-D float32 Tensor with shape [batch_size] specifying near
clipping plane distance.
far_clip: 1-D float32 Tensor with shape [batch_size] specifying far clipping
plane distance.
Returns:
A [batch_size, 4, 4] float tensor that maps from right-handed points in eye
space to left-handed points in clip space.
"""
# The multiplication of fov_y by pi/360.0 simultaneously converts to radians
# and adds the half-angle factor of .5.
focal_lengths_y = 1.0 / tf.tan(fov_y * (math.pi / 360.0))
depth_range = far_clip - near_clip
p_22 = -(far_clip + near_clip) / depth_range
p_23 = -2.0 * (far_clip * near_clip / depth_range)
zeros = tf.zeros_like(p_23, dtype=tf.float32)
# pyformat: disable
perspective_transform = tf.concat(
[
focal_lengths_y / aspect_ratio, zeros, zeros, zeros,
zeros, focal_lengths_y, zeros, zeros,
zeros, zeros, p_22, p_23,
zeros, zeros, -tf.ones_like(p_23, dtype=tf.float32), zeros
], axis=0)
# pyformat: enable
perspective_transform = tf.reshape(perspective_transform, [4, 4, -1])
return tf.transpose(perspective_transform, [2, 0, 1])
def look_at(eye, center, world_up):
"""Computes camera viewing matrices.
Functionality mimes gluLookAt (third_party/GL/glu/include/GLU/glu.h).
Args:
eye: 2-D float32 tensor with shape [batch_size, 3] containing the XYZ world
space position of the camera.
center: 2-D float32 tensor with shape [batch_size, 3] containing a position
along the center of the camera's gaze.
world_up: 2-D float32 tensor with shape [batch_size, 3] specifying the
world's up direction; the output camera will have no tilt with respect
to this direction.
Returns:
A [batch_size, 4, 4] float tensor containing a right-handed camera
extrinsics matrix that maps points from world space to points in eye space.
"""
batch_size = center.shape[0].value
vector_degeneracy_cutoff = 1e-6
forward = center - eye
forward_norm = tf.norm(forward, ord='euclidean', axis=1, keep_dims=True)
# tf.assert_greater(
# forward_norm,
# vector_degeneracy_cutoff,
# message='Camera matrix is degenerate because eye and center are close.')
forward = tf.divide(forward, forward_norm)
to_side = tf.cross(forward, world_up)
to_side_norm = tf.norm(to_side, ord='euclidean', axis=1, keep_dims=True)
# tf.assert_greater(
# to_side_norm,
# vector_degeneracy_cutoff,
# message='Camera matrix is degenerate because up and gaze are close or'
# 'because up is degenerate.')
to_side = tf.divide(to_side, to_side_norm)
cam_up = tf.cross(to_side, forward)
w_column = tf.constant(
batch_size * [[0., 0., 0., 1.]], dtype=tf.float32) # [batch_size, 4]
w_column = tf.reshape(w_column, [batch_size, 4, 1])
view_rotation = tf.stack(
[to_side, cam_up, -forward,
tf.zeros_like(to_side, dtype=tf.float32)],
axis=1) # [batch_size, 4, 3] matrix
view_rotation = tf.concat(
[view_rotation, w_column], axis=2) # [batch_size, 4, 4]
identity_batch = tf.tile(tf.expand_dims(tf.eye(3), 0), [batch_size, 1, 1])
view_translation = tf.concat([identity_batch, tf.expand_dims(-eye, 2)], 2)
view_translation = tf.concat(
[view_translation,
tf.reshape(w_column, [batch_size, 1, 4])], 1)
camera_matrices = tf.matmul(view_rotation, view_translation)
return camera_matrices
def euler_matrices(angles):
"""Computes a XYZ Tait-Bryan (improper Euler angle) rotation.
Returns 4x4 matrices for convenient multiplication with other transformations.
Args:
angles: a [batch_size, 3] tensor containing X, Y, and Z angles in radians.
Returns:
a [batch_size, 4, 4] tensor of matrices.
"""
s = tf.sin(angles)
c = tf.cos(angles)
# Rename variables for readability in the matrix definition below.
c0, c1, c2 = (c[:, 0], c[:, 1], c[:, 2])
s0, s1, s2 = (s[:, 0], s[:, 1], s[:, 2])
zeros = tf.zeros_like(s[:, 0])
ones = tf.ones_like(s[:, 0])
# pyformat: disable
flattened = tf.concat(
[
c2 * c1, c2 * s1 * s0 - c0 * s2, s2 * s0 + c2 * c0 * s1, zeros,
c1 * s2, c2 * c0 + s2 * s1 * s0, c0 * s2 * s1 - c2 * s0, zeros,
-s1, c1 * s0, c1 * c0, zeros,
zeros, zeros, zeros, ones
],
axis=0)
# pyformat: enable
reshaped = tf.reshape(flattened, [4, 4, -1])
return tf.transpose(reshaped, [2, 0, 1])

404
renderer/mesh_renderer.py Normal file
Просмотреть файл

@ -0,0 +1,404 @@
# Copyright 2017 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Differentiable 3-D rendering of a triangle mesh."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from renderer import camera_utils
from renderer import rasterize_triangles
def phong_shader(normals,
alphas,
pixel_positions,
light_positions,
light_intensities,
diffuse_colors=None,
camera_position=None,
specular_colors=None,
shininess_coefficients=None,
ambient_color=None):
"""Computes pixelwise lighting from rasterized buffers with the Phong model.
Args:
normals: a 4D float32 tensor with shape [batch_size, image_height,
image_width, 3]. The inner dimension is the world space XYZ normal for
the corresponding pixel. Should be already normalized.
alphas: a 3D float32 tensor with shape [batch_size, image_height,
image_width]. The inner dimension is the alpha value (transparency)
for the corresponding pixel.
pixel_positions: a 4D float32 tensor with shape [batch_size, image_height,
image_width, 3]. The inner dimension is the world space XYZ position for
the corresponding pixel.
light_positions: a 3D tensor with shape [batch_size, light_count, 3]. The
XYZ position of each light in the scene. In the same coordinate space as
pixel_positions.
light_intensities: a 3D tensor with shape [batch_size, light_count, 3]. The
RGB intensity values for each light. Intensities may be above one.
diffuse_colors: a 4D float32 tensor with shape [batch_size, image_height,
image_width, 3]. The inner dimension is the diffuse RGB coefficients at
a pixel in the range [0, 1].
camera_position: a 1D tensor with shape [batch_size, 3]. The XYZ camera
position in the scene. If supplied, specular reflections will be
computed. If not supplied, specular_colors and shininess_coefficients
are expected to be None. In the same coordinate space as
pixel_positions.
specular_colors: a 4D float32 tensor with shape [batch_size, image_height,
image_width, 3]. The inner dimension is the specular RGB coefficients at
a pixel in the range [0, 1]. If None, assumed to be tf.zeros()
shininess_coefficients: A 3D float32 tensor that is broadcasted to shape
[batch_size, image_height, image_width]. The inner dimension is the
shininess coefficient for the object at a pixel. Dimensions that are
constant can be given length 1, so [batch_size, 1, 1] and [1, 1, 1] are
also valid input shapes.
ambient_color: a 2D tensor with shape [batch_size, 3]. The RGB ambient
color, which is added to each pixel before tone mapping. If None, it is
assumed to be tf.zeros().
Returns:
A 4D float32 tensor of shape [batch_size, image_height, image_width, 4]
containing the lit RGBA color values for each image at each pixel. Colors
are in the range [0,1].
Raises:
ValueError: An invalid argument to the method is detected.
"""
batch_size, image_height, image_width = [s.value for s in normals.shape[:-1]]
light_count = light_positions.shape[1].value
pixel_count = image_height * image_width
# Reshape all values to easily do pixelwise computations:
normals = tf.reshape(normals, [batch_size, -1, 3])
alphas = tf.reshape(alphas, [batch_size, -1, 1])
diffuse_colors = tf.reshape(diffuse_colors, [batch_size, -1, 3])
if camera_position is not None:
specular_colors = tf.reshape(specular_colors, [batch_size, -1, 3])
# Ambient component
output_colors = tf.zeros([batch_size, image_height * image_width, 3])
if ambient_color is not None:
ambient_reshaped = tf.expand_dims(ambient_color, axis=1)
output_colors = tf.add(output_colors, ambient_reshaped * diffuse_colors)
# Diffuse component
pixel_positions = tf.reshape(pixel_positions, [batch_size, -1, 3])
per_light_pixel_positions = tf.stack(
[pixel_positions] * light_count,
axis=1) # [batch_size, light_count, pixel_count, 3]
directions_to_lights = tf.nn.l2_normalize(
tf.expand_dims(light_positions, axis=2) - per_light_pixel_positions,
dim=3) # [batch_size, light_count, pixel_count, 3]
# The specular component should only contribute when the light and normal
# face one another (i.e. the dot product is nonnegative):
normals_dot_lights = tf.clip_by_value(
tf.reduce_sum(
tf.expand_dims(normals, axis=1) * directions_to_lights, axis=3), 0.0,
1.0) # [batch_size, light_count, pixel_count]
diffuse_output = tf.expand_dims(
diffuse_colors, axis=1) * tf.expand_dims(
normals_dot_lights, axis=3) * tf.expand_dims(
light_intensities, axis=2)
diffuse_output = tf.reduce_sum(
diffuse_output, axis=1) # [batch_size, pixel_count, 3]
output_colors = tf.add(output_colors, diffuse_output)
# Specular component
if camera_position is not None:
camera_position = tf.reshape(camera_position, [batch_size, 1, 3])
mirror_reflection_direction = tf.nn.l2_normalize(
2.0 * tf.expand_dims(normals_dot_lights, axis=3) * tf.expand_dims(
normals, axis=1) - directions_to_lights,
dim=3)
direction_to_camera = tf.nn.l2_normalize(
camera_position - pixel_positions, dim=2)
reflection_direction_dot_camera_direction = tf.reduce_sum(
tf.expand_dims(direction_to_camera, axis=1) *
mirror_reflection_direction,
axis=3)
# The specular component should only contribute when the reflection is
# external:
reflection_direction_dot_camera_direction = tf.clip_by_value(
tf.nn.l2_normalize(reflection_direction_dot_camera_direction, dim=2),
0.0, 1.0)
# The specular component should also only contribute when the diffuse
# component contributes:
reflection_direction_dot_camera_direction = tf.where(
normals_dot_lights != 0.0, reflection_direction_dot_camera_direction,
tf.zeros_like(
reflection_direction_dot_camera_direction, dtype=tf.float32))
# Reshape to support broadcasting the shininess coefficient, which rarely
# varies per-vertex:
reflection_direction_dot_camera_direction = tf.reshape(
reflection_direction_dot_camera_direction,
[batch_size, light_count, image_height, image_width])
shininess_coefficients = tf.expand_dims(shininess_coefficients, axis=1)
specularity = tf.reshape(
tf.pow(reflection_direction_dot_camera_direction,
shininess_coefficients),
[batch_size, light_count, pixel_count, 1])
specular_output = tf.expand_dims(
specular_colors, axis=1) * specularity * tf.expand_dims(
light_intensities, axis=2)
specular_output = tf.reduce_sum(specular_output, axis=1)
output_colors = tf.add(output_colors, specular_output)
rgb_images = tf.reshape(output_colors,
[batch_size, image_height, image_width, 3])
alpha_images = tf.reshape(alphas, [batch_size, image_height, image_width, 1])
valid_rgb_values = tf.concat(3 * [alpha_images > 0.5], axis=3)
rgb_images = tf.where(valid_rgb_values, rgb_images,
tf.zeros_like(rgb_images, dtype=tf.float32))
return tf.reverse(tf.concat([rgb_images, alpha_images], axis=3), axis=[1])
def tone_mapper(image, gamma):
"""Applies gamma correction to the input image.
Tone maps the input image batch in order to make scenes with a high dynamic
range viewable. The gamma correction factor is computed separately per image,
but is shared between all provided channels. The exact function computed is:
image_out = A*image_in^gamma, where A is an image-wide constant computed so
that the maximum image value is approximately 1. The correction is applied
to all channels.
Args:
image: 4-D float32 tensor with shape [batch_size, image_height,
image_width, channel_count]. The batch of images to tone map.
gamma: 0-D float32 nonnegative tensor. Values of gamma below one compress
relative contrast in the image, and values above one increase it. A
value of 1 is equivalent to scaling the image to have a maximum value
of 1.
Returns:
4-D float32 tensor with shape [batch_size, image_height, image_width,
channel_count]. Contains the gamma-corrected images, clipped to the range
[0, 1].
"""
batch_size = image.shape[0].value
corrected_image = tf.pow(image, gamma)
image_max = tf.reduce_max(
tf.reshape(corrected_image, [batch_size, -1]), axis=1)
scaled_image = tf.divide(corrected_image,
tf.reshape(image_max, [batch_size, 1, 1, 1]))
return tf.clip_by_value(scaled_image, 0.0, 1.0)
def mesh_renderer(vertices,
triangles,
normals,
diffuse_colors,
camera_position,
camera_lookat,
camera_up,
light_positions,
light_intensities,
image_width,
image_height,
specular_colors=None,
shininess_coefficients=None,
ambient_color=None,
fov_y=40.0,
near_clip=0.01,
far_clip=50.0):
"""Renders an input scene using phong shading, and returns an output image.
Args:
vertices: 3-D float32 tensor with shape [batch_size, vertex_count, 3]. Each
triplet is an xyz position in world space.
triangles: 2-D int32 tensor with shape [triangle_count, 3]. Each triplet
should contain vertex indices describing a triangle such that the
triangle's normal points toward the viewer if the forward order of the
triplet defines a clockwise winding of the vertices. Gradients with
respect to this tensor are not available.
normals: 3-D float32 tensor with shape [batch_size, vertex_count, 3]. Each
triplet is the xyz vertex normal for its corresponding vertex. Each
vector is assumed to be already normalized.
diffuse_colors: 3-D float32 tensor with shape [batch_size,
vertex_count, 3]. The RGB diffuse reflection in the range [0,1] for
each vertex.
camera_position: 2-D tensor with shape [batch_size, 3] or 1-D tensor with
shape [3] specifying the XYZ world space camera position.
camera_lookat: 2-D tensor with shape [batch_size, 3] or 1-D tensor with
shape [3] containing an XYZ point along the center of the camera's gaze.
camera_up: 2-D tensor with shape [batch_size, 3] or 1-D tensor with shape
[3] containing the up direction for the camera. The camera will have no
tilt with respect to this direction.
light_positions: a 3-D tensor with shape [batch_size, light_count, 3]. The
XYZ position of each light in the scene. In the same coordinate space as
pixel_positions.
light_intensities: a 3-D tensor with shape [batch_size, light_count, 3]. The
RGB intensity values for each light. Intensities may be above one.
image_width: int specifying desired output image width in pixels.
image_height: int specifying desired output image height in pixels.
specular_colors: 3-D float32 tensor with shape [batch_size,
vertex_count, 3]. The RGB specular reflection in the range [0, 1] for
each vertex. If supplied, specular reflections will be computed, and
both specular_colors and shininess_coefficients are expected.
shininess_coefficients: a 0D-2D float32 tensor with maximum shape
[batch_size, vertex_count]. The phong shininess coefficient of each
vertex. A 0D tensor or float gives a constant shininess coefficient
across all batches and images. A 1D tensor must have shape [batch_size],
and a single shininess coefficient per image is used.
ambient_color: a 2D tensor with shape [batch_size, 3]. The RGB ambient
color, which is added to each pixel in the scene. If None, it is
assumed to be black.
fov_y: float, 0D tensor, or 1D tensor with shape [batch_size] specifying
desired output image y field of view in degrees.
near_clip: float, 0D tensor, or 1D tensor with shape [batch_size] specifying
near clipping plane distance.
far_clip: float, 0D tensor, or 1D tensor with shape [batch_size] specifying
far clipping plane distance.
Returns:
A 4-D float32 tensor of shape [batch_size, image_height, image_width, 4]
containing the lit RGBA color values for each image at each pixel. RGB
colors are the intensity values before tonemapping and can be in the range
[0, infinity]. Clipping to the range [0,1] with tf.clip_by_value is likely
reasonable for both viewing and training most scenes. More complex scenes
with multiple lights should tone map color values for display only. One
simple tonemapping approach is to rescale color values as x/(1+x); gamma
compression is another common techinque. Alpha values are zero for
background pixels and near one for mesh pixels.
Raises:
ValueError: An invalid argument to the method is detected.
"""
if len(vertices.shape) != 3:
raise ValueError('Vertices must have shape [batch_size, vertex_count, 3].')
batch_size = vertices.shape[0].value
# print(batch_size)
if len(normals.shape) != 3:
raise ValueError('Normals must have shape [batch_size, vertex_count, 3].')
if len(light_positions.shape) != 3:
raise ValueError(
'Light_positions must have shape [batch_size, light_count, 3].')
if len(light_intensities.shape) != 3:
raise ValueError(
'Light_intensities must have shape [batch_size, light_count, 3].')
if len(diffuse_colors.shape) != 3:
raise ValueError(
'vertex_diffuse_colors must have shape [batch_size, vertex_count, 3].')
if (ambient_color is not None and
ambient_color.get_shape().as_list() != [batch_size, 3]):
raise ValueError('Ambient_color must have shape [batch_size, 3].')
if camera_position.get_shape().as_list() == [3]:
camera_position = tf.tile(
tf.expand_dims(camera_position, axis=0), [batch_size, 1])
elif camera_position.get_shape().as_list() != [batch_size, 3]:
raise ValueError('Camera_position must have shape [batch_size, 3]')
if camera_lookat.get_shape().as_list() == [3]:
camera_lookat = tf.tile(
tf.expand_dims(camera_lookat, axis=0), [batch_size, 1])
elif camera_lookat.get_shape().as_list() != [batch_size, 3]:
raise ValueError('Camera_lookat must have shape [batch_size, 3]')
if camera_up.get_shape().as_list() == [3]:
camera_up = tf.tile(tf.expand_dims(camera_up, axis=0), [batch_size, 1])
elif camera_up.get_shape().as_list() != [batch_size, 3]:
raise ValueError('Camera_up must have shape [batch_size, 3]')
if isinstance(fov_y, float):
fov_y = tf.constant(batch_size * [fov_y], dtype=tf.float32)
elif not fov_y.get_shape().as_list():
fov_y = tf.tile(tf.expand_dims(fov_y, 0), [batch_size])
elif fov_y.get_shape().as_list() != [batch_size]:
raise ValueError('Fov_y must be a float, a 0D tensor, or a 1D tensor with'
'shape [batch_size]')
if isinstance(near_clip, float):
near_clip = tf.constant(batch_size * [near_clip], dtype=tf.float32)
elif not near_clip.get_shape().as_list():
near_clip = tf.tile(tf.expand_dims(near_clip, 0), [batch_size])
elif near_clip.get_shape().as_list() != [batch_size]:
raise ValueError('Near_clip must be a float, a 0D tensor, or a 1D tensor'
'with shape [batch_size]')
if isinstance(far_clip, float):
far_clip = tf.constant(batch_size * [far_clip], dtype=tf.float32)
elif not far_clip.get_shape().as_list():
far_clip = tf.tile(tf.expand_dims(far_clip, 0), [batch_size])
elif far_clip.get_shape().as_list() != [batch_size]:
raise ValueError('Far_clip must be a float, a 0D tensor, or a 1D tensor'
'with shape [batch_size]')
if specular_colors is not None and shininess_coefficients is None:
raise ValueError(
'Specular colors were supplied without shininess coefficients.')
if shininess_coefficients is not None and specular_colors is None:
raise ValueError(
'Shininess coefficients were supplied without specular colors.')
if specular_colors is not None:
# Since a 0-D float32 tensor is accepted, also accept a float.
if isinstance(shininess_coefficients, float):
shininess_coefficients = tf.constant(
shininess_coefficients, dtype=tf.float32)
if len(specular_colors.shape) != 3:
raise ValueError('The specular colors must have shape [batch_size, '
'vertex_count, 3].')
if len(shininess_coefficients.shape) > 2:
raise ValueError('The shininess coefficients must have shape at most'
'[batch_size, vertex_count].')
# If we don't have per-vertex coefficients, we can just reshape the
# input shininess to broadcast later, rather than interpolating an
# additional vertex attribute:
if len(shininess_coefficients.shape) < 2:
vertex_attributes = tf.concat(
[normals, vertices, diffuse_colors, specular_colors], axis=2)
else:
vertex_attributes = tf.concat(
[
normals, vertices, diffuse_colors, specular_colors,
tf.expand_dims(shininess_coefficients, axis=2)
],
axis=2)
else:
vertex_attributes = tf.concat([normals, vertices, diffuse_colors], axis=2)
camera_matrices = camera_utils.look_at(camera_position, camera_lookat,
camera_up)
perspective_transforms = camera_utils.perspective(image_width / image_height,
fov_y, near_clip, far_clip)
clip_space_transforms = tf.matmul(perspective_transforms, camera_matrices)
pixel_attributes,alphas = rasterize_triangles.rasterize_triangles(
vertices, vertex_attributes, triangles, clip_space_transforms,
image_width, image_height, [-1] * vertex_attributes.shape[2].value)
# Extract the interpolated vertex attributes from the pixel buffer and
# supply them to the shader:
pixel_normals = tf.nn.l2_normalize(pixel_attributes[:, :, :, 0:3], dim=3)
pixel_positions = pixel_attributes[:, :, :, 3:6]
diffuse_colors = pixel_attributes[:, :, :, 6:9]
if specular_colors is not None:
specular_colors = pixel_attributes[:, :, :, 9:12]
# Retrieve the interpolated shininess coefficients if necessary, or just
# reshape our input for broadcasting:
if len(shininess_coefficients.shape) == 2:
shininess_coefficients = pixel_attributes[:, :, :, 12]
else:
shininess_coefficients = tf.reshape(shininess_coefficients, [-1, 1, 1])
# pixel_mask = tf.cast(tf.reduce_any(diffuse_colors >= 0, axis=3), tf.float32)
renders = phong_shader(
normals=pixel_normals,
alphas=alphas,
pixel_positions=pixel_positions,
light_positions=light_positions,
light_intensities=light_intensities,
diffuse_colors=diffuse_colors,
camera_position=camera_position if specular_colors is not None else None,
specular_colors=specular_colors,
shininess_coefficients=shininess_coefficients,
ambient_color=ambient_color)
return renders

Просмотреть файл

@ -0,0 +1,190 @@
# Copyright 2017 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Differentiable triangle rasterizer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tensorflow as tf
# rasterize_triangles_module = tf.load_op_library(
# os.path.join(os.environ['TEST_SRCDIR'],
# 'tf_mesh_renderer/mesh_renderer/kernels/rasterize_triangles_kernel.so'))
rasterize_triangles_module = tf.load_op_library('./renderer/rasterize_triangles_kernel_1.so')
# This epsilon should be smaller than any valid barycentric reweighting factor
# (i.e. the per-pixel reweighting factor used to correct for the effects of
# perspective-incorrect barycentric interpolation). It is necessary primarily
# because the reweighting factor will be 0 for factors outside the mesh, and we
# need to ensure the image color and gradient outside the region of the mesh are
# 0.
_MINIMUM_REWEIGHTING_THRESHOLD = 1e-6
# This epsilon is the minimum absolute value of a homogenous coordinate before
# it is clipped. It should be sufficiently large such that the output of
# the perspective divide step with this denominator still has good working
# precision with 32 bit arithmetic, and sufficiently small so that in practice
# vertices are almost never close enough to a clipping plane to be thresholded.
_MINIMUM_PERSPECTIVE_DIVIDE_THRESHOLD = 1e-6
def rasterize_triangles(vertices, attributes, triangles, projection_matrices,
image_width, image_height, background_value):
"""Rasterizes the input scene and computes interpolated vertex attributes.
NOTE: the rasterizer does no triangle clipping. Triangles that lie outside the
viewing frustum (esp. behind the camera) may be drawn incorrectly.
Args:
vertices: 3-D float32 tensor with shape [batch_size, vertex_count, 3]. Each
triplet is an xyz position in model space.
attributes: 3-D float32 tensor with shape [batch_size, vertex_count,
attribute_count]. Each vertex attribute is interpolated
across the triangle using barycentric interpolation.
triangles: 2-D int32 tensor with shape [triangle_count, 3]. Each triplet
should contain vertex indices describing a triangle such that the
triangle's normal points toward the viewer if the forward order of the
triplet defines a clockwise winding of the vertices. Gradients with
respect to this tensor are not available.
projection_matrices: 3-D float tensor with shape [batch_size, 4, 4]
containing model-view-perspective projection matrices.
image_width: int specifying desired output image width in pixels.
image_height: int specifying desired output image height in pixels.
background_value: a 1-D float32 tensor with shape [attribute_count]. Pixels
that lie outside all triangles take this value.
Returns:
A 4-D float32 tensor with shape [batch_size, image_height, image_width,
attribute_count], containing the interpolated vertex attributes at
each pixel.
Raises:
ValueError: An invalid argument to the method is detected.
"""
if not image_width > 0:
raise ValueError('Image width must be > 0.')
if not image_height > 0:
raise ValueError('Image height must be > 0.')
if len(vertices.shape) != 3:
raise ValueError('The vertex buffer must be 3D.')
batch_size = vertices.shape[0].value
vertex_count = vertices.shape[1].value
# We map the coordinates to normalized device coordinates before passing
# the scene to the rendering kernel to keep as many ops in tensorflow as
# possible.
homogeneous_coord = tf.ones([batch_size, vertex_count, 1], dtype=tf.float32)
vertices_homogeneous = tf.concat([vertices, homogeneous_coord], 2)
# Vertices are given in row-major order, but the transformation pipeline is
# column major:
clip_space_points = tf.matmul(
vertices_homogeneous, projection_matrices, transpose_b=True)
# Perspective divide, first thresholding the homogeneous coordinate to avoid
# the possibility of NaNs:
clip_space_points_w = tf.maximum(
tf.abs(clip_space_points[:, :, 3:4]),
_MINIMUM_PERSPECTIVE_DIVIDE_THRESHOLD) * tf.sign(
clip_space_points[:, :, 3:4])
normalized_device_coordinates = (
clip_space_points[:, :, 0:3] / clip_space_points_w)
per_image_uncorrected_barycentric_coordinates = []
per_image_vertex_ids = []
for im in range(vertices.shape[0]):
barycentric_coords, triangle_ids, _ = (
rasterize_triangles_module.rasterize_triangles(
normalized_device_coordinates[im, :, :], triangles, image_width,
image_height))
per_image_uncorrected_barycentric_coordinates.append(
tf.reshape(barycentric_coords, [-1, 3]))
# Gathers the vertex indices now because the indices don't contain a batch
# identifier, and reindexes the vertex ids to point to a (batch,vertex_id)
vertex_ids = tf.gather(triangles, tf.reshape(triangle_ids, [-1]))
reindexed_ids = tf.add(vertex_ids, im * vertices.shape[1].value)
per_image_vertex_ids.append(reindexed_ids)
uncorrected_barycentric_coordinates = tf.concat(
per_image_uncorrected_barycentric_coordinates, axis=0)
vertex_ids = tf.concat(per_image_vertex_ids, axis=0)
# Indexes with each pixel's clip-space triangle's extrema (the pixel's
# 'corner points') ids to get the relevant properties for deferred shading.
flattened_vertex_attributes = tf.reshape(attributes,
[batch_size * vertex_count, -1])
corner_attributes = tf.gather(flattened_vertex_attributes, vertex_ids)
# Barycentric interpolation is linear in the reciprocal of the homogeneous
# W coordinate, so we use these weights to correct for the effects of
# perspective distortion after rasterization.
perspective_distortion_weights = tf.reciprocal(
tf.reshape(clip_space_points_w, [-1]))
corner_distortion_weights = tf.gather(perspective_distortion_weights,
vertex_ids)
# Apply perspective correction to the barycentric coordinates. This step is
# required since the rasterizer receives normalized-device coordinates (i.e.,
# after perspective division), so it can't apply perspective correction to the
# interpolated values.
weighted_barycentric_coordinates = tf.multiply(
uncorrected_barycentric_coordinates, corner_distortion_weights)
barycentric_reweighting_factor = tf.reduce_sum(
weighted_barycentric_coordinates, axis=1)
corrected_barycentric_coordinates = tf.divide(
weighted_barycentric_coordinates,
tf.expand_dims(
tf.maximum(barycentric_reweighting_factor,
_MINIMUM_REWEIGHTING_THRESHOLD),
axis=1))
# Computes the pixel attributes by interpolating the known attributes at the
# corner points of the triangle interpolated with the barycentric coordinates.
weighted_vertex_attributes = tf.multiply(
corner_attributes,
tf.expand_dims(corrected_barycentric_coordinates, axis=2))
summed_attributes = tf.reduce_sum(weighted_vertex_attributes, axis=1)
attribute_images = tf.reshape(summed_attributes,
[batch_size, image_height, image_width, -1])
# Barycentric coordinates should approximately sum to one where there is
# rendered geometry, but be exactly zero where there is not.
alphas = tf.clip_by_value(
tf.reduce_sum(2.0 * corrected_barycentric_coordinates, axis=1), 0.0, 1.0)
alphas = tf.reshape(alphas, [batch_size, image_height, image_width, 1])
attributes_with_background = (
alphas * attribute_images + (1.0 - alphas) * background_value)
return attributes_with_background,alphas
@tf.RegisterGradient('RasterizeTriangles')
def _rasterize_triangles_grad(op, df_dbarys, df_dids, df_dz):
# Gradients are only supported for barycentric coordinates. Gradients for the
# z-buffer are possible as well but not currently implemented.
del df_dids, df_dz
return rasterize_triangles_module.rasterize_triangles_grad(
op.inputs[0], op.inputs[1], op.outputs[0], op.outputs[1], df_dbarys,
op.get_attr('image_width'), op.get_attr('image_height')), None

103
skin.py Normal file
Просмотреть файл

@ -0,0 +1,103 @@
import math
import numpy as np
class GMM:
def __init__(self, dim, num, w, mu, cov, cov_det, cov_inv):
self.dim = dim # feature dimension
self.num = num # number of Gaussian components
self.w = w # weights of Gaussian components (a list of scalars)
self.mu= mu # mean of Gaussian components (a list of 1xdim vectors)
self.cov = cov # covariance matrix of Gaussian components (a list of dimxdim matrices)
self.cov_det = cov_det # pre-computed determinet of covariance matrices (a list of scalars)
self.cov_inv = cov_inv # pre-computed inverse covariance matrices (a list of dimxdim matrices)
self.factor = [0]*num
for i in range(self.num):
self.factor[i] = (2*math.pi)**(self.dim/2) * self.cov_det[i]**0.5
def likelihood(self, data):
assert(data.shape[1] == self.dim)
N = data.shape[0]
lh = np.zeros(N)
for i in range(self.num):
data_ = data - self.mu[i]
tmp = np.matmul(data_,self.cov_inv[i]) * data_
tmp = np.sum(tmp,axis=1)
power = -0.5 * tmp
p = np.array([math.exp(power[j]) for j in range(N)])
p = p/self.factor[i]
lh += p*self.w[i]
return lh
def _rgb2ycbcr(rgb):
m = np.array([[65.481, 128.553, 24.966],
[-37.797, -74.203, 112],
[112, -93.786, -18.214]])
shape = rgb.shape
rgb = rgb.reshape((shape[0] * shape[1], 3))
ycbcr = np.dot(rgb, m.transpose() / 255.)
ycbcr[:, 0] += 16.
ycbcr[:, 1:] += 128.
return ycbcr.reshape(shape)
def _bgr2ycbcr(bgr):
rgb = bgr[..., ::-1]
return _rgb2ycbcr(rgb)
gmm_skin_w = [0.24063933, 0.16365987, 0.26034665, 0.33535415]
gmm_skin_mu = [np.array([113.71862, 103.39613, 164.08226]),
np.array([150.19858, 105.18467, 155.51428]),
np.array([183.92976, 107.62468, 152.71820]),
np.array([114.90524, 113.59782, 151.38217])]
gmm_skin_cov_det = [5692842.5, 5851930.5, 2329131., 1585971.]
gmm_skin_cov_inv = [np.array([[0.0019472069, 0.0020450759, -0.00060243998],[0.0020450759, 0.017700525, 0.0051420014],[-0.00060243998, 0.0051420014, 0.0081308950]]),
np.array([[0.0027110141, 0.0011036990, 0.0023122299],[0.0011036990, 0.010707724, 0.010742856],[0.0023122299, 0.010742856, 0.017481629]]),
np.array([[0.0048026871, 0.00022935172, 0.0077668377],[0.00022935172, 0.011729696, 0.0081661865],[0.0077668377, 0.0081661865, 0.025374353]]),
np.array([[0.0011989699, 0.0022453172, -0.0010748957],[0.0022453172, 0.047758564, 0.020332102],[-0.0010748957, 0.020332102, 0.024502251]])]
gmm_skin = GMM(3, 4, gmm_skin_w, gmm_skin_mu, [], gmm_skin_cov_det, gmm_skin_cov_inv)
gmm_nonskin_w = [0.12791070, 0.31130761, 0.34245777, 0.21832393]
gmm_nonskin_mu = [np.array([99.200851, 112.07533, 140.20602]),
np.array([110.91392, 125.52969, 130.19237]),
np.array([129.75864, 129.96107, 126.96808]),
np.array([112.29587, 128.85121, 129.05431])]
gmm_nonskin_cov_det = [458703648., 6466488., 90611376., 133097.63]
gmm_nonskin_cov_inv = [np.array([[0.00085371657, 0.00071197288, 0.00023958916],[0.00071197288, 0.0025935620, 0.00076557708],[0.00023958916, 0.00076557708, 0.0015042332]]),
np.array([[0.00024650150, 0.00045542428, 0.00015019422],[0.00045542428, 0.026412144, 0.018419769],[0.00015019422, 0.018419769, 0.037497383]]),
np.array([[0.00037054974, 0.00038146760, 0.00040408765],[0.00038146760, 0.0085505722, 0.0079136286],[0.00040408765, 0.0079136286, 0.010982352]]),
np.array([[0.00013709733, 0.00051228428, 0.00012777430],[0.00051228428, 0.28237113, 0.10528370],[0.00012777430, 0.10528370, 0.23468947]])]
gmm_nonskin = GMM(3, 4, gmm_nonskin_w, gmm_nonskin_mu, [], gmm_nonskin_cov_det, gmm_nonskin_cov_inv)
prior_skin = 0.8
prior_nonskin = 1 - prior_skin
# calculate skin attention mask
def skinmask(imbgr):
im = _bgr2ycbcr(imbgr)
data = im.reshape((-1,3))
lh_skin = gmm_skin.likelihood(data)
lh_nonskin = gmm_nonskin.likelihood(data)
tmp1 = prior_skin * lh_skin
tmp2 = prior_nonskin * lh_nonskin
post_skin = tmp1 / (tmp1+tmp2) # posterior probability
post_skin = post_skin.reshape((im.shape[0],im.shape[1]))
post_skin = np.round(post_skin*255)
post_skin = post_skin.astype(np.uint8)
post_skin = np.tile(np.expand_dims(post_skin,2),[1,1,3]) # reshape to H*W*3
return post_skin

140
train.py Normal file
Просмотреть файл

@ -0,0 +1,140 @@
import tensorflow as tf
import numpy as np
import os
from options import Option
from reconstruction_model import *
from data_loader import *
from utils import *
import argparse
###############################################################################################
# training stage
###############################################################################################
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# training data and validation data
def parse_args():
desc = "Data preprocessing for Deep3DRecon."
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('--data_path', type=str, default='./processed_data', help='training data folder')
parser.add_argument('--val_data_path', type=str, default='./processed_data', help='validation data folder')
return parser.parse_args()
# initialize weights for resnet and facenet
def restore_weights_and_initialize(opt):
var_list = tf.trainable_variables()
g_list = tf.global_variables()
# add batch normalization params into trainable variables
bn_moving_vars = [g for g in g_list if 'moving_mean' in g.name]
bn_moving_vars += [g for g in g_list if 'moving_variance' in g.name]
var_list +=bn_moving_vars
# create saver to save and restore weights
resnet_vars = [v for v in var_list if 'resnet_v1_50' in v.name]
facenet_vars = [v for v in var_list if 'InceptionResnetV1' in v.name]
saver_resnet = tf.train.Saver(var_list = resnet_vars)
saver_facenet = tf.train.Saver(var_list = facenet_vars)
saver = tf.train.Saver(var_list = resnet_vars + [v for v in var_list if 'fc-' in v.name],max_to_keep = 50)
# create session
sess = tf.InteractiveSession(config = opt.config)
# create summary op
train_writer = tf.summary.FileWriter(opt.train_summary_path, sess.graph)
val_writer = tf.summary.FileWriter(opt.val_summary_path, sess.graph)
# initialization
tf.global_variables_initializer().run()
tf.local_variables_initializer().run()
saver_resnet.restore(sess,opt.R_net_weights)
saver_facenet.restore(sess,opt.Perceptual_net_weights)
return saver, train_writer,val_writer, sess
# main function for training
def train():
# read BFM face model
# transfer original BFM model to our model
if not os.path.isfile('./BFM/BFM_model_front.mat'):
transferBFM09()
with tf.Graph().as_default() as graph:
# training options
args = parse_args()
opt = Option()
opt.data_path = [args.data_path]
opt.val_data_path = [args.val_data_path]
# load training data into queue
train_iterator = load_dataset(opt)
# create reconstruction model
model = Reconstruction_model(opt)
# send training data to the model
model.set_input(train_iterator)
# update model variables with training data
model.step(is_train = True)
# summarize training statistics
model.summarize()
# several training stattistics to be saved
train_stat = model.summary_stat
train_img_stat = model.summary_img
train_op = model.train_op
photo_error = model.photo_loss
lm_error = model.landmark_loss
id_error = model.perceptual_loss
# load validation data into queue
val_iterator = load_dataset(opt,train=False)
# send validation data to the model
model.set_input(val_iterator)
# only do foward pass without updating model variables
model.step(is_train = False)
# summarize validation statistics
model.summarize()
val_stat = model.summary_stat
val_img_stat = model.summary_img
# initialization
saver, train_writer,val_writer, sess = restore_weights_and_initialize(opt)
# freeze the graph to ensure no new op will be added during training
sess.graph.finalize()
# training loop
for i in range(opt.train_maxiter):
_,ph_loss,lm_loss,id_loss = sess.run([train_op,photo_error,lm_error,id_error])
print('Iter: %d; lm_loss: %f ; photo_loss: %f; id_loss: %f\n'%(i,np.sqrt(lm_loss),ph_loss,id_loss))
# summarize training stats every <train_summary_iter> iterations
if np.mod(i,opt.train_summary_iter) == 0:
train_summary = sess.run(train_stat)
train_writer.add_summary(train_summary,i)
# summarize image stats every <image_summary_iter> iterations
if np.mod(i,opt.image_summary_iter) == 0:
train_img_summary = sess.run(train_img_stat)
train_writer.add_summary(train_img_summary,i)
# summarize validation stats every <val_summary_iter> iterations
if np.mod(i,opt.val_summary_iter) == 0:
val_summary,val_img_summary = sess.run([val_stat,val_img_stat])
val_writer.add_summary(val_summary,i)
val_writer.add_summary(val_img_summary,i)
# # save model variables every <save_iter> iterations
if np.mod(i,opt.save_iter) == 0:
saver.save(sess,os.path.join(opt.model_save_path,'iter_%d.ckpt'%i))
if __name__ == '__main__':
train()

Просмотреть файл

@ -1,4 +1,5 @@
import numpy as np
import tensorflow as tf
from PIL import Image
from scipy.io import loadmat,savemat
from array import array
@ -125,4 +126,12 @@ def save_obj(path,v,f,c):
for i in range(len(f)):
file.write('f %d %d %d\n'%(f[i,0],f[i,1],f[i,2]))
file.close()
file.close()
# load .pb file into tensorflow graph
def load_graph(graph_filename):
with tf.gfile.GFile(graph_filename,'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
return graph_def