first commit
This commit is contained in:
Родитель
a8a8eefebd
Коммит
1068687f83
|
@ -0,0 +1,11 @@
|
|||
Thanks a lot for your interests to our work.
|
||||
I quickly wrap up a multi-gpu version code. (Note that in the submission phase, we use a 1gpu version code. Please let me know if anything we can improve)
|
||||
|
||||
Training demo code: example.sh
|
||||
|
||||
Inference demo code: batch_test_script_mainbody.sh
|
||||
|
||||
The data and the checkpoint are available at:
|
||||
https://www.dropbox.com/sh/fpnvtcmyj4mul2s/AAB4wvsxoS8pf7ExnZYe4VV1a?dl=0
|
||||
|
||||
You need to download them and put them in the working dir. An example is in ``example.sh''
|
|
@ -0,0 +1,40 @@
|
|||
export PATH=/usr/anaconda2/bin:$PATH
|
||||
#export LD_LIBRARY_PATH=~/Downloads/cuda/lib64:"$LD_LIBRAYR_PATH"
|
||||
export CUDA_VISIBLE_DEVICES=6
|
||||
|
||||
model_dir=checkpoints
|
||||
|
||||
for (( e=345;e<=345;e+=2 ));do
|
||||
filename=$(ls "$model_dir" | grep -o 'params_'${e}'uidx[^\.]*\.ckpt\.index')
|
||||
filename=${filename:0:-6}
|
||||
|
||||
python monitor_checkleft.py --data_dir=./cifar10_data --save_dir=$model_dir --batch_size=12 --show_interval=100 --load_params=${filename} --mode=I2L --useSoftLabel=0
|
||||
|
||||
done
|
||||
|
||||
for (( e=345;e<=345;e+=2 ));do
|
||||
filename=$(ls "$model_dir" | grep -o 'params_'${e}'uidx[^\.]*\.ckpt\.index')
|
||||
filename=${filename:0:-6}
|
||||
|
||||
python monitor_checkleft.py --data_dir=./cifar10_data --save_dir=$model_dir --batch_size=12 --show_interval=100 --load_params=${filename} --mode=L2I --useSoftLabel=0
|
||||
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
: <<'VIRTUAL_ENV'
|
||||
source ~/virtual_py/bin/activate
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
|
||||
model_dir=debug_room
|
||||
|
||||
python monitor.py --data_dir=./cifar10_data --save_dir=$model_dir --batch_size=12 --show_interval=100 --load_params=params_9uidx13880.ckpt --mode=L2I --useSoftLabel=0
|
||||
|
||||
deactivate
|
||||
|
||||
VIRTUAL_ENV
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
export PATH=/usr/anaconda2/bin:$PATH
|
||||
#export LD_LIBRARY_PATH=~/Downloads/cuda/lib64:"$LD_LIBRAYR_PATH"
|
||||
export CUDA_VISIBLE_DEVICES=6
|
||||
|
||||
model_dir=checkpoints
|
||||
|
||||
for (( e=345;e<=345;e+=2 ));do
|
||||
filename=$(ls "$model_dir" | grep -o 'params_'${e}'uidx[^\.]*\.ckpt\.index')
|
||||
filename=${filename:0:-6}
|
||||
|
||||
python monitor.py --data_dir=./cifar10_data --save_dir=$model_dir --batch_size=12 --show_interval=100 --load_params=${filename} --mode=I2L --useSoftLabel=0
|
||||
|
||||
done
|
||||
|
||||
for (( e=345;e<=345;e+=2 ));do
|
||||
filename=$(ls "$model_dir" | grep -o 'params_'${e}'uidx[^\.]*\.ckpt\.index')
|
||||
filename=${filename:0:-6}
|
||||
|
||||
python monitor.py --data_dir=./cifar10_data --save_dir=$model_dir --batch_size=12 --show_interval=100 --load_params=${filename} --mode=L2I --useSoftLabel=0
|
||||
|
||||
done
|
||||
|
||||
|
||||
# When using "--oneside" in training mode, you should also add the
|
||||
# corresponding "--oneside" in the inference phase.
|
||||
|
||||
|
||||
: <<'VIRTUAL_ENV'
|
||||
source ~/virtual_py/bin/activate
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
|
||||
model_dir=debug_room
|
||||
|
||||
python monitor.py --data_dir=./cifar10_data --save_dir=$model_dir --batch_size=12 --show_interval=100 --load_params=params_9uidx13880.ckpt --mode=L2I --useSoftLabel=0
|
||||
|
||||
deactivate
|
||||
|
||||
VIRTUAL_ENV
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""CIFAR dataset input module.
|
||||
"""
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def build_input(dataset, data_path, batch_size, mode):
|
||||
"""Build CIFAR image and labels.
|
||||
|
||||
Args:
|
||||
dataset: Either 'cifar10' or 'cifar100'.
|
||||
data_path: Filename for data.
|
||||
batch_size: Input batch size.
|
||||
mode: Either 'train' or 'eval'.
|
||||
Returns:
|
||||
images: Batches of images. [batch_size, image_size, image_size, 3]
|
||||
labels: Batches of labels. [batch_size, num_classes]
|
||||
Raises:
|
||||
ValueError: when the specified dataset is not supported.
|
||||
"""
|
||||
image_size = 32
|
||||
if dataset == 'cifar10':
|
||||
label_bytes = 1
|
||||
label_offset = 0
|
||||
num_classes = 10
|
||||
elif dataset == 'cifar100':
|
||||
label_bytes = 1
|
||||
label_offset = 1
|
||||
num_classes = 100
|
||||
else:
|
||||
raise ValueError('Not supported dataset %s', dataset)
|
||||
|
||||
depth = 3
|
||||
image_bytes = image_size * image_size * depth
|
||||
record_bytes = label_bytes + label_offset + image_bytes
|
||||
|
||||
data_files = tf.gfile.Glob(data_path)
|
||||
file_queue = tf.train.string_input_producer(data_files, shuffle=True)
|
||||
# Read examples from files in the filename queue.
|
||||
reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
|
||||
_, value = reader.read(file_queue)
|
||||
|
||||
# Convert these examples to dense labels and processed images.
|
||||
record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
|
||||
|
||||
label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)
|
||||
# Convert from string to [depth * height * width] to [depth, height, width].
|
||||
depth_major = tf.reshape(tf.slice(record, [label_bytes], [image_bytes]),
|
||||
[depth, image_size, image_size])
|
||||
# Convert from [depth, height, width] to [height, width, depth].
|
||||
image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)
|
||||
|
||||
if mode == 'train':
|
||||
image = tf.image.resize_image_with_crop_or_pad(
|
||||
image, image_size+4, image_size+4)
|
||||
image = tf.random_crop(image, [image_size, image_size, 3])
|
||||
image = tf.image.random_flip_left_right(image)
|
||||
# Brightness/saturation/constrast provides small gains .2%~.5% on cifar.
|
||||
# image = tf.image.random_brightness(image, max_delta=63. / 255.)
|
||||
# image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
|
||||
# image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
|
||||
image = tf.image.per_image_standardization(image)
|
||||
|
||||
example_queue = tf.RandomShuffleQueue(
|
||||
capacity=16 * batch_size,
|
||||
min_after_dequeue=8 * batch_size,
|
||||
dtypes=[tf.float32, tf.int32],
|
||||
shapes=[[image_size, image_size, depth], [1]])
|
||||
num_threads = 16
|
||||
else:
|
||||
image = tf.image.resize_image_with_crop_or_pad(
|
||||
image, image_size, image_size)
|
||||
image = tf.image.per_image_whitening(image)
|
||||
|
||||
example_queue = tf.FIFOQueue(
|
||||
3 * batch_size,
|
||||
dtypes=[tf.float32, tf.int32],
|
||||
shapes=[[image_size, image_size, depth], [1]])
|
||||
num_threads = 1
|
||||
|
||||
example_enqueue_op = example_queue.enqueue([image, label])
|
||||
tf.train.add_queue_runner(tf.train.queue_runner.QueueRunner(
|
||||
example_queue, [example_enqueue_op] * num_threads))
|
||||
|
||||
# Read 'batch' labels + images from the example queue.
|
||||
images, labels = example_queue.dequeue_many(batch_size)
|
||||
labels = tf.reshape(labels, [batch_size, 1])
|
||||
indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
|
||||
labels = tf.sparse_to_dense(
|
||||
tf.concat(1, [indices, labels]),
|
||||
[batch_size, num_classes], 1.0, 0.0)
|
||||
|
||||
assert len(images.get_shape()) == 4
|
||||
assert images.get_shape()[0] == batch_size
|
||||
assert images.get_shape()[-1] == 3
|
||||
assert len(labels.get_shape()) == 2
|
||||
assert labels.get_shape()[0] == batch_size
|
||||
assert labels.get_shape()[1] == num_classes
|
||||
|
||||
# Display the training images in the visualizer.
|
||||
tf.image_summary('images', images)
|
||||
return images, labels
|
|
@ -0,0 +1,129 @@
|
|||
"""
|
||||
Utilities for downloading and unpacking the CIFAR-10 dataset, originally published
|
||||
by Krizhevsky et al. and hosted here: https://www.cs.toronto.edu/~kriz/cifar.html
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
from six.moves import urllib
|
||||
import numpy as np
|
||||
|
||||
def maybe_download_and_extract(data_dir, url='http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'):
|
||||
if not os.path.exists(os.path.join(data_dir, 'cifar-10-batches-py')):
|
||||
if not os.path.exists(data_dir):
|
||||
os.makedirs(data_dir)
|
||||
filename = url.split('/')[-1]
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
if not os.path.exists(filepath):
|
||||
def _progress(count, block_size, total_size):
|
||||
sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
|
||||
float(count * block_size) / float(total_size) * 100.0))
|
||||
sys.stdout.flush()
|
||||
filepath, _ = urllib.request.urlretrieve(url, filepath, _progress)
|
||||
print()
|
||||
statinfo = os.stat(filepath)
|
||||
print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
|
||||
tarfile.open(filepath, 'r:gz').extractall(data_dir)
|
||||
|
||||
def unpickle(file):
|
||||
fo = open(file, 'rb')
|
||||
if (sys.version_info >= (3, 0)):
|
||||
import pickle
|
||||
d = pickle.load(fo, encoding='latin1')
|
||||
else:
|
||||
import cPickle
|
||||
d = cPickle.load(fo)
|
||||
fo.close()
|
||||
return {'x': d['data'].reshape((10000,3,32,32)), 'y': np.array(d['labels']).astype(np.uint8)}
|
||||
|
||||
def load(data_dir, subset='train'):
|
||||
maybe_download_and_extract(data_dir)
|
||||
if subset=='train':
|
||||
train_data = [unpickle(os.path.join(data_dir,'cifar-10-batches-py','data_batch_' + str(i))) for i in range(1,6)]
|
||||
trainx = np.concatenate([d['x'] for d in train_data],axis=0)
|
||||
trainy = np.concatenate([d['y'] for d in train_data],axis=0)
|
||||
return trainx, trainy
|
||||
elif subset=='test':
|
||||
test_data = unpickle(os.path.join(data_dir,'cifar-10-batches-py','test_batch'))
|
||||
testx = test_data['x']
|
||||
testy = test_data['y']
|
||||
return testx, testy
|
||||
else:
|
||||
raise NotImplementedError('subset should be either train or test')
|
||||
|
||||
class DataLoader(object):
|
||||
""" an object that generates batches of CIFAR-10 data for training """
|
||||
|
||||
def __init__(self, data_dir, subset, batch_size, rng=None, shuffle=False, return_labels=False, filter_labels=None):
|
||||
"""
|
||||
- data_dir is location where to store files
|
||||
- subset is train|test
|
||||
- batch_size is int, of #examples to load at once
|
||||
- rng is np.random.RandomState object for reproducibility
|
||||
"""
|
||||
|
||||
self.data_dir = data_dir
|
||||
self.batch_size = batch_size
|
||||
self.shuffle = shuffle
|
||||
self.return_labels = return_labels
|
||||
|
||||
# create temporary storage for the data, if not yet created
|
||||
if not os.path.exists(data_dir):
|
||||
print('creating folder', data_dir)
|
||||
os.makedirs(data_dir)
|
||||
|
||||
# load CIFAR-10 training data to RAM
|
||||
self.data, self.labels = load(os.path.join(data_dir,'cifar-10-python'), subset=subset)
|
||||
|
||||
if filter_labels is not None:
|
||||
selected_idx = self.labels == filter_labels
|
||||
self.data = self.data[selected_idx]
|
||||
self.labels = self.labels[selected_idx]
|
||||
print('There are %d samples left' % self.labels.size)
|
||||
|
||||
self.data = np.transpose(self.data, (0,2,3,1)) # (N,3,32,32) -> (N,32,32,3)
|
||||
|
||||
self.p = 0 # pointer to where we are in iteration
|
||||
self.rng = np.random.RandomState(1) if rng is None else rng
|
||||
|
||||
def get_observation_size(self):
|
||||
return self.data.shape[1:]
|
||||
|
||||
def get_num_labels(self):
|
||||
return np.amax(self.labels) + 1
|
||||
|
||||
def reset(self):
|
||||
self.p = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self, n=None):
|
||||
""" n is the number of examples to fetch """
|
||||
if n is None: n = self.batch_size
|
||||
|
||||
# on first iteration lazily permute all data
|
||||
if self.p == 0 and self.shuffle:
|
||||
inds = self.rng.permutation(self.data.shape[0])
|
||||
self.data = self.data[inds]
|
||||
self.labels = self.labels[inds]
|
||||
|
||||
# on last iteration reset the counter and raise StopIteration
|
||||
if self.p + n > self.data.shape[0]:
|
||||
self.reset() # reset for next time we get called
|
||||
raise StopIteration
|
||||
|
||||
# on intermediate iterations fetch the next batch
|
||||
x = self.data[self.p : self.p + n]
|
||||
y = self.labels[self.p : self.p + n]
|
||||
self.p += self.batch_size
|
||||
|
||||
if self.return_labels:
|
||||
return x,y
|
||||
else:
|
||||
return x
|
||||
|
||||
next = __next__ # Python 2 compatibility (https://stackoverflow.com/questions/29578469/how-to-make-an-object-both-a-python2-and-python3-iterator)
|
||||
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
import cifar10_data
|
||||
import argparse
|
||||
import plotting
|
||||
import numpy as np
|
||||
|
||||
data_dir = '/home/tim/data'
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--save_dir', type=str, default='./log')
|
||||
parser.add_argument('--data_dir', type=str, default='/home/tim/data')
|
||||
parser.add_argument('--plot_title', type=str, default=None)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
data_dir = args.data_dir
|
||||
|
||||
trainx, trainy = cifar10_data.load(data_dir)
|
||||
|
||||
ids = [[] for i in range(10)]
|
||||
for i in range(trainx.shape[0]):
|
||||
if len(ids[trainy[i]]) < 10:
|
||||
ids[trainy[i]].append(i)
|
||||
if np.alltrue(np.asarray([len(_ids) >= 10 for _ids in ids])):
|
||||
break
|
||||
|
||||
images = np.zeros((10*10,32,32,3),dtype='uint8')
|
||||
for i in range(len(ids)):
|
||||
for j in range(len(ids[i])):
|
||||
images[10*j+i] = trainx[ids[i][j]].transpose([1,2,0])
|
||||
print(ids)
|
||||
|
||||
img_tile = plotting.img_tile(images, aspect_ratio=1.0, border_color=1.0, stretch=True)
|
||||
img = plotting.plot_img(img_tile, title=args.plot_title if args.plot_title != 'None' else None)
|
||||
plotting.plt.savefig(args.save_dir + '/cifar10_orig_images.png')
|
||||
plotting.plt.close('all')
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
"""
|
||||
Utilities for loading the small ImageNet dataset used in Oord et al.
|
||||
use scripts/png_to_npz.py to create the npz files
|
||||
|
||||
The code here currently assumes that the preprocessing was done manually.
|
||||
TODO: make automatic and painless
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
from six.moves import urllib
|
||||
|
||||
import numpy as np
|
||||
from scipy.misc import imread
|
||||
|
||||
def fetch(url, filepath):
|
||||
filename = url.split('/')[-1]
|
||||
def _progress(count, block_size, total_size):
|
||||
sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
|
||||
float(count * block_size) / float(total_size) * 100.0))
|
||||
sys.stdout.flush()
|
||||
print(url)
|
||||
filepath, headers = urllib.request.urlretrieve(url, filepath, _progress)
|
||||
print()
|
||||
statinfo = os.stat(filepath)
|
||||
print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
|
||||
|
||||
def maybe_download_and_extract(data_dir):
|
||||
# more info on the dataset at http://image-net.org/small/download.php
|
||||
# downloads and extracts the two tar files for train/val
|
||||
|
||||
train_dir = os.path.join(data_dir, 'train_32x32')
|
||||
if not os.path.exists(train_dir):
|
||||
train_url = 'http://image-net.org/small/train_32x32.tar' # 4GB
|
||||
filepath = os.path.join(data_dir, 'train_32x32.tar')
|
||||
fetch(train_url, filepath)
|
||||
print('unpacking the tar file', filepath)
|
||||
tarfile.open(filepath, 'r').extractall(data_dir) # creates the train_32x32 folder
|
||||
|
||||
test_dir = os.path.join(data_dir, 'valid_32x32')
|
||||
if not os.path.exists(test_dir):
|
||||
test_url = 'http://image-net.org/small/valid_32x32.tar' # 154MB
|
||||
filepath = os.path.join(data_dir, 'valid_32x32.tar')
|
||||
fetch(test_url, filepath)
|
||||
print('unpacking the tar file', filepath)
|
||||
tarfile.open(filepath, 'r').extractall(data_dir) # creates the valid_32x32 folder
|
||||
|
||||
def maybe_preprocess(data_dir):
|
||||
|
||||
npz_file = os.path.join(data_dir, 'imgnet_32x32.npz')
|
||||
if os.path.exists(npz_file):
|
||||
return # all good
|
||||
|
||||
trainx = []
|
||||
train_dir = os.path.join(data_dir, 'train_32x32')
|
||||
for f in os.listdir(train_dir):
|
||||
if f.endswith('.png'):
|
||||
print('reading', f)
|
||||
filepath = os.path.join(train_dir, f)
|
||||
trainx.append(imread(filepath).reshape((1,32,32,3)))
|
||||
trainx = np.concatenate(trainx, axis=0)
|
||||
|
||||
testx = []
|
||||
test_dir = os.path.join(data_dir, 'valid_32x32')
|
||||
for f in os.listdir(test_dir):
|
||||
if f.endswith('.png'):
|
||||
print('reading', f)
|
||||
filepath = os.path.join(test_dir, f)
|
||||
testx.append(imread(filepath).reshape((1,32,32,3)))
|
||||
testx = np.concatenate(testx, axis=0)
|
||||
|
||||
np.savez(npz_file, trainx=trainx, testx=testx)
|
||||
|
||||
|
||||
def load(data_dir, subset='train'):
|
||||
if not os.path.exists(data_dir):
|
||||
print('creating folder', data_dir)
|
||||
os.makedirs(data_dir)
|
||||
maybe_download_and_extract(data_dir)
|
||||
maybe_preprocess(data_dir)
|
||||
imagenet_data = np.load(os.path.join(data_dir,'imgnet_32x32.npz'))
|
||||
return imagenet_data['trainx'] if subset == 'train' else imagenet_data['testx']
|
||||
|
||||
|
||||
|
||||
class DataLoader(object):
|
||||
""" an object that generates batches of CIFAR-10 data for training """
|
||||
|
||||
def __init__(self, data_dir, subset, batch_size, rng=None, shuffle=False):
|
||||
"""
|
||||
- data_dir is location where the files are stored
|
||||
- subset is train|test
|
||||
- batch_size is int, of #examples to load at once
|
||||
- rng is np.random.RandomState object for reproducibility
|
||||
"""
|
||||
|
||||
self.data_dir = data_dir
|
||||
self.batch_size = batch_size
|
||||
self.shuffle = shuffle
|
||||
|
||||
self.data = load(os.path.join(data_dir,'small_imagenet'), subset=subset)
|
||||
|
||||
self.p = 0 # pointer to where we are in iteration
|
||||
self.rng = np.random.RandomState(1) if rng is None else rng
|
||||
|
||||
def get_observation_size(self):
|
||||
return self.data.shape[1:]
|
||||
|
||||
def reset(self):
|
||||
self.p = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self, n=None):
|
||||
""" n is the number of examples to fetch """
|
||||
if n is None: n = self.batch_size
|
||||
|
||||
# on first iteration lazily permute all data
|
||||
if self.p == 0 and self.shuffle:
|
||||
inds = self.rng.permutation(self.data.shape[0])
|
||||
self.data = self.data[inds]
|
||||
|
||||
# on last iteration reset the counter and raise StopIteration
|
||||
if self.p + n > self.data.shape[0]:
|
||||
self.reset() # reset for next time we get called
|
||||
raise StopIteration
|
||||
|
||||
# on intermediate iterations fetch the next batch
|
||||
x = self.data[self.p : self.p + n]
|
||||
self.p += self.batch_size
|
||||
|
||||
return x
|
||||
|
||||
next = __next__ # Python 2 compatibility (https://stackoverflow.com/questions/29578469/how-to-make-an-object-both-a-python2-and-python3-iterator)
|
||||
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 577 KiB |
|
@ -0,0 +1,131 @@
|
|||
"""
|
||||
Utilities for downloading and unpacking the CIFAR-10 dataset, originally published
|
||||
by Krizhevsky et al. and hosted here: https://www.cs.toronto.edu/~kriz/cifar.html
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
from six.moves import urllib
|
||||
import numpy as np
|
||||
|
||||
def maybe_download_and_extract(data_dir, url='http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'):
|
||||
if not os.path.exists(os.path.join(data_dir, 'cifar-10-batches-py')):
|
||||
if not os.path.exists(data_dir):
|
||||
os.makedirs(data_dir)
|
||||
filename = url.split('/')[-1]
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
if not os.path.exists(filepath):
|
||||
def _progress(count, block_size, total_size):
|
||||
sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
|
||||
float(count * block_size) / float(total_size) * 100.0))
|
||||
sys.stdout.flush()
|
||||
filepath, _ = urllib.request.urlretrieve(url, filepath, _progress)
|
||||
print()
|
||||
statinfo = os.stat(filepath)
|
||||
print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
|
||||
tarfile.open(filepath, 'r:gz').extractall(data_dir)
|
||||
|
||||
def unpickle(file):
|
||||
fo = open(file, 'rb')
|
||||
if (sys.version_info >= (3, 0)):
|
||||
import pickle
|
||||
d = pickle.load(fo, encoding='latin1')
|
||||
else:
|
||||
import cPickle
|
||||
d = cPickle.load(fo)
|
||||
fo.close()
|
||||
return {'x': d['data'].reshape((10000,3,32,32)), 'y': np.array(d['labels']).astype(np.uint8)}
|
||||
|
||||
def load(data_dir, subset='train'):
|
||||
maybe_download_and_extract(data_dir)
|
||||
if subset=='train':
|
||||
train_data = [unpickle(os.path.join(data_dir,'cifar-10-batches-py','data_batch_' + str(i))) for i in range(1,6)]
|
||||
trainx = np.concatenate([d['x'] for d in train_data],axis=0)
|
||||
trainy = np.concatenate([d['y'] for d in train_data],axis=0)
|
||||
return trainx, trainy
|
||||
elif subset=='test':
|
||||
test_data = unpickle(os.path.join(data_dir,'cifar-10-batches-py','test_batch'))
|
||||
testx = test_data['x']
|
||||
testy = test_data['y']
|
||||
return testx, testy
|
||||
else:
|
||||
raise NotImplementedError('subset should be either train or test')
|
||||
|
||||
class DataLoader(object):
|
||||
""" an object that generates batches of CIFAR-10 data for training """
|
||||
|
||||
def __init__(self, data_dir, subset, batch_size, LMscore=None, rng=None, shuffle=False, return_labels=False):
|
||||
"""
|
||||
- data_dir is location where to store files
|
||||
- subset is train|test
|
||||
- batch_size is int, of #examples to load at once
|
||||
- rng is np.random.RandomState object for reproducibility
|
||||
"""
|
||||
|
||||
self.data_dir = data_dir
|
||||
self.batch_size = batch_size
|
||||
self.shuffle = shuffle
|
||||
self.return_labels = return_labels
|
||||
|
||||
# create temporary storage for the data, if not yet created
|
||||
if not os.path.exists(data_dir):
|
||||
print('creating folder', data_dir)
|
||||
os.makedirs(data_dir)
|
||||
|
||||
# load CIFAR-10 training data to RAM
|
||||
self.data, self.labels = load(os.path.join(data_dir,'cifar-10-python'), subset=subset)
|
||||
self.data = np.transpose(self.data, (0,2,3,1)) # (N,3,32,32) -> (N,32,32,3)
|
||||
|
||||
if subset == 'train':
|
||||
self.LM = np.load(LMscore + '.train.npz')['arr_0']
|
||||
elif subset == 'test':
|
||||
self.LM = np.load(LMscore + '.test.npz')
|
||||
else:
|
||||
raise 'Not found proper LMscore folder'
|
||||
|
||||
self.p = 0 # pointer to where we are in iteration
|
||||
self.rng = np.random.RandomState(1) if rng is None else rng
|
||||
|
||||
def get_observation_size(self):
|
||||
return self.data.shape[1:]
|
||||
|
||||
def get_num_labels(self):
|
||||
return np.amax(self.labels) + 1
|
||||
|
||||
def reset(self):
|
||||
self.p = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self, n=None):
|
||||
""" n is the number of examples to fetch """
|
||||
if n is None: n = self.batch_size
|
||||
|
||||
# on first iteration lazily permute all data
|
||||
if self.p == 0 and self.shuffle:
|
||||
inds = self.rng.permutation(self.data.shape[0])
|
||||
self.data = self.data[inds]
|
||||
self.labels = self.labels[inds]
|
||||
self.LM = self.LM[inds]
|
||||
|
||||
# on last iteration reset the counter and raise StopIteration
|
||||
if self.p + n > self.data.shape[0]:
|
||||
self.reset() # reset for next time we get called
|
||||
raise StopIteration
|
||||
|
||||
# on intermediate iterations fetch the next batch
|
||||
x = self.data[self.p : self.p + n]
|
||||
y = self.labels[self.p : self.p + n]
|
||||
lmscore = self.LM[self.p : self.p + n]
|
||||
self.p += self.batch_size
|
||||
|
||||
if self.return_labels:
|
||||
return x,y, lmscore
|
||||
else:
|
||||
return x, lmscore
|
||||
|
||||
next = __next__ # Python 2 compatibility (https://stackoverflow.com/questions/29578469/how-to-make-an-object-both-a-python2-and-python3-iterator)
|
||||
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
import cifar10_data
|
||||
import argparse
|
||||
import plotting
|
||||
import numpy as np
|
||||
|
||||
data_dir = '/home/tim/data'
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--save_dir', type=str, default='./log')
|
||||
parser.add_argument('--data_dir', type=str, default='/home/tim/data')
|
||||
parser.add_argument('--plot_title', type=str, default=None)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
data_dir = args.data_dir
|
||||
|
||||
trainx, trainy = cifar10_data.load(data_dir)
|
||||
|
||||
ids = [[] for i in range(10)]
|
||||
for i in range(trainx.shape[0]):
|
||||
if len(ids[trainy[i]]) < 10:
|
||||
ids[trainy[i]].append(i)
|
||||
if np.alltrue(np.asarray([len(_ids) >= 10 for _ids in ids])):
|
||||
break
|
||||
|
||||
images = np.zeros((10*10,32,32,3),dtype='uint8')
|
||||
for i in range(len(ids)):
|
||||
for j in range(len(ids[i])):
|
||||
images[10*j+i] = trainx[ids[i][j]].transpose([1,2,0])
|
||||
print(ids)
|
||||
|
||||
img_tile = plotting.img_tile(images, aspect_ratio=1.0, border_color=1.0, stretch=True)
|
||||
img = plotting.plot_img(img_tile, title=args.plot_title if args.plot_title != 'None' else None)
|
||||
plotting.plt.savefig(args.save_dir + '/cifar10_orig_images.png')
|
||||
plotting.plt.close('all')
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
"""
|
||||
Utilities for loading the small ImageNet dataset used in Oord et al.
|
||||
use scripts/png_to_npz.py to create the npz files
|
||||
|
||||
The code here currently assumes that the preprocessing was done manually.
|
||||
TODO: make automatic and painless
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
from six.moves import urllib
|
||||
|
||||
import numpy as np
|
||||
from scipy.misc import imread
|
||||
|
||||
def fetch(url, filepath):
|
||||
filename = url.split('/')[-1]
|
||||
def _progress(count, block_size, total_size):
|
||||
sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
|
||||
float(count * block_size) / float(total_size) * 100.0))
|
||||
sys.stdout.flush()
|
||||
print(url)
|
||||
filepath, headers = urllib.request.urlretrieve(url, filepath, _progress)
|
||||
print()
|
||||
statinfo = os.stat(filepath)
|
||||
print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
|
||||
|
||||
def maybe_download_and_extract(data_dir):
|
||||
# more info on the dataset at http://image-net.org/small/download.php
|
||||
# downloads and extracts the two tar files for train/val
|
||||
|
||||
train_dir = os.path.join(data_dir, 'train_32x32')
|
||||
if not os.path.exists(train_dir):
|
||||
train_url = 'http://image-net.org/small/train_32x32.tar' # 4GB
|
||||
filepath = os.path.join(data_dir, 'train_32x32.tar')
|
||||
fetch(train_url, filepath)
|
||||
print('unpacking the tar file', filepath)
|
||||
tarfile.open(filepath, 'r').extractall(data_dir) # creates the train_32x32 folder
|
||||
|
||||
test_dir = os.path.join(data_dir, 'valid_32x32')
|
||||
if not os.path.exists(test_dir):
|
||||
test_url = 'http://image-net.org/small/valid_32x32.tar' # 154MB
|
||||
filepath = os.path.join(data_dir, 'valid_32x32.tar')
|
||||
fetch(test_url, filepath)
|
||||
print('unpacking the tar file', filepath)
|
||||
tarfile.open(filepath, 'r').extractall(data_dir) # creates the valid_32x32 folder
|
||||
|
||||
def maybe_preprocess(data_dir):
|
||||
|
||||
npz_file = os.path.join(data_dir, 'imgnet_32x32.npz')
|
||||
if os.path.exists(npz_file):
|
||||
return # all good
|
||||
|
||||
trainx = []
|
||||
train_dir = os.path.join(data_dir, 'train_32x32')
|
||||
for f in os.listdir(train_dir):
|
||||
if f.endswith('.png'):
|
||||
print('reading', f)
|
||||
filepath = os.path.join(train_dir, f)
|
||||
trainx.append(imread(filepath).reshape((1,32,32,3)))
|
||||
trainx = np.concatenate(trainx, axis=0)
|
||||
|
||||
testx = []
|
||||
test_dir = os.path.join(data_dir, 'valid_32x32')
|
||||
for f in os.listdir(test_dir):
|
||||
if f.endswith('.png'):
|
||||
print('reading', f)
|
||||
filepath = os.path.join(test_dir, f)
|
||||
testx.append(imread(filepath).reshape((1,32,32,3)))
|
||||
testx = np.concatenate(testx, axis=0)
|
||||
|
||||
np.savez(npz_file, trainx=trainx, testx=testx)
|
||||
|
||||
|
||||
def load(data_dir, subset='train'):
|
||||
if not os.path.exists(data_dir):
|
||||
print('creating folder', data_dir)
|
||||
os.makedirs(data_dir)
|
||||
maybe_download_and_extract(data_dir)
|
||||
maybe_preprocess(data_dir)
|
||||
imagenet_data = np.load(os.path.join(data_dir,'imgnet_32x32.npz'))
|
||||
return imagenet_data['trainx'] if subset == 'train' else imagenet_data['testx']
|
||||
|
||||
|
||||
|
||||
class DataLoader(object):
|
||||
""" an object that generates batches of CIFAR-10 data for training """
|
||||
|
||||
def __init__(self, data_dir, subset, batch_size, rng=None, shuffle=False):
|
||||
"""
|
||||
- data_dir is location where the files are stored
|
||||
- subset is train|test
|
||||
- batch_size is int, of #examples to load at once
|
||||
- rng is np.random.RandomState object for reproducibility
|
||||
"""
|
||||
|
||||
self.data_dir = data_dir
|
||||
self.batch_size = batch_size
|
||||
self.shuffle = shuffle
|
||||
|
||||
self.data = load(os.path.join(data_dir,'small_imagenet'), subset=subset)
|
||||
|
||||
self.p = 0 # pointer to where we are in iteration
|
||||
self.rng = np.random.RandomState(1) if rng is None else rng
|
||||
|
||||
def get_observation_size(self):
|
||||
return self.data.shape[1:]
|
||||
|
||||
def reset(self):
|
||||
self.p = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self, n=None):
|
||||
""" n is the number of examples to fetch """
|
||||
if n is None: n = self.batch_size
|
||||
|
||||
# on first iteration lazily permute all data
|
||||
if self.p == 0 and self.shuffle:
|
||||
inds = self.rng.permutation(self.data.shape[0])
|
||||
self.data = self.data[inds]
|
||||
|
||||
# on last iteration reset the counter and raise StopIteration
|
||||
if self.p + n > self.data.shape[0]:
|
||||
self.reset() # reset for next time we get called
|
||||
raise StopIteration
|
||||
|
||||
# on intermediate iterations fetch the next batch
|
||||
x = self.data[self.p : self.p + n]
|
||||
self.p += self.batch_size
|
||||
|
||||
return x
|
||||
|
||||
next = __next__ # Python 2 compatibility (https://stackoverflow.com/questions/29578469/how-to-make-an-object-both-a-python2-and-python3-iterator)
|
||||
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 577 KiB |
|
@ -0,0 +1,133 @@
|
|||
"""
|
||||
Utilities for downloading and unpacking the CIFAR-10 dataset, originally published
|
||||
by Krizhevsky et al. and hosted here: https://www.cs.toronto.edu/~kriz/cifar.html
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
from six.moves import urllib
|
||||
import numpy as np
|
||||
|
||||
def maybe_download_and_extract(data_dir, url='http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'):
|
||||
if not os.path.exists(os.path.join(data_dir, 'cifar-10-batches-py')):
|
||||
if not os.path.exists(data_dir):
|
||||
os.makedirs(data_dir)
|
||||
filename = url.split('/')[-1]
|
||||
filepath = os.path.join(data_dir, filename)
|
||||
if not os.path.exists(filepath):
|
||||
def _progress(count, block_size, total_size):
|
||||
sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
|
||||
float(count * block_size) / float(total_size) * 100.0))
|
||||
sys.stdout.flush()
|
||||
filepath, _ = urllib.request.urlretrieve(url, filepath, _progress)
|
||||
print()
|
||||
statinfo = os.stat(filepath)
|
||||
print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
|
||||
tarfile.open(filepath, 'r:gz').extractall(data_dir)
|
||||
|
||||
def unpickle(file):
|
||||
fo = open(file, 'rb')
|
||||
if (sys.version_info >= (3, 0)):
|
||||
import pickle
|
||||
d = pickle.load(fo, encoding='latin1')
|
||||
else:
|
||||
import cPickle
|
||||
d = cPickle.load(fo)
|
||||
fo.close()
|
||||
return {'x': d['data'].reshape((10000,3,32,32)), 'y': np.array(d['labels']).astype(np.uint8)}
|
||||
|
||||
def load(data_dir, subset='train'):
|
||||
maybe_download_and_extract(data_dir)
|
||||
if subset=='train':
|
||||
train_data = [unpickle(os.path.join(data_dir,'cifar-10-batches-py','data_batch_' + str(i))) for i in range(1,6)]
|
||||
trainx = np.concatenate([d['x'] for d in train_data],axis=0)
|
||||
trainy = np.concatenate([d['y'] for d in train_data],axis=0)
|
||||
return trainx, trainy
|
||||
elif subset=='test':
|
||||
test_data = unpickle(os.path.join(data_dir,'cifar-10-batches-py','test_batch'))
|
||||
testx = test_data['x']
|
||||
testy = test_data['y']
|
||||
return testx, testy
|
||||
else:
|
||||
raise NotImplementedError('subset should be either train or test')
|
||||
|
||||
class DataLoader(object):
|
||||
""" an object that generates batches of CIFAR-10 data for training """
|
||||
|
||||
def __init__(self, data_dir, subset, batch_size, rng=None, shuffle=False, return_labels=False, filter_labels=None,final=8):
|
||||
"""
|
||||
- data_dir is location where to store files
|
||||
- subset is train|test
|
||||
- batch_size is int, of #examples to load at once
|
||||
- rng is np.random.RandomState object for reproducibility
|
||||
"""
|
||||
|
||||
self.data_dir = data_dir
|
||||
self.batch_size = batch_size
|
||||
self.shuffle = shuffle
|
||||
self.return_labels = return_labels
|
||||
|
||||
# create temporary storage for the data, if not yet created
|
||||
if not os.path.exists(data_dir):
|
||||
print('creating folder', data_dir)
|
||||
os.makedirs(data_dir)
|
||||
|
||||
# load CIFAR-10 training data to RAM
|
||||
self.data, self.labels = load(os.path.join(data_dir,'cifar-10-python'), subset=subset)
|
||||
if final > 0:
|
||||
self.data = np.tile(self.data[-final:],[3,1,1,1])
|
||||
self.labels = np.tile(self.labels[-final:],[3])
|
||||
|
||||
|
||||
if filter_labels is not None:
|
||||
selected_idx = self.labels == filter_labels
|
||||
self.data = self.data[selected_idx]
|
||||
self.labels = self.labels[selected_idx]
|
||||
print('There are %d samples left' % self.labels.size)
|
||||
|
||||
self.data = np.transpose(self.data, (0,2,3,1)) # (N,3,32,32) -> (N,32,32,3)
|
||||
|
||||
self.p = 0 # pointer to where we are in iteration
|
||||
self.rng = np.random.RandomState(1) if rng is None else rng
|
||||
|
||||
def get_observation_size(self):
|
||||
return self.data.shape[1:]
|
||||
|
||||
def get_num_labels(self):
|
||||
return np.amax(self.labels) + 1
|
||||
|
||||
def reset(self):
|
||||
self.p = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self, n=None):
|
||||
""" n is the number of examples to fetch """
|
||||
if n is None: n = self.batch_size
|
||||
|
||||
# on first iteration lazily permute all data
|
||||
if self.p == 0 and self.shuffle:
|
||||
inds = self.rng.permutation(self.data.shape[0])
|
||||
self.data = self.data[inds]
|
||||
self.labels = self.labels[inds]
|
||||
|
||||
# on last iteration reset the counter and raise StopIteration
|
||||
if self.p + n > self.data.shape[0]:
|
||||
self.reset() # reset for next time we get called
|
||||
raise StopIteration
|
||||
|
||||
# on intermediate iterations fetch the next batch
|
||||
x = self.data[self.p : self.p + n]
|
||||
y = self.labels[self.p : self.p + n]
|
||||
self.p += self.batch_size
|
||||
|
||||
if self.return_labels:
|
||||
return x,y
|
||||
else:
|
||||
return x
|
||||
|
||||
next = __next__ # Python 2 compatibility (https://stackoverflow.com/questions/29578469/how-to-make-an-object-both-a-python2-and-python3-iterator)
|
||||
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
import cifar10_data
|
||||
import argparse
|
||||
import plotting
|
||||
import numpy as np
|
||||
|
||||
data_dir = '/home/tim/data'
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--save_dir', type=str, default='./log')
|
||||
parser.add_argument('--data_dir', type=str, default='/home/tim/data')
|
||||
parser.add_argument('--plot_title', type=str, default=None)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
data_dir = args.data_dir
|
||||
|
||||
trainx, trainy = cifar10_data.load(data_dir)
|
||||
|
||||
ids = [[] for i in range(10)]
|
||||
for i in range(trainx.shape[0]):
|
||||
if len(ids[trainy[i]]) < 10:
|
||||
ids[trainy[i]].append(i)
|
||||
if np.alltrue(np.asarray([len(_ids) >= 10 for _ids in ids])):
|
||||
break
|
||||
|
||||
images = np.zeros((10*10,32,32,3),dtype='uint8')
|
||||
for i in range(len(ids)):
|
||||
for j in range(len(ids[i])):
|
||||
images[10*j+i] = trainx[ids[i][j]].transpose([1,2,0])
|
||||
print(ids)
|
||||
|
||||
img_tile = plotting.img_tile(images, aspect_ratio=1.0, border_color=1.0, stretch=True)
|
||||
img = plotting.plot_img(img_tile, title=args.plot_title if args.plot_title != 'None' else None)
|
||||
plotting.plt.savefig(args.save_dir + '/cifar10_orig_images.png')
|
||||
plotting.plt.close('all')
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
"""
|
||||
Utilities for loading the small ImageNet dataset used in Oord et al.
|
||||
use scripts/png_to_npz.py to create the npz files
|
||||
|
||||
The code here currently assumes that the preprocessing was done manually.
|
||||
TODO: make automatic and painless
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
from six.moves import urllib
|
||||
|
||||
import numpy as np
|
||||
from scipy.misc import imread
|
||||
|
||||
def fetch(url, filepath):
|
||||
filename = url.split('/')[-1]
|
||||
def _progress(count, block_size, total_size):
|
||||
sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
|
||||
float(count * block_size) / float(total_size) * 100.0))
|
||||
sys.stdout.flush()
|
||||
print(url)
|
||||
filepath, headers = urllib.request.urlretrieve(url, filepath, _progress)
|
||||
print()
|
||||
statinfo = os.stat(filepath)
|
||||
print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
|
||||
|
||||
def maybe_download_and_extract(data_dir):
|
||||
# more info on the dataset at http://image-net.org/small/download.php
|
||||
# downloads and extracts the two tar files for train/val
|
||||
|
||||
train_dir = os.path.join(data_dir, 'train_32x32')
|
||||
if not os.path.exists(train_dir):
|
||||
train_url = 'http://image-net.org/small/train_32x32.tar' # 4GB
|
||||
filepath = os.path.join(data_dir, 'train_32x32.tar')
|
||||
fetch(train_url, filepath)
|
||||
print('unpacking the tar file', filepath)
|
||||
tarfile.open(filepath, 'r').extractall(data_dir) # creates the train_32x32 folder
|
||||
|
||||
test_dir = os.path.join(data_dir, 'valid_32x32')
|
||||
if not os.path.exists(test_dir):
|
||||
test_url = 'http://image-net.org/small/valid_32x32.tar' # 154MB
|
||||
filepath = os.path.join(data_dir, 'valid_32x32.tar')
|
||||
fetch(test_url, filepath)
|
||||
print('unpacking the tar file', filepath)
|
||||
tarfile.open(filepath, 'r').extractall(data_dir) # creates the valid_32x32 folder
|
||||
|
||||
def maybe_preprocess(data_dir):
|
||||
|
||||
npz_file = os.path.join(data_dir, 'imgnet_32x32.npz')
|
||||
if os.path.exists(npz_file):
|
||||
return # all good
|
||||
|
||||
trainx = []
|
||||
train_dir = os.path.join(data_dir, 'train_32x32')
|
||||
for f in os.listdir(train_dir):
|
||||
if f.endswith('.png'):
|
||||
print('reading', f)
|
||||
filepath = os.path.join(train_dir, f)
|
||||
trainx.append(imread(filepath).reshape((1,32,32,3)))
|
||||
trainx = np.concatenate(trainx, axis=0)
|
||||
|
||||
testx = []
|
||||
test_dir = os.path.join(data_dir, 'valid_32x32')
|
||||
for f in os.listdir(test_dir):
|
||||
if f.endswith('.png'):
|
||||
print('reading', f)
|
||||
filepath = os.path.join(test_dir, f)
|
||||
testx.append(imread(filepath).reshape((1,32,32,3)))
|
||||
testx = np.concatenate(testx, axis=0)
|
||||
|
||||
np.savez(npz_file, trainx=trainx, testx=testx)
|
||||
|
||||
|
||||
def load(data_dir, subset='train'):
|
||||
if not os.path.exists(data_dir):
|
||||
print('creating folder', data_dir)
|
||||
os.makedirs(data_dir)
|
||||
maybe_download_and_extract(data_dir)
|
||||
maybe_preprocess(data_dir)
|
||||
imagenet_data = np.load(os.path.join(data_dir,'imgnet_32x32.npz'))
|
||||
return imagenet_data['trainx'] if subset == 'train' else imagenet_data['testx']
|
||||
|
||||
|
||||
|
||||
class DataLoader(object):
|
||||
""" an object that generates batches of CIFAR-10 data for training """
|
||||
|
||||
def __init__(self, data_dir, subset, batch_size, rng=None, shuffle=False):
|
||||
"""
|
||||
- data_dir is location where the files are stored
|
||||
- subset is train|test
|
||||
- batch_size is int, of #examples to load at once
|
||||
- rng is np.random.RandomState object for reproducibility
|
||||
"""
|
||||
|
||||
self.data_dir = data_dir
|
||||
self.batch_size = batch_size
|
||||
self.shuffle = shuffle
|
||||
|
||||
self.data = load(os.path.join(data_dir,'small_imagenet'), subset=subset)
|
||||
|
||||
self.p = 0 # pointer to where we are in iteration
|
||||
self.rng = np.random.RandomState(1) if rng is None else rng
|
||||
|
||||
def get_observation_size(self):
|
||||
return self.data.shape[1:]
|
||||
|
||||
def reset(self):
|
||||
self.p = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self, n=None):
|
||||
""" n is the number of examples to fetch """
|
||||
if n is None: n = self.batch_size
|
||||
|
||||
# on first iteration lazily permute all data
|
||||
if self.p == 0 and self.shuffle:
|
||||
inds = self.rng.permutation(self.data.shape[0])
|
||||
self.data = self.data[inds]
|
||||
|
||||
# on last iteration reset the counter and raise StopIteration
|
||||
if self.p + n > self.data.shape[0]:
|
||||
self.reset() # reset for next time we get called
|
||||
raise StopIteration
|
||||
|
||||
# on intermediate iterations fetch the next batch
|
||||
x = self.data[self.p : self.p + n]
|
||||
self.p += self.batch_size
|
||||
|
||||
return x
|
||||
|
||||
next = __next__ # Python 2 compatibility (https://stackoverflow.com/questions/29578469/how-to-make-an-object-both-a-python2-and-python3-iterator)
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
export PATH=/usr/anaconda2/bin:$PATH
|
||||
#export LD_LIBRARY_PATH=~/Downloads/cuda/lib64:"$LD_LIBRAYR_PATH"
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
|
||||
# train two models (test 4 gpu)
|
||||
python monitor.py --data_dir=./cifar10_data --save_dir=./checkpoints_All --batch_size=12 --show_interval=10 --learning_rate=1e-4 --load_params=params_345uidx480248.ckpt --learning_rate_I2L=2e-4 --trade_off_I2L=30. --trade_off_L2I=1.5 --save_interval=1 --bias=0.02 --valid_interval=8 --lr_decay=1. --nr_gpu=4
|
||||
|
||||
# train image classifier only (test single gpu)
|
||||
# python monitor.py --data_dir=./cifar10_data --save_dir=./checkpoints_I2L --batch_size=12 --show_interval=10 --learning_rate=1e-4 --load_params=params_345uidx480248.ckpt --learning_rate_I2L=2e-4 --trade_off_I2L=30. --trade_off_L2I=1.5 --save_interval=1 --bias=0.02 --valid_interval=8 --lr_decay=1. --nr_gpu=1 --oneside=I2L
|
||||
|
||||
# train image generator only (test 2 gpu)
|
||||
# python monitor.py --data_dir=./cifar10_data --save_dir=./checkpoints_L2I --batch_size=12 --show_interval=10 --learning_rate=1e-4 --load_params=params_345uidx480248.ckpt --learning_rate_I2L=2e-4 --trade_off_I2L=30. --trade_off_L2I=1.5 --save_interval=1 --bias=0.02 --valid_interval=8 --lr_decay=1. --nr_gpu=2 --oneside=L2I
|
|
@ -0,0 +1,462 @@
|
|||
import time
|
||||
import sys
|
||||
import os
|
||||
|
||||
import cifar_input
|
||||
import numpy as np
|
||||
import resnet_model_basic as resnet_model
|
||||
import tensorflow as tf
|
||||
import data.cifar10_data as cifar10_data
|
||||
import data2.cifar10_data as cifar_10data2
|
||||
|
||||
import json
|
||||
|
||||
from worker_I2L import worker_I2L, lr_I2L
|
||||
from worker_L2I import worker_L2I
|
||||
import argparse
|
||||
import time
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
parser = argparse.ArgumentParser()
|
||||
# data I/O
|
||||
parser.add_argument('-i', '--data_dir', type=str, default='/tmp/pxpp/data', help='Location for the dataset')
|
||||
parser.add_argument('-o', '--save_dir', type=str, default='/tmp/pxpp/save', help='Location for parameter checkpoints and samples')
|
||||
parser.add_argument('-d', '--data_set', type=str, default='cifar', help='Can be either cifar|imagenet')
|
||||
parser.add_argument('-t', '--save_interval', type=int, default=2, help='Every how many epochs to write checkpoint/samples?')
|
||||
parser.add_argument('--valid_interval', type=int, default=1, help='Every how many epochs to valid?')
|
||||
parser.add_argument('-r', '--load_params', type=str, default=None, help='The detailed model name')
|
||||
|
||||
|
||||
# model
|
||||
parser.add_argument('-q', '--nr_resnet', type=int, default=5, help='Number of residual blocks per stage of the model')
|
||||
parser.add_argument('-n', '--nr_filters', type=int, default=160, help='Number of filters to use across the model. Higher = larger model.')
|
||||
parser.add_argument('-m', '--nr_logistic_mix', type=int, default=10, help='Number of logistic components in the mixture. Higher = more flexible model')
|
||||
parser.add_argument('-z', '--resnet_nonlinearity', type=str, default='concat_elu', help='Which nonlinearity to use in the ResNet layers. One of "concat_elu", "elu", "relu" ')
|
||||
parser.add_argument('-c', '--class_conditional', dest='class_conditional', action='store_true', help='Condition generative model on labels?')
|
||||
parser.add_argument('--trade_off_I2L', type=float, default=5e-3, help='the consistence tradeoff')
|
||||
parser.add_argument('--trade_off_L2I', type=float, default=0.3, help='the consistence tradeoff')
|
||||
parser.add_argument('-w', '--use_wide_resnet', dest='use_wide_resnet', action='store_true', help='Condition generative model on labels?')
|
||||
parser.add_argument('--show_interval', type=int, default=100, help='Batch size during training per GPU')
|
||||
parser.add_argument('--steal_params_L2I', type=str, default=None, help='Provide the file, which stores the warm values of L2I')
|
||||
parser.add_argument('--steal_params_I2L', type=str, default=None, help='Provide the file, which stores the warm values of I2L')
|
||||
parser.add_argument('--oneside', dest='oneside', type=str, default=None, help='None | I2L | L2I')
|
||||
|
||||
# optimization
|
||||
parser.add_argument('--learning_rate_I2L', type=float, default=0.001, help='Base learning rate')
|
||||
parser.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Base learning rate')
|
||||
parser.add_argument('-e', '--lr_decay', type=float, default=0.999995, help='Learning rate decay, applied every step of the optimization')
|
||||
parser.add_argument('-b', '--batch_size', type=int, default=12, help='Batch size during training per GPU')
|
||||
parser.add_argument('-a', '--init_batch_size', type=int, default=100, help='How much data to use for data-dependent initialization.')
|
||||
parser.add_argument('-p', '--dropout_p', type=float, default=0.5, help='Dropout strength (i.e. 1 - keep_prob). 0 = No dropout, higher = more dropout.')
|
||||
parser.add_argument('-x', '--max_epochs', type=int, default=5000, help='How many epochs to run in total?')
|
||||
parser.add_argument('-g', '--nr_gpu', type=int, default=1, help='How many GPUs to distribute the training across?')
|
||||
parser.add_argument('--num_classes', type=int, default=10, help='number of classes')
|
||||
parser.add_argument('--L2I_normalization', dest='L2I_normalization', action='store_true', help='Use L2I normalization')
|
||||
parser.add_argument('--L2IuseSGD', dest='L2IuseSGD', action='store_true', help='Whether to use pure SGD to tune L2I')
|
||||
parser.add_argument('--useSoftLabel', type=int, default=0, help='0: no use | 1: use | 2: -0.1')
|
||||
# Activate "useSoftLabel" or not does not make significant differences. So my suggestion is that we do not need it. Also, I did not test useSoftLabel under multiple GPU settings
|
||||
parser.add_argument('--bias', type=float, default=0.0, help='introduce the bias')
|
||||
|
||||
|
||||
# evaluation
|
||||
parser.add_argument('--polyak_decay', type=float, default=0.9995, help='Exponential decay rate of the sum of previous model iterates during Polyak averaging')
|
||||
parser.add_argument('--mode', type=str, default='train', help='train | I2L | L2I | ImgGen' )
|
||||
|
||||
# reproducibility
|
||||
parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed to use')
|
||||
args = parser.parse_args()
|
||||
print('input args:\n', json.dumps(vars(args), indent=4, separators=(',',':'))) # pretty print args
|
||||
|
||||
DataLoader = cifar10_data.DataLoader
|
||||
DataLoader_train = cifar_10data2.DataLoader
|
||||
rng = np.random.RandomState(args.seed)
|
||||
train_data_iterator = DataLoader_train(args.data_dir, 'train', args.batch_size * args.nr_gpu,
|
||||
'./cifar10_data/cifar10-LMscore',
|
||||
rng=rng, shuffle=True, return_labels=True)
|
||||
test_data_iterator = DataLoader(args.data_dir, 'test', args.batch_size * args.nr_gpu, shuffle=False, return_labels=True)
|
||||
|
||||
|
||||
class monitor(object):
|
||||
def __init__(self):
|
||||
if not os.path.exists(args.save_dir):
|
||||
os.makedirs(args.save_dir)
|
||||
self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
|
||||
self.Worker_L2I = worker_L2I(args, train_data_iterator.get_num_labels(), train_data_iterator.get_observation_size())
|
||||
self.Worker_I2L = worker_I2L(args)
|
||||
|
||||
self.image_LM = [tf.placeholder(tf.float32, shape=(args.batch_size,)) for _ in range(args.nr_gpu)]
|
||||
self.trade_off_I2L = tf.placeholder(tf.float32, shape=())
|
||||
self.trade_off_L2I = tf.placeholder(tf.float32, shape=())
|
||||
|
||||
self.I2L_grads = []
|
||||
self.train_uidx = 0
|
||||
self._build_onestep(oneside=args.oneside)
|
||||
|
||||
self.lr_l2i = self.Worker_L2I.args.learning_rate
|
||||
self.current_epoch = 0
|
||||
|
||||
self.assign_op = lambda ref_, val_: tf.assign(ref_, val_)
|
||||
|
||||
def get_I2L_lr(self):
|
||||
if args.use_wide_resnet:
|
||||
step_wise = [60, 120, 160]
|
||||
#step_wise = [51000, 76000, 102000] # counted by iter with batch_size 100
|
||||
if self.current_epoch < step_wise[0]:
|
||||
return args.learning_rate_I2L
|
||||
elif self.current_epoch < step_wise[1]:
|
||||
return args.learning_rate_I2L * 0.2
|
||||
elif self.current_epoch < step_wise[2]:
|
||||
return args.learning_rate_I2L * 0.04
|
||||
else:
|
||||
return args.learning_rate_I2L * 0.008
|
||||
else:
|
||||
step_wise = [102, 153, 204]
|
||||
#step_wise = [51000, 76000, 102000] # counted by iter with batch_size 100
|
||||
if self.current_epoch < step_wise[0]:
|
||||
return args.learning_rate_I2L
|
||||
elif self.current_epoch < step_wise[1]:
|
||||
return args.learning_rate_I2L * 0.1
|
||||
elif self.current_epoch < step_wise[2]:
|
||||
return args.learning_rate_I2L * 0.01
|
||||
else:
|
||||
return args.learning_rate_I2L * 0.001
|
||||
|
||||
def get_L2I_lr(self):
|
||||
self.lr_l2i *= self.Worker_L2I.args.lr_decay
|
||||
return self.lr_l2i
|
||||
|
||||
def __del__(self):
|
||||
self.sess.close()
|
||||
|
||||
def _build_onestep(self, oneside=None):
|
||||
# Calculate all the costs and gradients
|
||||
# Let us NOT use weight decay, since we have aleardy had a regularization term
|
||||
# self.weightDecay_I2L = self.Worker_I2L.model.GetWeightDecay()
|
||||
self.nlls_I2L = self.Worker_I2L.model.nlls
|
||||
self.soft_labels = self.Worker_I2L.model.predictions # this is the soft labels [optional, may not use it]
|
||||
nlls_L2I, loss_gen_test = self.Worker_L2I.GetLoss()
|
||||
|
||||
nlls_L2I_train_bpd_list, nlls_L2I_test_bpd_list, consistent_loss_list = \
|
||||
[None for _ in xrange(args.nr_gpu)], [None for _ in xrange(args.nr_gpu)], [None for _ in xrange(args.nr_gpu)]
|
||||
overall_cost_I2L_list, overall_cost_L2I_list, nlls_I2L_batchMean_list = \
|
||||
[None for _ in xrange(args.nr_gpu)], [None for _ in xrange(args.nr_gpu)], [None for _ in xrange(args.nr_gpu)]
|
||||
grads_I2L_list, grads_L2I_list = [None for _ in xrange(args.nr_gpu)], [None for _ in xrange(args.nr_gpu)]
|
||||
|
||||
for i in range(args.nr_gpu):
|
||||
with tf.device('/gpu:%d' % i):
|
||||
nlls_L2I_train_bpd = tf.reduce_mean(nlls_L2I[i]) / (np.log(2.) * 32 * 32 * 3 )
|
||||
nlls_L2I_test_bpd = tf.reduce_mean(loss_gen_test[i]) / (np.log(2.) * 32 * 32 * 3 * args.batch_size)
|
||||
if args.L2I_normalization:
|
||||
consistent_loss = tf.reduce_mean(
|
||||
(self.image_LM[i] * np.log(2.) + self.nlls_I2L[i] + tf.log(0.1) - nlls_L2I[i] / (32. * 32 * 3)) ** 2.)
|
||||
else:
|
||||
consistent_loss = tf.reduce_mean(
|
||||
(self.image_LM[i] * np.log(2.) + (self.nlls_I2L[i] + tf.log(0.1) - nlls_L2I[i]) / 3072. + args.bias) ** 2.)
|
||||
|
||||
nlls_L2I_train_bpd_list[i] = nlls_L2I_train_bpd
|
||||
nlls_L2I_test_bpd_list[i] = nlls_L2I_test_bpd
|
||||
consistent_loss_list[i] = consistent_loss
|
||||
nlls_I2L_batchMean = tf.reduce_mean(self.nlls_I2L[i])
|
||||
overall_cost_I2L = nlls_I2L_batchMean + (self.trade_off_I2L ** 2.) * consistent_loss
|
||||
overall_cost_L2I = nlls_L2I_train_bpd + (self.trade_off_L2I ** 2.) * consistent_loss
|
||||
nlls_I2L_batchMean_list[i] = nlls_I2L_batchMean
|
||||
overall_cost_I2L_list[i] = overall_cost_I2L
|
||||
overall_cost_L2I_list[i] = overall_cost_L2I
|
||||
|
||||
if oneside is None:
|
||||
grads_I2L_list[i] = tf.gradients(overall_cost_I2L, self.Worker_I2L.model.trainable_variables)
|
||||
grads_L2I_list[i] = tf.gradients(overall_cost_L2I, self.Worker_L2I.all_params)
|
||||
elif oneside == 'I2L':
|
||||
grads_I2L_list[i] = tf.gradients(overall_cost_I2L, self.Worker_I2L.model.trainable_variables)
|
||||
elif oneside == 'L2I':
|
||||
grads_L2I_list[i] = tf.gradients(overall_cost_L2I, self.Worker_L2I.all_params)
|
||||
|
||||
with tf.device('/gpu:0'):
|
||||
for i in range(1, args.nr_gpu):
|
||||
nlls_L2I_train_bpd_list[0] += nlls_L2I_train_bpd_list[i]
|
||||
nlls_L2I_test_bpd_list[0] += nlls_L2I_test_bpd_list[i]
|
||||
consistent_loss_list[0] += consistent_loss_list[i]
|
||||
overall_cost_I2L_list[0] += overall_cost_I2L_list[i]
|
||||
overall_cost_L2I_list[0] += overall_cost_L2I_list[i]
|
||||
nlls_I2L_batchMean_list[0] += nlls_I2L_batchMean_list[i]
|
||||
|
||||
if oneside != 'L2I':
|
||||
for j in range(len(grads_I2L_list[0])):
|
||||
grads_I2L_list[0][j] += grads_I2L_list[i][j]
|
||||
if oneside != 'I2L':
|
||||
for j in range(len(grads_L2I_list[0])):
|
||||
grads_L2I_list[0][j] += grads_L2I_list[i][j]
|
||||
|
||||
if oneside != 'L2I':
|
||||
for j in range(len(grads_I2L_list[0])):
|
||||
grads_I2L_list[0][j] /= (args.nr_gpu * 1.)
|
||||
|
||||
if oneside != 'I2L':
|
||||
for j in range(len(grads_L2I_list[0])):
|
||||
grads_L2I_list[0][j] /= (args.nr_gpu * 1.)
|
||||
|
||||
|
||||
if oneside is None:
|
||||
self.Worker_I2L.model.Update(grads_I2L_list[0])
|
||||
self.Worker_L2I.Update(grads_L2I_list[0])
|
||||
elif oneside == 'I2L':
|
||||
self.Worker_I2L.model.Update(grads_I2L_list[0])
|
||||
elif oneside == 'L2I':
|
||||
self.Worker_L2I.Update(grads_L2I_list[0])
|
||||
|
||||
self.nlls_L2I_train_bpd = nlls_L2I_train_bpd_list[0] / args.nr_gpu
|
||||
self.nlls_L2I_test_bpd = nlls_L2I_test_bpd_list[0] / args.nr_gpu
|
||||
self.consistent_loss = consistent_loss_list[0] /args.nr_gpu
|
||||
self.nlls_I2L_batchMean = nlls_I2L_batchMean_list[0] / args.nr_gpu
|
||||
self.overall_cost_I2L = overall_cost_I2L_list[0] / args.nr_gpu
|
||||
self.overall_cost_L2I = overall_cost_L2I_list[0] / args.nr_gpu
|
||||
|
||||
# Build the sampler
|
||||
self.Worker_L2I.build_sample_from_model()
|
||||
|
||||
def step(self, images, labels, LMscores, currEpoch, use_soft_label=0):
|
||||
fetches = [self.nlls_I2L_batchMean, self.nlls_L2I_train_bpd, self.nlls_L2I_test_bpd, self.consistent_loss,
|
||||
self.overall_cost_I2L, self.overall_cost_L2I]
|
||||
if args.oneside is None:
|
||||
fetches.append(self.Worker_I2L.model.update_ops)
|
||||
fetches.append(self.Worker_L2I.update_ops)
|
||||
elif args.oneside == 'I2L':
|
||||
fetches.append(self.Worker_I2L.model.update_ops)
|
||||
elif args.oneside == 'L2I':
|
||||
fetches.append(self.Worker_L2I.update_ops)
|
||||
else:
|
||||
raise Exception('Currently, only support None | I2L | L2I')
|
||||
|
||||
feed_dict={
|
||||
# self.Worker_I2L.model.input_image: images.astype('float32'),
|
||||
# self.Worker_I2L.model.input_label: labels[:,None],
|
||||
self.Worker_I2L.model.lrn_rate: self.get_I2L_lr(),
|
||||
self.Worker_I2L.model.needImgAug: True,
|
||||
self.Worker_L2I.tf_lr: self.get_L2I_lr(),
|
||||
# self.image_LM: LMscores,
|
||||
self.trade_off_I2L: args.trade_off_I2L,
|
||||
self.trade_off_L2I: args.trade_off_L2I
|
||||
}
|
||||
|
||||
splitted_image = np.split(images.astype('float32'), args.nr_gpu)
|
||||
splitted_label = np.split(labels, args.nr_gpu)
|
||||
splitted_LM = np.split(LMscores, args.nr_gpu)
|
||||
|
||||
feed_dict.update({self.image_LM[i]: splitted_LM[i] for i in range(args.nr_gpu)})
|
||||
feed_dict.update({self.Worker_I2L.model.input_image[i]: splitted_image[i] for i in range(args.nr_gpu)})
|
||||
feed_dict.update({self.Worker_I2L.model.input_label[i]: splitted_label[i][:,None] for i in range(args.nr_gpu)})
|
||||
# Deal with xs and ys:
|
||||
x = np.cast[np.float32]((images - 127.5) / 127.5)
|
||||
x = np.split(x, self.Worker_L2I.args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.xs[i] : x[i] for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
if (use_soft_label == 2) or (use_soft_label == 1 and np.random.rand() < 0.8):
|
||||
soft_labels_ = self.sess.run(self.soft_labels, feed_dict={
|
||||
self.Worker_I2L.model.input_image: images.astype('float32'),
|
||||
self.Worker_I2L.model.needImgAug: True
|
||||
})
|
||||
if use_soft_label == 2:
|
||||
soft_labels_ -= 0.1
|
||||
feed_dict.update({self.Worker_L2I.hs[i]: soft_labels_ for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
else:
|
||||
y = np.split(labels, self.Worker_L2I.args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.ys[i] : y[i] for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
|
||||
if args.oneside is None:
|
||||
nlls_I2L_mean, nlls_L2I_mean, nlls_L2I_mean_test, consistent_loss, overall_cost_I2L, overall_cost_L2I, _, _ = \
|
||||
self.sess.run(fetches, feed_dict)
|
||||
else:
|
||||
nlls_I2L_mean, nlls_L2I_mean, nlls_L2I_mean_test, consistent_loss, overall_cost_I2L, overall_cost_L2I, _, = \
|
||||
self.sess.run(fetches, feed_dict)
|
||||
if self.train_uidx % args.show_interval == (args.show_interval - 1):
|
||||
print('iter={}, I2L={}, L2I={}/{}, Consistent={}, Overall_I2L={}, Overall_L2I={}'.format(
|
||||
self.train_uidx, '{0:.4f}'.format(nlls_I2L_mean), '{0:.4f}'.format(nlls_L2I_mean),
|
||||
'{0:.4f}'.format(nlls_L2I_mean_test), '{0:.4f}'.format(consistent_loss), '{0:.4f}'.format(overall_cost_I2L),
|
||||
'{0:.4f}'.format(overall_cost_L2I)
|
||||
))
|
||||
self.train_uidx += 1
|
||||
|
||||
def data_dependent_init(self):
|
||||
global_init = tf.global_variables_initializer()
|
||||
_images, _labels, _ = train_data_iterator.next(self.Worker_L2I.args.init_batch_size)
|
||||
initializer_dict = {
|
||||
self.Worker_L2I.x_init: (np.cast[np.float32](_images) - 127.5)/127.5,
|
||||
self.Worker_L2I.y_init: _labels
|
||||
}
|
||||
train_data_iterator.reset()
|
||||
self.sess.run(global_init, initializer_dict)
|
||||
|
||||
def L2I_TestNll(self, alpha_=1.):
|
||||
all_testnll = []
|
||||
for images, labels in test_data_iterator:
|
||||
feed_dict = {}
|
||||
x = np.cast[np.float32]((images - 127.5) / 127.5)
|
||||
x = np.split(x, args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.xs[i]: x[i] for i in range(args.nr_gpu)})
|
||||
if args.useSoftLabel == 1:
|
||||
soft_labels_ = self.sess.run(self.soft_labels, feed_dict={
|
||||
self.Worker_I2L.model.input_image: images.astype('float32'),
|
||||
self.Worker_I2L.model.needImgAug: False
|
||||
})
|
||||
one_hot_labels_ = np.zeros((args.batch_size, 10), dtype=np.float32)
|
||||
one_hot_labels_[np.arange(args.batch_size), labels] = 1.
|
||||
feed_dict.update({self.Worker_L2I.hs[i]: (1. - alpha_)*soft_labels_+alpha_*one_hot_labels_ for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
else:
|
||||
y = np.split(labels, args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.ys[i]: y[i] for i in range(args.nr_gpu)})
|
||||
all_testnll.append(self.sess.run([self.nlls_L2I_test_bpd], feed_dict))
|
||||
avg_testnll = np.mean(all_testnll)
|
||||
print('testnll=%f' % avg_testnll)
|
||||
|
||||
def build_saver(self):
|
||||
self.saver = tf.train.Saver(max_to_keep=None)
|
||||
#tf.reset_default_graph()
|
||||
if args.load_params is not None:
|
||||
print('Reload from ', args.save_dir)
|
||||
self.saver.restore(self.sess, args.save_dir + '/' + args.load_params)
|
||||
print('Done')
|
||||
else:
|
||||
print('Start to initialize the two models')
|
||||
self.data_dependent_init()
|
||||
print('Done')
|
||||
|
||||
def _steal_L2I(self):
|
||||
if args.steal_params_L2I is not None:
|
||||
# try to retrieve parameters NOT starting with "Variable" from a well-trained model
|
||||
success_ = 0
|
||||
import pickle
|
||||
with open(args.steal_params_L2I, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
for vidx, v in enumerate(tf.global_variables()):
|
||||
if v.name in old_model and not v.name.startswith('I2L/'):
|
||||
self.sess.run(self.assign_op(v, old_model[v.name][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(tf.global_variables()))
|
||||
print('Retrieve %d / %d parameters from model %s' % (success_, len(old_model), args.steal_params_L2I))
|
||||
|
||||
'''
|
||||
# this version can only reload "trainable vars"
|
||||
success_ = 0
|
||||
import pickle
|
||||
with open(args.steal_params_L2I, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
for vidx, v in enumerate(self.Worker_L2I.all_params):
|
||||
if v.name in old_model:
|
||||
self.sess.run(self.assign_op(v, old_model[v.name][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(self.Worker_L2I.all_params))
|
||||
print('Retrieve %d / %d parameters from model %s' % (success_, len(old_model), args.steal_params_L2I))
|
||||
'''
|
||||
|
||||
def _steal_I2L(self):
|
||||
if args.steal_params_I2L is not None:
|
||||
# try to retrieve parameters from a well-trained model
|
||||
success_ = 0
|
||||
import pickle
|
||||
with open(args.steal_params_I2L, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
for vidx, v in enumerate(self.Worker_I2L.model.all_variables):
|
||||
if v.name[4:] in old_model:
|
||||
self.sess.run(self.assign_op(v, old_model[v.name[4:]][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(self.Worker_I2L.model.all_variables))
|
||||
print('Retrieve %d / %d parameters from model %s' % (success_, len(old_model), args.steal_params_I2L))
|
||||
|
||||
def _reload_from_pkl(self, filename):
|
||||
success_ = 0
|
||||
import pickle
|
||||
|
||||
with open(filename, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
|
||||
for vidx, v in enumerate(self.Worker_I2L.model.all_variables):
|
||||
if v.name in old_model:
|
||||
self.sess.run(self.assign_op(v, old_model[v.name][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(self.Worker_I2L.model.all_variables))
|
||||
|
||||
for vidx, v in enumerate(self.Worker_L2I.all_params):
|
||||
if v.name in old_model:
|
||||
self.sess.run(self.assign_op(v, old_model[v.name][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(tf.global_variables()))
|
||||
|
||||
print('Retrieve %d / %d parameters from model ' % (success_, len(old_model)))
|
||||
|
||||
def train(self):
|
||||
# do not delete the following three lines
|
||||
# self._reload_from_pkl('warm_values')
|
||||
# self.saver.save(self.sess, args.save_dir + '/params_stealt_models.ckpt')
|
||||
# return
|
||||
if args.load_params is None:
|
||||
self._steal_L2I()
|
||||
self._steal_I2L()
|
||||
self.saver.save(self.sess, args.save_dir + '/params_stealt_models.ckpt')
|
||||
for epoch in range(args.max_epochs):
|
||||
self.current_epoch = epoch
|
||||
for images, labels, LMscores in train_data_iterator:
|
||||
self.step(images, labels, LMscores, epoch, args.useSoftLabel)
|
||||
|
||||
# if epoch % args.valid_interval == (args.valid_interval - 1):
|
||||
# self.Worker_I2L.Valid(test_data_iterator, self.sess)
|
||||
# self.L2I_TestNll()
|
||||
|
||||
if epoch % args.save_interval == (args.save_interval - 1):
|
||||
self.saver.save(self.sess, args.save_dir + '/params_' + str(epoch) + 'uidx' + str(self.train_uidx) + '.ckpt')
|
||||
self.Worker_L2I.Gen_Images(self.sess, self.current_epoch)
|
||||
|
||||
def valid_I2L(self):
|
||||
self.Worker_I2L.Valid(test_data_iterator, self.sess)
|
||||
|
||||
def valid_L2I(self):
|
||||
self.L2I_TestNll()
|
||||
'''
|
||||
for alpha_ in range(11):
|
||||
print('alpha=%f' % (alpha_ * 0.1))
|
||||
self.L2I_TestNll(alpha_ * 0.1)
|
||||
'''
|
||||
|
||||
def valid_ImgGen(self):
|
||||
self.Worker_L2I.Gen_Images(self.sess, self.current_epoch)
|
||||
|
||||
def dump_model_to_pkl(self):
|
||||
warm_models = {}
|
||||
print('Classifier')
|
||||
classifier_size = len(self.Worker_I2L.model.all_variables)
|
||||
for idx, v in enumerate(self.Worker_I2L.model.all_variables):
|
||||
vv = self.sess.run([v])
|
||||
warm_models[v.name] = vv
|
||||
if idx % 10 == 0:
|
||||
print('{}-{}'.format(idx, classifier_size))
|
||||
|
||||
print('Generator')
|
||||
generator_size = len(self.Worker_L2I.all_params)
|
||||
for idx, v in enumerate(self.Worker_L2I.all_params):
|
||||
vv = self.sess.run([v])
|
||||
warm_models[v.name] = vv
|
||||
if idx % 10 == 0:
|
||||
print('{}-{}'.format(idx, generator_size))
|
||||
|
||||
import pickle
|
||||
with open('warm_values', 'wb') as f:
|
||||
pickle.dump(warm_models, f, protocol=2)
|
||||
|
||||
|
||||
|
||||
def main(_):
|
||||
#L2Ipath='./pxpp_c_2.95/params_cifar.ckpt'
|
||||
monitor_ = monitor()
|
||||
monitor_.build_saver()
|
||||
|
||||
if args.mode == 'train':
|
||||
monitor_.train()
|
||||
elif args.mode == 'I2L':
|
||||
monitor_.valid_I2L()
|
||||
elif args.mode == 'L2I':
|
||||
monitor_.valid_L2I()
|
||||
elif args.mode == 'ImgGen':
|
||||
monitor_.valid_ImgGen()
|
||||
else:
|
||||
print('Un supported mode: ' + args.mode)
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.app.run()
|
|
@ -0,0 +1,332 @@
|
|||
import time
|
||||
import sys
|
||||
import os
|
||||
|
||||
import cifar_input
|
||||
import numpy as np
|
||||
import resnet_model_basic as resnet_model
|
||||
import tensorflow as tf
|
||||
import data.cifar10_data as cifar10_data
|
||||
import data2.cifar10_data as cifar_10data2
|
||||
|
||||
import json
|
||||
|
||||
from worker_I2L import worker_I2L, lr_I2L
|
||||
from worker_L2I import worker_L2I
|
||||
import argparse
|
||||
import time
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
parser = argparse.ArgumentParser()
|
||||
# data I/O
|
||||
parser.add_argument('-i', '--data_dir', type=str, default='/tmp/pxpp/data', help='Location for the dataset')
|
||||
parser.add_argument('-o', '--save_dir', type=str, default='/tmp/pxpp/save', help='Location for parameter checkpoints and samples')
|
||||
parser.add_argument('-d', '--data_set', type=str, default='cifar', help='Can be either cifar|imagenet')
|
||||
parser.add_argument('-t', '--save_interval', type=int, default=2, help='Every how many epochs to write checkpoint/samples?')
|
||||
parser.add_argument('--valid_interval', type=int, default=1, help='Every how many epochs to valid?')
|
||||
parser.add_argument('-r', '--load_params', type=str, default=None, help='The detailed model name')
|
||||
|
||||
|
||||
# model
|
||||
parser.add_argument('-q', '--nr_resnet', type=int, default=5, help='Number of residual blocks per stage of the model')
|
||||
parser.add_argument('-n', '--nr_filters', type=int, default=160, help='Number of filters to use across the model. Higher = larger model.')
|
||||
parser.add_argument('-m', '--nr_logistic_mix', type=int, default=10, help='Number of logistic components in the mixture. Higher = more flexible model')
|
||||
parser.add_argument('-z', '--resnet_nonlinearity', type=str, default='concat_elu', help='Which nonlinearity to use in the ResNet layers. One of "concat_elu", "elu", "relu" ')
|
||||
parser.add_argument('-c', '--class_conditional', dest='class_conditional', action='store_true', help='Condition generative model on labels?')
|
||||
parser.add_argument('--trade_off_I2L', type=float, default=5e-3, help='the consistence tradeoff')
|
||||
parser.add_argument('--trade_off_L2I', type=float, default=0.3, help='the consistence tradeoff')
|
||||
parser.add_argument('-w', '--use_wide_resnet', dest='use_wide_resnet', action='store_true', help='Condition generative model on labels?')
|
||||
parser.add_argument('--show_interval', type=int, default=100, help='Batch size during training per GPU')
|
||||
parser.add_argument('--steal_params_L2I', type=str, default=None, help='Provide the file, which stores the warm values of L2I')
|
||||
parser.add_argument('--steal_params_I2L', type=str, default=None, help='Provide the file, which stores the warm values of I2L')
|
||||
parser.add_argument('--freezeL2I', dest='freezeL2I', action='store_true', help='Freeze L2I to quickly train L2I')
|
||||
|
||||
# optimization
|
||||
parser.add_argument('--learning_rate_I2L', type=float, default=0.001, help='Base learning rate')
|
||||
parser.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Base learning rate')
|
||||
parser.add_argument('-e', '--lr_decay', type=float, default=0.999995, help='Learning rate decay, applied every step of the optimization')
|
||||
parser.add_argument('-b', '--batch_size', type=int, default=12, help='Batch size during training per GPU')
|
||||
parser.add_argument('-a', '--init_batch_size', type=int, default=100, help='How much data to use for data-dependent initialization.')
|
||||
parser.add_argument('-p', '--dropout_p', type=float, default=0.5, help='Dropout strength (i.e. 1 - keep_prob). 0 = No dropout, higher = more dropout.')
|
||||
parser.add_argument('-x', '--max_epochs', type=int, default=5000, help='How many epochs to run in total?')
|
||||
parser.add_argument('-g', '--nr_gpu', type=int, default=1, help='How many GPUs to distribute the training across?')
|
||||
parser.add_argument('--num_classes', type=int, default=10, help='number of classes')
|
||||
parser.add_argument('--L2I_normalization', dest='L2I_normalization', action='store_true', help='Use L2I normalization')
|
||||
parser.add_argument('--L2IuseSGD', dest='L2IuseSGD', action='store_true', help='Whether to use pure SGD to tune L2I')
|
||||
parser.add_argument('--useSoftLabel', type=int, default=0, help='0: no use | 1: use | 2: -0.1')
|
||||
parser.add_argument('--bias', type=float, default=0.0, help='introduce the bias')
|
||||
|
||||
|
||||
# evaluation
|
||||
parser.add_argument('--polyak_decay', type=float, default=0.9995, help='Exponential decay rate of the sum of previous model iterates during Polyak averaging')
|
||||
parser.add_argument('--mode', type=str, default='train', help='train | I2L | L2I | ImgGen' )
|
||||
|
||||
# reproducibility
|
||||
parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed to use')
|
||||
args = parser.parse_args()
|
||||
print('input args:\n', json.dumps(vars(args), indent=4, separators=(',',':'))) # pretty print args
|
||||
|
||||
DataLoader = cifar10_data.DataLoader
|
||||
DataLoader_train = cifar_10data2.DataLoader
|
||||
rng = np.random.RandomState(args.seed)
|
||||
train_data_iterator = DataLoader_train(args.data_dir, 'train', args.batch_size * args.nr_gpu, './cifar10_data/cifar10-LMscore', './cifar10_data/cifar10-CLMscore', rng=rng, shuffle=True, return_labels=True)
|
||||
test_data_iterator = DataLoader(args.data_dir, 'test', args.batch_size * args.nr_gpu, shuffle=False, return_labels=True)
|
||||
|
||||
|
||||
class moitor(object):
|
||||
def __init__(self):
|
||||
if not os.path.exists(args.save_dir):
|
||||
os.makedirs(args.save_dir)
|
||||
self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
|
||||
self.Worker_L2I = worker_L2I(args, train_data_iterator.get_num_labels(), train_data_iterator.get_observation_size())
|
||||
self.Worker_I2L = worker_I2L(args)
|
||||
|
||||
self.image_LM = tf.placeholder(tf.float32, shape=(args.batch_size,))
|
||||
self.trade_off_I2L = tf.placeholder(tf.float32, shape=())
|
||||
self.trade_off_L2I = tf.placeholder(tf.float32, shape=())
|
||||
|
||||
self.I2L_grads = []
|
||||
self.train_uidx = 0
|
||||
self._build_onestep()
|
||||
|
||||
self.lr_l2i = self.Worker_L2I.args.learning_rate
|
||||
self.current_epoch = 0
|
||||
|
||||
self.assign_op = lambda ref_, val_: tf.assign(ref_, val_)
|
||||
|
||||
|
||||
def get_I2L_lr(self):
|
||||
if args.use_wide_resnet:
|
||||
step_wise = [60, 120, 160]
|
||||
#step_wise = [51000, 76000, 102000] # counted by iter with batch_size 100
|
||||
if self.current_epoch < step_wise[0]:
|
||||
return args.learning_rate_I2L
|
||||
elif self.current_epoch < step_wise[1]:
|
||||
return args.learning_rate_I2L * 0.2
|
||||
elif self.current_epoch < step_wise[2]:
|
||||
return args.learning_rate_I2L * 0.04
|
||||
else:
|
||||
return args.learning_rate_I2L * 0.008
|
||||
else:
|
||||
step_wise = [102, 153, 204]
|
||||
#step_wise = [51000, 76000, 102000] # counted by iter with batch_size 100
|
||||
if self.current_epoch < step_wise[0]:
|
||||
return args.learning_rate_I2L
|
||||
elif self.current_epoch < step_wise[1]:
|
||||
return args.learning_rate_I2L * 0.1
|
||||
elif self.current_epoch < step_wise[2]:
|
||||
return args.learning_rate_I2L * 0.01
|
||||
else:
|
||||
return args.learning_rate_I2L * 0.001
|
||||
|
||||
def get_L2I_lr(self):
|
||||
self.lr_l2i *= self.Worker_L2I.args.lr_decay
|
||||
return self.lr_l2i
|
||||
|
||||
|
||||
def __del__(self):
|
||||
self.sess.close()
|
||||
|
||||
def _build_onestep(self):
|
||||
# Calculate all the costs and gradients
|
||||
# Let us NOT use weight decay, since we have aleardy had a regularization term
|
||||
# self.weightDecay_I2L = self.Worker_I2L.model.GetWeightDecay()
|
||||
self.nlls_I2L = self.Worker_I2L.model.nlls
|
||||
self.soft_labels = self.Worker_I2L.model.predictions # this is the soft labels [optional, may not use it]
|
||||
nlls_L2I, loss_gen_test = self.Worker_L2I.GetLoss()
|
||||
self.nlls_L2I_train_bpd = tf.reduce_mean(nlls_L2I) / (np.log(2.) * 32 * 32 * 3 * args.nr_gpu )
|
||||
self.nlls_L2I_test_bpd = tf.reduce_mean(loss_gen_test) / (np.log(2.) * 32 * 32 * 3 * args.nr_gpu * args.batch_size)
|
||||
if args.L2I_normalization:
|
||||
self.consistent_loss = tf.reduce_mean((self.image_LM * np.log(2.) + self.nlls_I2L + tf.log(0.1) - nlls_L2I / (32. * 32 * 3)) ** 2.)
|
||||
else:
|
||||
self.consistent_loss = tf.reduce_mean((self.image_LM * np.log(2.) + (self.nlls_I2L + tf.log(0.1))/3072. + args.bias) ** 2.)
|
||||
self.nlls_I2L_batchMean = tf.reduce_mean(self.nlls_I2L)
|
||||
|
||||
self.overall_cost_I2L = self.nlls_I2L_batchMean + (self.trade_off_I2L ** 2.) * self.consistent_loss
|
||||
self.overall_cost_L2I = self.nlls_L2I_train_bpd + (self.trade_off_L2I ** 2.) * self.consistent_loss
|
||||
#self.overall_cost_L2I = self.nlls_I2L_batchMean + self.nlls_L2I_train_bpd + self.weightDecay_I2L + self.trade_off * self.consistent_loss
|
||||
|
||||
grads_I2L = tf.gradients(self.overall_cost_I2L, self.Worker_I2L.model.trainable_variables)
|
||||
grads_L2I = tf.gradients(self.overall_cost_L2I, self.Worker_L2I.all_params)
|
||||
|
||||
# Update the parameters
|
||||
self.Worker_I2L.model.Update(grads_I2L)
|
||||
self.Worker_L2I.Update(grads_L2I, args.L2IuseSGD)
|
||||
|
||||
# Build the sampler
|
||||
self.Worker_L2I.build_sample_from_model()
|
||||
|
||||
def step(self, images, labels, LMscores, currEpoch, use_soft_label=0):
|
||||
fetches = [self.nlls_I2L_batchMean, self.consistent_loss,
|
||||
self.overall_cost_I2L,
|
||||
self.Worker_I2L.model.update_ops]
|
||||
|
||||
feed_dict={
|
||||
self.Worker_I2L.model.input_image: images.astype('float32'),
|
||||
self.Worker_I2L.model.input_label: labels[:,None],
|
||||
self.Worker_I2L.model.lrn_rate: self.get_I2L_lr(),
|
||||
self.Worker_I2L.model.needImgAug: True,
|
||||
self.Worker_L2I.tf_lr: self.get_L2I_lr(),
|
||||
self.image_LM: LMscores,
|
||||
self.trade_off_I2L: args.trade_off_I2L,
|
||||
self.trade_off_L2I: args.trade_off_L2I
|
||||
}
|
||||
# Deal with xs and ys:
|
||||
x = np.cast[np.float32]((images - 127.5) / 127.5)
|
||||
x = np.split(x, self.Worker_L2I.args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.xs[i] : x[i] for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
if (use_soft_label == 2) or (use_soft_label == 1 and np.random.rand() < 0.8):
|
||||
soft_labels_ = self.sess.run(self.soft_labels, feed_dict={
|
||||
self.Worker_I2L.model.input_image: images.astype('float32'),
|
||||
self.Worker_I2L.model.needImgAug: True
|
||||
})
|
||||
if use_soft_label == 2:
|
||||
soft_labels_ -= 0.1
|
||||
feed_dict.update({self.Worker_L2I.hs[i]: soft_labels_ for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
else:
|
||||
y = np.split(labels, self.Worker_L2I.args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.ys[i] : y[i] for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
|
||||
|
||||
nlls_I2L_mean, consistent_loss, overall_cost_I2L, _ = \
|
||||
self.sess.run(fetches, feed_dict)
|
||||
|
||||
if self.train_uidx % args.show_interval == (args.show_interval - 1):
|
||||
print('iter={}, I2L={}, Consistent={}, Overall_I2L={}'.format(
|
||||
self.train_uidx, '{0:.6f}'.format(nlls_I2L_mean), '{0:.6f}'.format(consistent_loss), '{0:.6f}'.format(overall_cost_I2L),
|
||||
))
|
||||
self.train_uidx += 1
|
||||
|
||||
def data_dependent_init(self):
|
||||
global_init = tf.global_variables_initializer()
|
||||
_images, _labels, _ = train_data_iterator.next(self.Worker_L2I.args.init_batch_size)
|
||||
initializer_dict = {
|
||||
self.Worker_L2I.x_init: (np.cast[np.float32](_images) - 127.5)/127.5,
|
||||
self.Worker_L2I.y_init: _labels
|
||||
}
|
||||
train_data_iterator.reset()
|
||||
self.sess.run(global_init, initializer_dict)
|
||||
|
||||
def L2I_TestNll(self, alpha_=1.):
|
||||
all_testnll = []
|
||||
for images, labels in test_data_iterator:
|
||||
feed_dict = {}
|
||||
x = np.cast[np.float32]((images - 127.5) / 127.5)
|
||||
x = np.split(x, args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.xs[i]: x[i] for i in range(args.nr_gpu)})
|
||||
if args.useSoftLabel == 1:
|
||||
soft_labels_ = self.sess.run(self.soft_labels, feed_dict={
|
||||
self.Worker_I2L.model.input_image: images.astype('float32'),
|
||||
self.Worker_I2L.model.needImgAug: False
|
||||
})
|
||||
one_hot_labels_ = np.zeros((args.batch_size, 10), dtype=np.float32)
|
||||
one_hot_labels_[np.arange(args.batch_size), labels] = 1.
|
||||
feed_dict.update({self.Worker_L2I.hs[i]: (1. - alpha_)*soft_labels_+alpha_*one_hot_labels_ for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
else:
|
||||
y = np.split(labels, args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.ys[i]: y[i] for i in range(args.nr_gpu)})
|
||||
all_testnll.append(self.sess.run([self.nlls_L2I_test_bpd], feed_dict))
|
||||
avg_testnll = np.mean(all_testnll)
|
||||
print('[L2I], testnll={0:.6f}'.format(avg_testnll))
|
||||
|
||||
def build_saver(self):
|
||||
self.saver = tf.train.Saver(max_to_keep=None)
|
||||
if args.load_params is not None:
|
||||
print('Reload from ', args.save_dir)
|
||||
self.saver.restore(self.sess, args.save_dir + '/' + args.load_params)
|
||||
print('Done')
|
||||
else:
|
||||
print('Start to initialize the two models')
|
||||
self.data_dependent_init()
|
||||
print('Done')
|
||||
|
||||
def _steal_L2I(self):
|
||||
if args.steal_params_L2I is not None:
|
||||
# try to retrieve parameters NOT starting with "Variable" from a well-trained model
|
||||
success_ = 0
|
||||
import pickle
|
||||
with open(args.steal_params_L2I, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
for vidx, v in enumerate(tf.global_variables()):
|
||||
if v.name in old_model and not v.name.startswith('I2L/'):
|
||||
self.sess.run(self.assign_op(v, old_model[v.name][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(tf.global_variables()))
|
||||
print('Retrieve %d / %d parameters from model %s' % (success_, len(old_model), args.steal_params_L2I))
|
||||
|
||||
'''
|
||||
# this version can only reload "trainable vars"
|
||||
success_ = 0
|
||||
import pickle
|
||||
with open(args.steal_params_L2I, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
for vidx, v in enumerate(self.Worker_L2I.all_params):
|
||||
if v.name in old_model:
|
||||
self.sess.run(self.assign_op(v, old_model[v.name][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(self.Worker_L2I.all_params))
|
||||
print('Retrieve %d / %d parameters from model %s' % (success_, len(old_model), args.steal_params_L2I))
|
||||
'''
|
||||
|
||||
def _steal_I2L(self):
|
||||
if args.steal_params_I2L is not None:
|
||||
# try to retrieve parameters from a well-trained model
|
||||
success_ = 0
|
||||
import pickle
|
||||
with open(args.steal_params_I2L, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
for vidx, v in enumerate(self.Worker_I2L.model.all_variables):
|
||||
if v.name[4:] in old_model:
|
||||
self.sess.run(self.assign_op(v, old_model[v.name[4:]][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(self.Worker_I2L.model.all_variables))
|
||||
print('Retrieve %d / %d parameters from model %s' % (success_, len(old_model), args.steal_params_I2L))
|
||||
|
||||
def train(self):
|
||||
if args.load_params is None:
|
||||
self._steal_L2I()
|
||||
self._steal_I2L()
|
||||
self.saver.save(self.sess, args.save_dir + '/params_stealt_models.ckpt')
|
||||
for epoch in range(args.max_epochs):
|
||||
self.current_epoch = epoch
|
||||
for images, labels, LMscores, CLMscores in train_data_iterator:
|
||||
self.step(images, labels, LMscores - CLMscores, epoch, args.useSoftLabel)
|
||||
|
||||
#if epoch % args.valid_interval == (args.valid_interval - 1):
|
||||
#self.Worker_I2L.Valid(test_data_iterator, self.sess)
|
||||
#self.L2I_TestNll()
|
||||
|
||||
if epoch % args.save_interval == (args.save_interval - 1):
|
||||
self.saver.save(self.sess, args.save_dir + '/params_' + str(epoch) + 'uidx' + str(self.train_uidx) + '.ckpt')
|
||||
#self.Worker_L2I.Gen_Images(self.sess, self.current_epoch)
|
||||
|
||||
def valid_I2L(self):
|
||||
self.Worker_I2L.Valid(test_data_iterator, self.sess)
|
||||
|
||||
def valid_L2I(self):
|
||||
self.L2I_TestNll()
|
||||
'''
|
||||
for alpha_ in range(11):
|
||||
print('alpha=%f' % (alpha_ * 0.1))
|
||||
self.L2I_TestNll(alpha_ * 0.1)
|
||||
'''
|
||||
def valid_ImgGen(self):
|
||||
self.Worker_L2I.Gen_Images(self.sess, self.current_epoch)
|
||||
|
||||
def main(_):
|
||||
#L2Ipath='./pxpp_c_2.95/params_cifar.ckpt'
|
||||
monitor_ = moitor()
|
||||
monitor_.build_saver()
|
||||
if args.mode == 'train':
|
||||
monitor_.train()
|
||||
elif args.mode == 'I2L':
|
||||
monitor_.valid_I2L()
|
||||
elif args.mode == 'L2I':
|
||||
monitor_.valid_L2I()
|
||||
elif args.mode == 'ImgGen':
|
||||
monitor_.valid_ImgGen()
|
||||
else:
|
||||
print('Un supported mode: ' + args.mode)
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.app.run()
|
|
@ -0,0 +1,358 @@
|
|||
import time
|
||||
import sys
|
||||
import os
|
||||
|
||||
import cifar_input
|
||||
import numpy as np
|
||||
import resnet_model_basic as resnet_model
|
||||
import tensorflow as tf
|
||||
import data.cifar10_data as cifar10_data
|
||||
import data2.cifar10_data as cifar_10data2
|
||||
import data4.cifar10_data as cifar_10data3
|
||||
import json
|
||||
|
||||
|
||||
from worker_I2L import worker_I2L, lr_I2L
|
||||
from worker_L2I import worker_L2I
|
||||
import argparse
|
||||
import time
|
||||
|
||||
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
parser = argparse.ArgumentParser()
|
||||
# data I/O
|
||||
parser.add_argument('-i', '--data_dir', type=str, default='/tmp/pxpp/data', help='Location for the dataset')
|
||||
parser.add_argument('-o', '--save_dir', type=str, default='/tmp/pxpp/save', help='Location for parameter checkpoints and samples')
|
||||
parser.add_argument('-d', '--data_set', type=str, default='cifar', help='Can be either cifar|imagenet')
|
||||
parser.add_argument('-t', '--save_interval', type=int, default=2, help='Every how many epochs to write checkpoint/samples?')
|
||||
parser.add_argument('--valid_interval', type=int, default=1, help='Every how many epochs to valid?')
|
||||
parser.add_argument('-r', '--load_params', type=str, default=None, help='The detailed model name')
|
||||
|
||||
|
||||
# model
|
||||
parser.add_argument('-q', '--nr_resnet', type=int, default=5, help='Number of residual blocks per stage of the model')
|
||||
parser.add_argument('-n', '--nr_filters', type=int, default=160, help='Number of filters to use across the model. Higher = larger model.')
|
||||
parser.add_argument('-m', '--nr_logistic_mix', type=int, default=10, help='Number of logistic components in the mixture. Higher = more flexible model')
|
||||
parser.add_argument('-z', '--resnet_nonlinearity', type=str, default='concat_elu', help='Which nonlinearity to use in the ResNet layers. One of "concat_elu", "elu", "relu" ')
|
||||
parser.add_argument('-c', '--class_conditional', dest='class_conditional', action='store_true', help='Condition generative model on labels?')
|
||||
parser.add_argument('--trade_off_I2L', type=float, default=5e-3, help='the consistence tradeoff')
|
||||
parser.add_argument('--trade_off_L2I', type=float, default=0.3, help='the consistence tradeoff')
|
||||
parser.add_argument('-w', '--use_wide_resnet', dest='use_wide_resnet', action='store_true', help='Condition generative model on labels?')
|
||||
parser.add_argument('--show_interval', type=int, default=100, help='Batch size during training per GPU')
|
||||
parser.add_argument('--steal_params_L2I', type=str, default=None, help='Provide the file, which stores the warm values of L2I')
|
||||
parser.add_argument('--steal_params_I2L', type=str, default=None, help='Provide the file, which stores the warm values of I2L')
|
||||
|
||||
|
||||
# optimization
|
||||
parser.add_argument('--learning_rate_I2L', type=float, default=0.001, help='Base learning rate')
|
||||
parser.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Base learning rate')
|
||||
parser.add_argument('-e', '--lr_decay', type=float, default=0.999995, help='Learning rate decay, applied every step of the optimization')
|
||||
parser.add_argument('-b', '--batch_size', type=int, default=12, help='Batch size during training per GPU')
|
||||
parser.add_argument('-a', '--init_batch_size', type=int, default=100, help='How much data to use for data-dependent initialization.')
|
||||
parser.add_argument('-p', '--dropout_p', type=float, default=0.5, help='Dropout strength (i.e. 1 - keep_prob). 0 = No dropout, higher = more dropout.')
|
||||
parser.add_argument('-x', '--max_epochs', type=int, default=5000, help='How many epochs to run in total?')
|
||||
parser.add_argument('-g', '--nr_gpu', type=int, default=1, help='How many GPUs to distribute the training across?')
|
||||
parser.add_argument('--num_classes', type=int, default=10, help='number of classes')
|
||||
parser.add_argument('--L2I_normalization', dest='L2I_normalization', action='store_true', help='Use L2I normalization')
|
||||
parser.add_argument('--L2IuseSGD', dest='L2IuseSGD', action='store_true', help='Whether to use pure SGD to tune L2I')
|
||||
parser.add_argument('--useSoftLabel', type=int, default=0, help='0: no use | 1: use | 2: -0.1')
|
||||
parser.add_argument('--bias', type=float, default=0.0, help='introduce the bias')
|
||||
|
||||
|
||||
# evaluation
|
||||
parser.add_argument('--polyak_decay', type=float, default=0.9995, help='Exponential decay rate of the sum of previous model iterates during Polyak averaging')
|
||||
parser.add_argument('--mode', type=str, default='train', help='train | I2L | L2I | ImgGen' )
|
||||
|
||||
# reproducibility
|
||||
parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed to use')
|
||||
args = parser.parse_args()
|
||||
print('input args:\n', json.dumps(vars(args), indent=4, separators=(',',':'))) # pretty print args
|
||||
|
||||
DataLoader = cifar_10data3.DataLoader
|
||||
DataLoader_train = cifar_10data2.DataLoader
|
||||
rng = np.random.RandomState(args.seed)
|
||||
train_data_iterator = DataLoader_train(args.data_dir, 'train', args.batch_size * args.nr_gpu, './cifar10_data/cifar10-LMscore', rng=rng, shuffle=True, return_labels=True)
|
||||
test_data_iterator = DataLoader(args.data_dir, 'test', args.batch_size * args.nr_gpu, shuffle=False, return_labels=True,final=4)
|
||||
|
||||
|
||||
class moitor(object):
|
||||
def __init__(self):
|
||||
if not os.path.exists(args.save_dir):
|
||||
os.makedirs(args.save_dir)
|
||||
self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
|
||||
self.Worker_L2I = worker_L2I(args, train_data_iterator.get_num_labels(), train_data_iterator.get_observation_size())
|
||||
self.Worker_I2L = worker_I2L(args)
|
||||
|
||||
self.image_LM = tf.placeholder(tf.float32, shape=(args.batch_size,))
|
||||
self.trade_off_I2L = tf.placeholder(tf.float32, shape=())
|
||||
self.trade_off_L2I = tf.placeholder(tf.float32, shape=())
|
||||
|
||||
self.I2L_grads = []
|
||||
self.train_uidx = 0
|
||||
self._build_onestep()
|
||||
|
||||
self.lr_l2i = self.Worker_L2I.args.learning_rate
|
||||
self.current_epoch = 0
|
||||
|
||||
self.assign_op = lambda ref_, val_: tf.assign(ref_, val_)
|
||||
'''
|
||||
self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
|
||||
self.Worker_L2I = worker_L2I(args, train_data_iterator.get_num_labels(), train_data_iterator.get_observation_size())
|
||||
self.saver = tf.train.Saver()
|
||||
if load_warm_start_models is None:
|
||||
print('Start to retrieve the (warm) initial L2I model')
|
||||
self.saver.restore(self.sess, L2Ipath)
|
||||
print('Done')
|
||||
self.Worker_I2L = worker_I2L(args)
|
||||
if load_warm_start_models is None:
|
||||
print('Start to initialize I2L model')
|
||||
self.sess.run(tf.variables_initializer(self.Worker_I2L.model.all_variables, name='coldInit_I2L_model'))
|
||||
print('Done')
|
||||
|
||||
if load_warm_start_models:
|
||||
self.saver.restore(self.sess, load_warm_start_models)
|
||||
|
||||
self.image_LM = tf.placeholder(tf.float32, shape=(args.batch_size,))
|
||||
self.trade_off = tf.placeholder(tf.float32, shape=())
|
||||
|
||||
self.I2L_grads = []
|
||||
self.train_uidx = 0
|
||||
'''
|
||||
|
||||
|
||||
def get_I2L_lr(self):
|
||||
if args.use_wide_resnet:
|
||||
step_wise = [60, 120, 160]
|
||||
#step_wise = [51000, 76000, 102000] # counted by iter with batch_size 100
|
||||
if self.current_epoch < step_wise[0]:
|
||||
return args.learning_rate_I2L
|
||||
elif self.current_epoch < step_wise[1]:
|
||||
return args.learning_rate_I2L * 0.2
|
||||
elif self.current_epoch < step_wise[2]:
|
||||
return args.learning_rate_I2L * 0.04
|
||||
else:
|
||||
return args.learning_rate_I2L * 0.008
|
||||
else:
|
||||
step_wise = [102, 153, 204]
|
||||
#step_wise = [51000, 76000, 102000] # counted by iter with batch_size 100
|
||||
if self.current_epoch < step_wise[0]:
|
||||
return args.learning_rate_I2L
|
||||
elif self.current_epoch < step_wise[1]:
|
||||
return args.learning_rate_I2L * 0.1
|
||||
elif self.current_epoch < step_wise[2]:
|
||||
return args.learning_rate_I2L * 0.01
|
||||
else:
|
||||
return args.learning_rate_I2L * 0.001
|
||||
|
||||
def get_L2I_lr(self):
|
||||
self.lr_l2i *= self.Worker_L2I.args.lr_decay
|
||||
return self.lr_l2i
|
||||
|
||||
|
||||
def __del__(self):
|
||||
self.sess.close()
|
||||
|
||||
def _build_onestep(self):
|
||||
# Calculate all the costs and gradients
|
||||
# Let us NOT use weight decay, since we have aleardy had a regularization term
|
||||
# self.weightDecay_I2L = self.Worker_I2L.model.GetWeightDecay()
|
||||
self.nlls_I2L = self.Worker_I2L.model.nlls
|
||||
self.soft_labels = self.Worker_I2L.model.predictions # this is the soft labels [optional, may not use it]
|
||||
nlls_L2I, loss_gen_test = self.Worker_L2I.GetLoss()
|
||||
self.nlls_L2I_train_bpd = tf.reduce_mean(nlls_L2I) / (np.log(2.) * 32 * 32 * 3 * args.nr_gpu )
|
||||
self.nlls_L2I_test_bpd = tf.reduce_mean(loss_gen_test) / (np.log(2.) * 32 * 32 * 3 * args.nr_gpu * args.batch_size)
|
||||
if args.L2I_normalization:
|
||||
self.consistent_loss = tf.reduce_mean((self.image_LM * np.log(2.) + self.nlls_I2L + tf.log(0.1) - nlls_L2I / (32. * 32 * 3)) ** 2.)
|
||||
else:
|
||||
self.consistent_loss = tf.reduce_mean((self.image_LM * np.log(2.) + (self.nlls_I2L + tf.log(0.1) - nlls_L2I)/3072. + args.bias) ** 2.)
|
||||
self.nlls_I2L_batchMean = tf.reduce_mean(self.nlls_I2L)
|
||||
|
||||
self.overall_cost_I2L = self.nlls_I2L_batchMean + (self.trade_off_I2L ** 2.) * self.consistent_loss
|
||||
self.overall_cost_L2I = self.nlls_L2I_train_bpd + (self.trade_off_L2I ** 2.) * self.consistent_loss
|
||||
#self.overall_cost_L2I = self.nlls_I2L_batchMean + self.nlls_L2I_train_bpd + self.weightDecay_I2L + self.trade_off * self.consistent_loss
|
||||
|
||||
grads_I2L = tf.gradients(self.overall_cost_I2L, self.Worker_I2L.model.trainable_variables)
|
||||
grads_L2I = tf.gradients(self.overall_cost_L2I, self.Worker_L2I.all_params)
|
||||
|
||||
# Update the parameters
|
||||
self.Worker_I2L.model.Update(grads_I2L)
|
||||
self.Worker_L2I.Update(grads_L2I, args.L2IuseSGD)
|
||||
|
||||
# Build the sampler
|
||||
self.Worker_L2I.build_sample_from_model()
|
||||
|
||||
def step(self, images, labels, LMscores, currEpoch, use_soft_label=0):
|
||||
fetches = [self.nlls_I2L_batchMean, self.nlls_L2I_train_bpd, self.nlls_L2I_test_bpd, self.consistent_loss,
|
||||
self.overall_cost_I2L, self.overall_cost_L2I,
|
||||
self.Worker_I2L.model.update_ops, self.Worker_L2I.update_ops]
|
||||
feed_dict={
|
||||
self.Worker_I2L.model.input_image: images.astype('float32'),
|
||||
self.Worker_I2L.model.input_label: labels[:,None],
|
||||
self.Worker_I2L.model.lrn_rate: self.get_I2L_lr(),
|
||||
self.Worker_I2L.model.needImgAug: True,
|
||||
self.Worker_L2I.tf_lr: self.get_L2I_lr(),
|
||||
self.image_LM: LMscores,
|
||||
self.trade_off_I2L: args.trade_off_I2L if currEpoch>3 else 0.,
|
||||
self.trade_off_L2I: args.trade_off_L2I if currEpoch>3 else 0.
|
||||
}
|
||||
# Deal with xs and ys:
|
||||
x = np.cast[np.float32]((images - 127.5) / 127.5)
|
||||
x = np.split(x, self.Worker_L2I.args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.xs[i] : x[i] for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
if (use_soft_label == 2) or (use_soft_label == 1 and np.random.rand() < 0.8):
|
||||
soft_labels_ = self.sess.run(self.soft_labels, feed_dict={
|
||||
self.Worker_I2L.model.input_image: images.astype('float32'),
|
||||
self.Worker_I2L.model.needImgAug: True
|
||||
})
|
||||
if use_soft_label == 2:
|
||||
soft_labels_ -= 0.1
|
||||
feed_dict.update({self.Worker_L2I.hs[i]: soft_labels_ for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
else:
|
||||
y = np.split(labels, self.Worker_L2I.args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.ys[i] : y[i] for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
nlls_I2L_mean, nlls_L2I_mean, nlls_L2I_mean_test, consistent_loss, overall_cost_I2L, overall_cost_L2I, _, _ = \
|
||||
self.sess.run(fetches, feed_dict)
|
||||
if self.train_uidx % args.show_interval == (args.show_interval - 1):
|
||||
print('iter={}, I2L={}, L2I={}/{}, Consistent={}, Overall_I2L={}, Overall_L2I={}'.format(
|
||||
self.train_uidx, '{0:.4f}'.format(nlls_I2L_mean), '{0:.4f}'.format(nlls_L2I_mean),
|
||||
'{0:.4f}'.format(nlls_L2I_mean_test), '{0:.4f}'.format(consistent_loss), '{0:.4f}'.format(overall_cost_I2L),
|
||||
'{0:.4f}'.format(overall_cost_L2I)
|
||||
))
|
||||
self.train_uidx += 1
|
||||
|
||||
def data_dependent_init(self):
|
||||
global_init = tf.global_variables_initializer()
|
||||
_images, _labels, _ = train_data_iterator.next(self.Worker_L2I.args.init_batch_size)
|
||||
initializer_dict = {
|
||||
self.Worker_L2I.x_init: (np.cast[np.float32](_images) - 127.5)/127.5,
|
||||
self.Worker_L2I.y_init: _labels
|
||||
}
|
||||
train_data_iterator.reset()
|
||||
self.sess.run(global_init, initializer_dict)
|
||||
|
||||
def L2I_TestNll(self, alpha_=1.):
|
||||
all_testnll = []
|
||||
for images, labels in test_data_iterator:
|
||||
feed_dict = {}
|
||||
x = np.cast[np.float32]((images - 127.5) / 127.5)
|
||||
x = np.split(x, args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.xs[i]: x[i] for i in range(args.nr_gpu)})
|
||||
if args.useSoftLabel == 1:
|
||||
soft_labels_ = self.sess.run(self.soft_labels, feed_dict={
|
||||
self.Worker_I2L.model.input_image: images.astype('float32'),
|
||||
self.Worker_I2L.model.needImgAug: False
|
||||
})
|
||||
one_hot_labels_ = np.zeros((args.batch_size, 10), dtype=np.float32)
|
||||
one_hot_labels_[np.arange(args.batch_size), labels] = 1.
|
||||
feed_dict.update({self.Worker_L2I.hs[i]: (1. - alpha_)*soft_labels_+alpha_*one_hot_labels_ for i in range(self.Worker_L2I.args.nr_gpu)})
|
||||
else:
|
||||
y = np.split(labels, args.nr_gpu)
|
||||
feed_dict.update({self.Worker_L2I.ys[i]: y[i] for i in range(args.nr_gpu)})
|
||||
all_testnll.append(self.sess.run([self.nlls_L2I_test_bpd], feed_dict))
|
||||
avg_testnll = np.mean(all_testnll)
|
||||
print('[L2I], testnll={0:.6f}'.format(avg_testnll))
|
||||
|
||||
def build_saver(self):
|
||||
self.saver = tf.train.Saver(max_to_keep=None)
|
||||
if args.load_params is not None:
|
||||
print('Reload from ', args.save_dir)
|
||||
self.saver.restore(self.sess, args.save_dir + '/' + args.load_params)
|
||||
print('Done')
|
||||
else:
|
||||
print('Start to initialize the two models')
|
||||
self.data_dependent_init()
|
||||
print('Done')
|
||||
|
||||
|
||||
def _steal_L2I(self):
|
||||
if args.steal_params_L2I is not None:
|
||||
# try to retrieve parameters NOT starting with "Variable" from a well-trained model
|
||||
success_ = 0
|
||||
import pickle
|
||||
with open(args.steal_params_L2I, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
for vidx, v in enumerate(tf.global_variables()):
|
||||
if v.name in old_model:
|
||||
self.sess.run(self.assign_op(v, old_model[v.name][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(tf.global_variables()))
|
||||
print('Retrieve %d / %d parameters from model %s' % (success_, len(old_model), args.steal_params_L2I))
|
||||
|
||||
'''
|
||||
# this version can only reload "trainable vars"
|
||||
success_ = 0
|
||||
import pickle
|
||||
with open(args.steal_params_L2I, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
for vidx, v in enumerate(self.Worker_L2I.all_params):
|
||||
if v.name in old_model:
|
||||
self.sess.run(self.assign_op(v, old_model[v.name][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(self.Worker_L2I.all_params))
|
||||
print('Retrieve %d / %d parameters from model %s' % (success_, len(old_model), args.steal_params_L2I))
|
||||
'''
|
||||
|
||||
def _steal_I2L(self):
|
||||
if args.steal_params_I2L is not None:
|
||||
# try to retrieve parameters from a well-trained model
|
||||
success_ = 0
|
||||
import pickle
|
||||
with open(args.steal_params_I2L, 'rb') as f:
|
||||
old_model = pickle.load(f)
|
||||
for vidx, v in enumerate(self.Worker_I2L.model.all_variables):
|
||||
if v.name[4:] in old_model:
|
||||
self.sess.run(self.assign_op(v, old_model[v.name[4:]][0]))
|
||||
success_ += 1
|
||||
print(vidx, len(self.Worker_I2L.model.all_variables))
|
||||
print('Retrieve %d / %d parameters from model %s' % (success_, len(old_model), args.steal_params_I2L))
|
||||
|
||||
def train(self):
|
||||
if args.load_params is None:
|
||||
self._steal_L2I()
|
||||
self._steal_I2L()
|
||||
self.saver.save(self.sess, args.save_dir + '/params_stealt_models.ckpt')
|
||||
for epoch in range(args.max_epochs):
|
||||
self.current_epoch = epoch
|
||||
for images, labels, LMscores in train_data_iterator:
|
||||
self.step(images, labels, LMscores, epoch, args.useSoftLabel)
|
||||
|
||||
if epoch % args.valid_interval == (args.valid_interval - 1):
|
||||
self.Worker_I2L.Valid(test_data_iterator, self.sess)
|
||||
self.L2I_TestNll()
|
||||
|
||||
if epoch % args.save_interval == (args.save_interval - 1):
|
||||
self.saver.save(self.sess, args.save_dir + '/params_' + str(epoch) + 'uidx' + str(self.train_uidx) + '.ckpt')
|
||||
self.Worker_L2I.Gen_Images(self.sess, self.current_epoch)
|
||||
|
||||
def valid_I2L(self):
|
||||
self.Worker_I2L.Valid(test_data_iterator, self.sess)
|
||||
|
||||
def valid_L2I(self):
|
||||
self.L2I_TestNll()
|
||||
'''
|
||||
for alpha_ in range(11):
|
||||
print('alpha=%f' % (alpha_ * 0.1))
|
||||
self.L2I_TestNll(alpha_ * 0.1)
|
||||
'''
|
||||
def valid_ImgGen(self):
|
||||
self.Worker_L2I.Gen_Images(self.sess, self.current_epoch)
|
||||
|
||||
def main(_):
|
||||
#L2Ipath='./pxpp_c_2.95/params_cifar.ckpt'
|
||||
monitor_ = moitor()
|
||||
monitor_.build_saver()
|
||||
if args.mode == 'train':
|
||||
monitor_.train()
|
||||
elif args.mode == 'I2L':
|
||||
monitor_.valid_I2L()
|
||||
elif args.mode == 'L2I':
|
||||
monitor_.valid_L2I()
|
||||
elif args.mode == 'ImgGen':
|
||||
monitor_.valid_ImgGen()
|
||||
else:
|
||||
print('Un supported mode: ' + args.mode)
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.app.run()
|
|
@ -0,0 +1,85 @@
|
|||
"""
|
||||
The core Pixel-CNN model
|
||||
"""
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.framework.python.ops import arg_scope
|
||||
import pixel_cnn_pp.nn as nn
|
||||
|
||||
def model_spec(x, h=None, init=False, ema=None, dropout_p=0.5, nr_resnet=5, nr_filters=160, nr_logistic_mix=10, resnet_nonlinearity='concat_elu'):
|
||||
"""
|
||||
We receive a Tensor x of shape (N,H,W,D1) (e.g. (12,32,32,3)) and produce
|
||||
a Tensor x_out of shape (N,H,W,D2) (e.g. (12,32,32,100)), where each fiber
|
||||
of the x_out tensor describes the predictive distribution for the RGB at
|
||||
that position.
|
||||
'h' is an optional N x K matrix of values to condition our generative model on
|
||||
"""
|
||||
|
||||
counters = {}
|
||||
with arg_scope([nn.conv2d, nn.deconv2d, nn.gated_resnet, nn.dense], counters=counters, init=init, ema=ema, dropout_p=dropout_p):
|
||||
|
||||
# parse resnet nonlinearity argument
|
||||
if resnet_nonlinearity == 'concat_elu':
|
||||
resnet_nonlinearity = nn.concat_elu
|
||||
elif resnet_nonlinearity == 'elu':
|
||||
resnet_nonlinearity = tf.nn.elu
|
||||
elif resnet_nonlinearity == 'relu':
|
||||
resnet_nonlinearity = tf.nn.relu
|
||||
else:
|
||||
raise('resnet nonlinearity ' + resnet_nonlinearity + ' is not supported')
|
||||
|
||||
with arg_scope([nn.gated_resnet], nonlinearity=resnet_nonlinearity, h=h):
|
||||
|
||||
# ////////// up pass through pixelCNN ////////
|
||||
xs = nn.int_shape(x)
|
||||
x_pad = tf.concat([x,tf.ones(xs[:-1]+[1])],3) # add channel of ones to distinguish image from padding later on
|
||||
u_list = [nn.down_shift(nn.down_shifted_conv2d(x_pad, num_filters=nr_filters, filter_size=[2, 3]))] # stream for pixels above
|
||||
ul_list = [nn.down_shift(nn.down_shifted_conv2d(x_pad, num_filters=nr_filters, filter_size=[1,3])) + \
|
||||
nn.right_shift(nn.down_right_shifted_conv2d(x_pad, num_filters=nr_filters, filter_size=[2,1]))] # stream for up and to the left
|
||||
|
||||
for rep in range(nr_resnet):
|
||||
u_list.append(nn.gated_resnet(u_list[-1], conv=nn.down_shifted_conv2d))
|
||||
ul_list.append(nn.gated_resnet(ul_list[-1], u_list[-1], conv=nn.down_right_shifted_conv2d))
|
||||
|
||||
u_list.append(nn.down_shifted_conv2d(u_list[-1], num_filters=nr_filters, stride=[2, 2]))
|
||||
ul_list.append(nn.down_right_shifted_conv2d(ul_list[-1], num_filters=nr_filters, stride=[2, 2]))
|
||||
|
||||
for rep in range(nr_resnet):
|
||||
u_list.append(nn.gated_resnet(u_list[-1], conv=nn.down_shifted_conv2d))
|
||||
ul_list.append(nn.gated_resnet(ul_list[-1], u_list[-1], conv=nn.down_right_shifted_conv2d))
|
||||
|
||||
u_list.append(nn.down_shifted_conv2d(u_list[-1], num_filters=nr_filters, stride=[2, 2]))
|
||||
ul_list.append(nn.down_right_shifted_conv2d(ul_list[-1], num_filters=nr_filters, stride=[2, 2]))
|
||||
|
||||
for rep in range(nr_resnet):
|
||||
u_list.append(nn.gated_resnet(u_list[-1], conv=nn.down_shifted_conv2d))
|
||||
ul_list.append(nn.gated_resnet(ul_list[-1], u_list[-1], conv=nn.down_right_shifted_conv2d))
|
||||
|
||||
# /////// down pass ////////
|
||||
u = u_list.pop()
|
||||
ul = ul_list.pop()
|
||||
for rep in range(nr_resnet):
|
||||
u = nn.gated_resnet(u, u_list.pop(), conv=nn.down_shifted_conv2d)
|
||||
ul = nn.gated_resnet(ul, tf.concat([u, ul_list.pop()],3), conv=nn.down_right_shifted_conv2d)
|
||||
|
||||
u = nn.down_shifted_deconv2d(u, num_filters=nr_filters, stride=[2, 2])
|
||||
ul = nn.down_right_shifted_deconv2d(ul, num_filters=nr_filters, stride=[2, 2])
|
||||
|
||||
for rep in range(nr_resnet+1):
|
||||
u = nn.gated_resnet(u, u_list.pop(), conv=nn.down_shifted_conv2d)
|
||||
ul = nn.gated_resnet(ul, tf.concat([u, ul_list.pop()],3), conv=nn.down_right_shifted_conv2d)
|
||||
|
||||
u = nn.down_shifted_deconv2d(u, num_filters=nr_filters, stride=[2, 2])
|
||||
ul = nn.down_right_shifted_deconv2d(ul, num_filters=nr_filters, stride=[2, 2])
|
||||
|
||||
for rep in range(nr_resnet+1):
|
||||
u = nn.gated_resnet(u, u_list.pop(), conv=nn.down_shifted_conv2d)
|
||||
ul = nn.gated_resnet(ul, tf.concat([u, ul_list.pop()],3), conv=nn.down_right_shifted_conv2d)
|
||||
|
||||
x_out = nn.nin(tf.nn.elu(ul),10*nr_logistic_mix)
|
||||
|
||||
assert len(u_list) == 0
|
||||
assert len(ul_list) == 0
|
||||
|
||||
return x_out
|
||||
|
|
@ -0,0 +1,319 @@
|
|||
"""
|
||||
Various tensorflow utilities
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.framework.python.ops import add_arg_scope
|
||||
|
||||
def int_shape(x):
|
||||
return list(map(int, x.get_shape()))
|
||||
|
||||
def concat_elu(x):
|
||||
""" like concatenated ReLU (http://arxiv.org/abs/1603.05201), but then with ELU """
|
||||
axis = len(x.get_shape())-1
|
||||
return tf.nn.elu(tf.concat([x, -x],axis))
|
||||
|
||||
def log_sum_exp(x):
|
||||
""" numerically stable log_sum_exp implementation that prevents overflow """
|
||||
axis = len(x.get_shape())-1
|
||||
m = tf.reduce_max(x, axis)
|
||||
m2 = tf.reduce_max(x, axis, keep_dims=True)
|
||||
return m + tf.log(tf.reduce_sum(tf.exp(x-m2), axis))
|
||||
|
||||
def log_prob_from_logits(x):
|
||||
""" numerically stable log_softmax implementation that prevents overflow """
|
||||
axis = len(x.get_shape())-1
|
||||
m = tf.reduce_max(x, axis, keep_dims=True)
|
||||
return x - m - tf.log(tf.reduce_sum(tf.exp(x-m), axis, keep_dims=True))
|
||||
|
||||
def discretized_mix_logistic_loss(x,l,sum_all=True):
|
||||
""" log-likelihood for mixture of discretized logistics, assumes the data has been rescaled to [-1,1] interval """
|
||||
xs = int_shape(x) # true image (i.e. labels) to regress to, e.g. (B,32,32,3)
|
||||
ls = int_shape(l) # predicted distribution, e.g. (B,32,32,100)
|
||||
nr_mix = int(ls[-1] / 10) # here and below: unpacking the params of the mixture of logistics
|
||||
logit_probs = l[:,:,:,:nr_mix]
|
||||
l = tf.reshape(l[:,:,:,nr_mix:], xs + [nr_mix*3])
|
||||
means = l[:,:,:,:,:nr_mix]
|
||||
log_scales = tf.maximum(l[:,:,:,:,nr_mix:2*nr_mix], -7.)
|
||||
coeffs = tf.nn.tanh(l[:,:,:,:,2*nr_mix:3*nr_mix])
|
||||
x = tf.reshape(x, xs + [1]) + tf.zeros(xs + [nr_mix]) # here and below: getting the means and adjusting them based on preceding sub-pixels
|
||||
m2 = tf.reshape(means[:,:,:,1,:] + coeffs[:, :, :, 0, :] * x[:, :, :, 0, :], [xs[0],xs[1],xs[2],1,nr_mix])
|
||||
m3 = tf.reshape(means[:, :, :, 2, :] + coeffs[:, :, :, 1, :] * x[:, :, :, 0, :] + coeffs[:, :, :, 2, :] * x[:, :, :, 1, :], [xs[0],xs[1],xs[2],1,nr_mix])
|
||||
means = tf.concat([tf.reshape(means[:,:,:,0,:], [xs[0],xs[1],xs[2],1,nr_mix]), m2, m3],3)
|
||||
centered_x = x - means
|
||||
inv_stdv = tf.exp(-log_scales)
|
||||
plus_in = inv_stdv * (centered_x + 1./255.)
|
||||
cdf_plus = tf.nn.sigmoid(plus_in)
|
||||
min_in = inv_stdv * (centered_x - 1./255.)
|
||||
cdf_min = tf.nn.sigmoid(min_in)
|
||||
log_cdf_plus = plus_in - tf.nn.softplus(plus_in) # log probability for edge case of 0 (before scaling)
|
||||
log_one_minus_cdf_min = -tf.nn.softplus(min_in) # log probability for edge case of 255 (before scaling)
|
||||
cdf_delta = cdf_plus - cdf_min # probability for all other cases
|
||||
mid_in = inv_stdv * centered_x
|
||||
log_pdf_mid = mid_in - log_scales - 2.*tf.nn.softplus(mid_in) # log probability in the center of the bin, to be used in extreme cases (not actually used in our code)
|
||||
|
||||
# now select the right output: left edge case, right edge case, normal case, extremely low prob case (doesn't actually happen for us)
|
||||
|
||||
# this is what we are really doing, but using the robust version below for extreme cases in other applications and to avoid NaN issue with tf.select()
|
||||
# log_probs = tf.select(x < -0.999, log_cdf_plus, tf.select(x > 0.999, log_one_minus_cdf_min, tf.log(cdf_delta)))
|
||||
|
||||
# robust version, that still works if probabilities are below 1e-5 (which never happens in our code)
|
||||
# tensorflow backpropagates through tf.select() by multiplying with zero instead of selecting: this requires use to use some ugly tricks to avoid potential NaNs
|
||||
# the 1e-12 in tf.maximum(cdf_delta, 1e-12) is never actually used as output, it's purely there to get around the tf.select() gradient issue
|
||||
# if the probability on a sub-pixel is below 1e-5, we use an approximation based on the assumption that the log-density is constant in the bin of the observed sub-pixel value
|
||||
log_probs = tf.where(x < -0.999, log_cdf_plus, tf.where(x > 0.999, log_one_minus_cdf_min, tf.where(cdf_delta > 1e-5, tf.log(tf.maximum(cdf_delta, 1e-12)), log_pdf_mid - np.log(127.5))))
|
||||
|
||||
log_probs = tf.reduce_sum(log_probs,3) + log_prob_from_logits(logit_probs)
|
||||
if sum_all:
|
||||
return -tf.reduce_sum(log_sum_exp(log_probs))
|
||||
else:
|
||||
return -tf.reduce_sum(log_sum_exp(log_probs),[1,2])
|
||||
|
||||
def sample_from_discretized_mix_logistic(l,nr_mix):
|
||||
ls = int_shape(l)
|
||||
xs = ls[:-1] + [3]
|
||||
# unpack parameters
|
||||
logit_probs = l[:, :, :, :nr_mix]
|
||||
l = tf.reshape(l[:, :, :, nr_mix:], xs + [nr_mix*3])
|
||||
# sample mixture indicator from softmax
|
||||
sel = tf.one_hot(tf.argmax(logit_probs - tf.log(-tf.log(tf.random_uniform(logit_probs.get_shape(), minval=1e-5, maxval=1. - 1e-5))), 3), depth=nr_mix, dtype=tf.float32)
|
||||
sel = tf.reshape(sel, xs[:-1] + [1,nr_mix])
|
||||
# select logistic parameters
|
||||
means = tf.reduce_sum(l[:,:,:,:,:nr_mix]*sel,4)
|
||||
log_scales = tf.maximum(tf.reduce_sum(l[:,:,:,:,nr_mix:2*nr_mix]*sel,4), -7.)
|
||||
coeffs = tf.reduce_sum(tf.nn.tanh(l[:,:,:,:,2*nr_mix:3*nr_mix])*sel,4)
|
||||
# sample from logistic & clip to interval
|
||||
# we don't actually round to the nearest 8bit value when sampling
|
||||
u = tf.random_uniform(means.get_shape(), minval=1e-5, maxval=1. - 1e-5)
|
||||
x = means + tf.exp(log_scales)*(tf.log(u) - tf.log(1. - u))
|
||||
x0 = tf.minimum(tf.maximum(x[:,:,:,0], -1.), 1.)
|
||||
x1 = tf.minimum(tf.maximum(x[:,:,:,1] + coeffs[:,:,:,0]*x0, -1.), 1.)
|
||||
x2 = tf.minimum(tf.maximum(x[:,:,:,2] + coeffs[:,:,:,1]*x0 + coeffs[:,:,:,2]*x1, -1.), 1.)
|
||||
return tf.concat([tf.reshape(x0,xs[:-1]+[1]), tf.reshape(x1,xs[:-1]+[1]), tf.reshape(x2,xs[:-1]+[1])],3)
|
||||
|
||||
def get_var_maybe_avg(var_name, ema, **kwargs):
|
||||
''' utility for retrieving polyak averaged params '''
|
||||
v = tf.get_variable(var_name, **kwargs)
|
||||
if ema is not None:
|
||||
v = ema.average(v)
|
||||
return v
|
||||
|
||||
def get_vars_maybe_avg(var_names, ema, **kwargs):
|
||||
''' utility for retrieving polyak averaged params '''
|
||||
vars = []
|
||||
for vn in var_names:
|
||||
vars.append(get_var_maybe_avg(vn, ema, **kwargs))
|
||||
return vars
|
||||
|
||||
def adam_updates(params, cost_or_grads, lr=0.001, mom1=0.9, mom2=0.999):
|
||||
''' Adam optimizer '''
|
||||
updates = []
|
||||
if type(cost_or_grads) is not list:
|
||||
grads = tf.gradients(cost_or_grads, params)
|
||||
else:
|
||||
grads = cost_or_grads
|
||||
t = tf.Variable(1., 'adam_t')
|
||||
for p, g in zip(params, grads):
|
||||
mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg')
|
||||
if mom1>0:
|
||||
v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v')
|
||||
v_t = mom1*v + (1. - mom1)*g
|
||||
v_hat = v_t / (1. - tf.pow(mom1,t))
|
||||
updates.append(v.assign(v_t))
|
||||
else:
|
||||
v_hat = g
|
||||
mg_t = mom2*mg + (1. - mom2)*tf.square(g)
|
||||
mg_hat = mg_t / (1. - tf.pow(mom2,t))
|
||||
g_t = v_hat / tf.sqrt(mg_hat + 1e-8)
|
||||
p_t = p - lr * g_t
|
||||
updates.append(mg.assign(mg_t))
|
||||
updates.append(p.assign(p_t))
|
||||
updates.append(t.assign_add(1))
|
||||
return tf.group(*updates)
|
||||
|
||||
def get_name(layer_name, counters):
|
||||
''' utlity for keeping track of layer names '''
|
||||
if not layer_name in counters:
|
||||
counters[layer_name] = 0
|
||||
name = layer_name + '_' + str(counters[layer_name])
|
||||
counters[layer_name] += 1
|
||||
return name
|
||||
|
||||
@add_arg_scope
|
||||
def dense(x, num_units, nonlinearity=None, init_scale=1., counters={}, init=False, ema=None, **kwargs):
|
||||
''' fully connected layer '''
|
||||
name = get_name('dense', counters)
|
||||
with tf.variable_scope(name):
|
||||
if init:
|
||||
# data based initialization of parameters
|
||||
V = tf.get_variable('V', [int(x.get_shape()[1]),num_units], tf.float32, tf.random_normal_initializer(0, 0.05), trainable=True)
|
||||
V_norm = tf.nn.l2_normalize(V.initialized_value(), [0])
|
||||
x_init = tf.matmul(x, V_norm)
|
||||
m_init, v_init = tf.nn.moments(x_init, [0])
|
||||
scale_init = init_scale/tf.sqrt(v_init + 1e-10)
|
||||
g = tf.get_variable('g', dtype=tf.float32, initializer=scale_init, trainable=True)
|
||||
b = tf.get_variable('b', dtype=tf.float32, initializer=-m_init*scale_init, trainable=True)
|
||||
x_init = tf.reshape(scale_init,[1,num_units])*(x_init-tf.reshape(m_init,[1,num_units]))
|
||||
if nonlinearity is not None:
|
||||
x_init = nonlinearity(x_init)
|
||||
return x_init
|
||||
|
||||
else:
|
||||
V,g,b = get_vars_maybe_avg(['V','g','b'], ema)
|
||||
# According to the comments at
|
||||
# https: // github.com / openai / pixel - cnn / issues / 17,
|
||||
# I simply comment the following line
|
||||
# tf.assert_variables_initialized([V,g,b])
|
||||
|
||||
# use weight normalization (Salimans & Kingma, 2016)
|
||||
x = tf.matmul(x, V)
|
||||
scaler = g/tf.sqrt(tf.reduce_sum(tf.square(V),[0]))
|
||||
x = tf.reshape(scaler,[1,num_units])*x + tf.reshape(b,[1,num_units])
|
||||
|
||||
# apply nonlinearity
|
||||
if nonlinearity is not None:
|
||||
x = nonlinearity(x)
|
||||
return x
|
||||
|
||||
@add_arg_scope
|
||||
def conv2d(x, num_filters, filter_size=[3,3], stride=[1,1], pad='SAME', nonlinearity=None, init_scale=1., counters={}, init=False, ema=None, **kwargs):
|
||||
''' convolutional layer '''
|
||||
name = get_name('conv2d', counters)
|
||||
with tf.variable_scope(name):
|
||||
if init:
|
||||
# data based initialization of parameters
|
||||
V = tf.get_variable('V', filter_size+[int(x.get_shape()[-1]),num_filters], tf.float32, tf.random_normal_initializer(0, 0.05), trainable=True)
|
||||
V_norm = tf.nn.l2_normalize(V.initialized_value(), [0,1,2])
|
||||
x_init = tf.nn.conv2d(x, V_norm, [1]+stride+[1], pad)
|
||||
m_init, v_init = tf.nn.moments(x_init, [0,1,2])
|
||||
scale_init = init_scale/tf.sqrt(v_init + 1e-8)
|
||||
g = tf.get_variable('g', dtype=tf.float32, initializer=scale_init, trainable=True)
|
||||
b = tf.get_variable('b', dtype=tf.float32, initializer=-m_init*scale_init, trainable=True)
|
||||
x_init = tf.reshape(scale_init,[1,1,1,num_filters])*(x_init-tf.reshape(m_init,[1,1,1,num_filters]))
|
||||
if nonlinearity is not None:
|
||||
x_init = nonlinearity(x_init)
|
||||
return x_init
|
||||
|
||||
else:
|
||||
V, g, b = get_vars_maybe_avg(['V', 'g', 'b'], ema)
|
||||
# tf.assert_variables_initialized([V,g,b])
|
||||
|
||||
# use weight normalization (Salimans & Kingma, 2016)
|
||||
W = tf.reshape(g,[1,1,1,num_filters])*tf.nn.l2_normalize(V,[0,1,2])
|
||||
|
||||
# calculate convolutional layer output
|
||||
x = tf.nn.bias_add(tf.nn.conv2d(x, W, [1]+stride+[1], pad), b)
|
||||
|
||||
# apply nonlinearity
|
||||
if nonlinearity is not None:
|
||||
x = nonlinearity(x)
|
||||
return x
|
||||
|
||||
@add_arg_scope
|
||||
def deconv2d(x, num_filters, filter_size=[3,3], stride=[1,1], pad='SAME', nonlinearity=None, init_scale=1., counters={}, init=False, ema=None, **kwargs):
|
||||
''' transposed convolutional layer '''
|
||||
name = get_name('deconv2d', counters)
|
||||
xs = int_shape(x)
|
||||
if pad=='SAME':
|
||||
target_shape = [xs[0], xs[1]*stride[0], xs[2]*stride[1], num_filters]
|
||||
else:
|
||||
target_shape = [xs[0], xs[1]*stride[0] + filter_size[0]-1, xs[2]*stride[1] + filter_size[1]-1, num_filters]
|
||||
with tf.variable_scope(name):
|
||||
if init:
|
||||
# data based initialization of parameters
|
||||
V = tf.get_variable('V', filter_size+[num_filters,int(x.get_shape()[-1])], tf.float32, tf.random_normal_initializer(0, 0.05), trainable=True)
|
||||
V_norm = tf.nn.l2_normalize(V.initialized_value(), [0,1,3])
|
||||
x_init = tf.nn.conv2d_transpose(x, V_norm, target_shape, [1]+stride+[1], padding=pad)
|
||||
m_init, v_init = tf.nn.moments(x_init, [0,1,2])
|
||||
scale_init = init_scale/tf.sqrt(v_init + 1e-8)
|
||||
g = tf.get_variable('g', dtype=tf.float32, initializer=scale_init, trainable=True)
|
||||
b = tf.get_variable('b', dtype=tf.float32, initializer=-m_init*scale_init, trainable=True)
|
||||
x_init = tf.reshape(scale_init,[1,1,1,num_filters])*(x_init-tf.reshape(m_init,[1,1,1,num_filters]))
|
||||
if nonlinearity is not None:
|
||||
x_init = nonlinearity(x_init)
|
||||
return x_init
|
||||
|
||||
else:
|
||||
V, g, b = get_vars_maybe_avg(['V', 'g', 'b'], ema)
|
||||
# tf.assert_variables_initialized([V,g,b])
|
||||
|
||||
# use weight normalization (Salimans & Kingma, 2016)
|
||||
W = tf.reshape(g,[1,1,num_filters,1])*tf.nn.l2_normalize(V,[0,1,3])
|
||||
|
||||
# calculate convolutional layer output
|
||||
x = tf.nn.conv2d_transpose(x, W, target_shape, [1]+stride+[1], padding=pad)
|
||||
x = tf.nn.bias_add(x, b)
|
||||
|
||||
# apply nonlinearity
|
||||
if nonlinearity is not None:
|
||||
x = nonlinearity(x)
|
||||
return x
|
||||
|
||||
@add_arg_scope
|
||||
def nin(x, num_units, **kwargs):
|
||||
""" a network in network layer (1x1 CONV) """
|
||||
s = int_shape(x)
|
||||
x = tf.reshape(x, [np.prod(s[:-1]),s[-1]])
|
||||
x = dense(x, num_units, **kwargs)
|
||||
return tf.reshape(x, s[:-1]+[num_units])
|
||||
|
||||
''' meta-layer consisting of multiple base layers '''
|
||||
|
||||
@add_arg_scope
|
||||
def gated_resnet(x, a=None, h=None, nonlinearity=concat_elu, conv=conv2d, init=False, counters={}, ema=None, dropout_p=0., **kwargs):
|
||||
xs = int_shape(x)
|
||||
num_filters = xs[-1]
|
||||
|
||||
c1 = conv(nonlinearity(x), num_filters)
|
||||
if a is not None: # add short-cut connection if auxiliary input 'a' is given
|
||||
c1 += nin(nonlinearity(a), num_filters)
|
||||
c1 = nonlinearity(c1)
|
||||
if dropout_p > 0:
|
||||
c1 = tf.nn.dropout(c1, keep_prob=1. - dropout_p)
|
||||
c2 = conv(c1, num_filters * 2, init_scale=0.1)
|
||||
|
||||
# add projection of h vector if included: conditional generation
|
||||
if h is not None:
|
||||
with tf.variable_scope(get_name('conditional_weights', counters)):
|
||||
hw = get_var_maybe_avg('hw', ema, shape=[int_shape(h)[-1], 2 * num_filters], dtype=tf.float32,
|
||||
initializer=tf.random_normal_initializer(0, 0.05), trainable=True)
|
||||
if init:
|
||||
hw = hw.initialized_value()
|
||||
c2 += tf.reshape(tf.matmul(h, hw), [xs[0], 1, 1, 2 * num_filters])
|
||||
|
||||
a, b = tf.split(c2, 2, 3)
|
||||
c3 = a * tf.nn.sigmoid(b)
|
||||
return x + c3
|
||||
|
||||
''' utilities for shifting the image around, efficient alternative to masking convolutions '''
|
||||
|
||||
def down_shift(x):
|
||||
xs = int_shape(x)
|
||||
return tf.concat([tf.zeros([xs[0],1,xs[2],xs[3]]), x[:,:xs[1]-1,:,:]],1)
|
||||
|
||||
def right_shift(x):
|
||||
xs = int_shape(x)
|
||||
return tf.concat([tf.zeros([xs[0],xs[1],1,xs[3]]), x[:,:,:xs[2]-1,:]],2)
|
||||
|
||||
@add_arg_scope
|
||||
def down_shifted_conv2d(x, num_filters, filter_size=[2,3], stride=[1,1], **kwargs):
|
||||
x = tf.pad(x, [[0,0],[filter_size[0]-1,0], [int((filter_size[1]-1)/2),int((filter_size[1]-1)/2)],[0,0]])
|
||||
return conv2d(x, num_filters, filter_size=filter_size, pad='VALID', stride=stride, **kwargs)
|
||||
|
||||
@add_arg_scope
|
||||
def down_shifted_deconv2d(x, num_filters, filter_size=[2,3], stride=[1,1], **kwargs):
|
||||
x = deconv2d(x, num_filters, filter_size=filter_size, pad='VALID', stride=stride, **kwargs)
|
||||
xs = int_shape(x)
|
||||
return x[:,:(xs[1]-filter_size[0]+1),int((filter_size[1]-1)/2):(xs[2]-int((filter_size[1]-1)/2)),:]
|
||||
|
||||
@add_arg_scope
|
||||
def down_right_shifted_conv2d(x, num_filters, filter_size=[2,2], stride=[1,1], **kwargs):
|
||||
x = tf.pad(x, [[0,0],[filter_size[0]-1, 0], [filter_size[1]-1, 0],[0,0]])
|
||||
return conv2d(x, num_filters, filter_size=filter_size, pad='VALID', stride=stride, **kwargs)
|
||||
|
||||
@add_arg_scope
|
||||
def down_right_shifted_deconv2d(x, num_filters, filter_size=[2,2], stride=[1,1], **kwargs):
|
||||
x = deconv2d(x, num_filters, filter_size=filter_size, pad='VALID', stride=stride, **kwargs)
|
||||
xs = int_shape(x)
|
||||
return x[:,:(xs[1]-filter_size[0]+1):,:(xs[2]-filter_size[1]+1),:]
|
|
@ -0,0 +1,194 @@
|
|||
import numpy as np
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
# Plot image examples.
|
||||
def plot_img(img, title=None):
|
||||
plt.figure()
|
||||
plt.imshow(img, interpolation='nearest')
|
||||
if title is not None:
|
||||
plt.title(title)
|
||||
plt.axis('off')
|
||||
plt.tight_layout()
|
||||
|
||||
def img_stretch(img):
|
||||
img = img.astype(float)
|
||||
img -= np.min(img)
|
||||
img /= np.max(img)+1e-12
|
||||
return img
|
||||
|
||||
def img_tile(imgs, aspect_ratio=1.0, tile_shape=None, border=1,
|
||||
border_color=0, stretch=False):
|
||||
''' Tile images in a grid.
|
||||
If tile_shape is provided only as many images as specified in tile_shape
|
||||
will be included in the output.
|
||||
'''
|
||||
|
||||
# Prepare images
|
||||
if stretch:
|
||||
imgs = img_stretch(imgs)
|
||||
imgs = np.array(imgs)
|
||||
if imgs.ndim != 3 and imgs.ndim != 4:
|
||||
raise ValueError('imgs has wrong number of dimensions.')
|
||||
n_imgs = imgs.shape[0]
|
||||
|
||||
# Grid shape
|
||||
img_shape = np.array(imgs.shape[1:3])
|
||||
if tile_shape is None:
|
||||
img_aspect_ratio = img_shape[1] / float(img_shape[0])
|
||||
aspect_ratio *= img_aspect_ratio
|
||||
tile_height = int(np.ceil(np.sqrt(n_imgs * aspect_ratio)))
|
||||
tile_width = int(np.ceil(np.sqrt(n_imgs / aspect_ratio)))
|
||||
grid_shape = np.array((tile_height, tile_width))
|
||||
else:
|
||||
assert len(tile_shape) == 2
|
||||
grid_shape = np.array(tile_shape)
|
||||
|
||||
# Tile image shape
|
||||
tile_img_shape = np.array(imgs.shape[1:])
|
||||
tile_img_shape[:2] = (img_shape[:2] + border) * grid_shape[:2] - border
|
||||
|
||||
# Assemble tile image
|
||||
tile_img = np.empty(tile_img_shape)
|
||||
tile_img[:] = border_color
|
||||
for i in range(grid_shape[0]):
|
||||
for j in range(grid_shape[1]):
|
||||
img_idx = j + i*grid_shape[1]
|
||||
if img_idx >= n_imgs:
|
||||
# No more images - stop filling out the grid.
|
||||
break
|
||||
img = imgs[img_idx]
|
||||
yoff = (img_shape[0] + border) * i
|
||||
xoff = (img_shape[1] + border) * j
|
||||
tile_img[yoff:yoff+img_shape[0], xoff:xoff+img_shape[1], ...] = img
|
||||
|
||||
return tile_img
|
||||
|
||||
def conv_filter_tile(filters):
|
||||
n_filters, n_channels, height, width = filters.shape
|
||||
tile_shape = None
|
||||
if n_channels == 3:
|
||||
# Interpret 3 color channels as RGB
|
||||
filters = np.transpose(filters, (0, 2, 3, 1))
|
||||
else:
|
||||
# Organize tile such that each row corresponds to a filter and the
|
||||
# columns are the filter channels
|
||||
tile_shape = (n_channels, n_filters)
|
||||
filters = np.transpose(filters, (1, 0, 2, 3))
|
||||
filters = np.resize(filters, (n_filters*n_channels, height, width))
|
||||
filters = img_stretch(filters)
|
||||
return img_tile(filters, tile_shape=tile_shape)
|
||||
|
||||
def scale_to_unit_interval(ndar, eps=1e-8):
|
||||
""" Scales all values in the ndarray ndar to be between 0 and 1 """
|
||||
ndar = ndar.copy()
|
||||
ndar -= ndar.min()
|
||||
ndar *= 1.0 / (ndar.max() + eps)
|
||||
return ndar
|
||||
|
||||
|
||||
def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
|
||||
scale_rows_to_unit_interval=True,
|
||||
output_pixel_vals=True):
|
||||
"""
|
||||
Transform an array with one flattened image per row, into an array in
|
||||
which images are reshaped and layed out like tiles on a floor.
|
||||
|
||||
This function is useful for visualizing datasets whose rows are images,
|
||||
and also columns of matrices for transforming those rows
|
||||
(such as the first layer of a neural net).
|
||||
|
||||
:type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
|
||||
be 2-D ndarrays or None;
|
||||
:param X: a 2-D array in which every row is a flattened image.
|
||||
|
||||
:type img_shape: tuple; (height, width)
|
||||
:param img_shape: the original shape of each image
|
||||
|
||||
:type tile_shape: tuple; (rows, cols)
|
||||
:param tile_shape: the number of images to tile (rows, cols)
|
||||
|
||||
:param output_pixel_vals: if output should be pixel values (i.e. int8
|
||||
values) or floats
|
||||
|
||||
:param scale_rows_to_unit_interval: if the values need to be scaled before
|
||||
being plotted to [0,1] or not
|
||||
|
||||
|
||||
:returns: array suitable for viewing as an image.
|
||||
(See:`PIL.Image.fromarray`.)
|
||||
:rtype: a 2-d array with same dtype as X.
|
||||
|
||||
"""
|
||||
|
||||
assert len(img_shape) == 2
|
||||
assert len(tile_shape) == 2
|
||||
assert len(tile_spacing) == 2
|
||||
|
||||
# The expression below can be re-written in a more C style as
|
||||
# follows :
|
||||
#
|
||||
# out_shape = [0,0]
|
||||
# out_shape[0] = (img_shape[0] + tile_spacing[0]) * tile_shape[0] -
|
||||
# tile_spacing[0]
|
||||
# out_shape[1] = (img_shape[1] + tile_spacing[1]) * tile_shape[1] -
|
||||
# tile_spacing[1]
|
||||
out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
|
||||
in zip(img_shape, tile_shape, tile_spacing)]
|
||||
|
||||
if isinstance(X, tuple):
|
||||
assert len(X) == 4
|
||||
# Create an output numpy ndarray to store the image
|
||||
if output_pixel_vals:
|
||||
out_array = np.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
|
||||
else:
|
||||
out_array = np.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)
|
||||
|
||||
#colors default to 0, alpha defaults to 1 (opaque)
|
||||
if output_pixel_vals:
|
||||
channel_defaults = [0, 0, 0, 255]
|
||||
else:
|
||||
channel_defaults = [0., 0., 0., 1.]
|
||||
|
||||
for i in range(4):
|
||||
if X[i] is None:
|
||||
# if channel is None, fill it with zeros of the correct
|
||||
# dtype
|
||||
out_array[:, :, i] = np.zeros(out_shape,
|
||||
dtype='uint8' if output_pixel_vals else out_array.dtype
|
||||
) + channel_defaults[i]
|
||||
else:
|
||||
# use a recurrent call to compute the channel and store it
|
||||
# in the output
|
||||
out_array[:, :, i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
|
||||
return out_array
|
||||
|
||||
else:
|
||||
# if we are dealing with only one channel
|
||||
H, W = img_shape
|
||||
Hs, Ws = tile_spacing
|
||||
|
||||
# generate a matrix to store the output
|
||||
out_array = np.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
|
||||
|
||||
|
||||
for tile_row in range(tile_shape[0]):
|
||||
for tile_col in range(tile_shape[1]):
|
||||
if tile_row * tile_shape[1] + tile_col < X.shape[0]:
|
||||
if scale_rows_to_unit_interval:
|
||||
# if we should scale values to be between 0 and 1
|
||||
# do this by calling the `scale_to_unit_interval`
|
||||
# function
|
||||
this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
|
||||
else:
|
||||
this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
|
||||
# add the slice to the corresponding position in the
|
||||
# output array
|
||||
out_array[
|
||||
tile_row * (H+Hs): tile_row * (H + Hs) + H,
|
||||
tile_col * (W+Ws): tile_col * (W + Ws) + W
|
||||
] \
|
||||
= this_img * (255 if output_pixel_vals else 1)
|
||||
return out_array
|
||||
|
|
@ -0,0 +1,436 @@
|
|||
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""ResNet model.
|
||||
|
||||
Related papers:
|
||||
https://arxiv.org/pdf/1603.05027v2.pdf
|
||||
https://arxiv.org/pdf/1512.03385v1.pdf
|
||||
https://arxiv.org/pdf/1605.07146v1.pdf
|
||||
"""
|
||||
from collections import namedtuple
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from tensorflow.python.training import moving_averages
|
||||
|
||||
|
||||
HParams = namedtuple('HParams',
|
||||
'batch_size, num_classes, min_lrn_rate, lrn_rate, '
|
||||
'num_residual_units, use_bottleneck, weight_decay_rate, '
|
||||
'relu_leakiness, optimizer')
|
||||
|
||||
|
||||
class ResNet(object):
|
||||
"""ResNet model."""
|
||||
|
||||
def __init__(self, hps, mode, image_size=32, use_wide_resnet=False, nr_gpu=1):
|
||||
self.hps = hps
|
||||
self.batch_size = self.hps.batch_size
|
||||
self.input_image = [tf.placeholder(tf.float32, shape=(self.batch_size,image_size,image_size,3)) for _ in range(nr_gpu)]
|
||||
self.input_label = [tf.placeholder(tf.int32, shape=(self.batch_size,1)) for _ in range(nr_gpu)]
|
||||
self.mode = mode
|
||||
self.needImgAug = tf.placeholder(tf.bool, shape=())
|
||||
self.image_size = image_size
|
||||
self.nr_gpu = nr_gpu
|
||||
|
||||
self._extra_train_ops = []
|
||||
self.lrn_rate = tf.placeholder(tf.float32, shape=())
|
||||
self.use_wide_resnet = use_wide_resnet
|
||||
|
||||
def build_graph(self):
|
||||
"""Build a whole graph for the model."""
|
||||
with tf.variable_scope('I2L'):
|
||||
self.global_step = tf.contrib.framework.get_or_create_global_step()
|
||||
self._build_model()
|
||||
self.trainable_variables = [v for v in tf.trainable_variables() if v.name.startswith('I2L/')]
|
||||
self.all_variables = [v for v in tf.global_variables() if v.name.startswith('I2L/')]
|
||||
#if self.mode == 'train':
|
||||
# self._build_train_op()
|
||||
|
||||
def _stride_arr(self, stride):
|
||||
"""Map a stride scalar to the stride array for tf.nn.conv2d."""
|
||||
return [1, stride, stride, 1]
|
||||
|
||||
def _PreprocessImages(self):
|
||||
def _aug_one_img(img):
|
||||
img = tf.image.resize_image_with_crop_or_pad(img, self.image_size+4, self.image_size+4)
|
||||
img = tf.random_crop(img, [self.image_size, self.image_size, 3])
|
||||
img = tf.image.random_flip_left_right(img)
|
||||
return img
|
||||
def _deal_one_img(img):
|
||||
img = tf.cond(self.needImgAug, lambda: _aug_one_img(img), lambda: img)
|
||||
img = tf.image.per_image_standardization(img)
|
||||
return img
|
||||
#images = tf.map_fn(lambda img: _deal_one_img(img), self.input_image)
|
||||
#self.image = images
|
||||
self.image = [tf.map_fn(lambda img: _deal_one_img(img), X) for X in self.input_image]
|
||||
|
||||
def _make_1hot_labels(self):
|
||||
self.labels = []
|
||||
for L in self.input_label:
|
||||
labels = tf.reshape(L, [self.batch_size, 1])
|
||||
indices = tf.reshape(tf.range(0, self.batch_size, 1), [self.batch_size, 1])
|
||||
labels = tf.sparse_to_dense(
|
||||
tf.concat([indices, labels],1),
|
||||
[self.batch_size, self.hps.num_classes], 1.0, 0.0)
|
||||
self.labels.append(labels)
|
||||
|
||||
def _build_basic_structure(self, x, y):
|
||||
with tf.variable_scope('init'):
|
||||
x = self._conv('init_conv', x, 3, 3, 16, self._stride_arr(1))
|
||||
|
||||
strides = [1, 2, 2]
|
||||
activate_before_residual = [True, False, False]
|
||||
if self.hps.use_bottleneck:
|
||||
res_func = self._bottleneck_residual
|
||||
filters = [16, 64, 128, 256]
|
||||
else:
|
||||
res_func = self._residual
|
||||
if self.use_wide_resnet:
|
||||
filters = [16, 160, 320, 640]
|
||||
else:
|
||||
filters = [16, 16, 32, 64]
|
||||
# Uncomment the following codes to use w28-10 wide residual network.
|
||||
# It is more memory efficient than very deep residual network and has
|
||||
# comparably good performance.
|
||||
# https://arxiv.org/pdf/1605.07146v1.pdf
|
||||
# filters = [16, 160, 320, 640]
|
||||
# Update hps.num_residual_units to 9
|
||||
|
||||
with tf.variable_scope('unit_1_0'):
|
||||
x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]),
|
||||
activate_before_residual[0])
|
||||
for i in range(1, self.hps.num_residual_units):
|
||||
with tf.variable_scope('unit_1_%d' % i):
|
||||
x = res_func(x, filters[1], filters[1], self._stride_arr(1), False)
|
||||
|
||||
with tf.variable_scope('unit_2_0'):
|
||||
x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]),
|
||||
activate_before_residual[1])
|
||||
for i in range(1, self.hps.num_residual_units):
|
||||
with tf.variable_scope('unit_2_%d' % i):
|
||||
x = res_func(x, filters[2], filters[2], self._stride_arr(1), False)
|
||||
|
||||
with tf.variable_scope('unit_3_0'):
|
||||
x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]),
|
||||
activate_before_residual[2])
|
||||
for i in range(1, self.hps.num_residual_units):
|
||||
with tf.variable_scope('unit_3_%d' % i):
|
||||
x = res_func(x, filters[3], filters[3], self._stride_arr(1), False)
|
||||
|
||||
with tf.variable_scope('unit_last'):
|
||||
x = self._batch_norm('final_bn', x)
|
||||
x = self._relu(x, self.hps.relu_leakiness)
|
||||
x = self._global_avg_pool(x)
|
||||
|
||||
with tf.variable_scope('logit'):
|
||||
logits = self._fully_connected(x, self.hps.num_classes)
|
||||
predictions_ = tf.nn.softmax(logits)
|
||||
|
||||
with tf.variable_scope('costs'):
|
||||
xent = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits)
|
||||
nlls_ = xent
|
||||
cost_ = tf.reduce_mean(xent, name='xent')
|
||||
cost_ += self._decay()
|
||||
|
||||
return nlls_, cost_, predictions_
|
||||
|
||||
#tf.scalar_summary('cost', self.cost)
|
||||
|
||||
def _build_model(self):
|
||||
"""Build the core model within the graph."""
|
||||
# Preprocess
|
||||
self._PreprocessImages()
|
||||
self._make_1hot_labels()
|
||||
self.nlls = [None for _ in range(self.nr_gpu)]
|
||||
self.cost = [None for _ in range(self.nr_gpu)]
|
||||
self.predictions = [None for _ in range(self.nr_gpu)]
|
||||
|
||||
for i in range(self.nr_gpu):
|
||||
with tf.variable_scope('I2L', reuse=True if i >= 1 else None):
|
||||
with tf.device('/gpu:%d' % i):
|
||||
nll_, cost_, predicted_ = self._build_basic_structure(self.image[i], self.labels[i])
|
||||
self.nlls[i] = nll_
|
||||
self.cost[i] = cost_
|
||||
self.predictions[i] = predicted_
|
||||
|
||||
'''
|
||||
def _build_model(self):
|
||||
"""Build the core model within the graph."""
|
||||
# Preprocess
|
||||
self._PreprocessImages()
|
||||
self._make_1hot_labels()
|
||||
|
||||
with tf.variable_scope('init'):
|
||||
x = self.image
|
||||
x = self._conv('init_conv', x, 3, 3, 16, self._stride_arr(1))
|
||||
|
||||
strides = [1, 2, 2]
|
||||
activate_before_residual = [True, False, False]
|
||||
if self.hps.use_bottleneck:
|
||||
res_func = self._bottleneck_residual
|
||||
filters = [16, 64, 128, 256]
|
||||
else:
|
||||
res_func = self._residual
|
||||
if self.use_wide_resnet:
|
||||
filters = [16, 160, 320, 640]
|
||||
else:
|
||||
filters = [16, 16, 32, 64]
|
||||
# Uncomment the following codes to use w28-10 wide residual network.
|
||||
# It is more memory efficient than very deep residual network and has
|
||||
# comparably good performance.
|
||||
# https://arxiv.org/pdf/1605.07146v1.pdf
|
||||
# filters = [16, 160, 320, 640]
|
||||
# Update hps.num_residual_units to 9
|
||||
|
||||
with tf.variable_scope('unit_1_0'):
|
||||
x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]),
|
||||
activate_before_residual[0])
|
||||
for i in range(1, self.hps.num_residual_units):
|
||||
with tf.variable_scope('unit_1_%d' % i):
|
||||
x = res_func(x, filters[1], filters[1], self._stride_arr(1), False)
|
||||
|
||||
with tf.variable_scope('unit_2_0'):
|
||||
x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]),
|
||||
activate_before_residual[1])
|
||||
for i in range(1, self.hps.num_residual_units):
|
||||
with tf.variable_scope('unit_2_%d' % i):
|
||||
x = res_func(x, filters[2], filters[2], self._stride_arr(1), False)
|
||||
|
||||
with tf.variable_scope('unit_3_0'):
|
||||
x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]),
|
||||
activate_before_residual[2])
|
||||
for i in range(1, self.hps.num_residual_units):
|
||||
with tf.variable_scope('unit_3_%d' % i):
|
||||
x = res_func(x, filters[3], filters[3], self._stride_arr(1), False)
|
||||
|
||||
with tf.variable_scope('unit_last'):
|
||||
x = self._batch_norm('final_bn', x)
|
||||
x = self._relu(x, self.hps.relu_leakiness)
|
||||
x = self._global_avg_pool(x)
|
||||
|
||||
with tf.variable_scope('logit'):
|
||||
logits = self._fully_connected(x, self.hps.num_classes)
|
||||
self.predictions = tf.nn.softmax(logits)
|
||||
|
||||
with tf.variable_scope('costs'):
|
||||
xent = tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=logits)
|
||||
self.nlls = xent
|
||||
self.cost = tf.reduce_mean(xent, name='xent')
|
||||
self.cost += self._decay()
|
||||
|
||||
#tf.scalar_summary('cost', self.cost)
|
||||
'''
|
||||
|
||||
def _build_train_op(self):
|
||||
"""Build training specific ops for the graph."""
|
||||
self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32)
|
||||
#tf.scalar_summary('learning rate', self.lrn_rate)
|
||||
|
||||
trainable_variables = tf.trainable_variables()
|
||||
#self.trainable_variables = [v for v in tf.trainable_variables() if v.name.startswith('LM/')]
|
||||
|
||||
grads = tf.gradients(self.cost, trainable_variables)
|
||||
|
||||
if self.hps.optimizer == 'sgd':
|
||||
optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate)
|
||||
elif self.hps.optimizer == 'mom':
|
||||
optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9)
|
||||
|
||||
apply_op = optimizer.apply_gradients(
|
||||
zip(grads, trainable_variables),
|
||||
global_step=self.global_step, name='train_step')
|
||||
|
||||
train_ops = [apply_op] + self._extra_train_ops
|
||||
self.train_op = tf.group(*train_ops)
|
||||
|
||||
|
||||
def Update(self, grads):
|
||||
"""Build training specific ops for the graph."""
|
||||
if self.hps.optimizer == 'sgd':
|
||||
optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate)
|
||||
elif self.hps.optimizer == 'mom':
|
||||
optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9)
|
||||
|
||||
apply_op = optimizer.apply_gradients(
|
||||
zip(grads, self.trainable_variables),
|
||||
global_step=self.global_step, name='train_step')
|
||||
|
||||
train_ops = [apply_op] + self._extra_train_ops
|
||||
self.update_ops = tf.group(*train_ops)
|
||||
|
||||
# TODO(xpan): Consider batch_norm in contrib/layers/python/layers/layers.py
|
||||
def _batch_norm(self, name, x):
|
||||
"""Batch normalization."""
|
||||
with tf.variable_scope(name):
|
||||
params_shape = [x.get_shape()[-1]]
|
||||
|
||||
beta = tf.get_variable(
|
||||
'beta', params_shape, tf.float32,
|
||||
initializer=tf.constant_initializer(0.0, tf.float32))
|
||||
gamma = tf.get_variable(
|
||||
'gamma', params_shape, tf.float32,
|
||||
initializer=tf.constant_initializer(1.0, tf.float32))
|
||||
|
||||
if self.mode == 'train':
|
||||
mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments')
|
||||
|
||||
moving_mean = tf.get_variable(
|
||||
'moving_mean', params_shape, tf.float32,
|
||||
initializer=tf.constant_initializer(0.0, tf.float32),
|
||||
trainable=False)
|
||||
moving_variance = tf.get_variable(
|
||||
'moving_variance', params_shape, tf.float32,
|
||||
initializer=tf.constant_initializer(1.0, tf.float32),
|
||||
trainable=False)
|
||||
|
||||
self._extra_train_ops.append(moving_averages.assign_moving_average(
|
||||
moving_mean, mean, 0.9))
|
||||
self._extra_train_ops.append(moving_averages.assign_moving_average(
|
||||
moving_variance, variance, 0.9))
|
||||
else:
|
||||
mean = tf.get_variable(
|
||||
'moving_mean', params_shape, tf.float32,
|
||||
initializer=tf.constant_initializer(0.0, tf.float32),
|
||||
trainable=False)
|
||||
variance = tf.get_variable(
|
||||
'moving_variance', params_shape, tf.float32,
|
||||
initializer=tf.constant_initializer(1.0, tf.float32),
|
||||
trainable=False)
|
||||
#tf.histogram_summary(mean.op.name, mean)
|
||||
#tf.histogram_summary(variance.op.name, variance)
|
||||
# elipson used to be 1e-5. Maybe 0.001 solves NaN problem in deeper net.
|
||||
y = tf.nn.batch_normalization(
|
||||
x, mean, variance, beta, gamma, 0.001)
|
||||
y.set_shape(x.get_shape())
|
||||
return y
|
||||
|
||||
def _residual(self, x, in_filter, out_filter, stride,
|
||||
activate_before_residual=False):
|
||||
"""Residual unit with 2 sub layers."""
|
||||
if activate_before_residual:
|
||||
with tf.variable_scope('shared_activation'):
|
||||
x = self._batch_norm('init_bn', x)
|
||||
x = self._relu(x, self.hps.relu_leakiness)
|
||||
orig_x = x
|
||||
else:
|
||||
with tf.variable_scope('residual_only_activation'):
|
||||
orig_x = x
|
||||
x = self._batch_norm('init_bn', x)
|
||||
x = self._relu(x, self.hps.relu_leakiness)
|
||||
|
||||
with tf.variable_scope('sub1'):
|
||||
x = self._conv('conv1', x, 3, in_filter, out_filter, stride)
|
||||
|
||||
with tf.variable_scope('sub2'):
|
||||
x = self._batch_norm('bn2', x)
|
||||
x = self._relu(x, self.hps.relu_leakiness)
|
||||
x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1])
|
||||
|
||||
with tf.variable_scope('sub_add'):
|
||||
if in_filter != out_filter:
|
||||
orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID')
|
||||
orig_x = tf.pad(
|
||||
orig_x, [[0, 0], [0, 0], [0, 0],
|
||||
[(out_filter-in_filter)//2, (out_filter-in_filter)//2]])
|
||||
x += orig_x
|
||||
|
||||
tf.logging.info('image after unit %s', x.get_shape())
|
||||
return x
|
||||
|
||||
def _bottleneck_residual(self, x, in_filter, out_filter, stride,
|
||||
activate_before_residual=False):
|
||||
"""Bottleneck resisual unit with 3 sub layers."""
|
||||
if activate_before_residual:
|
||||
with tf.variable_scope('common_bn_relu'):
|
||||
x = self._batch_norm('init_bn', x)
|
||||
x = self._relu(x, self.hps.relu_leakiness)
|
||||
orig_x = x
|
||||
else:
|
||||
with tf.variable_scope('residual_bn_relu'):
|
||||
orig_x = x
|
||||
x = self._batch_norm('init_bn', x)
|
||||
x = self._relu(x, self.hps.relu_leakiness)
|
||||
|
||||
with tf.variable_scope('sub1'):
|
||||
x = self._conv('conv1', x, 1, in_filter, out_filter/4, stride)
|
||||
|
||||
with tf.variable_scope('sub2'):
|
||||
x = self._batch_norm('bn2', x)
|
||||
x = self._relu(x, self.hps.relu_leakiness)
|
||||
x = self._conv('conv2', x, 3, out_filter/4, out_filter/4, [1, 1, 1, 1])
|
||||
|
||||
with tf.variable_scope('sub3'):
|
||||
x = self._batch_norm('bn3', x)
|
||||
x = self._relu(x, self.hps.relu_leakiness)
|
||||
x = self._conv('conv3', x, 1, out_filter/4, out_filter, [1, 1, 1, 1])
|
||||
|
||||
with tf.variable_scope('sub_add'):
|
||||
if in_filter != out_filter:
|
||||
orig_x = self._conv('project', orig_x, 1, in_filter, out_filter, stride)
|
||||
x += orig_x
|
||||
|
||||
tf.logging.info('image after unit %s', x.get_shape())
|
||||
return x
|
||||
|
||||
def _decay(self):
|
||||
"""L2 weight decay loss."""
|
||||
costs = []
|
||||
for var in tf.trainable_variables():
|
||||
if var.op.name.find(r'DW') > 0:
|
||||
costs.append(tf.nn.l2_loss(var))
|
||||
# tf.histogram_summary(var.op.name, var)
|
||||
|
||||
return tf.multiply(self.hps.weight_decay_rate, tf.add_n(costs))
|
||||
|
||||
def GetWeightDecay(self):
|
||||
"""L2 weight decay loss."""
|
||||
costs = []
|
||||
for var in self.trainable_variables:
|
||||
if var.op.name.find(r'DW') > 0:
|
||||
costs.append(tf.nn.l2_loss(var))
|
||||
# tf.histogram_summary(var.op.name, var)
|
||||
|
||||
return tf.mul(self.hps.weight_decay_rate, tf.add_n(costs))
|
||||
|
||||
def _conv(self, name, x, filter_size, in_filters, out_filters, strides):
|
||||
"""Convolution."""
|
||||
with tf.variable_scope(name):
|
||||
n = filter_size * filter_size * out_filters
|
||||
kernel = tf.get_variable(
|
||||
'DW', [filter_size, filter_size, in_filters, out_filters],
|
||||
tf.float32, initializer=tf.random_normal_initializer(
|
||||
stddev=np.sqrt(2.0/n)))
|
||||
return tf.nn.conv2d(x, kernel, strides, padding='SAME')
|
||||
|
||||
def _relu(self, x, leakiness=0.0):
|
||||
"""Relu, with optional leaky support."""
|
||||
return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')
|
||||
|
||||
def _fully_connected(self, x, out_dim):
|
||||
"""FullyConnected layer for final output."""
|
||||
x = tf.reshape(x, [self.batch_size, -1])
|
||||
w = tf.get_variable(
|
||||
'DW', [x.get_shape()[1], out_dim],
|
||||
initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
|
||||
b = tf.get_variable('biases', [out_dim],
|
||||
initializer=tf.constant_initializer())
|
||||
return tf.nn.xw_plus_b(x, w, b)
|
||||
|
||||
def _global_avg_pool(self, x):
|
||||
assert x.get_shape().ndims == 4
|
||||
return tf.reduce_mean(x, [1, 2])
|
|
@ -0,0 +1,196 @@
|
|||
"""
|
||||
Trains a Pixel-CNN++ generative model on CIFAR-10 or Tiny ImageNet data.
|
||||
Uses multiple GPUs, indicated by the flag --nr-gpu
|
||||
|
||||
Example usage:
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python train_double_cnn.py --nr_gpu 4
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import pixel_cnn_pp.nn as nn
|
||||
import pixel_cnn_pp.plotting as plotting
|
||||
from pixel_cnn_pp.model import model_spec
|
||||
import data.cifar10_data as cifar10_data
|
||||
import data.imagenet_data as imagenet_data
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
parser = argparse.ArgumentParser()
|
||||
# data I/O
|
||||
parser.add_argument('-i', '--data_dir', type=str, default='/tmp/pxpp/data', help='Location for the dataset')
|
||||
parser.add_argument('-o', '--save_dir', type=str, default='/tmp/pxpp/save', help='Location for parameter checkpoints and samples')
|
||||
parser.add_argument('-d', '--data_set', type=str, default='cifar', help='Can be either cifar|imagenet')
|
||||
parser.add_argument('-t', '--save_interval', type=int, default=20, help='Every how many epochs to write checkpoint/samples?')
|
||||
parser.add_argument('-r', '--load_params', dest='load_params', action='store_true', help='Restore training from previous model checkpoint?')
|
||||
# model
|
||||
parser.add_argument('-q', '--nr_resnet', type=int, default=5, help='Number of residual blocks per stage of the model')
|
||||
parser.add_argument('-n', '--nr_filters', type=int, default=160, help='Number of filters to use across the model. Higher = larger model.')
|
||||
parser.add_argument('-m', '--nr_logistic_mix', type=int, default=10, help='Number of logistic components in the mixture. Higher = more flexible model')
|
||||
parser.add_argument('-z', '--resnet_nonlinearity', type=str, default='concat_elu', help='Which nonlinearity to use in the ResNet layers. One of "concat_elu", "elu", "relu" ')
|
||||
parser.add_argument('-c', '--class_conditional', dest='class_conditional', action='store_true', help='Condition generative model on labels?')
|
||||
# optimization
|
||||
parser.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Base learning rate')
|
||||
parser.add_argument('-e', '--lr_decay', type=float, default=0.999995, help='Learning rate decay, applied every step of the optimization')
|
||||
parser.add_argument('-b', '--batch_size', type=int, default=12, help='Batch size during training per GPU')
|
||||
parser.add_argument('-a', '--init_batch_size', type=int, default=100, help='How much data to use for data-dependent initialization.')
|
||||
parser.add_argument('-p', '--dropout_p', type=float, default=0.5, help='Dropout strength (i.e. 1 - keep_prob). 0 = No dropout, higher = more dropout.')
|
||||
parser.add_argument('-x', '--max_epochs', type=int, default=5000, help='How many epochs to run in total?')
|
||||
parser.add_argument('-g', '--nr_gpu', type=int, default=8, help='How many GPUs to distribute the training across?')
|
||||
# evaluation
|
||||
parser.add_argument('--polyak_decay', type=float, default=0.9995, help='Exponential decay rate of the sum of previous model iterates during Polyak averaging')
|
||||
# reproducibility
|
||||
parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed to use')
|
||||
args = parser.parse_args()
|
||||
print('input args:\n', json.dumps(vars(args), indent=4, separators=(',',':'))) # pretty print args
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# fix random seed for reproducibility
|
||||
rng = np.random.RandomState(args.seed)
|
||||
tf.set_random_seed(args.seed)
|
||||
|
||||
# initialize data loaders for train/test splits
|
||||
if args.data_set == 'imagenet' and args.class_conditional:
|
||||
raise("We currently don't have labels for the small imagenet data set")
|
||||
DataLoader = {'cifar':cifar10_data.DataLoader, 'imagenet':imagenet_data.DataLoader}[args.data_set]
|
||||
train_data = DataLoader(args.data_dir, 'train', args.batch_size * args.nr_gpu, rng=rng, shuffle=False, return_labels=args.class_conditional)
|
||||
test_data = DataLoader(args.data_dir, 'test', args.batch_size * args.nr_gpu, shuffle=False, return_labels=args.class_conditional)
|
||||
obs_shape = train_data.get_observation_size() # e.g. a tuple (32,32,3)
|
||||
assert len(obs_shape) == 3, 'assumed right now'
|
||||
|
||||
# data place holders
|
||||
x_init = tf.placeholder(tf.float32, shape=(args.init_batch_size,) + obs_shape)
|
||||
xs = [tf.placeholder(tf.float32, shape=(args.batch_size, ) + obs_shape) for i in range(args.nr_gpu)]
|
||||
|
||||
# if the model is class-conditional we'll set up label placeholders + one-hot encodings 'h' to condition on
|
||||
if args.class_conditional:
|
||||
num_labels = train_data.get_num_labels()
|
||||
y_init = tf.placeholder(tf.int32, shape=(args.init_batch_size,))
|
||||
h_init = tf.one_hot(y_init, num_labels)
|
||||
y_sample = np.split(np.mod(np.arange(args.batch_size*args.nr_gpu), num_labels), args.nr_gpu)
|
||||
h_sample = [tf.one_hot(tf.Variable(y_sample[i], trainable=False), num_labels) for i in range(args.nr_gpu)]
|
||||
ys = [tf.placeholder(tf.int32, shape=(args.batch_size,)) for i in range(args.nr_gpu)]
|
||||
hs = [tf.one_hot(ys[i], num_labels) for i in range(args.nr_gpu)]
|
||||
else:
|
||||
h_init = None
|
||||
h_sample = [None] * args.nr_gpu
|
||||
hs = h_sample
|
||||
|
||||
# create the model
|
||||
model_opt = { 'nr_resnet': args.nr_resnet, 'nr_filters': args.nr_filters, 'nr_logistic_mix': args.nr_logistic_mix, 'resnet_nonlinearity': args.resnet_nonlinearity }
|
||||
model = tf.make_template('model', model_spec)
|
||||
|
||||
# run once for data dependent initialization of parameters
|
||||
gen_par = model(x_init, h_init, init=True, dropout_p=args.dropout_p, **model_opt)
|
||||
|
||||
# keep track of moving average
|
||||
all_params = tf.trainable_variables()
|
||||
ema = tf.train.ExponentialMovingAverage(decay=args.polyak_decay)
|
||||
maintain_averages_op = tf.group(ema.apply(all_params))
|
||||
|
||||
# get loss gradients over multiple GPUs
|
||||
grads = []
|
||||
loss_gen = []
|
||||
loss_gen_test = []
|
||||
for i in range(args.nr_gpu):
|
||||
with tf.device('/gpu:%d' % i):
|
||||
# train
|
||||
gen_par = model(xs[i], hs[i], ema=None, dropout_p=args.dropout_p, **model_opt)
|
||||
loss_gen.append(nn.discretized_mix_logistic_loss(xs[i], gen_par))
|
||||
# gradients
|
||||
grads.append(tf.gradients(loss_gen[i], all_params))
|
||||
# test
|
||||
gen_par = model(xs[i], hs[i], ema=ema, dropout_p=0., **model_opt)
|
||||
loss_gen_test.append(nn.discretized_mix_logistic_loss(xs[i], gen_par))
|
||||
|
||||
# add losses and gradients together and get training updates
|
||||
tf_lr = tf.placeholder(tf.float32, shape=[])
|
||||
with tf.device('/gpu:0'):
|
||||
for i in range(1,args.nr_gpu):
|
||||
loss_gen[0] += loss_gen[i]
|
||||
loss_gen_test[0] += loss_gen_test[i]
|
||||
for j in range(len(grads[0])):
|
||||
grads[0][j] += grads[i][j]
|
||||
# training op
|
||||
optimizer = tf.group(nn.adam_updates(all_params, grads[0], lr=tf_lr, mom1=0.95, mom2=0.9995), maintain_averages_op)
|
||||
|
||||
# convert loss to bits/dim
|
||||
bits_per_dim = loss_gen[0]/(args.nr_gpu*np.log(2.)*np.prod(obs_shape)*args.batch_size)
|
||||
bits_per_dim_test = loss_gen_test[0]/(args.nr_gpu*np.log(2.)*np.prod(obs_shape)*args.batch_size)
|
||||
|
||||
# sample from the model
|
||||
new_x_gen = []
|
||||
for i in range(args.nr_gpu):
|
||||
with tf.device('/gpu:%d' % i):
|
||||
gen_par = model(xs[i], h_sample[i], ema=ema, dropout_p=0, **model_opt)
|
||||
new_x_gen.append(nn.sample_from_discretized_mix_logistic(gen_par, args.nr_logistic_mix))
|
||||
def sample_from_model(sess):
|
||||
x_gen = [np.zeros((args.batch_size,) + obs_shape, dtype=np.float32) for i in range(args.nr_gpu)]
|
||||
for yi in range(obs_shape[0]):
|
||||
for xi in range(obs_shape[1]):
|
||||
new_x_gen_np = sess.run(new_x_gen, {xs[i]: x_gen[i] for i in range(args.nr_gpu)})
|
||||
for i in range(args.nr_gpu):
|
||||
x_gen[i][:,yi,xi,:] = new_x_gen_np[i][:,yi,xi,:]
|
||||
return np.concatenate(x_gen, axis=0)
|
||||
|
||||
# init & save
|
||||
initializer = tf.initialize_all_variables()
|
||||
saver = tf.train.Saver()
|
||||
|
||||
# turn numpy inputs into feed_dict for use with tensorflow
|
||||
def make_feed_dict(data, init=False):
|
||||
if type(data) is tuple:
|
||||
x,y = data
|
||||
else:
|
||||
x = data
|
||||
y = None
|
||||
x = np.cast[np.float32]((x - 127.5) / 127.5) # input to pixelCNN is scaled from uint8 [0,255] to float in range [-1,1]
|
||||
if init:
|
||||
feed_dict = {x_init: x}
|
||||
if y is not None:
|
||||
feed_dict.update({y_init: y})
|
||||
else:
|
||||
x = np.split(x, args.nr_gpu)
|
||||
feed_dict = {xs[i]: x[i] for i in range(args.nr_gpu)}
|
||||
if y is not None:
|
||||
y = np.split(y, args.nr_gpu)
|
||||
feed_dict.update({ys[i]: y[i] for i in range(args.nr_gpu)})
|
||||
return feed_dict
|
||||
|
||||
# //////////// perform testing //////////////
|
||||
|
||||
print('starting testing')
|
||||
test_bpd = []
|
||||
lr = args.learning_rate
|
||||
|
||||
with tf.Session() as sess:
|
||||
# compute likelihood over test data
|
||||
ckpt_file = args.save_dir + '/params_' + args.data_set + '.ckpt'
|
||||
print('restoring parameters from', ckpt_file)
|
||||
saver.restore(sess, ckpt_file)
|
||||
|
||||
test_losses = []
|
||||
uidx = 0
|
||||
for d in train_data:
|
||||
feed_dict = make_feed_dict(d)
|
||||
l = sess.run(bits_per_dim_test, feed_dict)
|
||||
test_losses.append(l)
|
||||
uidx += 1
|
||||
if uidx % 100 == 0:
|
||||
print(uidx, l)
|
||||
test_loss_gen = np.mean(test_losses)
|
||||
print(uidx, ' -- ', test_loss_gen)
|
||||
test_bpd.append(test_loss_gen)
|
||||
print('Test nll=%.2f' % test_loss_gen)
|
||||
|
||||
np.savez('./TMD', np.array(test_losses))
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,222 @@
|
|||
"""
|
||||
Trains a Pixel-CNN++ generative model on CIFAR-10 or Tiny ImageNet data.
|
||||
Uses multiple GPUs, indicated by the flag --nr-gpu
|
||||
|
||||
Example usage:
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python train_double_cnn.py --nr_gpu 4
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import pixel_cnn_pp.nn as nn
|
||||
import pixel_cnn_pp.plotting as plotting
|
||||
from pixel_cnn_pp.model import model_spec
|
||||
import data.cifar10_data as cifar10_data
|
||||
import data.imagenet_data as imagenet_data
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
parser = argparse.ArgumentParser()
|
||||
# data I/O
|
||||
parser.add_argument('-i', '--data_dir', type=str, default='/tmp/pxpp/data', help='Location for the dataset')
|
||||
parser.add_argument('-o', '--save_dir', type=str, default='/tmp/pxpp/save', help='Location for parameter checkpoints and samples')
|
||||
parser.add_argument('-d', '--data_set', type=str, default='cifar', help='Can be either cifar|imagenet')
|
||||
parser.add_argument('-t', '--save_interval', type=int, default=20, help='Every how many epochs to write checkpoint/samples?')
|
||||
parser.add_argument('-r', '--load_params', dest='load_params', action='store_true', help='Restore training from previous model checkpoint?')
|
||||
# model
|
||||
parser.add_argument('-q', '--nr_resnet', type=int, default=5, help='Number of residual blocks per stage of the model')
|
||||
parser.add_argument('-n', '--nr_filters', type=int, default=160, help='Number of filters to use across the model. Higher = larger model.')
|
||||
parser.add_argument('-m', '--nr_logistic_mix', type=int, default=10, help='Number of logistic components in the mixture. Higher = more flexible model')
|
||||
parser.add_argument('-z', '--resnet_nonlinearity', type=str, default='concat_elu', help='Which nonlinearity to use in the ResNet layers. One of "concat_elu", "elu", "relu" ')
|
||||
parser.add_argument('-c', '--class_conditional', dest='class_conditional', action='store_true', help='Condition generative model on labels?')
|
||||
# optimization
|
||||
parser.add_argument('-l', '--learning_rate', type=float, default=0.001, help='Base learning rate')
|
||||
parser.add_argument('-e', '--lr_decay', type=float, default=0.999995, help='Learning rate decay, applied every step of the optimization')
|
||||
parser.add_argument('-b', '--batch_size', type=int, default=12, help='Batch size during training per GPU')
|
||||
parser.add_argument('-a', '--init_batch_size', type=int, default=100, help='How much data to use for data-dependent initialization.')
|
||||
parser.add_argument('-p', '--dropout_p', type=float, default=0.5, help='Dropout strength (i.e. 1 - keep_prob). 0 = No dropout, higher = more dropout.')
|
||||
parser.add_argument('-x', '--max_epochs', type=int, default=5000, help='How many epochs to run in total?')
|
||||
parser.add_argument('-g', '--nr_gpu', type=int, default=8, help='How many GPUs to distribute the training across?')
|
||||
# evaluation
|
||||
parser.add_argument('--polyak_decay', type=float, default=0.9995, help='Exponential decay rate of the sum of previous model iterates during Polyak averaging')
|
||||
# reproducibility
|
||||
parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed to use')
|
||||
args = parser.parse_args()
|
||||
print('input args:\n', json.dumps(vars(args), indent=4, separators=(',',':'))) # pretty print args
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# fix random seed for reproducibility
|
||||
rng = np.random.RandomState(args.seed)
|
||||
tf.set_random_seed(args.seed)
|
||||
|
||||
# initialize data loaders for train/test splits
|
||||
if args.data_set == 'imagenet' and args.class_conditional:
|
||||
raise("We currently don't have labels for the small imagenet data set")
|
||||
DataLoader = {'cifar':cifar10_data.DataLoader, 'imagenet':imagenet_data.DataLoader}[args.data_set]
|
||||
train_data = DataLoader(args.data_dir, 'train', args.batch_size * args.nr_gpu, rng=rng, shuffle=True, return_labels=args.class_conditional)
|
||||
test_data = DataLoader(args.data_dir, 'test', args.batch_size * args.nr_gpu, shuffle=False, return_labels=args.class_conditional)
|
||||
obs_shape = train_data.get_observation_size() # e.g. a tuple (32,32,3)
|
||||
assert len(obs_shape) == 3, 'assumed right now'
|
||||
|
||||
# data place holders
|
||||
x_init = tf.placeholder(tf.float32, shape=(args.init_batch_size,) + obs_shape)
|
||||
xs = [tf.placeholder(tf.float32, shape=(args.batch_size, ) + obs_shape) for i in range(args.nr_gpu)]
|
||||
|
||||
# if the model is class-conditional we'll set up label placeholders + one-hot encodings 'h' to condition on
|
||||
if args.class_conditional:
|
||||
num_labels = train_data.get_num_labels()
|
||||
y_init = tf.placeholder(tf.int32, shape=(args.init_batch_size,))
|
||||
h_init = tf.one_hot(y_init, num_labels)
|
||||
y_sample = np.split(np.mod(np.arange(args.batch_size*args.nr_gpu), num_labels), args.nr_gpu)
|
||||
h_sample = [tf.one_hot(tf.Variable(y_sample[i], trainable=False), num_labels) for i in range(args.nr_gpu)]
|
||||
ys = [tf.placeholder(tf.int32, shape=(args.batch_size,)) for i in range(args.nr_gpu)]
|
||||
hs = [tf.one_hot(ys[i], num_labels) for i in range(args.nr_gpu)]
|
||||
else:
|
||||
h_init = None
|
||||
h_sample = [None] * args.nr_gpu
|
||||
hs = h_sample
|
||||
|
||||
# create the model
|
||||
model_opt = { 'nr_resnet': args.nr_resnet, 'nr_filters': args.nr_filters, 'nr_logistic_mix': args.nr_logistic_mix, 'resnet_nonlinearity': args.resnet_nonlinearity }
|
||||
model = tf.make_template('model', model_spec)
|
||||
|
||||
# run once for data dependent initialization of parameters
|
||||
gen_par = model(x_init, h_init, init=True, dropout_p=args.dropout_p, **model_opt)
|
||||
|
||||
# keep track of moving average
|
||||
all_params = tf.trainable_variables()
|
||||
ema = tf.train.ExponentialMovingAverage(decay=args.polyak_decay)
|
||||
maintain_averages_op = tf.group(ema.apply(all_params))
|
||||
|
||||
# get loss gradients over multiple GPUs
|
||||
grads = []
|
||||
loss_gen = []
|
||||
loss_gen_test = []
|
||||
for i in range(args.nr_gpu):
|
||||
with tf.device('/gpu:%d' % i):
|
||||
# train
|
||||
gen_par = model(xs[i], hs[i], ema=None, dropout_p=args.dropout_p, **model_opt)
|
||||
loss_gen.append(nn.discretized_mix_logistic_loss(xs[i], gen_par))
|
||||
# gradients
|
||||
grads.append(tf.gradients(loss_gen[i], all_params))
|
||||
# test
|
||||
gen_par = model(xs[i], hs[i], ema=ema, dropout_p=0., **model_opt)
|
||||
loss_gen_test.append(nn.discretized_mix_logistic_loss(xs[i], gen_par))
|
||||
|
||||
# add losses and gradients together and get training updates
|
||||
tf_lr = tf.placeholder(tf.float32, shape=[])
|
||||
with tf.device('/gpu:0'):
|
||||
for i in range(1,args.nr_gpu):
|
||||
loss_gen[0] += loss_gen[i]
|
||||
loss_gen_test[0] += loss_gen_test[i]
|
||||
for j in range(len(grads[0])):
|
||||
grads[0][j] += grads[i][j]
|
||||
# training op
|
||||
optimizer = tf.group(nn.adam_updates(all_params, grads[0], lr=tf_lr, mom1=0.95, mom2=0.9995), maintain_averages_op)
|
||||
|
||||
# convert loss to bits/dim
|
||||
bits_per_dim = loss_gen[0]/(args.nr_gpu*np.log(2.)*np.prod(obs_shape)*args.batch_size)
|
||||
bits_per_dim_test = loss_gen_test[0]/(args.nr_gpu*np.log(2.)*np.prod(obs_shape)*args.batch_size)
|
||||
|
||||
# sample from the model
|
||||
new_x_gen = []
|
||||
for i in range(args.nr_gpu):
|
||||
with tf.device('/gpu:%d' % i):
|
||||
gen_par = model(xs[i], h_sample[i], ema=ema, dropout_p=0, **model_opt)
|
||||
new_x_gen.append(nn.sample_from_discretized_mix_logistic(gen_par, args.nr_logistic_mix))
|
||||
def sample_from_model(sess):
|
||||
x_gen = [np.zeros((args.batch_size,) + obs_shape, dtype=np.float32) for i in range(args.nr_gpu)]
|
||||
for yi in range(obs_shape[0]):
|
||||
for xi in range(obs_shape[1]):
|
||||
new_x_gen_np = sess.run(new_x_gen, {xs[i]: x_gen[i] for i in range(args.nr_gpu)})
|
||||
for i in range(args.nr_gpu):
|
||||
x_gen[i][:,yi,xi,:] = new_x_gen_np[i][:,yi,xi,:]
|
||||
return np.concatenate(x_gen, axis=0)
|
||||
|
||||
# init & save
|
||||
initializer = tf.initialize_all_variables()
|
||||
saver = tf.train.Saver()
|
||||
|
||||
# turn numpy inputs into feed_dict for use with tensorflow
|
||||
def make_feed_dict(data, init=False):
|
||||
if type(data) is tuple:
|
||||
x,y = data
|
||||
else:
|
||||
x = data
|
||||
y = None
|
||||
x = np.cast[np.float32]((x - 127.5) / 127.5) # input to pixelCNN is scaled from uint8 [0,255] to float in range [-1,1]
|
||||
if init:
|
||||
feed_dict = {x_init: x}
|
||||
if y is not None:
|
||||
feed_dict.update({y_init: y})
|
||||
else:
|
||||
x = np.split(x, args.nr_gpu)
|
||||
feed_dict = {xs[i]: x[i] for i in range(args.nr_gpu)}
|
||||
if y is not None:
|
||||
y = np.split(y, args.nr_gpu)
|
||||
feed_dict.update({ys[i]: y[i] for i in range(args.nr_gpu)})
|
||||
return feed_dict
|
||||
|
||||
# //////////// perform training //////////////
|
||||
if not os.path.exists(args.save_dir):
|
||||
os.makedirs(args.save_dir)
|
||||
print('starting training')
|
||||
test_bpd = []
|
||||
lr = args.learning_rate
|
||||
with tf.Session() as sess:
|
||||
for epoch in range(args.max_epochs):
|
||||
begin = time.time()
|
||||
|
||||
# init
|
||||
if epoch == 0:
|
||||
feed_dict = make_feed_dict(train_data.next(args.init_batch_size), init=True) # manually retrieve exactly init_batch_size examples
|
||||
train_data.reset() # rewind the iterator back to 0 to do one full epoch
|
||||
sess.run(initializer, feed_dict)
|
||||
print('initializing the model...')
|
||||
if args.load_params:
|
||||
ckpt_file = args.save_dir + '/params_' + args.data_set + '.ckpt'
|
||||
print('restoring parameters from', ckpt_file)
|
||||
saver.restore(sess, ckpt_file)
|
||||
|
||||
# train for one epoch
|
||||
train_losses = []
|
||||
for d in train_data:
|
||||
feed_dict = make_feed_dict(d)
|
||||
# forward/backward/update model on each gpu
|
||||
lr *= args.lr_decay
|
||||
feed_dict.update({ tf_lr: lr })
|
||||
l,_ = sess.run([bits_per_dim, optimizer], feed_dict)
|
||||
train_losses.append(l)
|
||||
train_loss_gen = np.mean(train_losses)
|
||||
|
||||
# compute likelihood over test data
|
||||
test_losses = []
|
||||
for d in test_data:
|
||||
feed_dict = make_feed_dict(d)
|
||||
l = sess.run(bits_per_dim_test, feed_dict)
|
||||
test_losses.append(l)
|
||||
test_loss_gen = np.mean(test_losses)
|
||||
test_bpd.append(test_loss_gen)
|
||||
|
||||
# log progress to console
|
||||
print("Iteration %d, time = %ds, train bits_per_dim = %.4f, test bits_per_dim = %.4f" % (epoch, time.time()-begin, train_loss_gen, test_loss_gen))
|
||||
sys.stdout.flush()
|
||||
|
||||
if epoch % args.save_interval == 0:
|
||||
|
||||
# generate samples from the model
|
||||
sample_x = sample_from_model(sess)
|
||||
img_tile = plotting.img_tile(sample_x[:int(np.floor(np.sqrt(args.batch_size*args.nr_gpu))**2)], aspect_ratio=1.0, border_color=1.0, stretch=True)
|
||||
img = plotting.plot_img(img_tile, title=args.data_set + ' samples')
|
||||
plotting.plt.savefig(os.path.join(args.save_dir,'%s_sample%d.png' % (args.data_set, epoch)))
|
||||
plotting.plt.close('all')
|
||||
|
||||
# save params
|
||||
saver.save(sess, args.save_dir + '/params_' + args.data_set + '.ckpt')
|
||||
np.savez(args.save_dir + '/test_bpd_' + args.data_set + '.npz', test_bpd=np.array(test_bpd))
|
|
@ -0,0 +1,91 @@
|
|||
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""ResNet Train/Eval module.
|
||||
"""
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
|
||||
import cifar_input
|
||||
import numpy as np
|
||||
import resnet_model_basic as resnet_model
|
||||
import tensorflow as tf
|
||||
import data.cifar10_data as cifar10_data
|
||||
|
||||
|
||||
|
||||
def lr_I2L(train_step):
|
||||
#step_wise = [40000,60000,80000] # this is the one for original
|
||||
step_wise = [51000,76000,102000]
|
||||
if train_step < step_wise[0]:
|
||||
return 0.1
|
||||
elif train_step < step_wise[1]:
|
||||
return 0.01
|
||||
elif train_step < step_wise[2]:
|
||||
return 0.001
|
||||
else:
|
||||
return 0.0001
|
||||
|
||||
class worker_I2L(object):
|
||||
def __init__(self, args):
|
||||
|
||||
hps = resnet_model.HParams(batch_size=args.batch_size,
|
||||
num_classes=10,
|
||||
min_lrn_rate=0.0001,
|
||||
lrn_rate=0.1,
|
||||
num_residual_units=18,
|
||||
use_bottleneck=False,
|
||||
weight_decay_rate=0.0002,
|
||||
relu_leakiness=0.1,
|
||||
optimizer='mom')
|
||||
self.args = args
|
||||
self.model = resnet_model.ResNet(hps, args.mode, use_wide_resnet=args.use_wide_resnet, nr_gpu=args.nr_gpu)
|
||||
self.model.build_graph()
|
||||
|
||||
truth = tf.argmax(tf.concat(self.model.labels, axis=0), axis=1)
|
||||
predictions = tf.argmax(tf.concat(self.model.predictions,axis=0), axis=1)
|
||||
self.right_decision = tf.reduce_sum(tf.to_float(tf.equal(predictions, truth)))
|
||||
|
||||
def GetLoss(self):
|
||||
return self.model.nlls, self.model.GetWeightDecay()
|
||||
|
||||
def Valid(self, test_data, sess):
|
||||
with tf.device('/gpu:0'):
|
||||
cost_all = self.model.nlls[0]
|
||||
for i in range(1, self.args.nr_gpu):
|
||||
cost_all += self.model.nlls[i]
|
||||
|
||||
m_sample = 0
|
||||
m_correct = 0.
|
||||
costs = 0.
|
||||
for test_image, test_label in test_data:
|
||||
m_sample += test_image.shape[0]
|
||||
|
||||
splitted_image = np.split(test_image.astype('float32'), self.args.nr_gpu)
|
||||
splitted_label = np.split(test_label, self.args.nr_gpu)
|
||||
|
||||
feed_dict = {self.model.needImgAug: False}
|
||||
feed_dict.update({self.model.input_image[i]: splitted_image[i] for i in range(self.args.nr_gpu)})
|
||||
feed_dict.update({self.model.input_label[i]: splitted_label[i][:, None] for i in range(self.args.nr_gpu)})
|
||||
|
||||
_cost, _right_decision = sess.run([cost_all, self.right_decision], feed_dict)
|
||||
costs += np.sum(_cost)
|
||||
m_correct += _right_decision
|
||||
test_loss = costs / m_sample
|
||||
test_acc = m_correct * 1. / m_sample
|
||||
print('[I2L] test_nll={},test_acc={}'.format(
|
||||
'{0:.4f}'.format(test_loss), '{0:.6f}'.format(test_acc) )
|
||||
)
|
|
@ -0,0 +1,134 @@
|
|||
"""
|
||||
Trains a Pixel-CNN++ generative model on CIFAR-10 or Tiny ImageNet data.
|
||||
Uses multiple GPUs, indicated by the flag --nr-gpu
|
||||
|
||||
Example usage:
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python train_double_cnn.py --nr_gpu 4
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import pixel_cnn_pp.nn as nn
|
||||
import pixel_cnn_pp.plotting as plotting
|
||||
from pixel_cnn_pp.model import model_spec
|
||||
import data.cifar10_data as cifar10_data
|
||||
|
||||
class worker_L2I(object):
|
||||
def __init__(self, args, num_labels, image_shape):
|
||||
# Default parameters
|
||||
self.num_labels = num_labels
|
||||
self.image_shape=image_shape
|
||||
self.args = args
|
||||
|
||||
# Data userd for data-dependent parameter initialization
|
||||
self.x_init = tf.placeholder(tf.float32, shape=(args.init_batch_size,) + self.image_shape)
|
||||
self.xs = [tf.placeholder(tf.float32, shape=(args.batch_size, ) + self.image_shape) for _ in range(args.nr_gpu)]
|
||||
self.y_init = tf.placeholder(tf.int32, shape=(args.init_batch_size,))
|
||||
self.h_init = tf.one_hot(self.y_init, self.num_labels)
|
||||
|
||||
# parameters used for sampling
|
||||
self.y_sample = np.split(np.mod(np.arange(args.batch_size*args.nr_gpu), self.num_labels), args.nr_gpu)
|
||||
# self.h_sample = [tf.one_hot(tf.Variable(self.y_sample[i], trainable=False), self.num_labels) for i in range(args.nr_gpu)]
|
||||
# the above line is the version used for icml paper. I revise it as follows
|
||||
self.h_sample = [tf.one_hot(self.y_sample[i], self.num_labels) for i in range(args.nr_gpu)]
|
||||
self.ys = [tf.placeholder(tf.int32, shape=(args.batch_size,)) for i in range(args.nr_gpu)]
|
||||
self.hs = [tf.one_hot(self.ys[i], self.num_labels) for i in range(args.nr_gpu)]
|
||||
# create the model
|
||||
self.model_opt = { 'nr_resnet': args.nr_resnet, 'nr_filters': args.nr_filters, 'nr_logistic_mix': args.nr_logistic_mix, 'resnet_nonlinearity': args.resnet_nonlinearity }
|
||||
self.model = tf.make_template('model', model_spec)
|
||||
|
||||
# run once for data dependent initialization of parameters
|
||||
# in the original code, it is " gen_par = self.model(...)"; when init=True, it will run initilization automatically
|
||||
self.model(self.x_init, self.h_init, init=True, dropout_p=args.dropout_p, **self.model_opt)
|
||||
|
||||
# keep track of moving average
|
||||
self.all_params = tf.trainable_variables()
|
||||
self.ema = tf.train.ExponentialMovingAverage(decay=args.polyak_decay)
|
||||
self.maintain_averages_op = tf.group(self.ema.apply(self.all_params))
|
||||
|
||||
# parameters for optimization
|
||||
self.tf_lr = tf.placeholder(tf.float32, shape=())
|
||||
|
||||
def GetLoss(self):
|
||||
# get loss gradients over multiple GPUs
|
||||
loss_gen = []
|
||||
loss_gen_test = []
|
||||
for i in range(self.args.nr_gpu):
|
||||
with tf.device('/gpu:%d' % i):
|
||||
# train
|
||||
gen_par = self.model(self.xs[i], self.hs[i], ema=None, dropout_p=self.args.dropout_p, **self.model_opt)
|
||||
loss_gen.append(nn.discretized_mix_logistic_loss(self.xs[i], gen_par, sum_all=False))
|
||||
|
||||
# test
|
||||
gen_par = self.model(self.xs[i], self.hs[i], ema=self.ema, dropout_p=0., **self.model_opt)
|
||||
loss_gen_test.append(nn.discretized_mix_logistic_loss(self.xs[i], gen_par))
|
||||
|
||||
return loss_gen, loss_gen_test
|
||||
|
||||
def GetOverallLoss(self):
|
||||
# get loss gradients over multiple GPUs
|
||||
loss_gen = []
|
||||
loss_gen_test = []
|
||||
for i in range(self.args.nr_gpu):
|
||||
with tf.device('/gpu:%d' % i):
|
||||
# train
|
||||
gen_par = self.model(self.xs[i], self.hs[i], ema=None, dropout_p=self.args.dropout_p, **self.model_opt)
|
||||
loss_gen.append(nn.discretized_mix_logistic_loss(self.xs[i], gen_par, sum_all=False))
|
||||
|
||||
# test
|
||||
gen_par = self.model(self.xs[i], self.hs[i], ema=self.ema, dropout_p=0., **self.model_opt)
|
||||
loss_gen_test.append(nn.discretized_mix_logistic_loss(self.xs[i], gen_par))
|
||||
|
||||
# add the lossx to /gpu:0
|
||||
with tf.device('/gpu:0'):
|
||||
for i in range(1,self.args.nr_gpu):
|
||||
loss_gen[0] += loss_gen[i]
|
||||
loss_gen_test[0] += loss_gen_test[i]
|
||||
|
||||
# training op
|
||||
#optimizer = tf.group(nn.adam_updates(self.all_params, grads[0], lr=self.tf_lr, mom1=0.95, mom2=0.9995), self.maintain_averages_op)
|
||||
|
||||
# convert loss to bits/dim
|
||||
self.bits_per_dim = loss_gen[0]/(self.args.nr_gpu*np.log(2.)*np.prod(self.image_shape)*self.args.batch_size)
|
||||
self.bits_per_dim_test = loss_gen_test[0]/(self.args.nr_gpu*np.log(2.)*np.prod(self.image_shape)*self.args.batch_size)
|
||||
|
||||
def Update(self, grads, useSGD=False):
|
||||
if useSGD:
|
||||
print('Use pure SGD for Label-->Image tasks')
|
||||
optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.tf_lr)
|
||||
apply_op = optimizer.apply_gradients(zip(grads, self.all_params))
|
||||
self.update_ops = tf.group(apply_op)
|
||||
else:
|
||||
self.update_ops = tf.group(nn.adam_updates(self.all_params, grads, lr=self.tf_lr, mom1=0.95, mom2=0.9995), self.maintain_averages_op)
|
||||
|
||||
def build_sample_from_model(self):
|
||||
# sample from the model
|
||||
self.new_x_gen = []
|
||||
for i in range(self.args.nr_gpu):
|
||||
with tf.device('/gpu:%d' % i):
|
||||
gen_par = self.model(self.xs[i], self.h_sample[i], ema=self.ema, dropout_p=0, **self.model_opt)
|
||||
self.new_x_gen.append(nn.sample_from_discretized_mix_logistic(gen_par, self.args.nr_logistic_mix))
|
||||
|
||||
def _sample_from_model(self, sess):
|
||||
x_gen = [np.zeros((self.args.batch_size,) + self.image_shape, dtype=np.float32) for _ in range(self.args.nr_gpu)]
|
||||
for yi in range(self.image_shape[0]):
|
||||
for xi in range(self.image_shape[1]):
|
||||
new_x_gen_np = sess.run(self.new_x_gen, {self.xs[i]: x_gen[i] for i in range(self.args.nr_gpu)})
|
||||
for i in range(self.args.nr_gpu):
|
||||
x_gen[i][:,yi,xi,:] = new_x_gen_np[i][:,yi,xi,:]
|
||||
return np.concatenate(x_gen, axis=0)
|
||||
|
||||
|
||||
def Gen_Images(self, sess, epoch):
|
||||
sample_x = self._sample_from_model(sess)
|
||||
img_tile = plotting.img_tile(sample_x[:int(np.floor(np.sqrt(self.args.batch_size*self.args.nr_gpu))**2)], aspect_ratio=1.0, border_color=1.0, stretch=True)
|
||||
img = plotting.plot_img(img_tile, title=self.args.data_set + ' samples')
|
||||
plotting.plt.savefig(os.path.join(self.args.save_dir,'%s_sample%d.png' % (self.args.data_set, epoch)))
|
||||
plotting.plt.close('all')
|
|
@ -0,0 +1,40 @@
|
|||
import sys
|
||||
|
||||
mapper_machine_freecard = {}
|
||||
mapper_machine_rank = {}
|
||||
|
||||
def MapIDs(m_machine):
|
||||
for i in range(m_machine):
|
||||
fo = open('record' + str(i))
|
||||
id = 0
|
||||
m_line = 0
|
||||
machine_name = ''
|
||||
for line in fo:
|
||||
if id == 0:
|
||||
machine_name = line[:-1]
|
||||
mapper_machine_freecard[machine_name] = []
|
||||
if mapper_machine_rank.has_key(machine_name):
|
||||
mapper_machine_rank[machine_name].append(i)
|
||||
else:
|
||||
mapper_machine_rank[machine_name] = [i]
|
||||
elif id > 1:
|
||||
mapper_machine_freecard[machine_name].append(int(line))
|
||||
id = id + 1
|
||||
fo.close()
|
||||
|
||||
def Map_Rank_Card(m_machine):
|
||||
MapIDs(m_machine)
|
||||
allocations = range(m_machine)
|
||||
for k in mapper_machine_rank.keys():
|
||||
ranks = mapper_machine_rank[k]
|
||||
cards = mapper_machine_freecard[k]
|
||||
#if len(ranks) == len(cards):
|
||||
for i in range(len(ranks)):
|
||||
allocations[ranks[i]] = cards[i]
|
||||
|
||||
for l in allocations:
|
||||
print l
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
Map_Rank_Card(int(sys.argv[1]))
|
|
@ -0,0 +1,40 @@
|
|||
import sys
|
||||
|
||||
mapper_machine_freecard = {}
|
||||
mapper_machine_rank = {}
|
||||
|
||||
def MapIDs(m_machine):
|
||||
for i in range(m_machine):
|
||||
fo = open('record' + str(i))
|
||||
id = 0
|
||||
m_line = 0
|
||||
machine_name = ''
|
||||
for line in fo:
|
||||
if id == 0:
|
||||
machine_name = line[:-1]
|
||||
mapper_machine_freecard[machine_name] = []
|
||||
if mapper_machine_rank.has_key(machine_name):
|
||||
mapper_machine_rank[machine_name].append(i)
|
||||
else:
|
||||
mapper_machine_rank[machine_name] = [i]
|
||||
elif id > 1:
|
||||
mapper_machine_freecard[machine_name].append(int(line))
|
||||
id = id + 1
|
||||
fo.close()
|
||||
|
||||
def Map_Rank_Card(m_machine):
|
||||
MapIDs(m_machine)
|
||||
allocations = range(m_machine)
|
||||
for k in mapper_machine_rank.keys():
|
||||
ranks = mapper_machine_rank[k]
|
||||
cards = mapper_machine_freecard[k]
|
||||
#if len(ranks) == len(cards):
|
||||
for i in range(len(ranks)):
|
||||
allocations[ranks[i]] = cards[i]
|
||||
|
||||
for l in allocations:
|
||||
print l
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
Map_Rank_Card(int(sys.argv[1]))
|
|
@ -0,0 +1,455 @@
|
|||
from nmt_base import *
|
||||
from Data import *
|
||||
|
||||
def _p(pp, name):
|
||||
return '%s_%s' % (pp, name)
|
||||
|
||||
class CLM_worker(object):
|
||||
def __init__(self,
|
||||
round = 0,
|
||||
dim_word=500, # word vector dimensionality
|
||||
dim_proj=1024, # the number of GRU units
|
||||
encoder='lstm',
|
||||
patience=10, # early stopping patience
|
||||
max_epochs=5000,
|
||||
finish_after=10000000000000, # finish after this many updates
|
||||
decay_c=-1., # L2 weight decay penalty
|
||||
clip_c=5.,
|
||||
lrate=1.,
|
||||
n_words=10000, # vocabulary size
|
||||
maxlen=None, # maximum length of the description
|
||||
minlen=1,
|
||||
start_iter=0,
|
||||
start_epoch=0,
|
||||
optimizer='adadelta',
|
||||
batch_size=16,
|
||||
valid_batch_size=16,
|
||||
saveto='model.npz',
|
||||
validFreq=2000,
|
||||
dispFreq=100,
|
||||
saveFreq=100000, # save the parameters after every saveFreq updates
|
||||
newDumpFreq=10000,
|
||||
syncFreq = 500000000000,
|
||||
sampleFreq=10000000000, # generate some samples after every sampleFreq
|
||||
valid_dataset=None,
|
||||
test_dataset=None,
|
||||
dictionary=None,
|
||||
sampleFileName="sampleFile.txt",
|
||||
embedding=None,
|
||||
dropout_input=None,
|
||||
dropout_output=None,
|
||||
reload_model=None,
|
||||
reload_option=None,
|
||||
log=None,
|
||||
monitor_grad=False,
|
||||
pad_sos=False):
|
||||
# Model options
|
||||
if pad_sos:
|
||||
n_words += 1
|
||||
self.options = locals().copy()
|
||||
|
||||
print('log = ', log)
|
||||
F_log = open(log, "a")
|
||||
|
||||
voc_size = n_words - 1 if pad_sos else n_words
|
||||
|
||||
# reload options
|
||||
if reload_option is not None and os.path.exists(reload_option):
|
||||
print "Reloading model options...",
|
||||
with open('%s' % reload_option, 'rb') as f:
|
||||
model_options = pkl.load(f)
|
||||
print "Done"
|
||||
|
||||
# init parameters
|
||||
print 'Initializing model parameters...',
|
||||
params = init_lm_params(self.options)
|
||||
print 'Done'
|
||||
|
||||
# load pre-trained word embedding
|
||||
if embedding is not None and os.path.exists(embedding):
|
||||
print 'Load Embedding from ', embedding
|
||||
Wemb = numpy.array(numpy.load(open(embedding, "rb")))
|
||||
assert Wemb.shape[0] == self.options['n_words']
|
||||
assert Wemb.shape[1] == self.options['dim_word']
|
||||
print 'Using pre-trained word embedding...',
|
||||
params['Wemb'] = Wemb.astype(numpy.float32)
|
||||
print 'vocab size', params['Wemb'].shape[0], ', dim', params['Wemb'].shape[1]
|
||||
|
||||
# reload parameters
|
||||
if reload_model is not None and os.path.exists(reload_model):
|
||||
print "Reloading model parameters...",
|
||||
params = load_params(reload_model, params)
|
||||
print "Done"
|
||||
|
||||
# create shared variables for parameters
|
||||
self.tparams = init_tparams(params)
|
||||
|
||||
# build the symbolic computational graph
|
||||
print 'Building model...'
|
||||
self.trng = RandomStreams(1234)
|
||||
self.use_noise = theano.shared(numpy.float32(0.))
|
||||
|
||||
def GetNll(self):
|
||||
srcx, srcx_mask, ctx_, cost, sentenceLen = self.build_lm_model()
|
||||
print 'Done'
|
||||
|
||||
print 'Building f_log_probs',
|
||||
self.f_log_probs = theano.function([srcx, srcx_mask, ctx_], cost, profile = profile)
|
||||
print 'Done'
|
||||
return srcx, srcx_mask, ctx_, cost, sentenceLen
|
||||
|
||||
# build a training model
|
||||
def build_lm_model(self):
|
||||
srcx = tensor.matrix('x', dtype='int64')
|
||||
srcx_mask = tensor.matrix('x_mask', dtype='float32')
|
||||
ctx_ = tensor.vector('ctx_', dtype='int64')
|
||||
x = srcx[:-1, :]
|
||||
y = srcx[1:,:]
|
||||
|
||||
n_timesteps = x.shape[0]
|
||||
n_samples = x.shape[1]
|
||||
print('check init ok')
|
||||
emb = self.tparams['Wemb'][x.flatten()]
|
||||
emb = emb.reshape([n_timesteps, n_samples, self.options['dim_word']])
|
||||
emb_ctx = self.tparams['Wemb_ctx'][ctx_].reshape([n_samples, self.options['dim_word']])
|
||||
print('check embed ok')
|
||||
# input
|
||||
|
||||
if self.options['dropout_input'] is not None and self.options['dropout_input'] > 0:
|
||||
print 'Applying drop-out on input embedding (dropout_input:', self.options['dropout_input'], ")"
|
||||
emb = dropout_layer(emb, self.use_noise, self.trng, self.options['dropout_input'])
|
||||
emb_ctx = dropout_layer(emb_ctx, self.use_noise, self.trng, self.options['dropout_input'])
|
||||
|
||||
init_state = tensor.alloc(0., n_samples, self.options['dim_proj'])
|
||||
init_cell = tensor.alloc(0., n_samples, self.options['dim_proj'])
|
||||
|
||||
# pass through gru layer, recurrence here
|
||||
print 'Using', self.options['encoder'], 'unit for encoder'
|
||||
print 'Training with successive sentences'
|
||||
init_states = [init_state, init_cell]
|
||||
proj = lstm_layer(self.tparams, emb, emb_ctx, self.options,
|
||||
prefix='encoder',
|
||||
init_state=init_state,
|
||||
cell_state=init_cell,
|
||||
mask = srcx_mask[:-1,:])
|
||||
|
||||
|
||||
proj_h = proj[0] # all hidden states
|
||||
|
||||
next_states = [st[-1] for st in proj] # first last hidden_state, second last cell_state
|
||||
|
||||
if self.options['dropout_output'] is not None and self.options['dropout_output'] > 0:
|
||||
print 'Applying drop-out on hidden states (dropout_proj:', self.options['dropout_output'], ")"
|
||||
proj_h = dropout_layer(proj_h, self.use_noise, self.trng, self.options['dropout_output'])
|
||||
|
||||
|
||||
# compute word probabilities
|
||||
def _prob(proj_h):
|
||||
logit_lstm = get_layer('ff')[1](self.tparams, proj_h, self.options, prefix='ff_logit_lstm', activ='linear')
|
||||
logit_prev = get_layer('ff')[1](self.tparams, emb, self.options, prefix='ff_logit_prev', activ='linear')
|
||||
logit_label = get_layer('ff')[1](self.tparams, emb_ctx, self.options, prefix='ff_logit_label', activ='linear')
|
||||
logit = tensor.tanh(logit_lstm + logit_prev + logit_label)
|
||||
|
||||
#logit = tensor.tanh(logit_lstm)
|
||||
# split to calculate
|
||||
logit = get_layer('ff')[1](self.tparams, logit, self.options, prefix='ff_logit', activ='linear')
|
||||
logit_shp = logit.shape # n_timesteps * n_samples * n_words
|
||||
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))
|
||||
return probs
|
||||
|
||||
probs = _prob(proj_h)
|
||||
|
||||
# cost
|
||||
y_flat = y.flatten()
|
||||
y_flat_idx = tensor.arange(y_flat.shape[0]) * self.options['n_words'] + y_flat
|
||||
|
||||
# probs:(seq,batch,worddim) <-> x:(seq,batch) become the right place value
|
||||
# y:(seq_len, batch_size)
|
||||
def _cost(probs):
|
||||
cost = -tensor.log(probs.flatten()[y_flat_idx] + 1e-10)
|
||||
cost = cost.reshape([y.shape[0], y.shape[1]])
|
||||
sentenceLen = srcx_mask[1:,:].sum(axis=0)
|
||||
cost = (cost * srcx_mask[1:, :]).sum(axis=0) / sentenceLen
|
||||
return cost, sentenceLen
|
||||
|
||||
cost, sentenceLen = _cost(probs)
|
||||
|
||||
return srcx, srcx_mask, ctx_, cost, sentenceLen #(seq, batch, worddim)
|
||||
|
||||
# calculate the log probablities on a given corpus using language model
|
||||
def pred_probs(self, valid_Data, valid_batch_size):
|
||||
self.use_noise.set_value(0.)
|
||||
nlls = []
|
||||
dataLen = []
|
||||
valid_x, valid_y = valid_Data[0], valid_Data[1]
|
||||
|
||||
for idx in xrange((len(valid_x) + valid_batch_size - 1) // valid_batch_size ):
|
||||
data = valid_x[idx * valid_batch_size : (idx + 1) * valid_batch_size]
|
||||
label = valid_y[idx * valid_batch_size : (idx + 1) * valid_batch_size]
|
||||
dataLen += [len(tt) for tt in data]
|
||||
x, x_mask = prepare_data_x(data, pad_sos=self.options['pad_sos'], n_word=self.options['n_words'])
|
||||
cost = self.f_log_probs(x, x_mask, numpy.array(label).astype('int64'))
|
||||
nlls += cost.tolist()
|
||||
|
||||
nlls = numpy.array(nlls).astype('float32')
|
||||
dataLen = numpy.array(dataLen).astype('float32')
|
||||
return numpy.exp((nlls * dataLen).sum() / dataLen.sum())
|
||||
|
||||
def evaluate(self, validSet, testSet):
|
||||
valid_ppl = self.pred_probs(validSet, 32)
|
||||
test_ppl = self.pred_probs(testSet, 32)
|
||||
return valid_ppl, test_ppl
|
||||
|
||||
|
||||
'''
|
||||
def train(round = 0,
|
||||
dim_word=1000, # word vector dimensionality
|
||||
dim_proj=1000, # the number of GRU units
|
||||
encoder='lstm',
|
||||
patience=10, # early stopping patience
|
||||
max_epochs=5000,
|
||||
finish_after=10000000000000, # finish after this many updates
|
||||
decay_c=0., # L2 weight decay penalty
|
||||
clip_c=5.,
|
||||
lrate=1.,
|
||||
n_words = 10000, # vocabulary size
|
||||
maxlen=None, # maximum length of the description
|
||||
minlen=1,
|
||||
start_iter=0,
|
||||
start_epoch=0,
|
||||
optimizer='adadelta',
|
||||
batch_size=32,
|
||||
valid_batch_size=20,
|
||||
saveto='model.npz',
|
||||
validFreq=1000,
|
||||
dispFreq=100,
|
||||
saveFreq=1000, # save the parameters after every saveFreq updates
|
||||
newDumpFreq=10000,
|
||||
syncFreq = 50,
|
||||
sampleFreq=100, # generate some samples after every sampleFreq
|
||||
sampleNum = 50, # generate sampleNum sentences
|
||||
dataset=None,
|
||||
valid_dataset=None,
|
||||
test_dataset=None,
|
||||
dictionary=None,
|
||||
sampleFileName="sampleFile.txt",
|
||||
embedding=None,
|
||||
dropout_input=None,
|
||||
dropout_output=None,
|
||||
reload_model=None,
|
||||
reload_option=None,
|
||||
log=None,
|
||||
monitor_grad=False,
|
||||
pad_sos=False):
|
||||
|
||||
# Model options
|
||||
if pad_sos:
|
||||
n_words += 1
|
||||
model_options = locals().copy()
|
||||
print "model options:"
|
||||
for kk, vv in model_options.iteritems():
|
||||
print "\t"+kk+":\t"+str(vv)
|
||||
|
||||
print('log = ', log)
|
||||
F_log = open(log, "a")
|
||||
|
||||
if start_iter == 0:
|
||||
F_log.write("model options:\n")
|
||||
for kk, vv in model_options.iteritems():
|
||||
F_log.write("\t"+kk+":\t"+str(vv)+"\n")
|
||||
F_log.write("-----------------------------------------\n\n")
|
||||
|
||||
|
||||
print 'Loading training dataset...'
|
||||
|
||||
voc_size = n_words - 1 if pad_sos else n_words
|
||||
|
||||
trainSet, validSet, testSet = load_data(path=dataset, n_words=n_words, maxlen=maxlen, sort_by_len=True, fixed_valid=True)
|
||||
|
||||
# reload options
|
||||
if reload_option is not None and os.path.exists(reload_option):
|
||||
print "Reloading model options...",
|
||||
with open('%s' % reload_option, 'rb') as f:
|
||||
model_options = pkl.load(f)
|
||||
print "Done"
|
||||
|
||||
# init parameters
|
||||
print 'Initializing model parameters...',
|
||||
params = init_lm_params(model_options)
|
||||
print 'Done'
|
||||
|
||||
# load pre-trained word embedding
|
||||
if embedding is not None and os.path.exists(embedding):
|
||||
print 'Load Embedding from ', embedding
|
||||
Wemb = numpy.array(numpy.load(open(embedding, "rb")))
|
||||
if Wemb.shape[0] == model_options['n_words'] and Wemb.shape[1] == model_options['dim_word']:
|
||||
print 'Using pre-trained word embedding...',
|
||||
params['Wemb'] = Wemb.astype(numpy.float32)
|
||||
print 'vocab size', params['Wemb'].shape[0], ', dim', params['Wemb'].shape[1]
|
||||
|
||||
# reload parameters
|
||||
if reload_model is not None and os.path.exists(reload_model):
|
||||
print "Reloading model parameters...",
|
||||
params = load_params(reload_model, params)
|
||||
print "Done"
|
||||
|
||||
# create shared variables for parameters
|
||||
tparams = init_tparams(params)
|
||||
|
||||
# build the symbolic computational graph
|
||||
print 'Building model...'
|
||||
trng, use_noise, srcx, srcx_mask, ctx_, cost = build_lm_model(tparams, model_options)
|
||||
|
||||
print 'Building f_log_probs',
|
||||
f_log_probs = theano.function([srcx, srcx_mask, ctx_], cost, profile = profile)
|
||||
print 'Done'
|
||||
cost = cost.mean(axis=0)
|
||||
# apply L2 regularization on weights
|
||||
if decay_c > 0.:
|
||||
print "Applying L2 regularization (decay_c: "+str(decay_c)+')...',
|
||||
cost = l2_regularization(tparams, cost, decay_c)
|
||||
print "Done"
|
||||
|
||||
# after any regularizer - compile the computational graph for cost
|
||||
print 'Building f_cost',
|
||||
f_cost = theano.function([srcx, srcx_mask, ctx_], cost, profile = profile)
|
||||
print 'Done'
|
||||
|
||||
print 'Computing gradient',
|
||||
grads = tensor.grad(cost, wrt=itemlist(tparams))
|
||||
print 'Done'
|
||||
|
||||
# apply gradient clipping here
|
||||
if clip_c > 0.:
|
||||
print 'Applying gradient clipping (clip_c:'+str(clip_c)+')...',
|
||||
grads = grad_clipping(grads, clip_c)
|
||||
print 'Done'
|
||||
|
||||
# compile the optimizer, the actual computational graph is compiled here
|
||||
print 'Building optimizers...',
|
||||
lr = tensor.scalar(name='lr')
|
||||
f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, [srcx, srcx_mask, ctx_], cost)
|
||||
#f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
|
||||
print 'Done'
|
||||
|
||||
sys.stdout.flush()
|
||||
|
||||
history_errs = []
|
||||
# reload history
|
||||
if reload_model is not None and os.path.exists(reload_model):
|
||||
history_errs = list(numpy.load(reload_model)['history_errs'])
|
||||
best_p = None
|
||||
bad_count = 0
|
||||
|
||||
# Training loop
|
||||
bad_counter = 0
|
||||
uidx = start_iter
|
||||
estop = False
|
||||
start_time = time.time()
|
||||
n_samples = 0
|
||||
cost_accu = 0
|
||||
|
||||
for eidx in xrange(start_epoch, max_epochs):
|
||||
epoch_start_time = time.time()
|
||||
print "Start epoch ", eidx
|
||||
n_samples = 0
|
||||
|
||||
print 'Start epoch', eidx
|
||||
epoch_start_time = time.time()
|
||||
n_samples = 0
|
||||
|
||||
kf_train = get_minibatches_idx(len(trainSet[0]), batch_size, shuffle=True)
|
||||
|
||||
for _, train_index in kf_train:
|
||||
uidx += 1
|
||||
x = [trainSet[0][t] for t in train_index]
|
||||
y = [trainSet[1][t] for t in train_index]
|
||||
n_samples += len(x)
|
||||
use_noise.set_value(1.) #training mode
|
||||
|
||||
# pad batch and create mask
|
||||
x, x_mask = prepare_data_x(x, pad_eos=True,pad_sos=model_options['pad_sos'],n_word=model_options['n_words'])
|
||||
|
||||
if x is None:
|
||||
print 'Minibatch with zero sample under length ', maxlen
|
||||
uidx -= 1
|
||||
continue
|
||||
|
||||
ud_start = time.time()
|
||||
|
||||
# compute cost, grads and copy grads to shared variables
|
||||
cost = f_grad_shared(x, x_mask, y) # input argument issue fixed
|
||||
|
||||
# do the update on parameters
|
||||
f_update(lrate)
|
||||
|
||||
ud = time.time() - ud_start
|
||||
|
||||
# check for bad numbers
|
||||
if numpy.isnan(cost) or numpy.isinf(cost):
|
||||
print 'NaN detected'
|
||||
F_log.write("=========================================\nNaN detected\n")
|
||||
F_log.write('Epoch'+str(eidx)+'\tIter '+str(uidx)+'\tBatch Length '+str(x.shape[0])+'\n')
|
||||
return 1.
|
||||
|
||||
cost_accu += cost
|
||||
if numpy.mod(uidx, dispFreq) == 0:
|
||||
print 'Epoch ', eidx, '\tIter ', uidx, '\tLoss ', cost_accu/float(dispFreq), '\tUD ', ud,
|
||||
print '\tLength', x.shape[0], '\tSize ', x.shape[1]
|
||||
F_log.write('Epoch '+str(eidx)+'\tIter '+str(uidx)+'\tLoss '+str(cost_accu/float(dispFreq))
|
||||
+'\tUD '+str(ud)+'\tLength '+str(x.shape[0])+'\tSize '+str(x.shape[1])+'\n')
|
||||
cost_accu = 0
|
||||
sys.stdout.flush()
|
||||
|
||||
# validate model on validation set and early stop if necessary
|
||||
if numpy.mod(uidx, validFreq) == 0:
|
||||
print "Validating...",
|
||||
use_noise.set_value(0.)
|
||||
# fixed for successive mode
|
||||
valid_ppl = pred_probs(f_log_probs, prepare_data_x, model_options, validSet, batch_size)
|
||||
history_errs.append(valid_ppl)
|
||||
print "Done"
|
||||
|
||||
if uidx == 0 or valid_ppl <= numpy.array(history_errs).min():
|
||||
best_p = unzip(tparams)
|
||||
bad_counter = 0
|
||||
if len(history_errs) > patience and valid_ppl >= numpy.array(history_errs)[:-patience].min():
|
||||
bad_counter += 1
|
||||
if bad_counter > patience:
|
||||
print 'Early Stop!'
|
||||
F_log.write('##############\nEarly Stop!\n##############\n')
|
||||
estop = True
|
||||
break
|
||||
|
||||
# perplexity
|
||||
|
||||
test_ppl = pred_probs(f_log_probs, prepare_data_x, model_options, testSet, batch_size)
|
||||
|
||||
print 'Perplexity: { Valid', valid_ppl, ', Test', test_ppl, '}'
|
||||
F_log.write('Perplexity: Valid '+str(valid_ppl)+'\tTest '+str(test_ppl)+'\n')
|
||||
F_log.write('====================================\n')
|
||||
sys.stdout.flush()
|
||||
|
||||
# save the current models
|
||||
savefile = saveto + "_e" + str(eidx) + "_i" + str(uidx) + "_valid_" + str(valid_ppl) + '_test_' + str(test_ppl)
|
||||
numpy.savez(savefile, history_errs=history_errs, **unzip(tparams))
|
||||
pkl.dump(model_options, open('%s.option.pkl' % saveto, 'wb'))
|
||||
|
||||
# finish after this many updates
|
||||
if uidx >= finish_after:
|
||||
print 'Finishing after %d iterations!' % uidx
|
||||
F_log.write('##############\nFinishing after '+str(uidx)+' iterations!\n##############\n')
|
||||
estop = True
|
||||
break
|
||||
|
||||
epoch_end_time = time.time()
|
||||
print 'Epoch', eidx, 'completed, Seen', n_samples, 'samples, Time', epoch_end_time-epoch_start_time
|
||||
F_log.write("-----------------------------------------------------------\n")
|
||||
F_log.write("Epoch "+str(eidx)+" completed, Seen "+str(n_samples)+" samples, Time "+str(epoch_end_time-epoch_start_time)+"\n")
|
||||
F_log.write("------------------------------------------------------------\n")
|
||||
|
||||
if estop:
|
||||
break
|
||||
|
||||
end_time = time.time()
|
||||
'''
|
|
@ -0,0 +1,369 @@
|
|||
"""
|
||||
data loading and minibatch generation
|
||||
"""
|
||||
__author__ = 'v-yirwan'
|
||||
|
||||
import cPickle as pkl
|
||||
import gzip
|
||||
import os
|
||||
import numpy
|
||||
from theano import config
|
||||
|
||||
def get_dataset_file(dataset, default_dataset, origin):
|
||||
'''
|
||||
Look for it as if it was a full path, if not, try local file,
|
||||
if not try in the data directory.
|
||||
|
||||
Download dataset if it is not present
|
||||
'''
|
||||
data_dir, data_file = os.path.split(dataset)
|
||||
if data_dir == "" and not os.path.isfile(dataset):
|
||||
# Check if dataset is in the data directory.
|
||||
new_path = os.path.join(
|
||||
os.path.split(__file__)[0],
|
||||
"..",
|
||||
"data",
|
||||
dataset
|
||||
)
|
||||
if os.path.isfile(new_path) or data_file == default_dataset:
|
||||
dataset = new_path
|
||||
|
||||
if (not os.path.isfile(dataset)) and data_file == default_dataset:
|
||||
from six.moves import urllib
|
||||
print('Downloading data from %s' % origin)
|
||||
urllib.request.urlretrieve(origin, dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
def load_data(path="imdb.pkl", n_words=100000, maxlen=None,
|
||||
sort_by_len=True, fixed_valid=True, valid_portion=0.1):
|
||||
'''
|
||||
Loads the dataset
|
||||
:type path: String
|
||||
:param path: The path to the dataset (here IMDB)
|
||||
:type n_words: int
|
||||
:param n_words: The number of word to keep in the vocabulary.
|
||||
All extra words are set to unknow (1).
|
||||
:type maxlen: None or positive int
|
||||
:param maxlen: the max sequence length we use in the train/valid set.
|
||||
:type sort_by_len: bool
|
||||
:name sort_by_len: Sort by the sequence lenght for the train,
|
||||
valid and test set. This allow faster execution as it cause
|
||||
less padding per minibatch. Another mechanism must be used to
|
||||
shuffle the train set at each epoch.
|
||||
:type fixed_valid: bool
|
||||
:param fixed_valid: load fixed validation set from the corpus file,
|
||||
which would otherwise be picked randomly from the training set with
|
||||
proportion [valid_portion]
|
||||
:type valid_portion: float
|
||||
:param valid_portion: The proportion of the full train set used for
|
||||
the validation set.
|
||||
|
||||
'''
|
||||
|
||||
# Load the dataset
|
||||
path = get_dataset_file(
|
||||
path, "imdb.pkl",
|
||||
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
|
||||
if path.endswith(".gz"):
|
||||
f = gzip.open(path, 'rb')
|
||||
else:
|
||||
f = open(path, 'rb')
|
||||
|
||||
train_set = pkl.load(f)
|
||||
if fixed_valid:
|
||||
valid_set = pkl.load(f)
|
||||
test_set = pkl.load(f)
|
||||
f.close()
|
||||
|
||||
def _truncate_data(train_set):
|
||||
'''
|
||||
truncate sequences with lengths exceed max-len threshold
|
||||
:param train_set: a list of sequences list and corresponding labels list
|
||||
:return: truncated train_set
|
||||
'''
|
||||
new_train_set_x = []
|
||||
new_train_set_y = []
|
||||
for x, y in zip(train_set[0], train_set[1]):
|
||||
if len(x) < maxlen:
|
||||
new_train_set_x.append(x)
|
||||
new_train_set_y.append(y)
|
||||
train_set = (new_train_set_x, new_train_set_y)
|
||||
del new_train_set_x, new_train_set_y
|
||||
return train_set
|
||||
|
||||
def _set_valid(train_set, valid_portion):
|
||||
'''
|
||||
set validation with [valid_portion] proportion of training set
|
||||
'''
|
||||
train_set_x, train_set_y = train_set
|
||||
n_samples = len(train_set_x)
|
||||
sidx = numpy.random.permutation(n_samples) # shuffle data
|
||||
n_train = int(numpy.round(n_samples * (1. - valid_portion)))
|
||||
valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
|
||||
valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
|
||||
train_set_x = [train_set_x[s] for s in sidx[:n_train]]
|
||||
train_set_y = [train_set_y[s] for s in sidx[:n_train]]
|
||||
train_set = (train_set_x, train_set_y)
|
||||
valid_set = (valid_set_x, valid_set_y)
|
||||
del train_set_x, train_set_y, valid_set_x, valid_set_y
|
||||
return train_set, valid_set
|
||||
|
||||
if maxlen:
|
||||
train_set = _truncate_data(train_set)
|
||||
if fixed_valid:
|
||||
print 'Loading with fixed validation set...',
|
||||
valid_set = _truncate_data(valid_set)
|
||||
else:
|
||||
print 'Setting validation set with proportion:', valid_portion, '...',
|
||||
train_set, valid_set = _set_valid(train_set, valid_portion)
|
||||
test_set = _truncate_data(test_set)
|
||||
|
||||
if maxlen is None and not fixed_valid:
|
||||
train_set, valid_set = _set_valid(train_set, valid_portion)
|
||||
|
||||
def remove_unk(x):
|
||||
return [[1 if w >= n_words else w for w in sen] for sen in x]
|
||||
|
||||
test_set_x, test_set_y = test_set
|
||||
valid_set_x, valid_set_y = valid_set
|
||||
train_set_x, train_set_y = train_set
|
||||
|
||||
# remove unk from dataset
|
||||
train_set_x = remove_unk(train_set_x) # use 1 if unk
|
||||
valid_set_x = remove_unk(valid_set_x)
|
||||
test_set_x = remove_unk(test_set_x)
|
||||
|
||||
def len_argsort(seq):
|
||||
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
|
||||
|
||||
if sort_by_len:
|
||||
sorted_index = len_argsort(test_set_x)
|
||||
# ranked from shortest to longest
|
||||
test_set_x = [test_set_x[i] for i in sorted_index]
|
||||
test_set_y = [test_set_y[i] for i in sorted_index]
|
||||
|
||||
sorted_index = len_argsort(valid_set_x)
|
||||
valid_set_x = [valid_set_x[i] for i in sorted_index]
|
||||
valid_set_y = [valid_set_y[i] for i in sorted_index]
|
||||
|
||||
sorted_index = len_argsort(train_set_x)
|
||||
train_set_x = [train_set_x[i] for i in sorted_index]
|
||||
train_set_y = [train_set_y[i] for i in sorted_index]
|
||||
|
||||
train = (train_set_x, train_set_y)
|
||||
valid = (valid_set_x, valid_set_y)
|
||||
test = (test_set_x, test_set_y)
|
||||
|
||||
return train, valid, test
|
||||
|
||||
def load_mnist(path='mnist.pkl', fixed_permute=True, rand_permute=False):
|
||||
f = open(path, 'rb')
|
||||
train = pkl.load(f)
|
||||
valid = pkl.load(f)
|
||||
test = pkl.load(f)
|
||||
f.close()
|
||||
|
||||
def _permute(data, perm):
|
||||
x, y = data
|
||||
x_new = []
|
||||
for xx in x:
|
||||
xx_new = [xx[pp] for pp in perm]
|
||||
x_new.append(xx_new)
|
||||
return (x_new, y)
|
||||
|
||||
def _trans2list(data):
|
||||
x, y = data
|
||||
x = [list(xx) for xx in x]
|
||||
return (x, y)
|
||||
|
||||
if rand_permute:
|
||||
print 'Using a fixed random permutation of pixels...',
|
||||
perm = numpy.random.permutation(range(784))
|
||||
train = _permute(train, perm)
|
||||
valid = _permute(valid, perm)
|
||||
test = _permute(test, perm)
|
||||
elif fixed_permute:
|
||||
print 'Using permuted dataset...',
|
||||
|
||||
_trans2list(train)
|
||||
_trans2list(valid)
|
||||
_trans2list(test)
|
||||
|
||||
return train, valid, test
|
||||
|
||||
def get_minibatches_idx(n, minibatch_size, shuffle=False):
|
||||
"""
|
||||
Used to shuffle the dataset at each iteration.
|
||||
"""
|
||||
|
||||
idx_list = numpy.arange(n, dtype="int32")
|
||||
|
||||
if shuffle:
|
||||
numpy.random.shuffle(idx_list)
|
||||
|
||||
minibatches = []
|
||||
minibatch_start = 0
|
||||
for i in range(n // minibatch_size):
|
||||
minibatches.append(idx_list[minibatch_start:
|
||||
minibatch_start + minibatch_size])
|
||||
minibatch_start += minibatch_size
|
||||
|
||||
if (minibatch_start != n):
|
||||
# Make a minibatch out of what is left
|
||||
minibatches.append(idx_list[minibatch_start:])
|
||||
|
||||
return zip(range(len(minibatches)), minibatches)
|
||||
|
||||
def get_minibatches_idx_bucket(dataset, minibatch_size, shuffle=False):
|
||||
"""
|
||||
divide into different buckets according to sequence lengths
|
||||
dynamic batch size
|
||||
"""
|
||||
# divide into buckets
|
||||
slen = [len(ss) for ss in dataset]
|
||||
bucket1000 = [sidx for sidx in xrange(len(dataset))
|
||||
if slen[sidx] > 0 and slen[sidx] <= 1000]
|
||||
bucket3000 = [sidx for sidx in xrange(len(dataset))
|
||||
if slen[sidx] > 1000 and slen[sidx] <= 3000]
|
||||
bucket_long = [sidx for sidx in xrange(len(dataset))
|
||||
if slen[sidx] > 3000]
|
||||
|
||||
# shuffle each bucket
|
||||
if shuffle:
|
||||
numpy.random.shuffle(bucket1000)
|
||||
numpy.random.shuffle(bucket3000)
|
||||
numpy.random.shuffle(bucket_long)
|
||||
|
||||
# make minibatches
|
||||
def _make_batch(minibatches, bucket, minibatch_size):
|
||||
minibatch_start = 0
|
||||
n = len(bucket)
|
||||
for i in range(n // minibatch_size):
|
||||
minibatches.append(bucket[minibatch_start : minibatch_start + minibatch_size])
|
||||
minibatch_start += minibatch_size
|
||||
if (minibatch_start != n):
|
||||
# Make a minibatch out of what is left
|
||||
minibatches.append(bucket[minibatch_start:])
|
||||
return minibatches
|
||||
|
||||
minibatches = []
|
||||
_make_batch(minibatches, bucket1000, minibatch_size=minibatch_size)
|
||||
_make_batch(minibatches, bucket3000, minibatch_size=minibatch_size//2)
|
||||
_make_batch(minibatches, bucket_long, minibatch_size=minibatch_size//8)
|
||||
|
||||
# shuffle minibatches
|
||||
numpy.random.shuffle(minibatches)
|
||||
|
||||
return zip(range(len(minibatches)), minibatches)
|
||||
|
||||
def prepare_data(seqs, labels, maxlen=None, dataset='text'):
|
||||
"""Create the matrices from the datasets.
|
||||
|
||||
This pad each sequence to the same lenght: the lenght of the
|
||||
longuest sequence or maxlen.
|
||||
|
||||
if maxlen is set, we will cut all sequence to this maximum
|
||||
lenght.
|
||||
|
||||
This swap the axis!
|
||||
"""
|
||||
# x: a list of sentences
|
||||
lengths = [len(s) for s in seqs]
|
||||
|
||||
if maxlen is not None:
|
||||
new_seqs = []
|
||||
new_labels = []
|
||||
new_lengths = []
|
||||
for l, s, y in zip(lengths, seqs, labels):
|
||||
if l < maxlen:
|
||||
new_seqs.append(s)
|
||||
new_labels.append(y)
|
||||
new_lengths.append(l)
|
||||
lengths = new_lengths
|
||||
labels = new_labels
|
||||
seqs = new_seqs
|
||||
|
||||
if len(lengths) < 1:
|
||||
return None, None, None
|
||||
|
||||
n_samples = len(seqs)
|
||||
maxlen = numpy.max(lengths)
|
||||
|
||||
if dataset == 'mnist':
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('float32')
|
||||
else:
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('int64')
|
||||
x_mask = numpy.zeros((maxlen, n_samples)).astype(config.floatX)
|
||||
for idx, s in enumerate(seqs):
|
||||
x[:lengths[idx], idx] = s
|
||||
x_mask[:lengths[idx], idx] = 1.
|
||||
|
||||
return x, x_mask, labels
|
||||
|
||||
def prepare_data_hier(seqs, labels, hier_len, maxlen=None, dataset='text'):
|
||||
'''
|
||||
prepare minibatch for hierarchical model
|
||||
'''
|
||||
# sort (long->short)
|
||||
sorted_idx = sorted(range(len(seqs)), key=lambda x: len(seqs[x]), reverse=True)
|
||||
seqs = [seqs[i] for i in sorted_idx]
|
||||
labels = [labels[i] for i in sorted_idx]
|
||||
|
||||
# truncate data
|
||||
lengths = [len(s) for s in seqs]
|
||||
if maxlen is not None:
|
||||
new_seqs = []
|
||||
new_labels = []
|
||||
new_lengths = []
|
||||
for l, s, y in zip(lengths, seqs, labels):
|
||||
if l <maxlen :
|
||||
new_seqs.append(s)
|
||||
new_labels.append(y)
|
||||
new_lengths.append(l)
|
||||
lengths = new_lengths
|
||||
labels = new_labels
|
||||
seqs = new_seqs
|
||||
if len(lengths) < 1:
|
||||
return None, None, None
|
||||
|
||||
# set batch size
|
||||
n_samples = len(seqs)
|
||||
maxlen = numpy.max(lengths)
|
||||
if maxlen % hier_len == 0:
|
||||
n_batch = maxlen/hier_len
|
||||
else:
|
||||
n_batch = maxlen//hier_len + 1
|
||||
maxlen = n_batch * hier_len
|
||||
|
||||
# padding whole batch
|
||||
if dataset == 'mnist':
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('float32')
|
||||
else:
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('int64')
|
||||
x_mask = numpy.zeros((maxlen, n_samples)).astype(config.floatX)
|
||||
for idx, s in enumerate(seqs):
|
||||
x[:lengths[idx], idx] = s
|
||||
x_mask[:lengths[idx], idx] = 1
|
||||
|
||||
# slice to mini-batches
|
||||
x_batch = [x[bidx*hier_len:(bidx+1)*hier_len, :] for bidx in range(n_batch)]
|
||||
if dataset == 'mnist':
|
||||
x_batch = numpy.array(x_batch).astype('float32')
|
||||
else:
|
||||
x_batch = numpy.array(x_batch).astype('int64')
|
||||
mask_batch = [x_mask[bidx*hier_len:(bidx+1)*hier_len, :] for bidx in range(n_batch)]
|
||||
mask_batch = numpy.array(mask_batch).astype(config.floatX)
|
||||
|
||||
# mask for hier-level
|
||||
mask_hier = numpy.ones((n_batch, n_samples)).astype(config.floatX)
|
||||
for idx in range(n_samples):
|
||||
mpos = numpy.where(x_mask[:, idx]==0)[0]
|
||||
if len(mpos) == 0:
|
||||
continue
|
||||
bidx = min(mpos[0]//hier_len+1, n_batch)
|
||||
if mpos[0] % hier_len == 0:
|
||||
bidx -= 1 # bug fixed TODO: more elegant solution?
|
||||
mask_hier[bidx:, idx] = 0
|
||||
|
||||
return x_batch, mask_batch, mask_hier, labels
|
|
@ -0,0 +1,38 @@
|
|||
import re
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
|
||||
filename = r'.\gpu_usage_draft_'
|
||||
default_gpu = 58 + 30
|
||||
|
||||
|
||||
|
||||
def GrabGPU(rank):
|
||||
cmdstr = '\"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe\" > ' + filename + rank
|
||||
os.system(cmdstr)
|
||||
|
||||
def GetGPUUSage(rank):
|
||||
pattern = re.compile(r'(?P<num>[0-9]{1,5})MiB[\s]+/')
|
||||
id = 0
|
||||
GPUs = []
|
||||
fo = open(filename + rank, 'r')
|
||||
for line in fo:
|
||||
result = pattern.search(line)
|
||||
if result:
|
||||
if int(result.group("num")) < default_gpu:
|
||||
GPUs.append(id)
|
||||
id = id + 1
|
||||
fo.close()
|
||||
|
||||
print len(GPUs)
|
||||
for gpu in GPUs:
|
||||
print gpu
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
rank = sys.argv[1]
|
||||
GrabGPU(rank)
|
||||
print socket.gethostname()
|
||||
GetGPUUSage(rank)
|
||||
#os.system('del /q ' + filename + rank)
|
|
@ -0,0 +1,17 @@
|
|||
import os
|
||||
def MapDeviceIds(comm):
|
||||
rank = comm.Get_rank()
|
||||
num_machine = comm.Get_size()
|
||||
os.system('python GPU_Usage.py ' + str(rank) + ' > record' + str(rank))
|
||||
comm.Barrier()
|
||||
if rank == 0:
|
||||
os.system('python AllocateGPU.py ' + str(num_machine) + ' > DirtyRecord')
|
||||
comm.Barrier()
|
||||
cardid = str(0)
|
||||
with open('DirtyRecord', 'r') as f:
|
||||
for idx, line in enumerate(f):
|
||||
if idx == rank:
|
||||
cardid = line.strip()
|
||||
break
|
||||
|
||||
return cardid
|
|
@ -0,0 +1,32 @@
|
|||
import sys
|
||||
import codecs
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
raise Exception('Not enough argv')
|
||||
|
||||
theano_rc = r"""
|
||||
[global]
|
||||
mode = FAST_RUN
|
||||
device = gpu
|
||||
floatX = float32
|
||||
on_unused_input = warn
|
||||
optimizer = fast_run
|
||||
#allow_gc=False
|
||||
cuda.disable_gcc_cudnn_check=True
|
||||
|
||||
[lib]
|
||||
cnmem = 0.75
|
||||
|
||||
[nvcc]
|
||||
flags=-L{0}\libs
|
||||
root=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
|
||||
fast_math = True
|
||||
|
||||
"""
|
||||
|
||||
theano_rc = theano_rc.format(sys.argv[1])
|
||||
|
||||
print(theano_rc)
|
||||
|
||||
with codecs.open(sys.argv[2], 'w', 'utf-8') as f:
|
||||
f.write(theano_rc)
|
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) 2007, 2010, 2011, 2012 Godefroid Chapelle
|
||||
#
|
||||
# This file is part of ipdb.
|
||||
# GNU package is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU General Public License as published by the Free
|
||||
# Software Foundation, either version 2 of the License, or (at your option)
|
||||
# any later version.
|
||||
#
|
||||
# GNU package is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
# for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.
|
||||
|
||||
from __main__ import set_trace, post_mortem, pm, run, runcall, runeval, launch_ipdb_on_exception
|
||||
|
||||
pm # please pyflakes
|
||||
post_mortem # please pyflakes
|
||||
run # please pyflakes
|
||||
runcall # please pyflakes
|
||||
runeval # please pyflakes
|
||||
set_trace # please pyflakes
|
||||
launch_ipdb_on_exception # please pyflakes
|
|
@ -0,0 +1,184 @@
|
|||
# Copyright (c) 2011, 2012 Godefroid Chapelle
|
||||
#
|
||||
# This file is part of ipdb.
|
||||
# GNU package is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU General Public License as published by the Free
|
||||
# Software Foundation, either version 2 of the License, or (at your option)
|
||||
# any later version.
|
||||
#
|
||||
# GNU package is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
# for more details.
|
||||
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import os
|
||||
import traceback
|
||||
|
||||
from contextlib import contextmanager
|
||||
|
||||
try:
|
||||
from pdb import Restart
|
||||
except ImportError:
|
||||
class Restart(Exception):
|
||||
pass
|
||||
|
||||
import IPython
|
||||
|
||||
if IPython.__version__ > '0.10.2':
|
||||
from IPython.core.debugger import Pdb, BdbQuit_excepthook
|
||||
try:
|
||||
get_ipython
|
||||
except NameError:
|
||||
# Make it more resilient to different versions of IPython and try to
|
||||
# find a module.
|
||||
possible_modules = ['IPython.terminal.embed', # Newer IPython
|
||||
'IPython.frontend.terminal.embed'] # Older IPython
|
||||
|
||||
count = len(possible_modules)
|
||||
for module in possible_modules:
|
||||
try:
|
||||
embed = __import__(module, fromlist=["InteractiveShellEmbed"])
|
||||
InteractiveShellEmbed = embed.InteractiveShellEmbed
|
||||
except ImportError:
|
||||
count -= 1
|
||||
if count == 0:
|
||||
raise
|
||||
else:
|
||||
break
|
||||
|
||||
ipshell = InteractiveShellEmbed()
|
||||
def_colors = ipshell.colors
|
||||
else:
|
||||
def_colors = get_ipython.im_self.colors
|
||||
|
||||
from IPython.utils import io
|
||||
|
||||
if 'nose' in sys.modules.keys():
|
||||
def update_stdout():
|
||||
# setup stdout to ensure output is available with nose
|
||||
io.stdout = sys.stdout = sys.__stdout__
|
||||
else:
|
||||
def update_stdout():
|
||||
pass
|
||||
else:
|
||||
from IPython.Debugger import Pdb, BdbQuit_excepthook
|
||||
from IPython.Shell import IPShell
|
||||
from IPython import ipapi
|
||||
|
||||
ip = ipapi.get()
|
||||
if ip is None:
|
||||
IPShell(argv=[''])
|
||||
ip = ipapi.get()
|
||||
def_colors = ip.options.colors
|
||||
|
||||
from IPython.Shell import Term
|
||||
|
||||
if 'nose' in sys.modules.keys():
|
||||
def update_stdout():
|
||||
# setup stdout to ensure output is available with nose
|
||||
Term.cout = sys.stdout = sys.__stdout__
|
||||
else:
|
||||
def update_stdout():
|
||||
pass
|
||||
|
||||
|
||||
def wrap_sys_excepthook():
|
||||
# make sure we wrap it only once or we would end up with a cycle
|
||||
# BdbQuit_excepthook.excepthook_ori == BdbQuit_excepthook
|
||||
if sys.excepthook != BdbQuit_excepthook:
|
||||
BdbQuit_excepthook.excepthook_ori = sys.excepthook
|
||||
sys.excepthook = BdbQuit_excepthook
|
||||
|
||||
|
||||
def set_trace(frame=None):
|
||||
update_stdout()
|
||||
wrap_sys_excepthook()
|
||||
if frame is None:
|
||||
frame = sys._getframe().f_back
|
||||
Pdb(def_colors).set_trace(frame)
|
||||
|
||||
|
||||
def post_mortem(tb):
|
||||
update_stdout()
|
||||
wrap_sys_excepthook()
|
||||
p = Pdb(def_colors)
|
||||
p.reset()
|
||||
if tb is None:
|
||||
return
|
||||
p.interaction(None, tb)
|
||||
|
||||
|
||||
def pm():
|
||||
post_mortem(sys.last_traceback)
|
||||
|
||||
|
||||
def run(statement, globals=None, locals=None):
|
||||
Pdb(def_colors).run(statement, globals, locals)
|
||||
|
||||
|
||||
def runcall(*args, **kwargs):
|
||||
return Pdb(def_colors).runcall(*args, **kwargs)
|
||||
|
||||
|
||||
def runeval(expression, globals=None, locals=None):
|
||||
return Pdb(def_colors).runeval(expression, globals, locals)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def launch_ipdb_on_exception():
|
||||
try:
|
||||
yield
|
||||
except Exception:
|
||||
e, m, tb = sys.exc_info()
|
||||
print(m.__repr__(), file=sys.stderr)
|
||||
post_mortem(tb)
|
||||
finally:
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
if not sys.argv[1:] or sys.argv[1] in ("--help", "-h"):
|
||||
print("usage: ipdb.py scriptfile [arg] ...")
|
||||
sys.exit(2)
|
||||
|
||||
mainpyfile = sys.argv[1] # Get script filename
|
||||
if not os.path.exists(mainpyfile):
|
||||
print('Error:', mainpyfile, 'does not exist')
|
||||
sys.exit(1)
|
||||
|
||||
del sys.argv[0] # Hide "pdb.py" from argument list
|
||||
|
||||
# Replace pdb's dir with script's dir in front of module search path.
|
||||
sys.path[0] = os.path.dirname(mainpyfile)
|
||||
|
||||
# Note on saving/restoring sys.argv: it's a good idea when sys.argv was
|
||||
# modified by the script being debugged. It's a bad idea when it was
|
||||
# changed by the user from the command line. There is a "restart" command
|
||||
# which allows explicit specification of command line arguments.
|
||||
pdb = Pdb(def_colors)
|
||||
while 1:
|
||||
try:
|
||||
pdb._runscript(mainpyfile)
|
||||
if pdb._user_requested_quit:
|
||||
break
|
||||
print("The program finished and will be restarted")
|
||||
except Restart:
|
||||
print("Restarting", mainpyfile, "with arguments:")
|
||||
print("\t" + " ".join(sys.argv[1:]))
|
||||
except SystemExit:
|
||||
# In most cases SystemExit does not warrant a post-mortem session.
|
||||
print("The program exited via sys.exit(). Exit status: ", end='')
|
||||
print(sys.exc_info()[1])
|
||||
except:
|
||||
traceback.print_exc()
|
||||
print("Uncaught exception. Entering post mortem debugging")
|
||||
print("Running 'cont' or 'step' will restart the program")
|
||||
t = sys.exc_info()[2]
|
||||
pdb.interaction(None, t)
|
||||
print("Post mortem debugger finished. The " + mainpyfile +
|
||||
" will be restarted")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Двоичный файл не отображается.
|
@ -0,0 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
# coding:utf8
|
||||
|
||||
from api import init, shutdown, barrier, workers_num, worker_id, server_id, is_master_worker
|
||||
from tables import ArrayTableHandler, MatrixTableHandler
|
|
@ -0,0 +1,66 @@
|
|||
#!/usr/bin/env python
|
||||
# coding:utf8
|
||||
|
||||
import ctypes
|
||||
from utils import Loader
|
||||
import numpy as np
|
||||
|
||||
|
||||
mv_lib = Loader.get_lib()
|
||||
|
||||
|
||||
def init(sync=False):
|
||||
'''Initialize mutliverso.
|
||||
|
||||
This should be called only once before training at the beginning of the
|
||||
whole project.
|
||||
If sync is True, a sync server will be created. Otherwise an async server
|
||||
will be created.
|
||||
'''
|
||||
args = [""] # the first argument will be ignored. So we put a placeholder here
|
||||
if sync:
|
||||
args.append("-sync=true")
|
||||
n = len(args)
|
||||
args_type = ctypes.c_char_p * n
|
||||
mv_lib.MV_Init(ctypes.pointer(ctypes.c_int(n)), args_type(*[ctypes.c_char_p(arg) for arg in args]))
|
||||
|
||||
|
||||
def shutdown():
|
||||
'''Set a barrier for all workers to wait.
|
||||
|
||||
Workers will wait until all workers reach a specific barrier.
|
||||
'''
|
||||
mv_lib.MV_ShutDown()
|
||||
|
||||
|
||||
def barrier():
|
||||
'''Shutdown multiverso.
|
||||
|
||||
This should be called only once after finishing training at the end of the
|
||||
whole project.
|
||||
'''
|
||||
mv_lib.MV_Barrier()
|
||||
|
||||
|
||||
def workers_num():
|
||||
'''Return the total number of workers.'''
|
||||
return mv_lib.MV_NumWorkers()
|
||||
|
||||
|
||||
def worker_id():
|
||||
'''Return the id (zero-based index) for current worker.'''
|
||||
return mv_lib.MV_WorkerId()
|
||||
|
||||
|
||||
def server_id():
|
||||
return mv_lib.MV_ServerId()
|
||||
|
||||
|
||||
def is_master_worker():
|
||||
'''If the worker is master worker
|
||||
|
||||
Some things only need one worker process, such as validation, outputing the
|
||||
result, initializing the parameters and so on. So we mark the worker 0 as
|
||||
the master worker to finish these things.
|
||||
'''
|
||||
return worker_id() == 0
|
|
@ -0,0 +1,163 @@
|
|||
#!/usr/bin/env python
|
||||
# coding:utf8
|
||||
|
||||
import ctypes
|
||||
from utils import Loader
|
||||
from utils import convert_data
|
||||
import numpy as np
|
||||
import api
|
||||
|
||||
|
||||
mv_lib = Loader.get_lib()
|
||||
|
||||
|
||||
class TableHandler(object):
|
||||
'''`TableHandler` is an interface to sync different kinds of values.
|
||||
|
||||
If you are not writing python code based on theano or lasagne, you are
|
||||
supposed to sync models (for initialization) and gradients (during
|
||||
training) so as to let multiverso help you manage the models in distributed
|
||||
environments.
|
||||
Otherwise, you'd better use the classes in `multiverso.theano_ext` or
|
||||
`multiverso.theano_ext.lasagne_ext`
|
||||
'''
|
||||
def __init__(self, size, init_value=None):
|
||||
raise NotImplementedError("You must implement the __init__ method.")
|
||||
|
||||
def get(self, size):
|
||||
raise NotImplementedError("You must implement the get method.")
|
||||
|
||||
def add(self, data, sync=False):
|
||||
raise NotImplementedError("You must implement the add method.")
|
||||
|
||||
|
||||
# types
|
||||
C_FLOAT_P = ctypes.POINTER(ctypes.c_float)
|
||||
|
||||
|
||||
class ArrayTableHandler(TableHandler):
|
||||
'''`ArrayTableHandler` is used to sync array-like (one-dimensional) value.'''
|
||||
def __init__(self, size, init_value=None):
|
||||
'''Constructor for syncing array-like (one-dimensional) value.
|
||||
|
||||
The `size` should be a int equal to the size of value we want to sync.
|
||||
If init_value is None, zeros will be used to initialize the tables,
|
||||
otherwise the table will be initialized as the init_value.
|
||||
Notice: if the init_value is different in different processes, the
|
||||
average of them will be used.
|
||||
'''
|
||||
self._handler = ctypes.c_void_p()
|
||||
self._size = size
|
||||
mv_lib.MV_NewArrayTable(size, ctypes.byref(self._handler))
|
||||
if init_value is not None:
|
||||
init_value = convert_data(init_value)
|
||||
# sync add is used because we want to make sure that the initial
|
||||
# value has taken effect when the call returns.
|
||||
self.add(init_value / api.workers_num(), sync=True)
|
||||
|
||||
def get(self):
|
||||
'''get the latest value from multiverso ArrayTable
|
||||
|
||||
Data type of return value is numpy.ndarray with one-dimensional
|
||||
'''
|
||||
data = np.zeros((self._size, ), dtype=np.dtype("float32"))
|
||||
mv_lib.MV_GetArrayTable(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
|
||||
return data
|
||||
|
||||
def add(self, data, sync=False):
|
||||
'''add the data to the multiverso ArrayTable
|
||||
|
||||
Data type of `data` is numpy.ndarray with one-dimensional
|
||||
|
||||
If sync is True, this call will blocked by IO until the call finish.
|
||||
Otherwise it will return immediately
|
||||
'''
|
||||
data = convert_data(data)
|
||||
assert(data.size == self._size)
|
||||
if sync:
|
||||
mv_lib.MV_AddArrayTable(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
|
||||
else:
|
||||
mv_lib.MV_AddAsyncArrayTable(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
|
||||
|
||||
|
||||
class MatrixTableHandler(TableHandler):
|
||||
def __init__(self, num_row, num_col, init_value=None):
|
||||
'''Constructor for syncing matrix-like (two-dimensional) value.
|
||||
|
||||
The `num_row` should be the number of rows and the `num_col` should be
|
||||
the number of columns.
|
||||
|
||||
If init_value is None, zeros will be used to initialize the tables,
|
||||
otherwise the table will be initialized as the init_value.
|
||||
Notice: if the init_value is different in different processes, the
|
||||
average of them will be used.
|
||||
'''
|
||||
self._handler = ctypes.c_void_p()
|
||||
self._num_row = num_row
|
||||
self._num_col = num_col
|
||||
self._size = num_col * num_row
|
||||
mv_lib.MV_NewMatrixTable(num_row, num_col, ctypes.byref(self._handler))
|
||||
if init_value is not None:
|
||||
init_value = convert_data(init_value)
|
||||
# sync add is used because we want to make sure that the initial
|
||||
# value has taken effect when the call returns.
|
||||
self.add(init_value / api.workers_num(), sync=True)
|
||||
|
||||
def get(self, row_ids=None):
|
||||
'''get the latest value from multiverso MatrixTable
|
||||
|
||||
If row_ids is None, we will return all rows as numpy.narray , e.g.
|
||||
array([[1, 3], [3, 4]]).
|
||||
Otherwise we will return the data according to the row_ids(e.g. you can
|
||||
pass [1] to row_ids to get only the first row, it will return a
|
||||
two-dimensional numpy.ndarray with one row)
|
||||
|
||||
Data type of return value is numpy.ndarray with two-dimensional
|
||||
'''
|
||||
if row_ids is None:
|
||||
data = np.zeros((self._num_row, self._num_col), dtype=np.dtype("float32"))
|
||||
mv_lib.MV_GetMatrixTableAll(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
|
||||
return data
|
||||
else:
|
||||
row_ids_n = len(row_ids)
|
||||
int_array_type = ctypes.c_int * row_ids_n
|
||||
data = np.zeros((row_ids_n, self._num_col), dtype=np.dtype("float32"))
|
||||
mv_lib.MV_GetMatrixTableByRows(self._handler, data.ctypes.data_as(C_FLOAT_P),
|
||||
row_ids_n * self._num_col,
|
||||
int_array_type(*row_ids), row_ids_n)
|
||||
return data
|
||||
|
||||
def add(self, data=None, row_ids=None, sync=False):
|
||||
'''add the data to the multiverso MatrixTable
|
||||
|
||||
If row_ids is None, we will add all data, and the data
|
||||
should be a list, e.g. [1, 2, 3, ...]
|
||||
|
||||
Otherwise we will add the data according to the row_ids
|
||||
|
||||
Data type of `data` is numpy.ndarray with two-dimensional
|
||||
|
||||
If sync is True, this call will blocked by IO until the call finish.
|
||||
Otherwise it will return immediately
|
||||
'''
|
||||
assert(data is not None)
|
||||
data = convert_data(data)
|
||||
|
||||
if row_ids is None:
|
||||
assert(data.size == self._size)
|
||||
if sync:
|
||||
mv_lib.MV_AddMatrixTableAll(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
|
||||
else:
|
||||
mv_lib.MV_AddAsyncMatrixTableAll(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
|
||||
else:
|
||||
row_ids_n = len(row_ids)
|
||||
assert(data.size == row_ids_n * self._num_col)
|
||||
int_array_type = ctypes.c_int * row_ids_n
|
||||
if sync:
|
||||
mv_lib.MV_AddMatrixTableByRows(self._handler, data.ctypes.data_as(C_FLOAT_P),
|
||||
row_ids_n * self._num_col,
|
||||
int_array_type(*row_ids), row_ids_n)
|
||||
else:
|
||||
mv_lib.MV_AddAsyncMatrixTableByRows(self._handler, data.ctypes.data_as(C_FLOAT_P),
|
||||
row_ids_n * self._num_col,
|
||||
int_array_type(*row_ids), row_ids_n)
|
|
@ -0,0 +1,111 @@
|
|||
#!/usr/bin/env python
|
||||
# coding:utf8
|
||||
import multiverso as mv
|
||||
import unittest
|
||||
import numpy as np
|
||||
import theano
|
||||
from multiverso.theano_ext import sharedvar
|
||||
|
||||
|
||||
def setUpModule():
|
||||
mv.init()
|
||||
|
||||
|
||||
def tearDownModule():
|
||||
mv.shutdown()
|
||||
|
||||
|
||||
class TestMultiversoTables(unittest.TestCase):
|
||||
'''
|
||||
Use the commands below to run test
|
||||
$ nosetests
|
||||
'''
|
||||
|
||||
def _test_array(self, size):
|
||||
tbh = mv.ArrayTableHandler(size)
|
||||
mv.barrier()
|
||||
|
||||
for i in xrange(100):
|
||||
tbh.add(range(1, size + 1))
|
||||
tbh.add(range(1, size + 1))
|
||||
mv.barrier()
|
||||
for j, actual in enumerate(tbh.get()):
|
||||
self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(), actual)
|
||||
mv.barrier()
|
||||
|
||||
def test_small_array(self):
|
||||
# TODO : this is not supported by multiverso because of the size
|
||||
# limited. Waiting for the solution of this issue
|
||||
# https://github.com/Microsoft/multiverso/issues/69
|
||||
|
||||
# self._test_array(1)
|
||||
pass
|
||||
|
||||
def test_array(self):
|
||||
self._test_array(10000)
|
||||
|
||||
def test_matrix(self):
|
||||
num_row = 11
|
||||
num_col = 10
|
||||
size = num_col * num_row
|
||||
workers_num = mv.workers_num()
|
||||
tbh = mv.MatrixTableHandler(num_row, num_col)
|
||||
mv.barrier()
|
||||
for count in xrange(1, 21):
|
||||
row_ids = [0, 1, 5, 10]
|
||||
tbh.add(range(size))
|
||||
tbh.add([range(rid * num_col, (1 + rid) * num_col) for rid in row_ids], row_ids)
|
||||
mv.barrier()
|
||||
data = tbh.get()
|
||||
mv.barrier()
|
||||
for i, row in enumerate(data):
|
||||
for j, actual in enumerate(row):
|
||||
expected = (i * num_col + j) * count * workers_num
|
||||
if i in row_ids:
|
||||
expected += (i * num_col + j) * count * workers_num
|
||||
self.assertEqual(expected, actual)
|
||||
data = tbh.get(row_ids)
|
||||
mv.barrier()
|
||||
for i, row in enumerate(data):
|
||||
for j, actual in enumerate(row):
|
||||
expected = (row_ids[i] * num_col + j) * count * workers_num * 2
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
|
||||
class TestMultiversoSharedVariable(unittest.TestCase):
|
||||
'''
|
||||
Use the commands below to run test
|
||||
$ nosetests
|
||||
'''
|
||||
|
||||
def _test_sharedvar(self, row, col):
|
||||
W = sharedvar.mv_shared(
|
||||
value=np.zeros(
|
||||
(row, col),
|
||||
dtype=theano.config.floatX
|
||||
),
|
||||
name='W',
|
||||
borrow=True
|
||||
)
|
||||
delta = np.array(range(1, row * col + 1),
|
||||
dtype=theano.config.floatX).reshape((row, col))
|
||||
train_model = theano.function([], updates=[(W, W + delta)])
|
||||
mv.barrier()
|
||||
|
||||
for i in xrange(100):
|
||||
train_model()
|
||||
train_model()
|
||||
sharedvar.sync_all_mv_shared_vars()
|
||||
mv.barrier()
|
||||
# to get the newest value, we must sync again
|
||||
sharedvar.sync_all_mv_shared_vars()
|
||||
for j, actual in enumerate(W.get_value().reshape(-1)):
|
||||
self.assertEqual((j + 1) * (i + 1) * 2 * mv.workers_num(), actual)
|
||||
mv.barrier()
|
||||
|
||||
def test_sharedvar(self):
|
||||
self._test_sharedvar(200, 200)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -0,0 +1,64 @@
|
|||
#!/usr/bin/env python
|
||||
# coding:utf8
|
||||
|
||||
import lasagne
|
||||
import numpy as np
|
||||
import multiverso as mv
|
||||
|
||||
|
||||
class MVNetParamManager(object):
|
||||
'''
|
||||
MVNetParamManager is manager to make managing and synchronizing the
|
||||
variables in lasagne more easily
|
||||
'''
|
||||
def __init__(self, network):
|
||||
''' The constructor of MVNetParamManager
|
||||
|
||||
The constructor will associate the parameter with multiverso array
|
||||
table. The initial value of ArrayTableHandler will be same as the
|
||||
parameters of network. If different parameters are used in different
|
||||
processes, the average of them will be used as the initial value
|
||||
'''
|
||||
self.shapes = []
|
||||
self.dtypes = []
|
||||
self.sizes = []
|
||||
self.all_param_list = []
|
||||
self.network = network
|
||||
for arr in lasagne.layers.get_all_param_values(self.network):
|
||||
self.shapes.append(arr.shape)
|
||||
# TODO: Now only float32 is supported in multiverso. So I store all
|
||||
# the parameters in a float32 array. This place need modification
|
||||
# after other types are supported
|
||||
assert(np.dtype("float32") == arr.dtype)
|
||||
self.dtypes.append(arr.dtype)
|
||||
self.sizes.append(arr.size)
|
||||
self.all_param_list.extend([i for i in np.nditer(arr)])
|
||||
self.all_param_list = np.array(self.all_param_list)
|
||||
|
||||
self.tbh = mv.ArrayTableHandler(len(self.all_param_list), init_value=self.all_param_list)
|
||||
mv.barrier() # add barrier to make sure the initial values have token effect
|
||||
self.all_param_list = self.tbh.get()
|
||||
self._set_all_param_to_net()
|
||||
|
||||
def _set_all_param_to_net(self):
|
||||
n = 0
|
||||
params = []
|
||||
for i, size in enumerate(self.sizes):
|
||||
params.append(self.all_param_list[n:n + size].reshape(self.shapes[i]))
|
||||
n += size
|
||||
lasagne.layers.set_all_param_values(self.network, params)
|
||||
|
||||
def sync_all_param(self):
|
||||
'''sync all parameters with multiverso server
|
||||
|
||||
This function will
|
||||
1) calc all the delta of params in the network and add the delta to multiverso server
|
||||
2) get the latest value from the multiverso server
|
||||
'''
|
||||
cur_network_params = np.concatenate([
|
||||
arr.reshape(-1) for arr in lasagne.layers.get_all_param_values(self.network)])
|
||||
|
||||
params_delta = cur_network_params - self.all_param_list
|
||||
self.tbh.add(params_delta)
|
||||
self.all_param_list = self.tbh.get()
|
||||
self._set_all_param_to_net()
|
|
@ -0,0 +1,100 @@
|
|||
#!/usr/bin/env python
|
||||
# coding:utf8
|
||||
|
||||
from theano.tensor.basic import TensorType, _tensor_py_operators
|
||||
from theano.compile import SharedVariable
|
||||
from theano.compile.sharedvalue import shared
|
||||
from theano.gof import Variable, utils
|
||||
import numpy
|
||||
import multiverso as mv
|
||||
|
||||
|
||||
class MVSharedVariable(object):
|
||||
'''MVSharedVariable is an wrapper of SharedVariable
|
||||
|
||||
It will act same as SharedVariable. The only difference is a multiverso
|
||||
ArrayTable is addded to make it easier to sync values.
|
||||
'''
|
||||
def __init__(self, svobj):
|
||||
'''Constructor of the MVSharedVariable
|
||||
|
||||
The constructor will create ArrayTableHandler and associate the shared
|
||||
variable with it. The initial value of ArrayTableHandler will be same
|
||||
as the value of SharedVariable. If different initial value is used in
|
||||
different processes, the average of them will be used as the initial
|
||||
value
|
||||
'''
|
||||
assert(isinstance(svobj, SharedVariable))
|
||||
self._svobj = svobj
|
||||
self._mv_array = mv.ArrayTableHandler(self._svobj.get_value().size,
|
||||
init_value=self._svobj.get_value().reshape((-1,)))
|
||||
|
||||
mv.barrier() # add barrier to make sure the initial values have token effect
|
||||
# _last_mv_data restore a copy of value. It will be used for calculate
|
||||
# the update for multiverso when calling mv_sync
|
||||
self._last_mv_data = self._mv_array.get().reshape(self._svobj.get_value().shape)
|
||||
self._svobj.set_value(self._last_mv_data, borrow=False)
|
||||
|
||||
def mv_sync(self):
|
||||
''' sync values with multiverso server
|
||||
|
||||
mv_sync will add the delta of SharedVariable, which is usually the
|
||||
gradients in typical examples, to parameter server and then get the
|
||||
latest value in multiverso.
|
||||
'''
|
||||
# because multiverso always use add method to sync value, the delta
|
||||
# will be the difference of the current value of last synced value
|
||||
self._mv_array.add(self._svobj.get_value() - self._last_mv_data)
|
||||
|
||||
self._svobj.set_value(self._mv_array.get().reshape(self._svobj.get_value().shape))
|
||||
self._last_mv_data = self._svobj.get_value(borrow=False)
|
||||
|
||||
def __getstate__(self):
|
||||
'''This is for cPickle to store state.
|
||||
|
||||
It is usually called when you want to dump the model to file with
|
||||
cPickle
|
||||
'''
|
||||
odict = self.__dict__.copy() # copy the dict since we change it
|
||||
del odict['_mv_array'] # remove mv_array, because we can't pickle it
|
||||
return odict
|
||||
|
||||
def __getattribute__(self, attr):
|
||||
'''This function make MVSharedVariable act same as SharedVariable'''
|
||||
if attr in ['_svobj', '_mv_array', '_last_mv_data']:
|
||||
# If get the attribute of self, use parent __getattribute__ to get
|
||||
# attribute from the object, otherwise it will fall into infinite
|
||||
# loop
|
||||
return object.__getattribute__(self, attr)
|
||||
elif attr in ['mv_sync', "__getstate__"]:
|
||||
# If call method of MVSharedVariable, then call the method directly
|
||||
# and bound the method to self object
|
||||
return getattr(MVSharedVariable, attr).__get__(self)
|
||||
else:
|
||||
# Otherwise I will get attribute from the wrapped object
|
||||
return getattr(self._svobj, attr)
|
||||
|
||||
|
||||
def mv_shared(*args, **kwargs):
|
||||
'''mv_shared works same as `theano.shared`
|
||||
|
||||
It calls `theano.shared` to create the SharedVariable and use
|
||||
MVSharedVariable to wrap it.
|
||||
'''
|
||||
var = shared(*args, **kwargs)
|
||||
mv_shared.shared_vars.append(MVSharedVariable(var))
|
||||
return var
|
||||
|
||||
|
||||
mv_shared.shared_vars = [] # all shared_vars in multiverso will be recorded here
|
||||
|
||||
|
||||
def sync_all_mv_shared_vars():
|
||||
'''Sync shared value created by `mv_shared` with multiverso
|
||||
|
||||
It is often used when you are training model, and it will add the gradients
|
||||
(delta value) to the server and update the latest value from the server.
|
||||
Notice: It will **only** sync shared value created by `mv_shared`
|
||||
'''
|
||||
for sv in mv_shared.shared_vars:
|
||||
sv.mv_sync()
|
|
@ -0,0 +1,77 @@
|
|||
#!/usr/bin/env python
|
||||
# coding:utf8
|
||||
|
||||
import ctypes
|
||||
import os
|
||||
import platform
|
||||
from ctypes.util import find_library
|
||||
import numpy as np
|
||||
|
||||
PACKAGE_PATH = os.path.abspath(os.path.dirname(__file__))
|
||||
|
||||
|
||||
class Loader(object):
|
||||
'''
|
||||
This loader is responsible for loading multiverso dynamic library in both
|
||||
*nux and windows
|
||||
'''
|
||||
|
||||
LIB = None
|
||||
|
||||
@classmethod
|
||||
def _find_mv_path(cls):
|
||||
if platform.system() == "Windows":
|
||||
mv_lib_path = find_library("Multiverso")
|
||||
if mv_lib_path is None:
|
||||
print "* Fail to load Multiverso.dll from the windows $PATH."\
|
||||
"Because Multiverso.dll can not be found in the $PATH "\
|
||||
"directories. Go on loading Multiverso from the package."
|
||||
else:
|
||||
return mv_lib_path
|
||||
|
||||
mv_lib_path = os.path.join(PACKAGE_PATH, "Multiverso.dll")
|
||||
if not os.path.exists(mv_lib_path):
|
||||
print "* Fail to load Multiverso.dll from the package. Because"\
|
||||
" the file " + mv_lib_path + " can not be found."
|
||||
else:
|
||||
return mv_lib_path
|
||||
else:
|
||||
mv_lib_path = find_library("multiverso")
|
||||
if mv_lib_path is None:
|
||||
print "* Fail to load libmultiverso.so from the system"\
|
||||
"libraries. Because libmultiverso.so can't be found in"\
|
||||
"library paths. Go on loading Multiverso from the package."
|
||||
else:
|
||||
return mv_lib_path
|
||||
|
||||
mv_lib_path = os.path.join(PACKAGE_PATH, "libmultiverso.so")
|
||||
if not os.path.exists(mv_lib_path):
|
||||
print "* Fail to load libmultiverso.so from the package. Because"\
|
||||
" the file " + mv_lib_path + " can not be found."
|
||||
else:
|
||||
return mv_lib_path
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def load_lib(cls):
|
||||
mv_lib_path = cls._find_mv_path()
|
||||
if mv_lib_path is None:
|
||||
print "Fail to load the multiverso library. Please make sure you"\
|
||||
" have installed multiverso successfully"
|
||||
else:
|
||||
print "Find the multiverso library successfully(%s)" % mv_lib_path
|
||||
return ctypes.cdll.LoadLibrary(mv_lib_path)
|
||||
|
||||
@classmethod
|
||||
def get_lib(cls):
|
||||
if not cls.LIB:
|
||||
cls.LIB = cls.load_lib()
|
||||
cls.LIB.MV_NumWorkers.restype = ctypes.c_int
|
||||
return cls.LIB
|
||||
|
||||
|
||||
def convert_data(data):
|
||||
'''convert the data to float32 ndarray'''
|
||||
if not isinstance(data, np.ndarray):
|
||||
data = np.array(data)
|
||||
return data.astype(np.float32)
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,58 @@
|
|||
from CLM import train
|
||||
|
||||
def log_with_print(log, context):
|
||||
print >>log, context
|
||||
print context
|
||||
|
||||
|
||||
logfile = __file__ + 'log'
|
||||
log = open(logfile, 'w')
|
||||
|
||||
round = 0
|
||||
log_with_print(log, 'round ' + str(round) + 'begin ------------------------------- !!')
|
||||
# change some for round
|
||||
|
||||
|
||||
max_epochs = 100000
|
||||
|
||||
obj_directory = r'..\Sentiment_CLM_WithDropout'
|
||||
reload_model = obj_directory + r'\T.npz'
|
||||
|
||||
|
||||
train(round = round,
|
||||
saveto = obj_directory + '\\round%d_model_lstm.npz'%(round),
|
||||
reload_model = reload_model,
|
||||
reload_option = reload_model + '.pkl',
|
||||
dataset = r'../data/imdb.pkl', #%(work_id + 1),
|
||||
encoder = 'lstm',
|
||||
dropout_input = 0.5,
|
||||
dropout_output= 0.5,
|
||||
clip_c = 5.,
|
||||
dim_word = 500,
|
||||
dim_proj = 1024,
|
||||
n_words = 10000,
|
||||
#n_words_sqrt = n_words_sqrt,
|
||||
optimizer = 'adadelta',
|
||||
lrate = 0.5,
|
||||
maxlen = None,
|
||||
minlen = 1,
|
||||
start_iter = 0,
|
||||
start_epoch = 0,
|
||||
max_epochs = max_epochs, #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
batch_size = 16,
|
||||
patience = 100,
|
||||
validFreq = 5000,
|
||||
saveFreq = 50000000,
|
||||
dispFreq = 1,
|
||||
sampleFreq = 20000000,
|
||||
newDumpFreq = 20000,
|
||||
syncFreq = 5000000000,
|
||||
sampleNum = 25,
|
||||
decay_c = 0.,
|
||||
log = logfile,
|
||||
monitor_grad = False,
|
||||
sampleFileName= obj_directory + '\\round%d_sample.txt'%(round),
|
||||
pad_sos = False,
|
||||
embedding = '../data/embedding500.npz'
|
||||
)
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
from CLM import train
|
||||
|
||||
def log_with_print(log, context):
|
||||
print >>log, context
|
||||
print context
|
||||
|
||||
|
||||
logfile = __file__ + 'log'
|
||||
log = open(logfile, 'w')
|
||||
|
||||
round = 0
|
||||
log_with_print(log, 'round ' + str(round) + 'begin ------------------------------- !!')
|
||||
# change some for round
|
||||
|
||||
|
||||
max_epochs = 100000
|
||||
|
||||
obj_directory = r'..\Sentiment_CLM_nodrop'
|
||||
reload_model = obj_directory + r'\de.npz'
|
||||
|
||||
|
||||
train(round = round,
|
||||
saveto = obj_directory + '\\round%d_model_lstm.npz'%(round),
|
||||
reload_model = None, #reload_model,
|
||||
reload_option = None, #reload_model + '.pkl',
|
||||
dataset = r'../data/imdb.pkl', #%(work_id + 1),
|
||||
encoder = 'lstm',
|
||||
dropout_input = None,
|
||||
dropout_output= None,
|
||||
clip_c = 5.,
|
||||
dim_word = 500,
|
||||
dim_proj = 1024,
|
||||
n_words = 10000,
|
||||
#n_words_sqrt = n_words_sqrt,
|
||||
optimizer = 'adadelta',
|
||||
lrate = 1.0,
|
||||
maxlen = None,
|
||||
minlen = 1,
|
||||
start_iter = 0,
|
||||
start_epoch = 0,
|
||||
max_epochs = max_epochs, #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
batch_size = 16,
|
||||
patience = 100,
|
||||
validFreq = 10000,
|
||||
saveFreq = 50000000,
|
||||
dispFreq = 1,
|
||||
sampleFreq = 20000000,
|
||||
newDumpFreq = 20000,
|
||||
syncFreq = 5000000000,
|
||||
sampleNum = 25,
|
||||
decay_c = 0.,
|
||||
log = logfile,
|
||||
monitor_grad = False,
|
||||
sampleFileName= obj_directory + '\\round%d_sample.txt'%(round),
|
||||
pad_sos = False,
|
||||
embedding = '../data/embedding500.npz'
|
||||
)
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
from CLM import train
|
||||
|
||||
def log_with_print(log, context):
|
||||
print >>log, context
|
||||
print context
|
||||
|
||||
|
||||
logfile = __file__ + 'log'
|
||||
log = open(logfile, 'w')
|
||||
|
||||
round = 0
|
||||
log_with_print(log, 'round ' + str(round) + 'begin ------------------------------- !!')
|
||||
# change some for round
|
||||
|
||||
|
||||
max_epochs = 100000
|
||||
|
||||
obj_directory = r'..\Sentiment_CLM_nodrop_lr0.5'
|
||||
reload_model = obj_directory + r'\T.npz'
|
||||
|
||||
|
||||
train(round = round,
|
||||
saveto = obj_directory + '\\round%d_model_lstm.npz'%(round),
|
||||
reload_model = reload_model,
|
||||
reload_option = reload_model + '.pkl',
|
||||
dataset = r'../data/imdb.pkl', #%(work_id + 1),
|
||||
encoder = 'lstm',
|
||||
dropout_input = None,
|
||||
dropout_output= None,
|
||||
clip_c = 5.,
|
||||
dim_word = 500,
|
||||
dim_proj = 1024,
|
||||
n_words = 10000,
|
||||
#n_words_sqrt = n_words_sqrt,
|
||||
optimizer = 'adadelta',
|
||||
lrate = 0.5,
|
||||
maxlen = None,
|
||||
minlen = 1,
|
||||
start_iter = 0,
|
||||
start_epoch = 0,
|
||||
max_epochs = max_epochs, #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
batch_size = 16,
|
||||
patience = 100,
|
||||
validFreq = 5000,
|
||||
saveFreq = 50000000,
|
||||
dispFreq = 1,
|
||||
sampleFreq = 20000000,
|
||||
newDumpFreq = 20000,
|
||||
syncFreq = 5000000000,
|
||||
sampleNum = 25,
|
||||
decay_c = 0.,
|
||||
log = logfile,
|
||||
monitor_grad = False,
|
||||
sampleFileName= obj_directory + '\\round%d_sample.txt'%(round),
|
||||
pad_sos = False,
|
||||
embedding = '../data/embedding500.npz'
|
||||
)
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
@echo off
|
||||
setlocal ENABLEDELAYEDEXPANSION
|
||||
set THEANO_FLAGS=device=gpu1
|
||||
python train_clm_WithDropout_lr0.5.py
|
|
@ -0,0 +1,148 @@
|
|||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
@rem Windows batch file to use Theano on GCR
|
||||
@rem
|
||||
@rem Updated: April 7, 2016
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|
||||
@rem set the PATH system variable
|
||||
@rem Start from the 26th letter
|
||||
set working_sub_dir=%cd:~26%
|
||||
|
||||
set PATH=^
|
||||
C:\Windows\system32;^
|
||||
C:\Windows\System32\Wbem;^
|
||||
C:\Windows\System32\WindowsPowerShell\v1.0\;^
|
||||
C:\Windows;^
|
||||
C:\Program Files\Microsoft HPC Pack 2012\Bin\;^
|
||||
C:\Program Files\Microsoft MPI\Bin\;^
|
||||
C:\Program Files (x86)\Windows Kits\8.1\Windows Performance Toolkit\
|
||||
|
||||
pushd \\gcr\Scratch\RR1\v-yixia\Theano
|
||||
set ToolkitFolderDriver=%cd%
|
||||
|
||||
@rem set the environment variable for the CUDA 7.5 Toolkit
|
||||
rem set CUDA_HOME=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5 rem the old version
|
||||
set CUDA_HOME=%ToolkitFolderDriver%\CUDA\v7.0+cudnn4008
|
||||
set CUDA_BIN=%CUDA_HOME%\bin
|
||||
set CUDA_INCLUDE=%CUDA_HOME%\include
|
||||
set CUDA_LIB=%CUDA_HOME%\lib\x64
|
||||
set CUDA_LIBNVVP=%CUDA_HOME%\libnvvp
|
||||
|
||||
@rem add all CUDA Toolkit folders to the PATH system variable
|
||||
set PATH=^
|
||||
%CUDA_HOME%;^
|
||||
%CUDA_BIN%;^
|
||||
%CUDA_INCLUDE%;^
|
||||
%CUDA_LIB%;^
|
||||
%CUDA_LIBNVVP%;^
|
||||
%PATH%
|
||||
|
||||
@echo %PATH%
|
||||
|
||||
@rem setting up VC complier
|
||||
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
@rem connect to the shared toolkit folder \\gcr\Tools\Shared_Toolkits\Theano
|
||||
rem pushd \\gcr\Tools\Shared_Toolkits\Theano
|
||||
|
||||
|
||||
|
||||
@rem unset these variables
|
||||
@set Framework40Version=
|
||||
@set FrameworkDIR32=
|
||||
@set FrameworkVersion32=
|
||||
@set FSHARPINSTALLDIR=
|
||||
@set VSINSTALLDIR=
|
||||
@set WindowsSDK_ExecutablePath_x64=
|
||||
@set WindowsSDK_ExecutablePath_x86=
|
||||
|
||||
@set VCINSTALLDIR=%ToolkitFolderDriver%\VS_portable\Microsoft Visual Studio 12.0v2\VC\
|
||||
@set WindowsSdkDir=%ToolkitFolderDriver%\Windows Kits\8.1v2\
|
||||
|
||||
:amd64
|
||||
|
||||
@rem set Windows SDK include/lib path
|
||||
@rem --------------------------------------------------
|
||||
if not "%WindowsSdkDir%" == "" @set PATH=%WindowsSdkDir%bin\x64;%WindowsSdkDir%bin\x86;%PATH%
|
||||
if not "%WindowsSdkDir%" == "" @set INCLUDE=%WindowsSdkDir%include\shared;%WindowsSdkDir%include\um;%WindowsSdkDir%include\winrt;%INCLUDE%
|
||||
if not "%WindowsSdkDir%" == "" @set LIB=%WindowsSdkDir%lib\winv6.3\um\x64;%LIB%
|
||||
if not "%WindowsSdkDir%" == "" @set LIBPATH=%WindowsLibPath%;%ExtensionSDKDir%\Microsoft.VCLibs\14.0\References\CommonConfiguration\neutral;%LIBPATH%
|
||||
|
||||
@rem set the environment variables for Microsoft Visual Studio
|
||||
@rem --------------------------------------------------
|
||||
@rem PATH
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%VCPackages" set PATH=%VCINSTALLDIR%VCPackages;%PATH%
|
||||
if exist "%VCINSTALLDIR%BIN\amd64" set PATH=%VCINSTALLDIR%BIN\amd64;%PATH%
|
||||
@rem --------------------------------------------------
|
||||
@rem INCLUDE
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%ATLMFC\INCLUDE" set INCLUDE=%VCINSTALLDIR%ATLMFC\INCLUDE;%INCLUDE%
|
||||
if exist "%VCINSTALLDIR%INCLUDE" set INCLUDE=%VCINSTALLDIR%INCLUDE;%INCLUDE%
|
||||
@rem --------------------------------------------------
|
||||
@rem LIB
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIB=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIB%
|
||||
if exist "%VCINSTALLDIR%LIB\amd64" set LIB=%VCINSTALLDIR%LIB\amd64;%LIB%
|
||||
@rem --------------------------------------------------
|
||||
@rem LIBPATH
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIBPATH=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIBPATH%
|
||||
if exist "%VCINSTALLDIR%LIB\amd64" set LIBPATH=%VCINSTALLDIR%LIB\amd64;%LIBPATH%
|
||||
|
||||
@rem set the environment variables for the cuDNN v4 (Feb 10, 2016) for CUDA 7.0 and later.
|
||||
rem set CUDNN_PATH=%ToolkitFolderDriver%\Shared_Toolkits\Theano\CUDA\cudnn-4.0.7\cuda
|
||||
rem set INCLUDE=%CUDNN_PATH%\include;%INCLUDE%
|
||||
rem set LIB=%CUDNN_PATH%\lib\x64;%LIB%
|
||||
rem set PATH=%CUDNN_PATH%\bin;%PATH%
|
||||
|
||||
set Platform=X64
|
||||
set CommandPromptType=Native
|
||||
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
@rem connect to your scratch storage \\gcr\Scratch\<location>\<alias>
|
||||
@rem Note: Please copy ANACONDA2 to \\gcr\Scratch\<location>\<alias>\Anaconda2
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
pushd \\gcr\scratch\RR1\v-yixia
|
||||
set CONDANETDRIVE=%cd:~0,2%
|
||||
|
||||
@rem set the environment variable for the Anaconda2
|
||||
set ANACONDA2=%CONDANETDRIVE%\RR1\taoqin\Anaconda-gpu002
|
||||
set ANACONDA2_SCRIPTS=%ANACONDA2%\Scripts
|
||||
set ANACONDA2_BIN=%ANACONDA2%\Library\bin
|
||||
|
||||
@rem add Anaconda2 folders to the PATH system variable
|
||||
set PATH=^
|
||||
%ANACONDA2%;^
|
||||
%ANACONDA2_BIN%;^
|
||||
%ANACONDA2_SCRIPTS%;^
|
||||
%PATH%
|
||||
|
||||
@echo %PATH%
|
||||
|
||||
@rem example files from DeepLearningTutorials are available at \\gcr\Tools\Shared_Toolkits\Theano\Examples
|
||||
set PROJDRIVE=%CONDANETDRIVE%
|
||||
set MYHOME=%PROJDRIVE%\RR1\v-yixia
|
||||
set PROJHOME=%MYHOME%\%working_sub_dir%
|
||||
|
||||
%PROJDRIVE%
|
||||
|
||||
cd %PROJHOME%
|
||||
|
||||
@rem setup theano env (generate .theanorc.txt)
|
||||
call python gen_theanorc.py %ANACONDA2% .theanorc.txt
|
||||
del %userprofile%\.theanorc.txt /Q /F
|
||||
copy .theanorc.txt %userprofile% /Y
|
||||
|
||||
call python write_script.py %*
|
||||
|
||||
call worker.bat
|
||||
|
||||
@echo delete theano env
|
||||
del %userprofile%\.theanorc.txt /Q /F
|
||||
|
||||
popd
|
||||
|
||||
popd
|
||||
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
@rem Windows batch file to use Theano on GCR
|
||||
@rem
|
||||
@rem Updated: April 7, 2016
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|
||||
@rem set the PATH system variable
|
||||
@rem Start from the 26th letter
|
||||
set working_sub_dir=%cd:~26%
|
||||
|
||||
set PATH=^
|
||||
C:\Windows\system32;^
|
||||
C:\Windows\System32\Wbem;^
|
||||
C:\Windows\System32\WindowsPowerShell\v1.0\;^
|
||||
C:\Windows;^
|
||||
C:\Program Files\Microsoft HPC Pack 2012\Bin\;^
|
||||
C:\Program Files\Microsoft MPI\Bin\;^
|
||||
C:\Program Files (x86)\Windows Kits\8.1\Windows Performance Toolkit\
|
||||
|
||||
pushd \\gcr\Scratch\RR1\v-yixia\Theano
|
||||
set ToolkitFolderDriver=%cd%
|
||||
|
||||
@rem set the environment variable for the CUDA 7.5 Toolkit
|
||||
rem set CUDA_HOME=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5 rem the old version
|
||||
set CUDA_HOME=%ToolkitFolderDriver%\CUDA\v7.0
|
||||
set CUDA_BIN=%CUDA_HOME%\bin
|
||||
set CUDA_INCLUDE=%CUDA_HOME%\include
|
||||
set CUDA_LIB=%CUDA_HOME%\lib\x64
|
||||
set CUDA_LIBNVVP=%CUDA_HOME%\libnvvp
|
||||
|
||||
@rem add all CUDA Toolkit folders to the PATH system variable
|
||||
set PATH=^
|
||||
%CUDA_HOME%;^
|
||||
%CUDA_BIN%;^
|
||||
%CUDA_INCLUDE%;^
|
||||
%CUDA_LIB%;^
|
||||
%CUDA_LIBNVVP%;^
|
||||
%PATH%
|
||||
|
||||
@echo %PATH%
|
||||
|
||||
@rem setting up VC complier
|
||||
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
@rem connect to the shared toolkit folder \\gcr\Tools\Shared_Toolkits\Theano
|
||||
rem pushd \\gcr\Tools\Shared_Toolkits\Theano
|
||||
|
||||
|
||||
|
||||
@rem unset these variables
|
||||
@set Framework40Version=
|
||||
@set FrameworkDIR32=
|
||||
@set FrameworkVersion32=
|
||||
@set FSHARPINSTALLDIR=
|
||||
@set VSINSTALLDIR=
|
||||
@set WindowsSDK_ExecutablePath_x64=
|
||||
@set WindowsSDK_ExecutablePath_x86=
|
||||
|
||||
@set VCINSTALLDIR=%ToolkitFolderDriver%\VS_portable\Microsoft Visual Studio 12.0\VC\
|
||||
@set WindowsSdkDir=%ToolkitFolderDriver%\Windows Kits\8.1\
|
||||
|
||||
:amd64
|
||||
|
||||
@rem set Windows SDK include/lib path
|
||||
@rem --------------------------------------------------
|
||||
if not "%WindowsSdkDir%" == "" @set PATH=%WindowsSdkDir%bin\x64;%WindowsSdkDir%bin\x86;%PATH%
|
||||
if not "%WindowsSdkDir%" == "" @set INCLUDE=%WindowsSdkDir%include\%WindowsSDKVersion%shared;%WindowsSdkDir%include\%WindowsSDKVersion%um;%WindowsSdkDir%include\%WindowsSDKVersion%winrt;%INCLUDE%
|
||||
if not "%WindowsSdkDir%" == "" @set LIB=%WindowsSdkDir%lib\%WindowsSDKLibVersion%um\x64;%LIB%
|
||||
if not "%WindowsSdkDir%" == "" @set LIBPATH=%WindowsLibPath%;%ExtensionSDKDir%\Microsoft.VCLibs\14.0\References\CommonConfiguration\neutral;%LIBPATH%
|
||||
|
||||
@rem set the environment variables for Microsoft Visual Studio
|
||||
@rem --------------------------------------------------
|
||||
@rem PATH
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%VCPackages" set PATH=%VCINSTALLDIR%VCPackages;%PATH%
|
||||
if exist "%VCINSTALLDIR%BIN\amd64" set PATH=%VCINSTALLDIR%BIN\amd64;%PATH%
|
||||
@rem --------------------------------------------------
|
||||
@rem INCLUDE
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%ATLMFC\INCLUDE" set INCLUDE=%VCINSTALLDIR%ATLMFC\INCLUDE;%INCLUDE%
|
||||
if exist "%VCINSTALLDIR%INCLUDE" set INCLUDE=%VCINSTALLDIR%INCLUDE;%INCLUDE%
|
||||
@rem --------------------------------------------------
|
||||
@rem LIB
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIB=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIB%
|
||||
if exist "%VCINSTALLDIR%LIB\amd64" set LIB=%VCINSTALLDIR%LIB\amd64;%LIB%
|
||||
@rem --------------------------------------------------
|
||||
@rem LIBPATH
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIBPATH=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIBPATH%
|
||||
if exist "%VCINSTALLDIR%LIB\amd64" set LIBPATH=%VCINSTALLDIR%LIB\amd64;%LIBPATH%
|
||||
|
||||
@rem set the environment variables for the cuDNN v4 (Feb 10, 2016) for CUDA 7.0 and later.
|
||||
set CUDNN_PATH=%ToolkitFolderDriver%\Shared_Toolkits\Theano\CUDA\cudnn-4.0.7\cuda
|
||||
set INCLUDE=%CUDNN_PATH%\include;%INCLUDE%
|
||||
set LIB=%CUDNN_PATH%\lib\x64;%LIB%
|
||||
set PATH=%CUDNN_PATH%\bin;%PATH%
|
||||
|
||||
set Platform=X64
|
||||
set CommandPromptType=Native
|
||||
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
@rem connect to your scratch storage \\gcr\Scratch\<location>\<alias>
|
||||
@rem Note: Please copy ANACONDA2 to \\gcr\Scratch\<location>\<alias>\Anaconda2
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
pushd \\gcr\scratch\RR1\v-yixia
|
||||
set CONDANETDRIVE=%cd:~0,2%
|
||||
|
||||
@rem set the environment variable for the Anaconda2
|
||||
set ANACONDA2=%CONDANETDRIVE%\RR1\v-yirwan\Anaconda2
|
||||
set ANACONDA2_SCRIPTS=%ANACONDA2%\Scripts
|
||||
set ANACONDA2_BIN=%ANACONDA2%\Library\bin
|
||||
|
||||
@rem add Anaconda2 folders to the PATH system variable
|
||||
set PATH=^
|
||||
%ANACONDA2%;^
|
||||
%ANACONDA2_BIN%;^
|
||||
%ANACONDA2_SCRIPTS%;^
|
||||
%PATH%
|
||||
|
||||
@echo %PATH%
|
||||
|
||||
@rem example files from DeepLearningTutorials are available at \\gcr\Tools\Shared_Toolkits\Theano\Examples
|
||||
set PROJDRIVE=%CONDANETDRIVE%
|
||||
set MYHOME=%PROJDRIVE%\RR1\v-yixia
|
||||
set PROJHOME=%MYHOME%\%working_sub_dir%
|
||||
|
||||
%PROJDRIVE%
|
||||
|
||||
cd %PROJHOME%
|
||||
|
||||
@rem setup theano env (generate .theanorc.txt)
|
||||
call python gen_theanorc.py %ANACONDA2% .theanorc.txt
|
||||
del %userprofile%\.theanorc.txt /Q /F
|
||||
copy .theanorc.txt %userprofile% /Y
|
||||
|
||||
call python write_script.py %1
|
||||
|
||||
call worker.bat
|
||||
|
||||
@echo delete theano env
|
||||
del %userprofile%\.theanorc.txt /Q /F
|
||||
|
||||
popd
|
||||
|
||||
popd
|
||||
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
import re, os, numpy, sys
|
||||
|
||||
|
||||
filename = r'.\gpu_usage_draft'
|
||||
|
||||
|
||||
|
||||
def GrabGPU():
|
||||
cmdstr = '\"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe\" > ' + filename
|
||||
os.system(cmdstr)
|
||||
|
||||
def GetGPUUSage():
|
||||
pattern = re.compile(r'(?P<num>[0-9]+)MiB[\s]+/')
|
||||
mem = []
|
||||
fo = open(filename, 'r')
|
||||
for line in fo:
|
||||
result = pattern.search(line)
|
||||
if result:
|
||||
mem.append(int(result.group('num')))
|
||||
fo.close()
|
||||
|
||||
return numpy.array(mem).argsort()[0]
|
||||
|
||||
def print_script(cmd):
|
||||
GrabGPU()
|
||||
with open('worker.bat', 'w') as f:
|
||||
f.write('@echo off\nsetlocal ENABLEDELAYEDEXPANSION\n')
|
||||
if len(cmd) == 1:
|
||||
f.write('set THEANO_FLAGS=device=gpu%d\n' % GetGPUUSage())
|
||||
f.write('python ' + cmd[0])
|
||||
elif len(cmd) == 2:
|
||||
f.write('set THEANO_FLAGS=device=gpu' + cmd[1] + '\n')
|
||||
f.write('python ' + cmd[0])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print_script(sys.argv[1:])
|
||||
|
||||
# os.system('del /q ' + filename + rank)
|
|
@ -0,0 +1,40 @@
|
|||
import sys
|
||||
|
||||
mapper_machine_freecard = {}
|
||||
mapper_machine_rank = {}
|
||||
|
||||
def MapIDs(m_machine):
|
||||
for i in range(m_machine):
|
||||
fo = open('record' + str(i))
|
||||
id = 0
|
||||
m_line = 0
|
||||
machine_name = ''
|
||||
for line in fo:
|
||||
if id == 0:
|
||||
machine_name = line[:-1]
|
||||
mapper_machine_freecard[machine_name] = []
|
||||
if mapper_machine_rank.has_key(machine_name):
|
||||
mapper_machine_rank[machine_name].append(i)
|
||||
else:
|
||||
mapper_machine_rank[machine_name] = [i]
|
||||
elif id > 1:
|
||||
mapper_machine_freecard[machine_name].append(int(line))
|
||||
id = id + 1
|
||||
fo.close()
|
||||
|
||||
def Map_Rank_Card(m_machine):
|
||||
MapIDs(m_machine)
|
||||
allocations = range(m_machine)
|
||||
for k in mapper_machine_rank.keys():
|
||||
ranks = mapper_machine_rank[k]
|
||||
cards = mapper_machine_freecard[k]
|
||||
#if len(ranks) == len(cards):
|
||||
for i in range(len(ranks)):
|
||||
allocations[ranks[i]] = cards[i]
|
||||
|
||||
for l in allocations:
|
||||
print l
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
Map_Rank_Card(int(sys.argv[1]))
|
|
@ -0,0 +1,369 @@
|
|||
"""
|
||||
data loading and minibatch generation
|
||||
"""
|
||||
__author__ = 'v-yirwan'
|
||||
|
||||
import cPickle as pkl
|
||||
import gzip
|
||||
import os
|
||||
import numpy
|
||||
from theano import config
|
||||
|
||||
def get_dataset_file(dataset, default_dataset, origin):
|
||||
'''
|
||||
Look for it as if it was a full path, if not, try local file,
|
||||
if not try in the data directory.
|
||||
|
||||
Download dataset if it is not present
|
||||
'''
|
||||
data_dir, data_file = os.path.split(dataset)
|
||||
if data_dir == "" and not os.path.isfile(dataset):
|
||||
# Check if dataset is in the data directory.
|
||||
new_path = os.path.join(
|
||||
os.path.split(__file__)[0],
|
||||
"..",
|
||||
"data",
|
||||
dataset
|
||||
)
|
||||
if os.path.isfile(new_path) or data_file == default_dataset:
|
||||
dataset = new_path
|
||||
|
||||
if (not os.path.isfile(dataset)) and data_file == default_dataset:
|
||||
from six.moves import urllib
|
||||
print('Downloading data from %s' % origin)
|
||||
urllib.request.urlretrieve(origin, dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
def load_data(path="imdb.pkl", n_words=100000, maxlen=None,
|
||||
sort_by_len=True, fixed_valid=True, valid_portion=0.1):
|
||||
'''
|
||||
Loads the dataset
|
||||
:type path: String
|
||||
:param path: The path to the dataset (here IMDB)
|
||||
:type n_words: int
|
||||
:param n_words: The number of word to keep in the vocabulary.
|
||||
All extra words are set to unknow (1).
|
||||
:type maxlen: None or positive int
|
||||
:param maxlen: the max sequence length we use in the train/valid set.
|
||||
:type sort_by_len: bool
|
||||
:name sort_by_len: Sort by the sequence lenght for the train,
|
||||
valid and test set. This allow faster execution as it cause
|
||||
less padding per minibatch. Another mechanism must be used to
|
||||
shuffle the train set at each epoch.
|
||||
:type fixed_valid: bool
|
||||
:param fixed_valid: load fixed validation set from the corpus file,
|
||||
which would otherwise be picked randomly from the training set with
|
||||
proportion [valid_portion]
|
||||
:type valid_portion: float
|
||||
:param valid_portion: The proportion of the full train set used for
|
||||
the validation set.
|
||||
|
||||
'''
|
||||
|
||||
# Load the dataset
|
||||
path = get_dataset_file(
|
||||
path, "imdb.pkl",
|
||||
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
|
||||
if path.endswith(".gz"):
|
||||
f = gzip.open(path, 'rb')
|
||||
else:
|
||||
f = open(path, 'rb')
|
||||
|
||||
train_set = pkl.load(f)
|
||||
if fixed_valid:
|
||||
valid_set = pkl.load(f)
|
||||
test_set = pkl.load(f)
|
||||
f.close()
|
||||
|
||||
def _truncate_data(train_set):
|
||||
'''
|
||||
truncate sequences with lengths exceed max-len threshold
|
||||
:param train_set: a list of sequences list and corresponding labels list
|
||||
:return: truncated train_set
|
||||
'''
|
||||
new_train_set_x = []
|
||||
new_train_set_y = []
|
||||
for x, y in zip(train_set[0], train_set[1]):
|
||||
if len(x) < maxlen:
|
||||
new_train_set_x.append(x)
|
||||
new_train_set_y.append(y)
|
||||
train_set = (new_train_set_x, new_train_set_y)
|
||||
del new_train_set_x, new_train_set_y
|
||||
return train_set
|
||||
|
||||
def _set_valid(train_set, valid_portion):
|
||||
'''
|
||||
set validation with [valid_portion] proportion of training set
|
||||
'''
|
||||
train_set_x, train_set_y = train_set
|
||||
n_samples = len(train_set_x)
|
||||
sidx = numpy.random.permutation(n_samples) # shuffle data
|
||||
n_train = int(numpy.round(n_samples * (1. - valid_portion)))
|
||||
valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
|
||||
valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
|
||||
train_set_x = [train_set_x[s] for s in sidx[:n_train]]
|
||||
train_set_y = [train_set_y[s] for s in sidx[:n_train]]
|
||||
train_set = (train_set_x, train_set_y)
|
||||
valid_set = (valid_set_x, valid_set_y)
|
||||
del train_set_x, train_set_y, valid_set_x, valid_set_y
|
||||
return train_set, valid_set
|
||||
|
||||
if maxlen:
|
||||
train_set = _truncate_data(train_set)
|
||||
if fixed_valid:
|
||||
print 'Loading with fixed validation set...',
|
||||
valid_set = _truncate_data(valid_set)
|
||||
else:
|
||||
print 'Setting validation set with proportion:', valid_portion, '...',
|
||||
train_set, valid_set = _set_valid(train_set, valid_portion)
|
||||
test_set = _truncate_data(test_set)
|
||||
|
||||
if maxlen is None and not fixed_valid:
|
||||
train_set, valid_set = _set_valid(train_set, valid_portion)
|
||||
|
||||
def remove_unk(x):
|
||||
return [[1 if w >= n_words else w for w in sen] for sen in x]
|
||||
|
||||
test_set_x, test_set_y = test_set
|
||||
valid_set_x, valid_set_y = valid_set
|
||||
train_set_x, train_set_y = train_set
|
||||
|
||||
# remove unk from dataset
|
||||
train_set_x = remove_unk(train_set_x) # use 1 if unk
|
||||
valid_set_x = remove_unk(valid_set_x)
|
||||
test_set_x = remove_unk(test_set_x)
|
||||
|
||||
def len_argsort(seq):
|
||||
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
|
||||
|
||||
if sort_by_len:
|
||||
sorted_index = len_argsort(test_set_x)
|
||||
# ranked from shortest to longest
|
||||
test_set_x = [test_set_x[i] for i in sorted_index]
|
||||
test_set_y = [test_set_y[i] for i in sorted_index]
|
||||
|
||||
sorted_index = len_argsort(valid_set_x)
|
||||
valid_set_x = [valid_set_x[i] for i in sorted_index]
|
||||
valid_set_y = [valid_set_y[i] for i in sorted_index]
|
||||
|
||||
sorted_index = len_argsort(train_set_x)
|
||||
train_set_x = [train_set_x[i] for i in sorted_index]
|
||||
train_set_y = [train_set_y[i] for i in sorted_index]
|
||||
|
||||
train = (train_set_x, train_set_y)
|
||||
valid = (valid_set_x, valid_set_y)
|
||||
test = (test_set_x, test_set_y)
|
||||
|
||||
return train, valid, test
|
||||
|
||||
def load_mnist(path='mnist.pkl', fixed_permute=True, rand_permute=False):
|
||||
f = open(path, 'rb')
|
||||
train = pkl.load(f)
|
||||
valid = pkl.load(f)
|
||||
test = pkl.load(f)
|
||||
f.close()
|
||||
|
||||
def _permute(data, perm):
|
||||
x, y = data
|
||||
x_new = []
|
||||
for xx in x:
|
||||
xx_new = [xx[pp] for pp in perm]
|
||||
x_new.append(xx_new)
|
||||
return (x_new, y)
|
||||
|
||||
def _trans2list(data):
|
||||
x, y = data
|
||||
x = [list(xx) for xx in x]
|
||||
return (x, y)
|
||||
|
||||
if rand_permute:
|
||||
print 'Using a fixed random permutation of pixels...',
|
||||
perm = numpy.random.permutation(range(784))
|
||||
train = _permute(train, perm)
|
||||
valid = _permute(valid, perm)
|
||||
test = _permute(test, perm)
|
||||
elif fixed_permute:
|
||||
print 'Using permuted dataset...',
|
||||
|
||||
_trans2list(train)
|
||||
_trans2list(valid)
|
||||
_trans2list(test)
|
||||
|
||||
return train, valid, test
|
||||
|
||||
def get_minibatches_idx(n, minibatch_size, shuffle=False):
|
||||
"""
|
||||
Used to shuffle the dataset at each iteration.
|
||||
"""
|
||||
|
||||
idx_list = numpy.arange(n, dtype="int32")
|
||||
|
||||
if shuffle:
|
||||
numpy.random.shuffle(idx_list)
|
||||
|
||||
minibatches = []
|
||||
minibatch_start = 0
|
||||
for i in range(n // minibatch_size):
|
||||
minibatches.append(idx_list[minibatch_start:
|
||||
minibatch_start + minibatch_size])
|
||||
minibatch_start += minibatch_size
|
||||
|
||||
if (minibatch_start != n):
|
||||
# Make a minibatch out of what is left
|
||||
minibatches.append(idx_list[minibatch_start:])
|
||||
|
||||
return zip(range(len(minibatches)), minibatches)
|
||||
|
||||
def get_minibatches_idx_bucket(dataset, minibatch_size, shuffle=False):
|
||||
"""
|
||||
divide into different buckets according to sequence lengths
|
||||
dynamic batch size
|
||||
"""
|
||||
# divide into buckets
|
||||
slen = [len(ss) for ss in dataset]
|
||||
bucket1000 = [sidx for sidx in xrange(len(dataset))
|
||||
if slen[sidx] > 0 and slen[sidx] <= 1000]
|
||||
bucket3000 = [sidx for sidx in xrange(len(dataset))
|
||||
if slen[sidx] > 1000 and slen[sidx] <= 3000]
|
||||
bucket_long = [sidx for sidx in xrange(len(dataset))
|
||||
if slen[sidx] > 3000]
|
||||
|
||||
# shuffle each bucket
|
||||
if shuffle:
|
||||
numpy.random.shuffle(bucket1000)
|
||||
numpy.random.shuffle(bucket3000)
|
||||
numpy.random.shuffle(bucket_long)
|
||||
|
||||
# make minibatches
|
||||
def _make_batch(minibatches, bucket, minibatch_size):
|
||||
minibatch_start = 0
|
||||
n = len(bucket)
|
||||
for i in range(n // minibatch_size):
|
||||
minibatches.append(bucket[minibatch_start : minibatch_start + minibatch_size])
|
||||
minibatch_start += minibatch_size
|
||||
if (minibatch_start != n):
|
||||
# Make a minibatch out of what is left
|
||||
minibatches.append(bucket[minibatch_start:])
|
||||
return minibatches
|
||||
|
||||
minibatches = []
|
||||
_make_batch(minibatches, bucket1000, minibatch_size=minibatch_size)
|
||||
_make_batch(minibatches, bucket3000, minibatch_size=minibatch_size//2)
|
||||
_make_batch(minibatches, bucket_long, minibatch_size=minibatch_size//8)
|
||||
|
||||
# shuffle minibatches
|
||||
numpy.random.shuffle(minibatches)
|
||||
|
||||
return zip(range(len(minibatches)), minibatches)
|
||||
|
||||
def prepare_data(seqs, labels, maxlen=None, dataset='text'):
|
||||
"""Create the matrices from the datasets.
|
||||
|
||||
This pad each sequence to the same lenght: the lenght of the
|
||||
longuest sequence or maxlen.
|
||||
|
||||
if maxlen is set, we will cut all sequence to this maximum
|
||||
lenght.
|
||||
|
||||
This swap the axis!
|
||||
"""
|
||||
# x: a list of sentences
|
||||
lengths = [len(s) for s in seqs]
|
||||
|
||||
if maxlen is not None:
|
||||
new_seqs = []
|
||||
new_labels = []
|
||||
new_lengths = []
|
||||
for l, s, y in zip(lengths, seqs, labels):
|
||||
if l < maxlen:
|
||||
new_seqs.append(s)
|
||||
new_labels.append(y)
|
||||
new_lengths.append(l)
|
||||
lengths = new_lengths
|
||||
labels = new_labels
|
||||
seqs = new_seqs
|
||||
|
||||
if len(lengths) < 1:
|
||||
return None, None, None
|
||||
|
||||
n_samples = len(seqs)
|
||||
maxlen = numpy.max(lengths)
|
||||
|
||||
if dataset == 'mnist':
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('float32')
|
||||
else:
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('int64')
|
||||
x_mask = numpy.zeros((maxlen, n_samples)).astype(config.floatX)
|
||||
for idx, s in enumerate(seqs):
|
||||
x[:lengths[idx], idx] = s
|
||||
x_mask[:lengths[idx], idx] = 1.
|
||||
|
||||
return x, x_mask, labels
|
||||
|
||||
def prepare_data_hier(seqs, labels, hier_len, maxlen=None, dataset='text'):
|
||||
'''
|
||||
prepare minibatch for hierarchical model
|
||||
'''
|
||||
# sort (long->short)
|
||||
sorted_idx = sorted(range(len(seqs)), key=lambda x: len(seqs[x]), reverse=True)
|
||||
seqs = [seqs[i] for i in sorted_idx]
|
||||
labels = [labels[i] for i in sorted_idx]
|
||||
|
||||
# truncate data
|
||||
lengths = [len(s) for s in seqs]
|
||||
if maxlen is not None:
|
||||
new_seqs = []
|
||||
new_labels = []
|
||||
new_lengths = []
|
||||
for l, s, y in zip(lengths, seqs, labels):
|
||||
if l <maxlen :
|
||||
new_seqs.append(s)
|
||||
new_labels.append(y)
|
||||
new_lengths.append(l)
|
||||
lengths = new_lengths
|
||||
labels = new_labels
|
||||
seqs = new_seqs
|
||||
if len(lengths) < 1:
|
||||
return None, None, None
|
||||
|
||||
# set batch size
|
||||
n_samples = len(seqs)
|
||||
maxlen = numpy.max(lengths)
|
||||
if maxlen % hier_len == 0:
|
||||
n_batch = maxlen/hier_len
|
||||
else:
|
||||
n_batch = maxlen//hier_len + 1
|
||||
maxlen = n_batch * hier_len
|
||||
|
||||
# padding whole batch
|
||||
if dataset == 'mnist':
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('float32')
|
||||
else:
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('int64')
|
||||
x_mask = numpy.zeros((maxlen, n_samples)).astype(config.floatX)
|
||||
for idx, s in enumerate(seqs):
|
||||
x[:lengths[idx], idx] = s
|
||||
x_mask[:lengths[idx], idx] = 1
|
||||
|
||||
# slice to mini-batches
|
||||
x_batch = [x[bidx*hier_len:(bidx+1)*hier_len, :] for bidx in range(n_batch)]
|
||||
if dataset == 'mnist':
|
||||
x_batch = numpy.array(x_batch).astype('float32')
|
||||
else:
|
||||
x_batch = numpy.array(x_batch).astype('int64')
|
||||
mask_batch = [x_mask[bidx*hier_len:(bidx+1)*hier_len, :] for bidx in range(n_batch)]
|
||||
mask_batch = numpy.array(mask_batch).astype(config.floatX)
|
||||
|
||||
# mask for hier-level
|
||||
mask_hier = numpy.ones((n_batch, n_samples)).astype(config.floatX)
|
||||
for idx in range(n_samples):
|
||||
mpos = numpy.where(x_mask[:, idx]==0)[0]
|
||||
if len(mpos) == 0:
|
||||
continue
|
||||
bidx = min(mpos[0]//hier_len+1, n_batch)
|
||||
if mpos[0] % hier_len == 0:
|
||||
bidx -= 1 # bug fixed TODO: more elegant solution?
|
||||
mask_hier[bidx:, idx] = 0
|
||||
|
||||
return x_batch, mask_batch, mask_hier, labels
|
|
@ -0,0 +1,38 @@
|
|||
import re
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
|
||||
filename = r'.\gpu_usage_draft_'
|
||||
default_gpu = 58 + 30
|
||||
|
||||
|
||||
|
||||
def GrabGPU(rank):
|
||||
cmdstr = '\"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe\" > ' + filename + rank
|
||||
os.system(cmdstr)
|
||||
|
||||
def GetGPUUSage(rank):
|
||||
pattern = re.compile(r'(?P<num>[0-9]{1,5})MiB[\s]+/')
|
||||
id = 0
|
||||
GPUs = []
|
||||
fo = open(filename + rank, 'r')
|
||||
for line in fo:
|
||||
result = pattern.search(line)
|
||||
if result:
|
||||
if int(result.group("num")) < default_gpu:
|
||||
GPUs.append(id)
|
||||
id = id + 1
|
||||
fo.close()
|
||||
|
||||
print len(GPUs)
|
||||
for gpu in GPUs:
|
||||
print gpu
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
rank = sys.argv[1]
|
||||
GrabGPU(rank)
|
||||
print socket.gethostname()
|
||||
GetGPUUSage(rank)
|
||||
#os.system('del /q ' + filename + rank)
|
|
@ -0,0 +1,838 @@
|
|||
"""
|
||||
supports simple-rnn, lstm, hierarchical lstm
|
||||
supports lstm with identity skip-connections(soft), parametric skip-connections(soft)
|
||||
supports resnet, resnet with identity skip-connections(full and soft), parametric skip connections(soft)
|
||||
supports hybrid structure (lstm+resnet)
|
||||
"""
|
||||
|
||||
__author__ = 'v-yirwan'
|
||||
|
||||
import theano.tensor as tensor
|
||||
from Util import *
|
||||
|
||||
layers = {'lstm': ('param_init_lstm', 'lstm_layer'),
|
||||
'lstm_skip': ('param_init_lstm', 'lstm_skip_layer'),
|
||||
'lstm_pskip': ('param_init_lstm_pskip', 'lstm_pskip_layer'),
|
||||
'residual': ('param_init_residual', 'residual_layer'),
|
||||
'residual_full_skip': ('param_init_residual', 'residual_full_skip_layer'),
|
||||
'residual_skip': ('param_init_residual', 'residual_skip_layer'),
|
||||
'residual_pskip': ('param_init_residual_pskip', 'residual_pskip_layer'),
|
||||
'rnn': ('param_init_rnn', 'rnn_layer'),
|
||||
'rnn_pskip': ('param_init_rnn_pskip', 'rnn_pskip_layer'),
|
||||
# modules for ResNet Modifications
|
||||
'presidual': ('param_init_presidual', 'presidual_layer'),
|
||||
'pxresidual': ('param_init_pxresidual', 'pxresidual_layer'),
|
||||
'residual_pskip_mod': ('param_init_residual_pskip', 'residual_pskip_mod_layer')
|
||||
}
|
||||
|
||||
def _p(pp, name):
|
||||
return '%s_%s' % (pp, name)
|
||||
|
||||
def get_layer(name):
|
||||
fns = layers[name]
|
||||
return (eval(fns[0]), eval(fns[1]))
|
||||
|
||||
# ===========================
|
||||
# LSTM-related layers
|
||||
# LSTM, LSTM with identity and parametric skip connections (soft)
|
||||
# ===========================
|
||||
|
||||
def param_init_lstm(options, params, prefix='lstm', hier_level=False):
|
||||
"""
|
||||
Init the LSTM parameter
|
||||
Support hierarchical architecture
|
||||
"""
|
||||
if hier_level:
|
||||
# bug fixed: dimension matching for hier-mode
|
||||
W = numpy.concatenate([ortho_weight(options['dim_proj']),
|
||||
ortho_weight(options['dim_proj']),
|
||||
ortho_weight(options['dim_proj']),
|
||||
ortho_weight(options['dim_proj'])], axis=1)
|
||||
else:
|
||||
# bug fixed: different dim for embedding and hidden state
|
||||
W = numpy.concatenate([norm_weight(options['dim_word'], options['dim_proj']),
|
||||
norm_weight(options['dim_word'], options['dim_proj']),
|
||||
norm_weight(options['dim_word'], options['dim_proj']),
|
||||
norm_weight(options['dim_word'], options['dim_proj'])], axis=1)
|
||||
params[_p(prefix, 'W')] = W
|
||||
U = numpy.concatenate([ortho_weight(options['dim_proj']),
|
||||
ortho_weight(options['dim_proj']),
|
||||
ortho_weight(options['dim_proj']),
|
||||
ortho_weight(options['dim_proj'])], axis=1)
|
||||
params[_p(prefix, 'U')] = U
|
||||
b = numpy.zeros((4 * options['dim_proj'],))
|
||||
params[_p(prefix, 'b')] = b.astype(config.floatX)
|
||||
|
||||
return params
|
||||
|
||||
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
|
||||
nsteps = state_below.shape[0]
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
assert mask is not None
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
def _step(m_, x_, h_, c_):
|
||||
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
|
||||
preact += x_
|
||||
|
||||
i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
|
||||
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
|
||||
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
|
||||
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
|
||||
|
||||
c = f * c_ + i * c
|
||||
c = m_[:, None] * c + (1. - m_)[:, None] * c_
|
||||
|
||||
h = o * tensor.tanh(c)
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_
|
||||
|
||||
return h, c
|
||||
|
||||
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
|
||||
dim_proj = options['dim_proj']
|
||||
rval, updates = theano.scan(_step,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[tensor.alloc(numpy_floatX(0.),
|
||||
n_samples,
|
||||
dim_proj),
|
||||
tensor.alloc(numpy_floatX(0.),
|
||||
n_samples,
|
||||
dim_proj)],
|
||||
name=_p(prefix, '_layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
return rval[0]
|
||||
|
||||
def lstm_skip_layer(tparams, state_below, options, prefix='lstm_skip', mask=None):
|
||||
'''
|
||||
lstm layer with soft identity skip connections
|
||||
'''
|
||||
nsteps = state_below.shape[0]
|
||||
n_skip = options['skip_steps']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
assert mask is not None
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
def _lstm_unit(m_, x_, h_, c_, h_skip, hcnt):
|
||||
skip_flag = tensor.eq(hcnt % n_skip, 0)
|
||||
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
|
||||
preact += x_
|
||||
|
||||
# gates
|
||||
i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
|
||||
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
|
||||
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
|
||||
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
|
||||
|
||||
# cell state
|
||||
c = f * c_ + i * c
|
||||
c = m_[:, None] * c + (1. - m_)[:, None] * c_
|
||||
# new hidden stae
|
||||
h = o * tensor.tanh(c) + h_skip * skip_flag
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_
|
||||
# update h_skip
|
||||
h_skip = h_skip * (1-skip_flag) + h * skip_flag
|
||||
hcnt += 1
|
||||
|
||||
return h, c, h_skip, hcnt
|
||||
|
||||
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
|
||||
dim_proj = options['dim_proj']
|
||||
h = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
c = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
h_skip = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
hcnt = tensor.zeros_like(theano.shared(10.).astype('float32'))
|
||||
rval, updates = theano.scan(_lstm_unit,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[h, c, h_skip, hcnt],
|
||||
name=_p(prefix, 'layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
# return all hidden states h(t)
|
||||
return rval[0]
|
||||
|
||||
def param_init_lstm_pskip(options, params, prefix='lstm_pskip', hier_level=False):
|
||||
"""
|
||||
Init the LSTM-pskip parameter
|
||||
"""
|
||||
# same as vanilla lstm layer
|
||||
params = param_init_lstm(options, params, prefix=prefix, hier_level=hier_level)
|
||||
# weight for skip connection
|
||||
params[_p(prefix, 'W_skip')] = numpy.array([numpy.random.random_sample()]).astype('float32')[0]
|
||||
# random value in (0,1)
|
||||
|
||||
return params
|
||||
|
||||
def lstm_pskip_layer(tparams, state_below, options, prefix='lstm_pskip', mask=None):
|
||||
'''
|
||||
lstm layer with soft parametric weighted skip connections
|
||||
'''
|
||||
nsteps = state_below.shape[0]
|
||||
n_skip = options['skip_steps']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
assert mask is not None
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
def _lstm_unit(m_, x_, h_, c_, h_skip, hcnt):
|
||||
'''
|
||||
lstm_soft_pskip unit at each time step
|
||||
:param m_: mask
|
||||
:param x_: x(t) input
|
||||
:param h_: h(t-1) recurrent hidden state
|
||||
:param c_: c(t-1) cell state
|
||||
:param h_skip: h(t-n_skip) for skip connection
|
||||
:param hcnt: mark current time stamp (to determine whether skip connection exists)
|
||||
:return: h(t), c(t), h_skip, hcnt
|
||||
'''
|
||||
skip_flag = tensor.eq(hcnt % n_skip, 0)
|
||||
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
|
||||
preact += x_
|
||||
|
||||
# gates
|
||||
i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
|
||||
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
|
||||
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
|
||||
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
|
||||
|
||||
# cell state
|
||||
c = f * c_ + i * c
|
||||
c = m_[:, None] * c + (1. - m_)[:, None] * c_
|
||||
# new hidden stae
|
||||
h = o * tensor.tanh(c) + h_skip * skip_flag * tparams[_p(prefix, 'W_skip')]
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_
|
||||
# update h_skip
|
||||
h_skip = h_skip * (1-skip_flag) + h * skip_flag
|
||||
hcnt += 1 # bug fixed T^T
|
||||
|
||||
return h, c, h_skip, hcnt
|
||||
|
||||
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
|
||||
dim_proj = options['dim_proj']
|
||||
h = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
c = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
h_skip = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
hcnt = tensor.zeros_like(theano.shared(10.).astype('float32'))
|
||||
rval, updates = theano.scan(_lstm_unit,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[h, c, h_skip, hcnt],
|
||||
name=_p(prefix, 'layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
# return all hidden states h(t)
|
||||
return rval[0]
|
||||
|
||||
|
||||
# ===========================
|
||||
# ResNet-related layers
|
||||
# ResNet, ResNet with identity skip connections (full and soft),
|
||||
# ResNet with parametric skip connections(soft)
|
||||
# ===========================
|
||||
|
||||
def param_init_residual(options, params, prefix='residual'):
|
||||
"""
|
||||
Init the residual_network parameter:
|
||||
"""
|
||||
# weight for input x
|
||||
depth = options['unit_depth']
|
||||
Wx = dict()
|
||||
for idx in xrange(depth):
|
||||
Wx[idx] = norm_weight(options['dim_word'], options['dim_proj'])
|
||||
W = numpy.concatenate([ww for kk, ww in Wx.iteritems()], axis=1)
|
||||
params[_p(prefix, 'W')] = W
|
||||
b = numpy.zeros((depth * options['dim_proj'],))
|
||||
params[_p(prefix, 'b')] = b.astype(config.floatX)
|
||||
|
||||
# weight for identity connection
|
||||
'''
|
||||
w_res = numpy.array([numpy.random.random_sample()]).astype('float32')[0]
|
||||
params[_p(prefix, 'w_res')] = w_res.astype(config.floatX)
|
||||
'''
|
||||
# weight for inter-states
|
||||
for idx in xrange(depth):
|
||||
U = ortho_weight(options['dim_proj'])
|
||||
params[_p(prefix, 'U'+str(idx+1))] = U
|
||||
|
||||
return params
|
||||
|
||||
def residual_layer(tparams, state_below, options, prefix='residual', mask=None):
|
||||
'''
|
||||
vanilla residual layer (recurrent depth adjustable)
|
||||
'''
|
||||
# here state_below in x_emb
|
||||
nsteps = state_below.shape[0]
|
||||
depth = options['unit_depth']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
assert mask is not None
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
def _resblock(m_, x_, h_):
|
||||
y = h_
|
||||
for idx in xrange(depth):
|
||||
hy = tensor.dot(y, tparams[_p(prefix, 'U'+str(idx+1))])
|
||||
# y(i) = sigmoid(Wx(t)+b + Uy(i-1))
|
||||
y = tensor.nnet.sigmoid(_slice(x_, idx, options['dim_proj']) + hy)
|
||||
h = tensor.tanh(h_ + y)
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_
|
||||
return h
|
||||
|
||||
# state_below = W*x(t)+b (for all inter_state y)
|
||||
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
|
||||
h = tensor.alloc(numpy_floatX(0.), n_samples, options['dim_proj'])
|
||||
rval, updates = theano.scan(_resblock,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[h],
|
||||
name=_p(prefix, 'layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
return rval # bug fixed: not rval[0], attention here
|
||||
|
||||
def residual_full_skip_layer(tparams, state_below, options, prefix='residual_full_skip', mask=None):
|
||||
'''
|
||||
residual layer with full skip connections (direct link without weight)
|
||||
'''
|
||||
# here state_below in x_emb
|
||||
nsteps = state_below.shape[0]
|
||||
depth = options['unit_depth']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
assert mask is not None
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
# input mask, x(t), h(t-1), H(t-1)
|
||||
def _resblock(m_, x_, h_, H_):
|
||||
y = h_
|
||||
for idx in xrange(depth):
|
||||
hy = tensor.dot(y, tparams[_p(prefix, 'U'+str(idx+1))])
|
||||
# y(i) = sigmoid(Wx(t)+b + Uy(i-1))
|
||||
y = tensor.nnet.sigmoid(_slice(x_, idx, options['dim_proj']) + hy)
|
||||
# new hidden state
|
||||
h = tensor.tanh(h_ + y + H_[:,:,0])
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_ # mask
|
||||
# update skip hidden matrix
|
||||
H = tensor.zeros_like(H_)
|
||||
H = tensor.set_subtensor(H[:,:,:-1], H_[:,:,1:])
|
||||
H = tensor.set_subtensor(H[:,:,-1], h)
|
||||
return h, H
|
||||
|
||||
# state_below = W*x(t)+b (for all inter_state y)
|
||||
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
|
||||
dim_proj = options['dim_proj']
|
||||
n_skip = options['skip_steps']
|
||||
h = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
H = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj, n_skip)
|
||||
rval, updates = theano.scan(_resblock,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[h, H],
|
||||
name=_p(prefix, '_layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
return rval[0] # return all hidden states h
|
||||
|
||||
def residual_skip_layer(tparams, state_below, options, prefix='residual_skip', mask=None):
|
||||
'''
|
||||
residual layer with (soft) skip connections (direct link without weight)
|
||||
'''
|
||||
# here state_below in x_emb
|
||||
nsteps = state_below.shape[0]
|
||||
depth = options['unit_depth']
|
||||
dim_proj = options['dim_proj']
|
||||
n_skip = options['skip_steps']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
assert mask is not None
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
# input mask, x(t), h(t-1), h(skip), time_idx
|
||||
def _resblock(m_, x_, h_, h_skip, hcnt):
|
||||
y = h_
|
||||
skip_flag = theano.tensor.eq(hcnt%n_skip, 0)
|
||||
for idx in xrange(depth):
|
||||
hy = tensor.dot(y, tparams[_p(prefix, 'U'+str(idx+1))])
|
||||
# y(i) = sigmoid(Wx(t)+b + Uy(i-1))
|
||||
y = tensor.nnet.sigmoid(_slice(x_, idx, options['dim_proj']) + hy)
|
||||
# new hidden state
|
||||
h = tensor.tanh(h_ + y + h_skip*skip_flag)
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_ # mask
|
||||
# update h(skip)
|
||||
h_skip = h_skip*(1-skip_flag) + h*skip_flag
|
||||
hcnt += 1
|
||||
return h, h_skip, hcnt
|
||||
|
||||
# state_below = W*x(t)+b (for all inter_state y)
|
||||
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
|
||||
h = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
h_skip = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
# fixme: 0-dim init
|
||||
hcnt = tensor.zeros_like(theano.shared(10.).astype('float32'))
|
||||
rval, updates = theano.scan(_resblock,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[h, h_skip, hcnt],
|
||||
name=_p(prefix, '_layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
return rval[0] # return all hidden states h
|
||||
|
||||
def param_init_residual_pskip(options, params, prefix='residual_pskip'):
|
||||
"""
|
||||
Init the residual network with parametric weighted skip connections:
|
||||
"""
|
||||
# weight for input x
|
||||
depth = options['unit_depth']
|
||||
Wx = dict()
|
||||
for idx in xrange(depth):
|
||||
Wx[idx] = norm_weight(options['dim_word'], options['dim_proj'])
|
||||
W = numpy.concatenate([ww for kk, ww in Wx.iteritems()], axis=1)
|
||||
params[_p(prefix, 'W')] = W
|
||||
b = numpy.zeros((depth * options['dim_proj'],))
|
||||
params[_p(prefix, 'b')] = b.astype(config.floatX)
|
||||
|
||||
# weight for skip connection
|
||||
params[_p(prefix, 'W_skip')] = numpy.array([numpy.random.random_sample()]).astype('float32')[0]
|
||||
# random value in (0,1)
|
||||
|
||||
# weight for inter-states
|
||||
for idx in xrange(depth):
|
||||
U = ortho_weight(options['dim_proj'])
|
||||
params[_p(prefix, 'U'+str(idx+1))] = U
|
||||
return params
|
||||
|
||||
def residual_pskip_layer(tparams, state_below, options, prefix='residual_pskip', mask=None):
|
||||
'''
|
||||
residual layer with soft parametric weighted skip connections
|
||||
'''
|
||||
# here state_below in x_emb
|
||||
nsteps = state_below.shape[0]
|
||||
depth = options['unit_depth']
|
||||
dim_proj = options['dim_proj']
|
||||
n_skip = options['skip_steps']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
assert mask is not None
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
# input mask, x(t), h(t-1), h(skip), time_idx
|
||||
def _resblock(m_, x_, h_, h_skip, hcnt):
|
||||
y = h_
|
||||
skip_flag = theano.tensor.eq(hcnt%n_skip, 0)
|
||||
for idx in xrange(depth):
|
||||
hy = tensor.dot(y, tparams[_p(prefix, 'U'+str(idx+1))])
|
||||
# y(i) = sigmoid(Wx(t)+b + Uy(i-1))
|
||||
y = tensor.nnet.sigmoid(_slice(x_, idx, options['dim_proj']) + hy)
|
||||
# new hidden state
|
||||
h = tensor.tanh(h_ + y + h_skip * skip_flag * tparams[_p(prefix, 'W_skip')])
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_ # mask
|
||||
# update h(skip)
|
||||
h_skip = h_skip*(1-skip_flag) + h*skip_flag
|
||||
hcnt += 1
|
||||
return h, h_skip, hcnt
|
||||
|
||||
# state_below = W*x(t)+b (for all inter_state y)
|
||||
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
|
||||
h = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
h_skip = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
# fixme: 0-dim init
|
||||
hcnt = tensor.zeros_like(theano.shared(10.).astype('float32'))
|
||||
rval, updates = theano.scan(_resblock,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[h, h_skip, hcnt],
|
||||
name=_p(prefix, '_layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
return rval[0] # return all hidden states h
|
||||
|
||||
|
||||
# ===========================
|
||||
# RNN-related layers
|
||||
# simple rnn and rnn with parametric skip connections (soft)
|
||||
# ===========================
|
||||
|
||||
def param_init_rnn(options, params, prefix='rnn', hier_level=False):
|
||||
'''
|
||||
Initialize parameters for simple rnn unit
|
||||
Support hierarchical architecture
|
||||
'''
|
||||
if hier_level:
|
||||
W = ortho_weight(options['dim_proj'])
|
||||
else:
|
||||
W = norm_weight(options['dim_word'], options['dim_proj'])
|
||||
params[_p(prefix, 'W')] = W
|
||||
U = ortho_weight(options['dim_proj'])
|
||||
params[_p(prefix, 'U')] = U
|
||||
b = numpy.zeros((options['dim_proj']))
|
||||
params[_p(prefix, 'b')] = b.astype(config.floatX)
|
||||
|
||||
return params
|
||||
|
||||
def rnn_layer(tparams, state_below, options, prefix='rnn', mask=None):
|
||||
nsteps = state_below.shape[0]
|
||||
dim_proj = options['dim_proj']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
assert mask is not None
|
||||
|
||||
# input: mask, x(t), h(t-1)
|
||||
def _rnn_unit(m_, x_, h_):
|
||||
h = tensor.tanh(tensor.dot(x_, tparams[_p(prefix, 'W')]) +
|
||||
tensor.dot(h_, tparams[_p(prefix, 'U')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
h = m_[:, None] * h + (1.-m_)[:, None] * h_ # mask
|
||||
return h
|
||||
|
||||
h = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
rval, updates = theano.scan(_rnn_unit,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[h],
|
||||
name=_p(prefix, 'layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
return rval
|
||||
|
||||
def param_init_rnn_pskip(options, params, prefix='rnn_pskip', hier_level=False):
|
||||
'''
|
||||
Initialize parameters for simple-rnn unit with parametric soft skip connections
|
||||
'''
|
||||
# weight for vanilla simple-rnn
|
||||
params = param_init_rnn(options, params, prefix=prefix, hier_level=hier_level)
|
||||
# weight for skip connection
|
||||
params[_p(prefix, 'W_skip')] = numpy.array([numpy.random.random_sample()]).astype('float32')[0]
|
||||
|
||||
return params
|
||||
|
||||
def rnn_pskip_layer(tparams, state_below, options, prefix='rnn_pskip', mask=None):
|
||||
nsteps = state_below.shape[0]
|
||||
n_skip = options['skip_steps']
|
||||
dim_proj = options['dim_proj']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
assert mask is not None
|
||||
|
||||
def _rnn_pskip(m_, x_, h_, h_skip, hcnt):
|
||||
skip_flag = tensor.eq(hcnt % n_skip, 0)
|
||||
h = tensor.tanh(tensor.dot(x_, tparams[_p(prefix, 'W')]) +
|
||||
tensor.dot(h_, tparams[_p(prefix, 'U')]) +
|
||||
tparams[_p(prefix, 'b')] +
|
||||
skip_flag * h_skip * tparams[_p(prefix, 'W_skip')])
|
||||
h = m_[:, None] * h + (1.-m_)[:, None] * h_
|
||||
h_skip = skip_flag * h + (1-skip_flag) * h_skip
|
||||
hcnt += 1
|
||||
return h, h_skip, hcnt
|
||||
|
||||
h = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
h_skip = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
hcnt = tensor.zeros_like(theano.shared(10.).astype('float32'))
|
||||
rval, updates = theano.scan(_rnn_pskip,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[h, h_skip, hcnt],
|
||||
name=_p(prefix, 'layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
return rval[0]
|
||||
|
||||
|
||||
# ===========================
|
||||
# ResNet modifications
|
||||
# ===========================
|
||||
|
||||
def residual_pskip_mod_layer(tparams, state_below, options, prefix='residual_pskip_mod', mask=None):
|
||||
'''
|
||||
residual layer with soft parametric weighted skip connections
|
||||
modifications on original pskip model
|
||||
'''
|
||||
# here state_below in x_emb
|
||||
nsteps = state_below.shape[0]
|
||||
depth = options['unit_depth']
|
||||
dim_proj = options['dim_proj']
|
||||
n_skip = options['skip_steps']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
assert mask is not None
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
# input mask, x(t), h(t-1), h(skip), time_idx
|
||||
def _resblock_mod(m_, x_, h_, h_skip, hcnt):
|
||||
y = h_
|
||||
skip_flag = theano.tensor.eq(hcnt % n_skip, 0)
|
||||
for idx in xrange(depth):
|
||||
hy = tensor.dot(y, tparams[_p(prefix, 'U'+str(idx+1))])
|
||||
# y(i) = sigmoid(Wx(t)+b + Uy(i-1))
|
||||
y = tensor.nnet.sigmoid(_slice(x_, idx, options['dim_proj']) + hy)
|
||||
# modification: skip connection after activation
|
||||
h = tensor.tanh(h_ + y) + h_skip * skip_flag * tparams[_p(prefix, 'W_skip')]
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_
|
||||
h_skip = h_skip*(1-skip_flag) + h*skip_flag
|
||||
hcnt += 1
|
||||
return h, h_skip, hcnt
|
||||
|
||||
# state_below = W*x(t)+b (for all inter_state y)
|
||||
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
|
||||
h = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
h_skip = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)
|
||||
# fixme: 0-dim init
|
||||
hcnt = tensor.zeros_like(theano.shared(10.).astype('float32'))
|
||||
rval, updates = theano.scan(_resblock_mod,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[h, h_skip, hcnt],
|
||||
name=_p(prefix, '_layers'),
|
||||
n_steps=nsteps,
|
||||
truncate_gradient=options['truncate_grad'])
|
||||
return rval[0] # return all hidden states h
|
||||
|
||||
def param_init_presidual(options, params, prefix='presidual', nin=None, dim=None):
|
||||
"""
|
||||
Init the parametric_residual_network parameter:
|
||||
"""
|
||||
if nin is None:
|
||||
nin = options['dim_word']
|
||||
if dim is None:
|
||||
dim = options['dim_proj']
|
||||
|
||||
# weight for input x
|
||||
depth = options['unit_depth']
|
||||
Wx = dict()
|
||||
for idx in xrange(depth):
|
||||
Wx[idx] = norm_weight(nin, dim)
|
||||
W = numpy.concatenate([ww for kk, ww in Wx.iteritems()], axis=1)
|
||||
params[_p(prefix, 'W')] = W
|
||||
b = numpy.zeros((depth * dim,))
|
||||
params[_p(prefix, 'b')] = b.astype(config.floatX)
|
||||
w_res = rand_weight(dim, 1)
|
||||
params[_p(prefix, 'w_res')] = w_res.astype(config.floatX)
|
||||
b_res = numpy.array([numpy.random.random_sample()]).astype('float32')[0]
|
||||
params[_p(prefix, 'b_res')] = b_res
|
||||
|
||||
# weight for inter-states
|
||||
for idx in xrange(depth):
|
||||
U = ortho_weight(dim)
|
||||
params[_p(prefix, 'U'+str(idx+1))] = U
|
||||
return params
|
||||
|
||||
def presidual_layer(tparams, state_below, options, prefix='presidual', mask=None,
|
||||
one_step=False, init_state=None, **kwargs):
|
||||
'''
|
||||
parametric residual layer (recurrent depth adjustable)
|
||||
parametric vector on identity connection
|
||||
'''
|
||||
if one_step:
|
||||
assert init_state, 'previous state must be provided'
|
||||
|
||||
# here state_below in x_emb
|
||||
nsteps = state_below.shape[0]
|
||||
depth = options['unit_depth']
|
||||
dim = options['dim_proj']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
if mask is None:
|
||||
mask = tensor.alloc(1., state_below.shape[0], 1)
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
# input mask, x(t), h(t-1)
|
||||
def _presblock(m_, x_, h_):
|
||||
y = h_
|
||||
for idx in xrange(depth):
|
||||
hy = tensor.dot(y, tparams[_p(prefix, 'U'+str(idx+1))])
|
||||
y = tensor.nnet.sigmoid(_slice(x_, idx, dim) + hy)
|
||||
# p = 2*sigmoid(wh(t-1)+b)-1
|
||||
p = 2 * tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'w_res')]) + tparams[_p(prefix, 'b_res')]) - 1
|
||||
p_vec = p.reshape(p.shape[0], 1)
|
||||
# h(t) = tanh(ph(t-1)+y)
|
||||
h = tensor.tanh(tensor.dot(tensor.nlinalg.alloc_diag(p_vec), h_) + y)
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_
|
||||
return h
|
||||
|
||||
# state_below = W*x(t)+b (for all inter_state y)
|
||||
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
|
||||
tparams[_p(prefix, 'b')])
|
||||
|
||||
if init_state is None:
|
||||
init_state = tensor.alloc(numpy_floatX(0.), n_samples, dim)
|
||||
|
||||
if one_step:
|
||||
rval = _presblock(mask, state_below, init_state)
|
||||
else:
|
||||
rval, updates = theano.scan(_presblock,
|
||||
sequences=[mask, state_below],
|
||||
outputs_info=[init_state],
|
||||
name=_p(prefix, 'layers'),
|
||||
n_steps=nsteps)
|
||||
# rval = [rval] # note: for consistency among model layers
|
||||
return rval
|
||||
|
||||
def param_init_pxresidual(options, params, prefix='pxresidual', nin=None, dim=None):
|
||||
"""
|
||||
Init the parametric (with respect to input) residual network parameter:
|
||||
"""
|
||||
if nin is None:
|
||||
nin = options['dim_word']
|
||||
if dim is None:
|
||||
dim = options['dim_proj']
|
||||
|
||||
# weight for input x
|
||||
depth = options['unit_depth']
|
||||
Wx = dict()
|
||||
for idx in xrange(depth):
|
||||
Wx[idx] = norm_weight(nin, dim)
|
||||
W = numpy.concatenate([ww for kk, ww in Wx.iteritems()], axis=1)
|
||||
params[_p(prefix, 'W')] = W
|
||||
b = numpy.zeros((depth * dim,))
|
||||
params[_p(prefix, 'b')] = b.astype(config.floatX)
|
||||
w_res = rand_weight(dim, 1)
|
||||
params[_p(prefix, 'w_res')] = w_res.astype(config.floatX)
|
||||
u_res = rand_weight(nin, 1)
|
||||
params[_p(prefix, 'u_res')] = u_res.astype(config.floatX)
|
||||
b_res = numpy.array([numpy.random.random_sample()]).astype('float32')[0]
|
||||
params[_p(prefix, 'b_res')] = b_res
|
||||
|
||||
# weight for inter-states
|
||||
for idx in xrange(depth):
|
||||
U = ortho_weight(dim)
|
||||
params[_p(prefix, 'U'+str(idx+1))] = U
|
||||
return params
|
||||
|
||||
def pxresidual_layer(tparams, state_below, options, prefix='pxresidual', mask=None,
|
||||
one_step=False, init_state=None, **kwargs):
|
||||
'''
|
||||
parametric (with respect to input) residual layer (recurrent depth adjustable)
|
||||
parametric vector on identity connection
|
||||
'''
|
||||
if one_step:
|
||||
assert init_state, 'previous state must be provided'
|
||||
|
||||
# here state_below in x_emb
|
||||
nsteps = state_below.shape[0]
|
||||
depth = options['unit_depth']
|
||||
dim = options['dim_proj']
|
||||
if state_below.ndim == 3:
|
||||
n_samples = state_below.shape[1]
|
||||
else:
|
||||
n_samples = 1
|
||||
|
||||
if mask is None:
|
||||
mask = tensor.alloc(1., state_below.shape[0], 1)
|
||||
|
||||
def _slice(_x, n, dim):
|
||||
if _x.ndim == 3:
|
||||
return _x[:, :, n * dim:(n + 1) * dim]
|
||||
return _x[:, n * dim:(n + 1) * dim]
|
||||
|
||||
# input mask, x(t), h(t-1)
|
||||
def _presblock(m_, x_, px_, h_):
|
||||
y = h_
|
||||
for idx in xrange(depth):
|
||||
hy = tensor.dot(y, tparams[_p(prefix, 'U'+str(idx+1))])
|
||||
y = tensor.nnet.sigmoid(_slice(x_, idx, dim) + hy)
|
||||
# p = 2 * sigmoid(wh(t-1) + (ux(t)+b)) - 1
|
||||
p = 2 * tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'w_res')]) + px_) - 1
|
||||
p_vec = p.reshape(p.shape[0], 1)
|
||||
# h(t) = tanh(p*h(t-1) + y)
|
||||
h = tensor.tanh(tensor.dot(tensor.nlinalg.alloc_diag(p_vec), h_) + y)
|
||||
h = m_[:, None] * h + (1. - m_)[:, None] * h_
|
||||
return h
|
||||
|
||||
# state_below_x = W*x(t)+b (for all inter_state y)
|
||||
state_below_x = tensor.dot(state_below, tparams[_p(prefix, 'W')]) \
|
||||
+ tparams[_p(prefix, 'b')]
|
||||
# state_below_px = u_res*x(t)+b_res (for parametric weight on identity connection)
|
||||
state_below_px = tensor.dot(state_below, tparams[_p(prefix, 'u_res')]) \
|
||||
+ tparams[_p(prefix, 'b_res')]
|
||||
|
||||
if init_state is None:
|
||||
init_state = tensor.alloc(numpy_floatX(0.), n_samples, dim)
|
||||
|
||||
if one_step:
|
||||
rval = _presblock(mask, state_below_x, state_below_px, init_state)
|
||||
else:
|
||||
rval, updates = theano.scan(_presblock,
|
||||
sequences=[mask, state_below_x, state_below_px],
|
||||
outputs_info=[init_state],
|
||||
name=_p(prefix, 'layers'),
|
||||
n_steps=nsteps)
|
||||
# rval = [rval] # note: for consistency among model layers
|
||||
return rval
|
|
@ -0,0 +1,17 @@
|
|||
import os
|
||||
def MapDeviceIds(comm):
|
||||
rank = comm.Get_rank()
|
||||
num_machine = comm.Get_size()
|
||||
os.system('python GPU_Usage.py ' + str(rank) + ' > record' + str(rank))
|
||||
comm.Barrier()
|
||||
if rank == 0:
|
||||
os.system('python AllocateGPU.py ' + str(num_machine) + ' > DirtyRecord')
|
||||
comm.Barrier()
|
||||
cardid = str(0)
|
||||
with open('DirtyRecord', 'r') as f:
|
||||
for idx, line in enumerate(f):
|
||||
if idx == rank:
|
||||
cardid = line.strip()
|
||||
break
|
||||
|
||||
return cardid
|
|
@ -0,0 +1,379 @@
|
|||
"""
|
||||
model for classification task
|
||||
supports simple-rnn, lstm, hierarchical lstm
|
||||
supports lstm with identity skip-connections(soft), parametric skip-connections(soft)
|
||||
supports resnet, resnet with identity skip-connections(hard and soft), parametric skip connections(soft)
|
||||
supports hybrid structure (lstm+resnet)
|
||||
supports dropout on non-recurrent layers, gradient clipping, L2-regularization
|
||||
"""
|
||||
__author__ = 'v-yirwan'
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy
|
||||
import cPickle as pkl
|
||||
import theano
|
||||
import theano.tensor as tensor
|
||||
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
|
||||
|
||||
from Layers import get_layer
|
||||
from Data import *
|
||||
from Util import *
|
||||
|
||||
# Set the random number generators' seeds for consistency
|
||||
SEED = 123
|
||||
numpy.random.seed(SEED)
|
||||
|
||||
def _p(pp, name):
|
||||
return '%s_%s' % (pp, name)
|
||||
|
||||
def init_params(options):
|
||||
"""
|
||||
Global (not LSTM) parameter. For the embedding and the classifier.
|
||||
"""
|
||||
params = OrderedDict()
|
||||
# embedding
|
||||
if options['dataset'] != 'mnist':
|
||||
randn = rand_weight(options['n_words'], options['dim_word'])
|
||||
params['Wemb'] = randn.astype(config.floatX)
|
||||
|
||||
# encoder layer
|
||||
params = get_layer(options['encoder'])[0](options, params,
|
||||
prefix=options['encoder'])
|
||||
|
||||
# classifier
|
||||
if options['lastHiddenLayer'] is not None:
|
||||
params['U'] = 0.01 * numpy.random.randn(options['lastHiddenLayer'],
|
||||
options['ydim']).astype(config.floatX)
|
||||
params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX)
|
||||
|
||||
params['ToLastHidden_W'] = 0.01 * numpy.random.randn(options['dim_proj'],
|
||||
options['lastHiddenLayer']).astype(config.floatX)
|
||||
params['ToLastHidden_b'] = numpy.zeros((options['lastHiddenLayer'],)).astype(config.floatX)
|
||||
|
||||
|
||||
else:
|
||||
params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
|
||||
options['ydim']).astype(config.floatX)
|
||||
params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX)
|
||||
|
||||
return params
|
||||
|
||||
def load_params(path, params):
|
||||
failer=0
|
||||
pp = numpy.load(path)
|
||||
for kk, vv in params.items():
|
||||
if kk not in pp:
|
||||
failer += 1
|
||||
raise Warning('%s is not in the archive' % kk)
|
||||
params[kk] = pp[kk]
|
||||
print failer, ' failed out of ', len(params)
|
||||
return params
|
||||
|
||||
def init_tparams(params):
|
||||
tparams = OrderedDict()
|
||||
for kk, pp in params.items():
|
||||
tparams[kk] = theano.shared(params[kk], name=kk)
|
||||
return tparams
|
||||
|
||||
def encoder_word_layer(tparams, state_below, options, mask=None):
|
||||
'''
|
||||
word(bottom)-level encoder for hierarchical architecture
|
||||
'''
|
||||
def _encode(x_sub, mask_sub, proj_sub):
|
||||
n_timesteps = x_sub.shape[0]
|
||||
n_samples = x_sub.shape[1]
|
||||
emb_sub = tparams['Wemb'][x_sub.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])
|
||||
proj_sub = get_layer(options['encoder'])[1](tparams, emb_sub, options,
|
||||
prefix=options['encoder']+'_word',
|
||||
mask=mask_sub)
|
||||
return proj_sub[-1]
|
||||
proj_sub = tensor.alloc(numpy_floatX(0.), state_below.shape[2], options['dim_proj'])
|
||||
rval, update = theano.scan(_encode,
|
||||
sequences=[state_below, mask],
|
||||
outputs_info=[proj_sub],
|
||||
name='word_encoder_layer',
|
||||
n_steps=state_below.shape[0])
|
||||
return rval
|
||||
|
||||
def build_model(tparams, options):
|
||||
trng = RandomStreams(SEED)
|
||||
|
||||
# Used for dropout.
|
||||
use_noise = theano.shared(numpy_floatX(0.))
|
||||
|
||||
if options['dataset'] == 'mnist':
|
||||
print 'Using mnist dataset with single number input'
|
||||
x = tensor.matrix('x', dtype='float32')
|
||||
else:
|
||||
print 'Using text dataset with embedding input'
|
||||
x = tensor.matrix('x', dtype='int64')
|
||||
mask = tensor.matrix('mask', dtype=config.floatX)
|
||||
y = tensor.vector('y', dtype='int64')
|
||||
|
||||
n_timesteps = x.shape[0]
|
||||
n_samples = x.shape[1]
|
||||
|
||||
# input word embedding
|
||||
if options['dataset'] == 'mnist':
|
||||
emb = x.reshape([n_timesteps, n_samples, options['dim_word']])
|
||||
else:
|
||||
emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])
|
||||
|
||||
# dropout on embedding
|
||||
if options['dropout_input'] > 0:
|
||||
print 'Applying drop-out on input embedding (dropout_input:', options['dropout_input'], ')'
|
||||
emb = dropout_layer(emb, options['dropout_input'], use_noise, trng)
|
||||
|
||||
# encoder information
|
||||
print 'Using', options['encoder'], 'unit'
|
||||
if options['truncate_grad'] is not None and options['truncate_grad'] > 0:
|
||||
print 'Using gradient truncation to', options['truncate_grad'], 'steps'
|
||||
else:
|
||||
options['truncate_grad'] = -1
|
||||
|
||||
# encoding layer
|
||||
proj = get_layer(options['encoder'])[1](tparams, emb, options,
|
||||
prefix=options['encoder'],
|
||||
mask=mask)
|
||||
|
||||
# pooling
|
||||
if options['mean_pooling']:
|
||||
print 'Using mean_pooling'
|
||||
proj = (proj * mask[:, :, None]).sum(axis=0) # mean pooling
|
||||
proj = proj / mask.sum(axis=0)[:, None]
|
||||
else:
|
||||
print 'Using last hidden state'
|
||||
proj = proj[-1] # last hidden state
|
||||
|
||||
sys.stdout.flush()
|
||||
|
||||
# dropout on hidden states
|
||||
if options['lastHiddenLayer'] is not None:
|
||||
lastH = tensor.dot(proj, tparams['ToLastHidden_W']) + tparams['ToLastHidden_b']
|
||||
lastH = tensor.nnet.sigmoid(lastH)
|
||||
if options['dropout_output'] > 0:
|
||||
lastH = dropout_layer(lastH, options['dropout_output'], use_noise, trng)
|
||||
pred = tensor.nnet.softmax(tensor.dot(lastH, tparams['U']) + tparams['b'])
|
||||
else:
|
||||
if options['dropout_output'] > 0:
|
||||
print 'Applying drop-out on hidden states (dropout_output:', options['dropout_output'], ")"
|
||||
proj = dropout_layer(proj, options['dropout_output'], use_noise, trng)
|
||||
|
||||
pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])
|
||||
|
||||
# for training
|
||||
f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
|
||||
f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred') # sample by argmax
|
||||
|
||||
off = 1e-8
|
||||
if pred.dtype == 'float16':
|
||||
off = 1e-6
|
||||
nlls = -tensor.log(pred[tensor.arange(n_samples), y] + off)
|
||||
|
||||
return use_noise, x, mask, y, f_pred_prob, f_pred, nlls
|
||||
|
||||
class Model:
|
||||
def __init__(self,
|
||||
dim_word=500, # word embeding dimension
|
||||
dim_proj=1024, # LSTM number of hidden units
|
||||
patience=10, # Number of epoch to wait before early stop if no progress
|
||||
max_epochs=5000, # The maximum number of epoch to run
|
||||
decay_c=-1., # Weight decay (for L2-regularization)
|
||||
clip_c=-1., # gradient clipping threshold
|
||||
lrate=1., # Learning rate for sgd (not used for adadelta and rmsprop)
|
||||
n_words=10000, # Vocabulary size
|
||||
optimizer='adadelta',
|
||||
encoder='lstm', # name of encoder unit, refer to 'layers'
|
||||
encoder2=None, # only used in hybrid mode
|
||||
hierarchical=False, # whether use hierarchical structure
|
||||
hier_len=None, # length of bottom (word-level) encoder
|
||||
hybrid=False, # whether use hybrid model
|
||||
mean_pooling=False, # use last hidden state if false
|
||||
unit_depth=-1, # recurrent depth of residual unit
|
||||
skip_steps=-1, # skip connection length (h(t) -> h(t+skip_steps))
|
||||
skip_steps2=-1, # only used in hybrid mode
|
||||
truncate_grad=-1, # e number of steps to use in truncated BPTT, set to -1 if not to apply
|
||||
saveto='model.npz', # The best model will be saved there
|
||||
dispFreq=50, # Display the training progress after this number of updates
|
||||
validFreq=300, # Compute the validation error after this number of updates
|
||||
newDumpFreq=5000000, # Dump model into a new file after this number of updates
|
||||
maxlen=None, # Sequence longer then this get ignored
|
||||
batch_size=16, # The batch size during training.
|
||||
batch_len_threshold=None, # use dynamic batch size if sequence lengths exceed this threshold
|
||||
valid_batch_size=16, # The batch size used for validation/test set.
|
||||
dataset='text', # dataset dype
|
||||
corpus='imdb.pkl', # path to load training data
|
||||
start_iter=0,
|
||||
start_epoch=0,
|
||||
noise_std=0.,
|
||||
lastHiddenLayer=None,
|
||||
dropout_output=None, # Dropout on output hidden states (before softmax layer)
|
||||
dropout_input=None, # Dropout on input embeddings
|
||||
reload_options=None, # Path to a saved model options we want to start from
|
||||
reload_model=None, # Path to a saved model we want to start from.
|
||||
embedding=None, # Path to the word embedding file (otherwise randomized)
|
||||
warm_LM=None,
|
||||
test_size=None, # If >0, we keep only this number of test example.
|
||||
monitor_grad=False, # Print gradient norm to log file at each iteration if set True
|
||||
logFile='log.txt' # Path to log file
|
||||
):
|
||||
|
||||
# Model options
|
||||
self.model_options = locals().copy()
|
||||
self.model_options['self'] = None
|
||||
|
||||
# log files
|
||||
self.F_log = open(logFile, "a")
|
||||
|
||||
if start_iter == 0:
|
||||
self.F_log.write("model options:\n")
|
||||
for kk, vv in self.model_options.iteritems():
|
||||
self.F_log.write("\t"+kk+":\t"+str(vv)+"\n")
|
||||
self.F_log.write("-----------------------------------------\n")
|
||||
|
||||
pkl.dump(self.model_options, open('%s.pkl' % saveto, 'wb'))
|
||||
|
||||
print 'Loading data...',
|
||||
if dataset == 'mnist':
|
||||
self.trainSet, self.validSet, self.testSet = load_mnist(path=corpus,
|
||||
fixed_permute=True,
|
||||
rand_permute=False)
|
||||
else:
|
||||
self.trainSet, self.validSet, self.testSet = load_data(path=corpus,
|
||||
n_words=n_words,
|
||||
maxlen=maxlen,
|
||||
sort_by_len=True,
|
||||
fixed_valid=True)
|
||||
print 'Done! '
|
||||
print 'Training', len(self.trainSet[0]), 'Valid', len(self.validSet[0]), 'Test', len(self.testSet[0])
|
||||
sys.stdout.flush()
|
||||
|
||||
if test_size > 0:
|
||||
test_size = min(test_size, len(self.testSet[0]))
|
||||
idx = numpy.arange(len(self.testSet[0]))
|
||||
numpy.random.shuffle(idx)
|
||||
idx = idx[:test_size]
|
||||
self.testSet = ([self.testSet[0][n] for n in idx], [self.testSet[1][n] for n in idx])
|
||||
|
||||
# number of classes
|
||||
ydim = numpy.max(self.trainSet[1]) + 1
|
||||
self.model_options['ydim'] = ydim
|
||||
|
||||
print 'Initializing model parameters...',
|
||||
params = init_params(self.model_options)
|
||||
print 'Done'
|
||||
print 'Model size:', self.model_options['dim_word'], '*', self.model_options['dim_proj']
|
||||
sys.stdout.flush()
|
||||
|
||||
# load pre-trained word embedding
|
||||
if embedding is not None and os.path.exists(embedding):
|
||||
Wemb = numpy.array(numpy.load(open(embedding, "rb")))
|
||||
if Wemb.shape[0] == self.model_options['n_words'] and \
|
||||
Wemb.shape[1] == self.model_options['dim_word']:
|
||||
print 'Using pre-trained word embedding'
|
||||
params['Wemb'] = Wemb.astype(numpy.float32) # bug fixed
|
||||
print 'vocab size', params['Wemb'].shape[0], ', dim', params['Wemb'].shape[1]
|
||||
|
||||
# reload options
|
||||
if reload_options is not None and os.path.exists(reload_options):
|
||||
print "Reloading model options...",
|
||||
with open(reload_options, 'rb') as f:
|
||||
self.model_options = pkl.load(f)
|
||||
print "Done"
|
||||
|
||||
# reload parameters
|
||||
self.start_iter = 0
|
||||
self.start_epoch = 0
|
||||
self.history_errs = []
|
||||
if reload_model is not None and os.path.exists(reload_model): # bug fixed
|
||||
print 'Reloading model parameters...',
|
||||
load_params(reload_model, params)
|
||||
self.start_iter = start_iter
|
||||
self.start_epoch = start_epoch
|
||||
#self.history_errs = list(numpy.load(self.model_options['reload_model'])['history_errs'])
|
||||
print 'Done'
|
||||
sys.stdout.flush()
|
||||
|
||||
if warm_LM is not None:
|
||||
print 'Steal from language model'
|
||||
warmLM_ = numpy.load(warm_LM)
|
||||
assert params['lstm_W'].shape == warmLM_['encoder_W'].shape
|
||||
assert params['lstm_b'].shape == warmLM_['encoder_b'].shape
|
||||
assert params['lstm_U'].shape == warmLM_['encoder_U'].shape
|
||||
assert params['Wemb'].shape == warmLM_['Wemb'].shape
|
||||
params['lstm_W'] = warmLM_['encoder_W']
|
||||
params['lstm_b'] = warmLM_['encoder_b']
|
||||
params['lstm_U'] = warmLM_['encoder_U']
|
||||
params['Wemb'] = warmLM_['Wemb']
|
||||
|
||||
self.tparams = init_tparams(params)
|
||||
|
||||
# build model
|
||||
mask_proj = None
|
||||
# vanilla structure
|
||||
def GetNll(self):
|
||||
print 'Using vanilla structure'
|
||||
self.use_noise, x, mask, y, \
|
||||
self.f_pred_prob, self.f_pred, nlls = \
|
||||
build_model(self.tparams, self.model_options)
|
||||
#inps = [x, mask, y]
|
||||
return x, mask, y, nlls
|
||||
|
||||
def get_accu(self, data, iterator, hier_len=None):
|
||||
"""
|
||||
Just compute the error
|
||||
modified to support hierarchical mode
|
||||
"""
|
||||
valid_acc = 0
|
||||
for _, valid_index in iterator:
|
||||
if hier_len is not None:
|
||||
x, mask, mask_proj, y = prepare_data_hier([data[0][t] for t in valid_index],
|
||||
numpy.array(data[1])[valid_index],
|
||||
hier_len=hier_len)
|
||||
preds = self.f_pred(x, mask, mask_proj)
|
||||
else:
|
||||
x, mask, y = prepare_data([data[0][t] for t in valid_index],
|
||||
numpy.array(data[1])[valid_index],
|
||||
maxlen=None,
|
||||
dataset=self.model_options['dataset'])
|
||||
preds = self.f_pred(x, mask) # result obtained by argmax
|
||||
valid_acc += (preds == y).sum() # note that batch is sorted in hier-mode
|
||||
valid_acc = numpy_floatX(valid_acc) / numpy_floatX(len(data[0])) # accuracy
|
||||
|
||||
return valid_acc
|
||||
|
||||
def save_model(self, savefile, best_p=None):
|
||||
if best_p is not None: # save the best model so far
|
||||
params = best_p
|
||||
else:
|
||||
params = unzip(self.tparams)
|
||||
numpy.savez(savefile, history_errs=self.history_errs, **params)
|
||||
pkl.dump(self.model_options, open('%s.pkl' % self.model_options['saveto'], 'wb'))
|
||||
|
||||
def valid(self):
|
||||
train_acc = self.get_accu(self.trainSet, self.kf_train)
|
||||
#hier_len=self.model_options['hier_len'])
|
||||
valid_acc = self.get_accu(self.validSet, self.kf_valid)
|
||||
#hier_len=self.model_options['hier_len'])
|
||||
test_acc = self.get_accu(self.testSet, self.kf_test)
|
||||
#hier_len=self.model_options['hier_len'])
|
||||
return train_acc, valid_acc, test_acc
|
||||
|
||||
def evaluate(self, *dataset):
|
||||
acc = []
|
||||
for k in xrange(len(dataset)):
|
||||
data = dataset[k]
|
||||
idx = get_minibatches_idx(len(data[0]), 16)
|
||||
acc.append(self.get_accu(data, idx))
|
||||
return acc
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,257 @@
|
|||
import numpy
|
||||
from collections import OrderedDict
|
||||
|
||||
import theano
|
||||
import theano.tensor as tensor
|
||||
from theano import config
|
||||
|
||||
# ==========================
|
||||
# some operations with hyper-parameters
|
||||
# supports non-recurrent layer dropout, L2-regularization, gradient clipping
|
||||
# ==========================
|
||||
|
||||
def l2_regularization(tparams, cost, decay_c):
|
||||
decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
|
||||
weight_decay = 0.
|
||||
weight_decay += (tparams['U'] ** 2).sum()
|
||||
weight_decay *= decay_c
|
||||
cost += weight_decay
|
||||
return cost
|
||||
|
||||
def grad_clipping(grads, clip_c):
|
||||
g2 = 0.
|
||||
for g in grads:
|
||||
g2 += (g**2).sum()
|
||||
new_grads = []
|
||||
for g in grads:
|
||||
new_grads.append(tensor.switch(g2 > (clip_c**2), g/tensor.sqrt(g2) * clip_c, g))
|
||||
grads = new_grads
|
||||
return grads
|
||||
|
||||
def dropout_layer(state_before, dropout, use_noise, trng):
|
||||
proj = tensor.switch(use_noise,
|
||||
(state_before *
|
||||
trng.binomial(state_before.shape,
|
||||
p=(1-dropout), n=1,
|
||||
dtype=state_before.dtype)),
|
||||
state_before * (1-dropout))
|
||||
return proj
|
||||
|
||||
# ==========================
|
||||
# optimizers
|
||||
# supports sgd, adadelta and rmsprop
|
||||
# only adadelta supports hierarchical structure
|
||||
# ==========================
|
||||
|
||||
def sgd(lr, tparams, grads, x, mask, y, cost):
|
||||
""" Stochastic Gradient Descent
|
||||
|
||||
:note: A more complicated version of sgd then needed. This is
|
||||
done like that for adadelta and rmsprop.
|
||||
|
||||
"""
|
||||
# New set of shared variable that will contain the gradient
|
||||
# for a mini-batch.
|
||||
gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
|
||||
for k, p in tparams.items()]
|
||||
gsup = [(gs, g) for gs, g in zip(gshared, grads)]
|
||||
|
||||
# Function that computes gradients for a mini-batch, but do not
|
||||
# updates the weights.
|
||||
f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
|
||||
name='sgd_f_grad_shared')
|
||||
|
||||
pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
|
||||
|
||||
# Function that updates the weights from the previously computed
|
||||
# gradient.
|
||||
f_update = theano.function([lr], [], updates=pup,
|
||||
name='sgd_f_update')
|
||||
|
||||
return f_grad_shared, f_update
|
||||
|
||||
def adadelta(lr, tparams, grads, x, mask, y, cost, mask_hier=None):
|
||||
"""
|
||||
An adaptive learning rate optimizer
|
||||
# modified to support hierarchical mode
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lr : Theano SharedVariable
|
||||
Initial learning rate
|
||||
tpramas: Theano SharedVariable
|
||||
Model parameters
|
||||
grads: Theano variable
|
||||
Gradients of cost w.r.t to parameres
|
||||
x: Theano variable
|
||||
Model inputs
|
||||
mask: Theano variable
|
||||
Sequence mask
|
||||
y: Theano variable
|
||||
Targets
|
||||
cost: Theano variable
|
||||
Objective fucntion to minimize
|
||||
|
||||
Notes
|
||||
-----
|
||||
For more information, see [ADADELTA]_.
|
||||
|
||||
.. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
|
||||
Rate Method*, arXiv:1212.5701.
|
||||
"""
|
||||
|
||||
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
|
||||
name='%s_grad' % k)
|
||||
for k, p in tparams.items()]
|
||||
running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
|
||||
name='%s_rup2' % k)
|
||||
for k, p in tparams.items()]
|
||||
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
|
||||
name='%s_rgrad2' % k)
|
||||
for k, p in tparams.items()]
|
||||
|
||||
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
|
||||
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
|
||||
for rg2, g in zip(running_grads2, grads)]
|
||||
if mask_hier is not None:
|
||||
f_grad_shared = theano.function([x, mask, mask_hier, y], cost, updates=zgup + rg2up,
|
||||
name='adadelta_f_grad_shared')
|
||||
else:
|
||||
f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
|
||||
name='adadelta_f_grad_shared')
|
||||
|
||||
updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
|
||||
for zg, ru2, rg2 in zip(zipped_grads,
|
||||
running_up2,
|
||||
running_grads2)]
|
||||
ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
|
||||
for ru2, ud in zip(running_up2, updir)]
|
||||
param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
|
||||
|
||||
f_update = theano.function([lr], [], updates=ru2up + param_up,
|
||||
on_unused_input='ignore',
|
||||
name='adadelta_f_update')
|
||||
|
||||
return f_grad_shared, f_update
|
||||
|
||||
def rmsprop(lr, tparams, grads, x, mask, y, cost):
|
||||
"""
|
||||
A variant of SGD that scales the step size by running average of the
|
||||
recent step norms.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lr : Theano SharedVariable
|
||||
Initial learning rate
|
||||
tpramas: Theano SharedVariable
|
||||
Model parameters
|
||||
grads: Theano variable
|
||||
Gradients of cost w.r.t to parameres
|
||||
x: Theano variable
|
||||
Model inputs
|
||||
mask: Theano variable
|
||||
Sequence mask
|
||||
y: Theano variable
|
||||
Targets
|
||||
cost: Theano variable
|
||||
Objective fucntion to minimize
|
||||
|
||||
Notes
|
||||
-----
|
||||
For more information, see [Hint2014]_.
|
||||
|
||||
.. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
|
||||
lecture 6a,
|
||||
http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
|
||||
"""
|
||||
|
||||
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
|
||||
name='%s_grad' % k)
|
||||
for k, p in tparams.items()]
|
||||
running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
|
||||
name='%s_rgrad' % k)
|
||||
for k, p in tparams.items()]
|
||||
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
|
||||
name='%s_rgrad2' % k)
|
||||
for k, p in tparams.items()]
|
||||
|
||||
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
|
||||
rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
|
||||
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
|
||||
for rg2, g in zip(running_grads2, grads)]
|
||||
|
||||
f_grad_shared = theano.function([x, mask, y], cost,
|
||||
updates=zgup + rgup + rg2up,
|
||||
name='rmsprop_f_grad_shared')
|
||||
|
||||
updir = [theano.shared(p.get_value() * numpy_floatX(0.),
|
||||
name='%s_updir' % k)
|
||||
for k, p in tparams.items()]
|
||||
updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
|
||||
for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
|
||||
running_grads2)]
|
||||
param_up = [(p, p + udn[1])
|
||||
for p, udn in zip(tparams.values(), updir_new)]
|
||||
f_update = theano.function([lr], [], updates=updir_new + param_up,
|
||||
on_unused_input='ignore',
|
||||
name='rmsprop_f_update')
|
||||
|
||||
return f_grad_shared, f_update
|
||||
|
||||
# ==========================
|
||||
# matrix initializations
|
||||
# supports normalized, orthogonal and randomized
|
||||
# ==========================
|
||||
|
||||
def ortho_weight(ndim):
|
||||
W = numpy.random.randn(ndim, ndim)
|
||||
u, s, v = numpy.linalg.svd(W)
|
||||
return u.astype(config.floatX)
|
||||
|
||||
def norm_weight(nin, nout=None, scale=0.01, ortho=True):
|
||||
if nout is None:
|
||||
nout = nin
|
||||
if nout == nin and ortho:
|
||||
W = ortho_weight(nin)
|
||||
else:
|
||||
# bug fixed: set to be ortho_init
|
||||
# W = scale * numpy.random.randn(nin, nout)
|
||||
W = numpy.random.randn(nin, nout)
|
||||
u, s, v = numpy.linalg.svd(W)
|
||||
if nin > nout:
|
||||
W = u[:, :nout]
|
||||
else:
|
||||
W = v[:nin, :]
|
||||
return W.astype('float32')
|
||||
|
||||
def rand_weight(nin, nout=None, scale=0.01, ortho=True):
|
||||
if nout is None:
|
||||
nout = nin
|
||||
if nout == nin and ortho:
|
||||
W = ortho_weight(nin)
|
||||
else:
|
||||
W = scale * numpy.random.randn(nin, nout)
|
||||
return W.astype('float32')
|
||||
|
||||
# ==========================
|
||||
# some utility functions
|
||||
# ==========================
|
||||
|
||||
def zipp(params, tparams):
|
||||
"""
|
||||
When we reload the model. Needed for the GPU stuff.
|
||||
"""
|
||||
for kk, vv in params.items():
|
||||
tparams[kk].set_value(vv)
|
||||
|
||||
def unzip(zipped):
|
||||
"""
|
||||
When we pickle the model. Needed for the GPU stuff.
|
||||
"""
|
||||
new_params = OrderedDict()
|
||||
for kk, vv in zipped.items():
|
||||
new_params[kk] = vv.get_value()
|
||||
return new_params
|
||||
|
||||
def numpy_floatX(data):
|
||||
return numpy.asarray(data, dtype=config.floatX)
|
|
@ -0,0 +1,32 @@
|
|||
import sys
|
||||
import codecs
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
raise Exception('Not enough argv')
|
||||
|
||||
theano_rc = r"""
|
||||
[global]
|
||||
mode = FAST_RUN
|
||||
device = gpu
|
||||
floatX = float32
|
||||
on_unused_input = warn
|
||||
optimizer = fast_run
|
||||
#allow_gc=False
|
||||
cuda.disable_gcc_cudnn_check=True
|
||||
|
||||
[lib]
|
||||
cnmem = 0.75
|
||||
|
||||
[nvcc]
|
||||
flags=-L{0}\libs
|
||||
root=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
|
||||
fast_math = True
|
||||
|
||||
"""
|
||||
|
||||
theano_rc = theano_rc.format(sys.argv[1])
|
||||
|
||||
print(theano_rc)
|
||||
|
||||
with codecs.open(sys.argv[2], 'w', 'utf-8') as f:
|
||||
f.write(theano_rc)
|
|
@ -0,0 +1,4 @@
|
|||
@echo off
|
||||
setlocal ENABLEDELAYEDEXPANSION
|
||||
set THEANO_FLAGS=device=gpu5
|
||||
python train_classifier_LM_NoDrop_google_sgd0.2.py
|
|
@ -0,0 +1,148 @@
|
|||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
@rem Windows batch file to use Theano on GCR
|
||||
@rem
|
||||
@rem Updated: April 7, 2016
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|
||||
@rem set the PATH system variable
|
||||
@rem Start from the 26th letter
|
||||
set working_sub_dir=%cd:~26%
|
||||
|
||||
set PATH=^
|
||||
C:\Windows\system32;^
|
||||
C:\Windows\System32\Wbem;^
|
||||
C:\Windows\System32\WindowsPowerShell\v1.0\;^
|
||||
C:\Windows;^
|
||||
C:\Program Files\Microsoft HPC Pack 2012\Bin\;^
|
||||
C:\Program Files\Microsoft MPI\Bin\;^
|
||||
C:\Program Files (x86)\Windows Kits\8.1\Windows Performance Toolkit\
|
||||
|
||||
pushd \\gcr\Scratch\RR1\v-yixia\Theano
|
||||
set ToolkitFolderDriver=%cd%
|
||||
|
||||
@rem set the environment variable for the CUDA 7.5 Toolkit
|
||||
rem set CUDA_HOME=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5 rem the old version
|
||||
set CUDA_HOME=%ToolkitFolderDriver%\CUDA\v7.0+cudnn4008
|
||||
set CUDA_BIN=%CUDA_HOME%\bin
|
||||
set CUDA_INCLUDE=%CUDA_HOME%\include
|
||||
set CUDA_LIB=%CUDA_HOME%\lib\x64
|
||||
set CUDA_LIBNVVP=%CUDA_HOME%\libnvvp
|
||||
|
||||
@rem add all CUDA Toolkit folders to the PATH system variable
|
||||
set PATH=^
|
||||
%CUDA_HOME%;^
|
||||
%CUDA_BIN%;^
|
||||
%CUDA_INCLUDE%;^
|
||||
%CUDA_LIB%;^
|
||||
%CUDA_LIBNVVP%;^
|
||||
%PATH%
|
||||
|
||||
@echo %PATH%
|
||||
|
||||
@rem setting up VC complier
|
||||
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
@rem connect to the shared toolkit folder \\gcr\Tools\Shared_Toolkits\Theano
|
||||
rem pushd \\gcr\Tools\Shared_Toolkits\Theano
|
||||
|
||||
|
||||
|
||||
@rem unset these variables
|
||||
@set Framework40Version=
|
||||
@set FrameworkDIR32=
|
||||
@set FrameworkVersion32=
|
||||
@set FSHARPINSTALLDIR=
|
||||
@set VSINSTALLDIR=
|
||||
@set WindowsSDK_ExecutablePath_x64=
|
||||
@set WindowsSDK_ExecutablePath_x86=
|
||||
|
||||
@set VCINSTALLDIR=%ToolkitFolderDriver%\VS_portable\Microsoft Visual Studio 12.0v2\VC\
|
||||
@set WindowsSdkDir=%ToolkitFolderDriver%\Windows Kits\8.1v2\
|
||||
|
||||
:amd64
|
||||
|
||||
@rem set Windows SDK include/lib path
|
||||
@rem --------------------------------------------------
|
||||
if not "%WindowsSdkDir%" == "" @set PATH=%WindowsSdkDir%bin\x64;%WindowsSdkDir%bin\x86;%PATH%
|
||||
if not "%WindowsSdkDir%" == "" @set INCLUDE=%WindowsSdkDir%include\shared;%WindowsSdkDir%include\um;%WindowsSdkDir%include\winrt;%INCLUDE%
|
||||
if not "%WindowsSdkDir%" == "" @set LIB=%WindowsSdkDir%lib\winv6.3\um\x64;%LIB%
|
||||
if not "%WindowsSdkDir%" == "" @set LIBPATH=%WindowsLibPath%;%ExtensionSDKDir%\Microsoft.VCLibs\14.0\References\CommonConfiguration\neutral;%LIBPATH%
|
||||
|
||||
@rem set the environment variables for Microsoft Visual Studio
|
||||
@rem --------------------------------------------------
|
||||
@rem PATH
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%VCPackages" set PATH=%VCINSTALLDIR%VCPackages;%PATH%
|
||||
if exist "%VCINSTALLDIR%BIN\amd64" set PATH=%VCINSTALLDIR%BIN\amd64;%PATH%
|
||||
@rem --------------------------------------------------
|
||||
@rem INCLUDE
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%ATLMFC\INCLUDE" set INCLUDE=%VCINSTALLDIR%ATLMFC\INCLUDE;%INCLUDE%
|
||||
if exist "%VCINSTALLDIR%INCLUDE" set INCLUDE=%VCINSTALLDIR%INCLUDE;%INCLUDE%
|
||||
@rem --------------------------------------------------
|
||||
@rem LIB
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIB=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIB%
|
||||
if exist "%VCINSTALLDIR%LIB\amd64" set LIB=%VCINSTALLDIR%LIB\amd64;%LIB%
|
||||
@rem --------------------------------------------------
|
||||
@rem LIBPATH
|
||||
@rem --------------------------------------------------
|
||||
if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIBPATH=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIBPATH%
|
||||
if exist "%VCINSTALLDIR%LIB\amd64" set LIBPATH=%VCINSTALLDIR%LIB\amd64;%LIBPATH%
|
||||
|
||||
@rem set the environment variables for the cuDNN v4 (Feb 10, 2016) for CUDA 7.0 and later.
|
||||
rem set CUDNN_PATH=%ToolkitFolderDriver%\Shared_Toolkits\Theano\CUDA\cudnn-4.0.7\cuda
|
||||
rem set INCLUDE=%CUDNN_PATH%\include;%INCLUDE%
|
||||
rem set LIB=%CUDNN_PATH%\lib\x64;%LIB%
|
||||
rem set PATH=%CUDNN_PATH%\bin;%PATH%
|
||||
|
||||
set Platform=X64
|
||||
set CommandPromptType=Native
|
||||
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
@rem connect to your scratch storage \\gcr\Scratch\<location>\<alias>
|
||||
@rem Note: Please copy ANACONDA2 to \\gcr\Scratch\<location>\<alias>\Anaconda2
|
||||
@rem =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
pushd \\gcr\scratch\RR1\v-yixia
|
||||
set CONDANETDRIVE=%cd:~0,2%
|
||||
|
||||
@rem set the environment variable for the Anaconda2
|
||||
set ANACONDA2=%CONDANETDRIVE%\RR1\taoqin\Anaconda-gpu002
|
||||
set ANACONDA2_SCRIPTS=%ANACONDA2%\Scripts
|
||||
set ANACONDA2_BIN=%ANACONDA2%\Library\bin
|
||||
|
||||
@rem add Anaconda2 folders to the PATH system variable
|
||||
set PATH=^
|
||||
%ANACONDA2%;^
|
||||
%ANACONDA2_BIN%;^
|
||||
%ANACONDA2_SCRIPTS%;^
|
||||
%PATH%
|
||||
|
||||
@echo %PATH%
|
||||
|
||||
@rem example files from DeepLearningTutorials are available at \\gcr\Tools\Shared_Toolkits\Theano\Examples
|
||||
set PROJDRIVE=%CONDANETDRIVE%
|
||||
set MYHOME=%PROJDRIVE%\RR1\v-yixia
|
||||
set PROJHOME=%MYHOME%\%working_sub_dir%
|
||||
|
||||
%PROJDRIVE%
|
||||
|
||||
cd %PROJHOME%
|
||||
|
||||
@rem setup theano env (generate .theanorc.txt)
|
||||
call python gen_theanorc.py %ANACONDA2% .theanorc.txt
|
||||
del %userprofile%\.theanorc.txt /Q /F
|
||||
copy .theanorc.txt %userprofile% /Y
|
||||
|
||||
call python write_script.py %*
|
||||
|
||||
call worker.bat
|
||||
|
||||
@echo delete theano env
|
||||
del %userprofile%\.theanorc.txt /Q /F
|
||||
|
||||
popd
|
||||
|
||||
popd
|
||||
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
import re, os, numpy, sys
|
||||
|
||||
|
||||
filename = r'.\gpu_usage_draft'
|
||||
|
||||
|
||||
|
||||
def GrabGPU():
|
||||
cmdstr = '\"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe\" > ' + filename
|
||||
os.system(cmdstr)
|
||||
|
||||
def GetGPUUSage():
|
||||
pattern = re.compile(r'(?P<num>[0-9]+)MiB[\s]+/')
|
||||
mem = []
|
||||
fo = open(filename, 'r')
|
||||
for line in fo:
|
||||
result = pattern.search(line)
|
||||
if result:
|
||||
mem.append(int(result.group('num')))
|
||||
fo.close()
|
||||
|
||||
return numpy.array(mem).argsort()[0]
|
||||
|
||||
def print_script(cmd):
|
||||
GrabGPU()
|
||||
with open('worker.bat', 'w') as f:
|
||||
f.write('@echo off\nsetlocal ENABLEDELAYEDEXPANSION\n')
|
||||
if len(cmd) == 1:
|
||||
f.write('set THEANO_FLAGS=device=gpu%d\n' % GetGPUUSage())
|
||||
f.write('python ' + cmd[0])
|
||||
elif len(cmd) == 2:
|
||||
f.write('set THEANO_FLAGS=device=gpu' + cmd[1] + '\n')
|
||||
f.write('python ' + cmd[0])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print_script(sys.argv[1:])
|
||||
|
||||
# os.system('del /q ' + filename + rank)
|
|
@ -0,0 +1,369 @@
|
|||
"""
|
||||
data loading and minibatch generation
|
||||
"""
|
||||
__author__ = 'v-yirwan'
|
||||
|
||||
import cPickle as pkl
|
||||
import gzip
|
||||
import os
|
||||
import numpy
|
||||
from theano import config
|
||||
|
||||
def get_dataset_file(dataset, default_dataset, origin):
|
||||
'''
|
||||
Look for it as if it was a full path, if not, try local file,
|
||||
if not try in the data directory.
|
||||
|
||||
Download dataset if it is not present
|
||||
'''
|
||||
data_dir, data_file = os.path.split(dataset)
|
||||
if data_dir == "" and not os.path.isfile(dataset):
|
||||
# Check if dataset is in the data directory.
|
||||
new_path = os.path.join(
|
||||
os.path.split(__file__)[0],
|
||||
"..",
|
||||
"data",
|
||||
dataset
|
||||
)
|
||||
if os.path.isfile(new_path) or data_file == default_dataset:
|
||||
dataset = new_path
|
||||
|
||||
if (not os.path.isfile(dataset)) and data_file == default_dataset:
|
||||
from six.moves import urllib
|
||||
print('Downloading data from %s' % origin)
|
||||
urllib.request.urlretrieve(origin, dataset)
|
||||
|
||||
return dataset
|
||||
|
||||
def load_data(path="imdb.pkl", n_words=100000, maxlen=None,
|
||||
sort_by_len=True, fixed_valid=True, valid_portion=0.1):
|
||||
'''
|
||||
Loads the dataset
|
||||
:type path: String
|
||||
:param path: The path to the dataset (here IMDB)
|
||||
:type n_words: int
|
||||
:param n_words: The number of word to keep in the vocabulary.
|
||||
All extra words are set to unknow (1).
|
||||
:type maxlen: None or positive int
|
||||
:param maxlen: the max sequence length we use in the train/valid set.
|
||||
:type sort_by_len: bool
|
||||
:name sort_by_len: Sort by the sequence lenght for the train,
|
||||
valid and test set. This allow faster execution as it cause
|
||||
less padding per minibatch. Another mechanism must be used to
|
||||
shuffle the train set at each epoch.
|
||||
:type fixed_valid: bool
|
||||
:param fixed_valid: load fixed validation set from the corpus file,
|
||||
which would otherwise be picked randomly from the training set with
|
||||
proportion [valid_portion]
|
||||
:type valid_portion: float
|
||||
:param valid_portion: The proportion of the full train set used for
|
||||
the validation set.
|
||||
|
||||
'''
|
||||
|
||||
# Load the dataset
|
||||
path = get_dataset_file(
|
||||
path, "imdb.pkl",
|
||||
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
|
||||
if path.endswith(".gz"):
|
||||
f = gzip.open(path, 'rb')
|
||||
else:
|
||||
f = open(path, 'rb')
|
||||
|
||||
train_set = pkl.load(f)
|
||||
if fixed_valid:
|
||||
valid_set = pkl.load(f)
|
||||
test_set = pkl.load(f)
|
||||
f.close()
|
||||
|
||||
def _truncate_data(train_set):
|
||||
'''
|
||||
truncate sequences with lengths exceed max-len threshold
|
||||
:param train_set: a list of sequences list and corresponding labels list
|
||||
:return: truncated train_set
|
||||
'''
|
||||
new_train_set_x = []
|
||||
new_train_set_y = []
|
||||
for x, y in zip(train_set[0], train_set[1]):
|
||||
if len(x) < maxlen:
|
||||
new_train_set_x.append(x)
|
||||
new_train_set_y.append(y)
|
||||
train_set = (new_train_set_x, new_train_set_y)
|
||||
del new_train_set_x, new_train_set_y
|
||||
return train_set
|
||||
|
||||
def _set_valid(train_set, valid_portion):
|
||||
'''
|
||||
set validation with [valid_portion] proportion of training set
|
||||
'''
|
||||
train_set_x, train_set_y = train_set
|
||||
n_samples = len(train_set_x)
|
||||
sidx = numpy.random.permutation(n_samples) # shuffle data
|
||||
n_train = int(numpy.round(n_samples * (1. - valid_portion)))
|
||||
valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
|
||||
valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
|
||||
train_set_x = [train_set_x[s] for s in sidx[:n_train]]
|
||||
train_set_y = [train_set_y[s] for s in sidx[:n_train]]
|
||||
train_set = (train_set_x, train_set_y)
|
||||
valid_set = (valid_set_x, valid_set_y)
|
||||
del train_set_x, train_set_y, valid_set_x, valid_set_y
|
||||
return train_set, valid_set
|
||||
|
||||
if maxlen:
|
||||
train_set = _truncate_data(train_set)
|
||||
if fixed_valid:
|
||||
print 'Loading with fixed validation set...',
|
||||
valid_set = _truncate_data(valid_set)
|
||||
else:
|
||||
print 'Setting validation set with proportion:', valid_portion, '...',
|
||||
train_set, valid_set = _set_valid(train_set, valid_portion)
|
||||
test_set = _truncate_data(test_set)
|
||||
|
||||
if maxlen is None and not fixed_valid:
|
||||
train_set, valid_set = _set_valid(train_set, valid_portion)
|
||||
|
||||
def remove_unk(x):
|
||||
return [[1 if w >= n_words else w for w in sen] for sen in x]
|
||||
|
||||
test_set_x, test_set_y = test_set
|
||||
valid_set_x, valid_set_y = valid_set
|
||||
train_set_x, train_set_y = train_set
|
||||
|
||||
# remove unk from dataset
|
||||
train_set_x = remove_unk(train_set_x) # use 1 if unk
|
||||
valid_set_x = remove_unk(valid_set_x)
|
||||
test_set_x = remove_unk(test_set_x)
|
||||
|
||||
def len_argsort(seq):
|
||||
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
|
||||
|
||||
if sort_by_len:
|
||||
sorted_index = len_argsort(test_set_x)
|
||||
# ranked from shortest to longest
|
||||
test_set_x = [test_set_x[i] for i in sorted_index]
|
||||
test_set_y = [test_set_y[i] for i in sorted_index]
|
||||
|
||||
sorted_index = len_argsort(valid_set_x)
|
||||
valid_set_x = [valid_set_x[i] for i in sorted_index]
|
||||
valid_set_y = [valid_set_y[i] for i in sorted_index]
|
||||
|
||||
sorted_index = len_argsort(train_set_x)
|
||||
train_set_x = [train_set_x[i] for i in sorted_index]
|
||||
train_set_y = [train_set_y[i] for i in sorted_index]
|
||||
|
||||
train = (train_set_x, train_set_y)
|
||||
valid = (valid_set_x, valid_set_y)
|
||||
test = (test_set_x, test_set_y)
|
||||
|
||||
return train, valid, test
|
||||
|
||||
def load_mnist(path='mnist.pkl', fixed_permute=True, rand_permute=False):
|
||||
f = open(path, 'rb')
|
||||
train = pkl.load(f)
|
||||
valid = pkl.load(f)
|
||||
test = pkl.load(f)
|
||||
f.close()
|
||||
|
||||
def _permute(data, perm):
|
||||
x, y = data
|
||||
x_new = []
|
||||
for xx in x:
|
||||
xx_new = [xx[pp] for pp in perm]
|
||||
x_new.append(xx_new)
|
||||
return (x_new, y)
|
||||
|
||||
def _trans2list(data):
|
||||
x, y = data
|
||||
x = [list(xx) for xx in x]
|
||||
return (x, y)
|
||||
|
||||
if rand_permute:
|
||||
print 'Using a fixed random permutation of pixels...',
|
||||
perm = numpy.random.permutation(range(784))
|
||||
train = _permute(train, perm)
|
||||
valid = _permute(valid, perm)
|
||||
test = _permute(test, perm)
|
||||
elif fixed_permute:
|
||||
print 'Using permuted dataset...',
|
||||
|
||||
_trans2list(train)
|
||||
_trans2list(valid)
|
||||
_trans2list(test)
|
||||
|
||||
return train, valid, test
|
||||
|
||||
def get_minibatches_idx(n, minibatch_size, shuffle=False):
|
||||
"""
|
||||
Used to shuffle the dataset at each iteration.
|
||||
"""
|
||||
|
||||
idx_list = numpy.arange(n, dtype="int32")
|
||||
|
||||
if shuffle:
|
||||
numpy.random.shuffle(idx_list)
|
||||
|
||||
minibatches = []
|
||||
minibatch_start = 0
|
||||
for i in range(n // minibatch_size):
|
||||
minibatches.append(idx_list[minibatch_start:
|
||||
minibatch_start + minibatch_size])
|
||||
minibatch_start += minibatch_size
|
||||
|
||||
if (minibatch_start != n):
|
||||
# Make a minibatch out of what is left
|
||||
minibatches.append(idx_list[minibatch_start:])
|
||||
|
||||
return zip(range(len(minibatches)), minibatches)
|
||||
|
||||
def get_minibatches_idx_bucket(dataset, minibatch_size, shuffle=False):
|
||||
"""
|
||||
divide into different buckets according to sequence lengths
|
||||
dynamic batch size
|
||||
"""
|
||||
# divide into buckets
|
||||
slen = [len(ss) for ss in dataset]
|
||||
bucket1000 = [sidx for sidx in xrange(len(dataset))
|
||||
if slen[sidx] > 0 and slen[sidx] <= 1000]
|
||||
bucket3000 = [sidx for sidx in xrange(len(dataset))
|
||||
if slen[sidx] > 1000 and slen[sidx] <= 3000]
|
||||
bucket_long = [sidx for sidx in xrange(len(dataset))
|
||||
if slen[sidx] > 3000]
|
||||
|
||||
# shuffle each bucket
|
||||
if shuffle:
|
||||
numpy.random.shuffle(bucket1000)
|
||||
numpy.random.shuffle(bucket3000)
|
||||
numpy.random.shuffle(bucket_long)
|
||||
|
||||
# make minibatches
|
||||
def _make_batch(minibatches, bucket, minibatch_size):
|
||||
minibatch_start = 0
|
||||
n = len(bucket)
|
||||
for i in range(n // minibatch_size):
|
||||
minibatches.append(bucket[minibatch_start : minibatch_start + minibatch_size])
|
||||
minibatch_start += minibatch_size
|
||||
if (minibatch_start != n):
|
||||
# Make a minibatch out of what is left
|
||||
minibatches.append(bucket[minibatch_start:])
|
||||
return minibatches
|
||||
|
||||
minibatches = []
|
||||
_make_batch(minibatches, bucket1000, minibatch_size=minibatch_size)
|
||||
_make_batch(minibatches, bucket3000, minibatch_size=minibatch_size//2)
|
||||
_make_batch(minibatches, bucket_long, minibatch_size=minibatch_size//8)
|
||||
|
||||
# shuffle minibatches
|
||||
numpy.random.shuffle(minibatches)
|
||||
|
||||
return zip(range(len(minibatches)), minibatches)
|
||||
|
||||
def prepare_data(seqs, labels, maxlen=None, dataset='text'):
|
||||
"""Create the matrices from the datasets.
|
||||
|
||||
This pad each sequence to the same lenght: the lenght of the
|
||||
longuest sequence or maxlen.
|
||||
|
||||
if maxlen is set, we will cut all sequence to this maximum
|
||||
lenght.
|
||||
|
||||
This swap the axis!
|
||||
"""
|
||||
# x: a list of sentences
|
||||
lengths = [len(s) for s in seqs]
|
||||
|
||||
if maxlen is not None:
|
||||
new_seqs = []
|
||||
new_labels = []
|
||||
new_lengths = []
|
||||
for l, s, y in zip(lengths, seqs, labels):
|
||||
if l < maxlen:
|
||||
new_seqs.append(s)
|
||||
new_labels.append(y)
|
||||
new_lengths.append(l)
|
||||
lengths = new_lengths
|
||||
labels = new_labels
|
||||
seqs = new_seqs
|
||||
|
||||
if len(lengths) < 1:
|
||||
return None, None, None
|
||||
|
||||
n_samples = len(seqs)
|
||||
maxlen = numpy.max(lengths)
|
||||
|
||||
if dataset == 'mnist':
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('float32')
|
||||
else:
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('int64')
|
||||
x_mask = numpy.zeros((maxlen, n_samples)).astype(config.floatX)
|
||||
for idx, s in enumerate(seqs):
|
||||
x[:lengths[idx], idx] = s
|
||||
x_mask[:lengths[idx], idx] = 1.
|
||||
|
||||
return x, x_mask, labels
|
||||
|
||||
def prepare_data_hier(seqs, labels, hier_len, maxlen=None, dataset='text'):
|
||||
'''
|
||||
prepare minibatch for hierarchical model
|
||||
'''
|
||||
# sort (long->short)
|
||||
sorted_idx = sorted(range(len(seqs)), key=lambda x: len(seqs[x]), reverse=True)
|
||||
seqs = [seqs[i] for i in sorted_idx]
|
||||
labels = [labels[i] for i in sorted_idx]
|
||||
|
||||
# truncate data
|
||||
lengths = [len(s) for s in seqs]
|
||||
if maxlen is not None:
|
||||
new_seqs = []
|
||||
new_labels = []
|
||||
new_lengths = []
|
||||
for l, s, y in zip(lengths, seqs, labels):
|
||||
if l <maxlen :
|
||||
new_seqs.append(s)
|
||||
new_labels.append(y)
|
||||
new_lengths.append(l)
|
||||
lengths = new_lengths
|
||||
labels = new_labels
|
||||
seqs = new_seqs
|
||||
if len(lengths) < 1:
|
||||
return None, None, None
|
||||
|
||||
# set batch size
|
||||
n_samples = len(seqs)
|
||||
maxlen = numpy.max(lengths)
|
||||
if maxlen % hier_len == 0:
|
||||
n_batch = maxlen/hier_len
|
||||
else:
|
||||
n_batch = maxlen//hier_len + 1
|
||||
maxlen = n_batch * hier_len
|
||||
|
||||
# padding whole batch
|
||||
if dataset == 'mnist':
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('float32')
|
||||
else:
|
||||
x = numpy.zeros((maxlen, n_samples)).astype('int64')
|
||||
x_mask = numpy.zeros((maxlen, n_samples)).astype(config.floatX)
|
||||
for idx, s in enumerate(seqs):
|
||||
x[:lengths[idx], idx] = s
|
||||
x_mask[:lengths[idx], idx] = 1
|
||||
|
||||
# slice to mini-batches
|
||||
x_batch = [x[bidx*hier_len:(bidx+1)*hier_len, :] for bidx in range(n_batch)]
|
||||
if dataset == 'mnist':
|
||||
x_batch = numpy.array(x_batch).astype('float32')
|
||||
else:
|
||||
x_batch = numpy.array(x_batch).astype('int64')
|
||||
mask_batch = [x_mask[bidx*hier_len:(bidx+1)*hier_len, :] for bidx in range(n_batch)]
|
||||
mask_batch = numpy.array(mask_batch).astype(config.floatX)
|
||||
|
||||
# mask for hier-level
|
||||
mask_hier = numpy.ones((n_batch, n_samples)).astype(config.floatX)
|
||||
for idx in range(n_samples):
|
||||
mpos = numpy.where(x_mask[:, idx]==0)[0]
|
||||
if len(mpos) == 0:
|
||||
continue
|
||||
bidx = min(mpos[0]//hier_len+1, n_batch)
|
||||
if mpos[0] % hier_len == 0:
|
||||
bidx -= 1 # bug fixed TODO: more elegant solution?
|
||||
mask_hier[bidx:, idx] = 0
|
||||
|
||||
return x_batch, mask_batch, mask_hier, labels
|
|
@ -0,0 +1,38 @@
|
|||
import re
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
|
||||
filename = r'.\gpu_usage_draft_'
|
||||
default_gpu = 58 + 30
|
||||
|
||||
|
||||
|
||||
def GrabGPU(rank):
|
||||
cmdstr = '\"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe\" > ' + filename + rank
|
||||
os.system(cmdstr)
|
||||
|
||||
def GetGPUUSage(rank):
|
||||
pattern = re.compile(r'(?P<num>[0-9]{1,5})MiB[\s]+/')
|
||||
id = 0
|
||||
GPUs = []
|
||||
fo = open(filename + rank, 'r')
|
||||
for line in fo:
|
||||
result = pattern.search(line)
|
||||
if result:
|
||||
if int(result.group("num")) < default_gpu:
|
||||
GPUs.append(id)
|
||||
id = id + 1
|
||||
fo.close()
|
||||
|
||||
print len(GPUs)
|
||||
for gpu in GPUs:
|
||||
print gpu
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
rank = sys.argv[1]
|
||||
GrabGPU(rank)
|
||||
print socket.gethostname()
|
||||
GetGPUUSage(rank)
|
||||
#os.system('del /q ' + filename + rank)
|
|
@ -0,0 +1,17 @@
|
|||
import os
|
||||
def MapDeviceIds(comm):
|
||||
rank = comm.Get_rank()
|
||||
num_machine = comm.Get_size()
|
||||
os.system('python GPU_Usage.py ' + str(rank) + ' > record' + str(rank))
|
||||
comm.Barrier()
|
||||
if rank == 0:
|
||||
os.system('python AllocateGPU.py ' + str(num_machine) + ' > DirtyRecord')
|
||||
comm.Barrier()
|
||||
cardid = str(0)
|
||||
with open('DirtyRecord', 'r') as f:
|
||||
for idx, line in enumerate(f):
|
||||
if idx == rank:
|
||||
cardid = line.strip()
|
||||
break
|
||||
|
||||
return cardid
|
|
@ -0,0 +1,6 @@
|
|||
Dual supervised learning for sentiment analysis.
|
||||
|
||||
The models are at:
|
||||
https://www.dropbox.com/sh/sbl9lv6q0agsrrz/AADIYiS_4stp36X2waW2Wfiaa?dl=0
|
||||
|
||||
You can refer to "train.bat/train_linux.sh" and "valid.bat/valid_linux.sh" for how to run our code.
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,45 @@
|
|||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# data I/O
|
||||
|
||||
parser.add_argument('--data_dir', type=str, default='./data/imdb.pkl', help='Location for the dataset')
|
||||
parser.add_argument('--LMScoreFile', type=str, default='./data/LMScore.npz', help='Location for the LMScoreFile')
|
||||
parser.add_argument('--GCRmode', dest='GCRmode', action='store_true', help='GCRmode')
|
||||
parser.add_argument('--gpu', type=int, default=0, help='')
|
||||
|
||||
|
||||
# optimization parameters
|
||||
parser.add_argument('--model_dir', type=str, default=None)
|
||||
parser.add_argument('--model_S2L', type=str, default='warmClassifier.npz')
|
||||
parser.add_argument('--model_S2L_pkl', type=str, default=None)
|
||||
parser.add_argument('--model_L2S', type=str, default='warmCLM.npz')
|
||||
parser.add_argument('--model_L2S_pkl', type=str, default=None)
|
||||
parser.add_argument('--dual_style', type=str, default='all', help='all | S2L | L2S ')
|
||||
parser.add_argument('--optim', type=str, default='adadelta')
|
||||
|
||||
parser.add_argument('--minibatch', type=int, default=16, help='')
|
||||
parser.add_argument('--trade_off_S2L', type=float, default=5e-3, help='the consistence tradeoff')
|
||||
parser.add_argument('--trade_off_L2S', type=float, default=5e-3, help='the consistence tradeoff')
|
||||
parser.add_argument('--clip_S2L', type=float, default=-1., help='gradient clip S2L')
|
||||
parser.add_argument('--clip_L2S', type=float, default=5., help='gradient clip L2S')
|
||||
parser.add_argument('--bias', type=float, default=0.02, help='the bias')
|
||||
parser.add_argument('--FreezeEmb', dest='FreezeEmb', action='store_true', help='FreezeEmb')
|
||||
parser.add_argument('--lrS2L', type=float, default=0.1, help='')
|
||||
parser.add_argument('--lrL2S', type=float, default=0.1, help='the bias')
|
||||
parser.add_argument('--lrate', type=float, default=0.1, help='the bias')
|
||||
parser.add_argument('--maxEpoch', type=int, default=100, help='')
|
||||
parser.add_argument('--validFreq', type=int, default=2000, help='')
|
||||
parser.add_argument('--classifier_drop_in', type=float, default=0.8, help='classifier_drop_in')
|
||||
parser.add_argument('--classifier_drop_out', type=float, default=0.5, help='classifier_drop_out')
|
||||
parser.add_argument('--CLM_drop_in', type=float, default=0.5, help='CLM_drop_in')
|
||||
parser.add_argument('--CLM_drop_out', type=float, default=0.5, help='CLM_drop_out')
|
||||
|
||||
config_params = parser.parse_args()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
Please download the files from
|
||||
https://www.dropbox.com/sh/j9l5hhnjsyhtd02/AABMk8m6b_8tS8fuURqk66zCa?dl=0
|
|
@ -0,0 +1,6 @@
|
|||
from monitor import *
|
||||
|
||||
|
||||
runner = monitor()
|
||||
print 'valid classifier', runner.valid_S2L()
|
||||
print 'valid CLM:', runner.valid_L2S()
|
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) 2007, 2010, 2011, 2012 Godefroid Chapelle
|
||||
#
|
||||
# This file is part of ipdb.
|
||||
# GNU package is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU General Public License as published by the Free
|
||||
# Software Foundation, either version 2 of the License, or (at your option)
|
||||
# any later version.
|
||||
#
|
||||
# GNU package is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
# for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.
|
||||
|
||||
from __main__ import set_trace, post_mortem, pm, run, runcall, runeval, launch_ipdb_on_exception
|
||||
|
||||
pm # please pyflakes
|
||||
post_mortem # please pyflakes
|
||||
run # please pyflakes
|
||||
runcall # please pyflakes
|
||||
runeval # please pyflakes
|
||||
set_trace # please pyflakes
|
||||
launch_ipdb_on_exception # please pyflakes
|
|
@ -0,0 +1,184 @@
|
|||
# Copyright (c) 2011, 2012 Godefroid Chapelle
|
||||
#
|
||||
# This file is part of ipdb.
|
||||
# GNU package is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU General Public License as published by the Free
|
||||
# Software Foundation, either version 2 of the License, or (at your option)
|
||||
# any later version.
|
||||
#
|
||||
# GNU package is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
# for more details.
|
||||
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import os
|
||||
import traceback
|
||||
|
||||
from contextlib import contextmanager
|
||||
|
||||
try:
|
||||
from pdb import Restart
|
||||
except ImportError:
|
||||
class Restart(Exception):
|
||||
pass
|
||||
|
||||
import IPython
|
||||
|
||||
if IPython.__version__ > '0.10.2':
|
||||
from IPython.core.debugger import Pdb, BdbQuit_excepthook
|
||||
try:
|
||||
get_ipython
|
||||
except NameError:
|
||||
# Make it more resilient to different versions of IPython and try to
|
||||
# find a module.
|
||||
possible_modules = ['IPython.terminal.embed', # Newer IPython
|
||||
'IPython.frontend.terminal.embed'] # Older IPython
|
||||
|
||||
count = len(possible_modules)
|
||||
for module in possible_modules:
|
||||
try:
|
||||
embed = __import__(module, fromlist=["InteractiveShellEmbed"])
|
||||
InteractiveShellEmbed = embed.InteractiveShellEmbed
|
||||
except ImportError:
|
||||
count -= 1
|
||||
if count == 0:
|
||||
raise
|
||||
else:
|
||||
break
|
||||
|
||||
ipshell = InteractiveShellEmbed()
|
||||
def_colors = ipshell.colors
|
||||
else:
|
||||
def_colors = get_ipython.im_self.colors
|
||||
|
||||
from IPython.utils import io
|
||||
|
||||
if 'nose' in sys.modules.keys():
|
||||
def update_stdout():
|
||||
# setup stdout to ensure output is available with nose
|
||||
io.stdout = sys.stdout = sys.__stdout__
|
||||
else:
|
||||
def update_stdout():
|
||||
pass
|
||||
else:
|
||||
from IPython.Debugger import Pdb, BdbQuit_excepthook
|
||||
from IPython.Shell import IPShell
|
||||
from IPython import ipapi
|
||||
|
||||
ip = ipapi.get()
|
||||
if ip is None:
|
||||
IPShell(argv=[''])
|
||||
ip = ipapi.get()
|
||||
def_colors = ip.options.colors
|
||||
|
||||
from IPython.Shell import Term
|
||||
|
||||
if 'nose' in sys.modules.keys():
|
||||
def update_stdout():
|
||||
# setup stdout to ensure output is available with nose
|
||||
Term.cout = sys.stdout = sys.__stdout__
|
||||
else:
|
||||
def update_stdout():
|
||||
pass
|
||||
|
||||
|
||||
def wrap_sys_excepthook():
|
||||
# make sure we wrap it only once or we would end up with a cycle
|
||||
# BdbQuit_excepthook.excepthook_ori == BdbQuit_excepthook
|
||||
if sys.excepthook != BdbQuit_excepthook:
|
||||
BdbQuit_excepthook.excepthook_ori = sys.excepthook
|
||||
sys.excepthook = BdbQuit_excepthook
|
||||
|
||||
|
||||
def set_trace(frame=None):
|
||||
update_stdout()
|
||||
wrap_sys_excepthook()
|
||||
if frame is None:
|
||||
frame = sys._getframe().f_back
|
||||
Pdb(def_colors).set_trace(frame)
|
||||
|
||||
|
||||
def post_mortem(tb):
|
||||
update_stdout()
|
||||
wrap_sys_excepthook()
|
||||
p = Pdb(def_colors)
|
||||
p.reset()
|
||||
if tb is None:
|
||||
return
|
||||
p.interaction(None, tb)
|
||||
|
||||
|
||||
def pm():
|
||||
post_mortem(sys.last_traceback)
|
||||
|
||||
|
||||
def run(statement, globals=None, locals=None):
|
||||
Pdb(def_colors).run(statement, globals, locals)
|
||||
|
||||
|
||||
def runcall(*args, **kwargs):
|
||||
return Pdb(def_colors).runcall(*args, **kwargs)
|
||||
|
||||
|
||||
def runeval(expression, globals=None, locals=None):
|
||||
return Pdb(def_colors).runeval(expression, globals, locals)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def launch_ipdb_on_exception():
|
||||
try:
|
||||
yield
|
||||
except Exception:
|
||||
e, m, tb = sys.exc_info()
|
||||
print(m.__repr__(), file=sys.stderr)
|
||||
post_mortem(tb)
|
||||
finally:
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
if not sys.argv[1:] or sys.argv[1] in ("--help", "-h"):
|
||||
print("usage: ipdb.py scriptfile [arg] ...")
|
||||
sys.exit(2)
|
||||
|
||||
mainpyfile = sys.argv[1] # Get script filename
|
||||
if not os.path.exists(mainpyfile):
|
||||
print('Error:', mainpyfile, 'does not exist')
|
||||
sys.exit(1)
|
||||
|
||||
del sys.argv[0] # Hide "pdb.py" from argument list
|
||||
|
||||
# Replace pdb's dir with script's dir in front of module search path.
|
||||
sys.path[0] = os.path.dirname(mainpyfile)
|
||||
|
||||
# Note on saving/restoring sys.argv: it's a good idea when sys.argv was
|
||||
# modified by the script being debugged. It's a bad idea when it was
|
||||
# changed by the user from the command line. There is a "restart" command
|
||||
# which allows explicit specification of command line arguments.
|
||||
pdb = Pdb(def_colors)
|
||||
while 1:
|
||||
try:
|
||||
pdb._runscript(mainpyfile)
|
||||
if pdb._user_requested_quit:
|
||||
break
|
||||
print("The program finished and will be restarted")
|
||||
except Restart:
|
||||
print("Restarting", mainpyfile, "with arguments:")
|
||||
print("\t" + " ".join(sys.argv[1:]))
|
||||
except SystemExit:
|
||||
# In most cases SystemExit does not warrant a post-mortem session.
|
||||
print("The program exited via sys.exit(). Exit status: ", end='')
|
||||
print(sys.exc_info()[1])
|
||||
except:
|
||||
traceback.print_exc()
|
||||
print("Uncaught exception. Entering post mortem debugging")
|
||||
print("Running 'cont' or 'step' will restart the program")
|
||||
t = sys.exc_info()[2]
|
||||
pdb.interaction(None, t)
|
||||
print("Post mortem debugger finished. The " + mainpyfile +
|
||||
" will be restarted")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,128 @@
|
|||
(dp1
|
||||
S'monitor_grad'
|
||||
p2
|
||||
I00
|
||||
sS'dropout_output'
|
||||
p3
|
||||
F0.5
|
||||
sS'n_words'
|
||||
p4
|
||||
I10000
|
||||
sS'start_epoch'
|
||||
p5
|
||||
I0
|
||||
sS'dataset'
|
||||
p6
|
||||
S'text'
|
||||
p7
|
||||
sS'patience'
|
||||
p8
|
||||
I10
|
||||
sS'skip_steps2'
|
||||
p9
|
||||
I-1
|
||||
sS'hier_len'
|
||||
p10
|
||||
NsS'max_epochs'
|
||||
p11
|
||||
I5000
|
||||
sS'dispFreq'
|
||||
p12
|
||||
I50
|
||||
sS'newDumpFreq'
|
||||
p13
|
||||
I5000000
|
||||
sS'self'
|
||||
p14
|
||||
NsS'hybrid'
|
||||
p15
|
||||
I00
|
||||
sS'clip_c'
|
||||
p16
|
||||
F-1
|
||||
sS'dim_proj'
|
||||
p17
|
||||
I1024
|
||||
sS'saveto'
|
||||
p18
|
||||
S'model.npz'
|
||||
p19
|
||||
sS'start_iter'
|
||||
p20
|
||||
I0
|
||||
sS'lastHiddenLayer'
|
||||
p21
|
||||
NsS'noise_std'
|
||||
p22
|
||||
F0
|
||||
sS'batch_len_threshold'
|
||||
p23
|
||||
NsS'valid_batch_size'
|
||||
p24
|
||||
I16
|
||||
sS'corpus'
|
||||
p25
|
||||
S'imdb.pkl'
|
||||
p26
|
||||
sS'reload_options'
|
||||
p27
|
||||
NsS'optimizer'
|
||||
p28
|
||||
S'adadelta'
|
||||
p29
|
||||
sS'validFreq'
|
||||
p30
|
||||
I2000
|
||||
sS'dropout_input'
|
||||
p31
|
||||
F0.80000000000000004
|
||||
sS'warm_LM'
|
||||
p32
|
||||
NsS'batch_size'
|
||||
p33
|
||||
I16
|
||||
sS'encoder'
|
||||
p34
|
||||
S'lstm'
|
||||
p35
|
||||
sS'hierarchical'
|
||||
p36
|
||||
I00
|
||||
sS'reload_model'
|
||||
p37
|
||||
S'winner/warmClassifier.npz'
|
||||
p38
|
||||
sS'lrate'
|
||||
p39
|
||||
F1
|
||||
sS'truncate_grad'
|
||||
p40
|
||||
I-1
|
||||
sS'decay_c'
|
||||
p41
|
||||
F-1
|
||||
sS'encoder2'
|
||||
p42
|
||||
NsS'test_size'
|
||||
p43
|
||||
NsS'dim_word'
|
||||
p44
|
||||
I500
|
||||
sS'unit_depth'
|
||||
p45
|
||||
I-1
|
||||
sS'maxlen'
|
||||
p46
|
||||
NsS'skip_steps'
|
||||
p47
|
||||
I-1
|
||||
sS'embedding'
|
||||
p48
|
||||
NsS'logFile'
|
||||
p49
|
||||
S'log2'
|
||||
p50
|
||||
sS'mean_pooling'
|
||||
p51
|
||||
I00
|
||||
s.
|
|
@ -0,0 +1,209 @@
|
|||
from config import config_params
|
||||
import os
|
||||
os.environ['THEANO_FLAGS']='floatX=float32,device=cuda%d' % (config_params.gpu)
|
||||
if os.name == 'nt':
|
||||
cmdstr = '\"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe\" '
|
||||
os.system(cmdstr)
|
||||
else:
|
||||
os.system(r'nvidia-smi')
|
||||
|
||||
from CLM.CLM import CLM_worker
|
||||
from Classifier.Models import Model as Classifier
|
||||
import theano
|
||||
import theano.tensor as tensor
|
||||
import numpy
|
||||
from Util_basic import sgd_joint, prepare_data_x, unzip, itemlist_NoEmb, adadelta_joint, Optim
|
||||
from Data import load_data, get_minibatches_idx, get_minibatches_idx_bucket
|
||||
from collections import OrderedDict
|
||||
|
||||
def grad_clipping(grads, clip_c):
|
||||
g2 = 0.
|
||||
for g in grads:
|
||||
g2 += (g**2).sum()
|
||||
new_grads = []
|
||||
for g in grads:
|
||||
new_grads.append(tensor.switch(g2 > (clip_c**2), g/tensor.sqrt(g2) * clip_c, g))
|
||||
return new_grads, tensor.sqrt(g2)
|
||||
|
||||
class monitor(object):
|
||||
def __init__(self):
|
||||
print config_params
|
||||
self.CLM = CLM_worker(lrate=1.,
|
||||
optimizer='adadelta',
|
||||
batch_size=config_params.minibatch,
|
||||
saveto='model.npz',
|
||||
validFreq=2000,
|
||||
dispFreq=100,
|
||||
dropout_input=config_params.CLM_drop_in,
|
||||
dropout_output=config_params.CLM_drop_out,
|
||||
reload_model=config_params.model_dir + '/' + config_params.model_L2S,
|
||||
reload_option=None,
|
||||
log='log1'
|
||||
)
|
||||
self.classifier = Classifier(lrate=1., # Learning rate for sgd (not used for adadelta and rmsprop)
|
||||
optimizer='adadelta',
|
||||
saveto='model.npz', # The best model will be saved there
|
||||
dispFreq=50, # Display the training progress after this number of updates
|
||||
validFreq=2000, # Compute the validation error after this number of updates
|
||||
batch_size=config_params.minibatch, # The batch size during training.
|
||||
batch_len_threshold=None, # use dynamic batch size if sequence lengths exceed this threshold
|
||||
valid_batch_size=config_params.minibatch, # The batch size used for validation/test set.
|
||||
lastHiddenLayer=None,
|
||||
dropout_output=config_params.classifier_drop_out,
|
||||
dropout_input=config_params.classifier_drop_in,
|
||||
reload_options=None, # Path to a saved model options we want to start from
|
||||
reload_model=config_params.model_dir + '/' + config_params.model_S2L,
|
||||
embedding=None, # Path to the word embedding file (otherwise randomized)
|
||||
warm_LM=None,
|
||||
logFile='log2' # Path to log file
|
||||
)
|
||||
self.trainSet, self.validSet, self.testSet = \
|
||||
load_data(path=config_params.data_dir, n_words=10000, maxlen=None, sort_by_len=True, fixed_valid=True)
|
||||
self.LMscore = numpy.load(config_params.LMScoreFile)
|
||||
self.LMscore = self.LMscore[self.LMscore.files[0]]
|
||||
self.build()
|
||||
|
||||
def build(self):
|
||||
LMsores = tensor.vector('LMScore', dtype='float32')
|
||||
lrate = tensor.scalar(dtype='float32')
|
||||
|
||||
CLM_srcx, CLM_srcx_mask, CLM_ctx_, CLM_cost, CLM_sentenceLen = self.CLM.GetNll()
|
||||
classifier_x, classifier_mask, classifier_y, classifier_nlls = self.classifier.GetNll()
|
||||
consistent_loss = (((classifier_nlls + numpy.log(0.5))/CLM_sentenceLen + LMsores - CLM_cost) ** 2).mean()
|
||||
CLM_cost_avg = CLM_cost.mean()
|
||||
overall_L2S = CLM_cost_avg + config_params.trade_off_L2S * config_params.trade_off_L2S * consistent_loss
|
||||
classifier_nlls_avg = classifier_nlls.mean()
|
||||
overall_S2L = classifier_nlls_avg + config_params.trade_off_S2L * config_params.trade_off_S2L * consistent_loss
|
||||
|
||||
if config_params.FreezeEmb:
|
||||
grads_L2S = tensor.grad(overall_L2S, wrt=itemlist_NoEmb(self.CLM.tparams))
|
||||
else:
|
||||
grads_L2S = tensor.grad(overall_L2S, wrt=self.CLM.tparams.values())
|
||||
if config_params.clip_L2S > 0.:
|
||||
grads_L2S, norm_grads_L2S = grad_clipping(grads_L2S, config_params.clip_L2S)
|
||||
else:
|
||||
norm_grads_L2S = tensor.alloc(-1.)
|
||||
|
||||
if config_params.FreezeEmb:
|
||||
grads_S2L = tensor.grad(overall_S2L, wrt=itemlist_NoEmb(self.classifier.tparams))
|
||||
else:
|
||||
grads_S2L = tensor.grad(overall_S2L, wrt=self.classifier.tparams.values())
|
||||
if config_params.clip_S2L > 0.:
|
||||
grads_S2L, norm_grads_S2L = grad_clipping(grads_S2L, config_params.clip_S2L)
|
||||
else:
|
||||
norm_grads_S2L = tensor.alloc(-1.)
|
||||
|
||||
if config_params.dual_style == 'all':
|
||||
merged_var_dic = OrderedDict()
|
||||
if config_params.FreezeEmb:
|
||||
merged_var_dic.update(OrderedDict((k + '_S2L',v) for (k,v) in self.classifier.tparams.iteritems() if 'Wemb' not in k ))
|
||||
merged_var_dic.update(OrderedDict((k + '_L2S',v) for (k,v) in self.CLM.tparams.iteritems() if 'Wemb' not in k ))
|
||||
else:
|
||||
merged_var_dic.update(OrderedDict((k + '_S2L',v) for (k,v) in self.classifier.tparams.iteritems()))
|
||||
merged_var_dic.update(OrderedDict((k + '_L2S',v) for (k,v) in self.CLM.tparams.iteritems()))
|
||||
|
||||
inps = [CLM_srcx, CLM_srcx_mask, CLM_ctx_, classifier_x, classifier_mask, classifier_y, LMsores]
|
||||
outs = [CLM_cost_avg, classifier_nlls_avg, consistent_loss, overall_L2S, overall_S2L, norm_grads_L2S, norm_grads_S2L]
|
||||
self.f_grad_shared, self.f_update = Optim[config_params.optim](lrate, merged_var_dic, grads_S2L + grads_L2S, inps, outs)
|
||||
elif config_params.dual_style == 'S2L':
|
||||
if config_params.FreezeEmb:
|
||||
merged_var_dic = OrderedDict((k + '_S2L',v) for (k,v) in self.classifier.tparams.iteritems() if 'Wemb' not in k )
|
||||
else:
|
||||
merged_var_dic = OrderedDict((k + '_S2L',v) for (k,v) in self.classifier.tparams.iteritems())
|
||||
|
||||
inps = [CLM_srcx, CLM_srcx_mask, CLM_ctx_, classifier_x, classifier_mask, classifier_y, LMsores]
|
||||
norm_grads_L2S = tensor.alloc(-1.)
|
||||
outs = [CLM_cost_avg, classifier_nlls_avg, consistent_loss, overall_L2S, overall_S2L, norm_grads_L2S, norm_grads_S2L]
|
||||
self.f_grad_shared, self.f_update = Optim[config_params.optim](lrate, merged_var_dic, grads_S2L, inps, outs)
|
||||
elif config_params.dual_style == 'L2S':
|
||||
if config_params.FreezeEmb:
|
||||
merged_var_dic = OrderedDict((k + '_L2S',v) for (k,v) in self.CLM.tparams.iteritems() if 'Wemb' not in k )
|
||||
else:
|
||||
merged_var_dic = OrderedDict((k + '_L2S',v) for (k,v) in self.CLM.tparams.iteritems())
|
||||
|
||||
inps = [CLM_srcx, CLM_srcx_mask, CLM_ctx_, classifier_x, classifier_mask, classifier_y, LMsores]
|
||||
norm_grads_S2L = tensor.alloc(-1.)
|
||||
outs = [CLM_cost_avg, classifier_nlls_avg, consistent_loss, overall_L2S, overall_S2L, norm_grads_L2S, norm_grads_S2L]
|
||||
self.f_grad_shared, self.f_update = Optim[config_params.optim](lrate, merged_var_dic, grads_L2S, inps, outs)
|
||||
else:
|
||||
raise Exception('Not support {} in dual_style'.format(config_params.dual_style))
|
||||
|
||||
def train_one_minibatch(self, seqx, seqy, LMscore):
|
||||
CLM_x, CLM_xmask = prepare_data_x(seqx, pad_eos=True)
|
||||
labels = numpy.array(seqy).astype('int64')
|
||||
classifier_x, classifier_xmask = prepare_data_x(seqx, pad_eos=False)
|
||||
CLM_cost_avg, classifier_nlls_avg, consistent_loss, overall_L2S, overall_S2L, norm_grads_L2S, norm_grads_S2L = self.f_grad_shared(
|
||||
CLM_x, CLM_xmask, labels, classifier_x, classifier_xmask, labels, LMscore
|
||||
)
|
||||
print 'CLM_cost_avg=%f, classifier_nlls_avg=%f, norm_grads_L2S=%f, norm_grads_S2L=%f, consistent_loss=%f,' \
|
||||
' overall_L2S=%f, overall_S2L=%f' % (
|
||||
CLM_cost_avg, classifier_nlls_avg, norm_grads_L2S, norm_grads_S2L, consistent_loss, overall_L2S, overall_S2L )
|
||||
self.f_update(config_params.lrate)
|
||||
|
||||
def train(self):
|
||||
uidx = 0
|
||||
for eidx in xrange(0, config_params.maxEpoch):
|
||||
n_samples = 0
|
||||
self.kf_train = get_minibatches_idx_bucket(self.trainSet[0],config_params.minibatch,shuffle=True)
|
||||
|
||||
for _, train_index in self.kf_train:
|
||||
uidx += 1
|
||||
self.classifier.use_noise.set_value(1.)
|
||||
self.CLM.use_noise.set_value(1.)
|
||||
|
||||
# Select the random examples for this minibatch
|
||||
seqx = [self.trainSet[0][t] for t in train_index]
|
||||
seqy = [self.trainSet[1][t] for t in train_index]
|
||||
LMscore = [self.LMscore[t] for t in train_index]
|
||||
self.train_one_minibatch(seqx, seqy, numpy.array(LMscore).astype('float32'))
|
||||
|
||||
if uidx % config_params.validFreq == 0:
|
||||
self.classifier.use_noise.set_value(0.)
|
||||
self.CLM.use_noise.set_value(0.)
|
||||
|
||||
if config_params.dual_style == 'all':
|
||||
suffix_S2L = self.valid_S2L()
|
||||
suffix_L2S = self.valid_L2S()
|
||||
|
||||
S2Lpath = config_params.model_dir + '/model_S2L_' + suffix_S2L + '_uidx' + str(uidx)
|
||||
L2Spath = config_params.model_dir + '/model_L2S_' + suffix_L2S + '_uidx' + str(uidx)
|
||||
|
||||
numpy.savez(S2Lpath, history_errs=[], **unzip(self.classifier.tparams) )
|
||||
numpy.savez(L2Spath, history_errs=[], **unzip(self.CLM.tparams) )
|
||||
elif config_params.dual_style == 'S2L':
|
||||
suffix_S2L = self.valid_S2L()
|
||||
S2Lpath = config_params.model_dir + '/model_S2L_' + suffix_S2L + '_uidx' + str(uidx)
|
||||
numpy.savez(S2Lpath, history_errs=[], **unzip(self.classifier.tparams) )
|
||||
elif config_params.dual_style == 'L2S':
|
||||
suffix_L2S = self.valid_L2S()
|
||||
L2Spath = config_params.model_dir + '/model_L2S_' + suffix_L2S + '_uidx' + str(uidx)
|
||||
numpy.savez(L2Spath, history_errs=[], **unzip(self.CLM.tparams) )
|
||||
|
||||
|
||||
def valid_S2L(self):
|
||||
acc = self.classifier.evaluate(self.trainSet, self.validSet, self.testSet)
|
||||
print 'TrainAcc=%f, ValidAcc=%f, TestAcc=%f' % (acc[0], acc[1], acc[2])
|
||||
return 'train_{}_valid_{}_test_{}'.format(acc[0], acc[1], acc[2])
|
||||
|
||||
def valid_L2S(self):
|
||||
valid_ppl, test_ppl = self.CLM.evaluate(self.validSet, self.testSet)
|
||||
print 'Valid_PPL=%f, Test_PPL=%f' % (valid_ppl, test_ppl)
|
||||
return 'valid_{}_test_{}'.format(valid_ppl, test_ppl)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
runner = monitor()
|
||||
runner.train()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1 @@
|
|||
python monitor.py --classifier_drop_in=0.8 --classifier_drop_out=0.5 --clip_L2S=5.0 --trade_off_S2L=5.0 --trade_off_L2S=5.0 --validFreq=5000 --model_dir=your_model_folder --lrS2L=0.1 --lrL2S=0.1 --lrate=0.1 --bias=0.0 --dual_style=all --optim=adadelta
|
|
@ -0,0 +1 @@
|
|||
python monitor.py --classifier_drop_in=0.8 --classifier_drop_out=0.5 --clip_L2S=5.0 --trade_off_S2L=5.0 --trade_off_L2S=5.0 --validFreq=5000 --model_dir=Sentiment_model --lrS2L=0.1 --lrL2S=0.1 --lrate=0.1 --bias=0.0 --dual_style=all --optim=adadelta
|
|
@ -0,0 +1 @@
|
|||
python inference.py --model_dir=winner --model_S2L=warmClassifier.npz --model_L2S=warmCLM.npz
|
|
@ -0,0 +1 @@
|
|||
python inference.py --model_dir=winner --model_S2L=warmClassifier.npz --model_L2S=warmCLM.npz --gpu=3
|
|
@ -12,3 +12,7 @@ provided by the bot. You will only need to do this once across all repos using o
|
|||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
||||
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
|
||||
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||
|
||||
The code consists of two parts:
|
||||
(1) dual supervised learning for image processing: DSL_ImgProcess
|
||||
(2) dual supervised learning for sentiment analysis: DSL_SentimentAnalysis
|
Загрузка…
Ссылка в новой задаче