Updated examples to use the latest progress writer API

This commit is contained in:
NORTHAMERICA\vistepan 2017-02-22 16:37:37 -08:00
Родитель 52ebff65e0
Коммит 6256c6e355
33 изменённых файлов: 275 добавлений и 474 удалений

Просмотреть файл

@ -109,7 +109,8 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule,
unit_gain = True,
l2_regularization_weight = l2_reg_weight)
trainer = cntk.Trainer(z, (ce, pe), learner)
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
trainer = cntk.Trainer(z, (ce, pe), learner, progress_printer)
# define mapping from reader streams to network inputs
input_map = {
@ -118,7 +119,6 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
}
cntk.utils.log_number_of_parameters(z) ; print()
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
# perform model training
for epoch in range(max_epochs): # loop over epochs
@ -127,9 +127,8 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
data = reader_train.next_minibatch(min(minibatch_size, epoch_size-sample_count), input_map=input_map) # fetch minibatch.
trainer.train_minibatch(data) # update model with it
sample_count += trainer.previous_minibatch_sample_count # count samples processed so far
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
### Evaluation action

Просмотреть файл

@ -64,6 +64,7 @@ def convnet_cifar10(debug_output=False):
# training config
epoch_size = 50000 # for now we manually specify epoch size
minibatch_size = 64
max_epochs = 30
# Set learning parameters
lr_per_sample = [0.0015625]*10 + [0.00046875]*10 + [0.00015625]
@ -75,7 +76,8 @@ def convnet_cifar10(debug_output=False):
# Instantiate the trainer object to drive the model training
learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule,
l2_regularization_weight = l2_reg_weight)
trainer = cntk.Trainer(z, (ce, pe), learner)
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
trainer = cntk.Trainer(z, (ce, pe), learner, progress_printer)
# define mapping from reader streams to network inputs
input_map = {
@ -84,8 +86,6 @@ def convnet_cifar10(debug_output=False):
}
cntk.utils.log_number_of_parameters(z) ; print()
max_epochs = 30
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
# Get minibatches of images to train with and perform model training
for epoch in range(max_epochs): # loop over epochs
@ -94,9 +94,8 @@ def convnet_cifar10(debug_output=False):
data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch.
trainer.train_minibatch(data) # update model with it
sample_count += trainer.previous_minibatch_sample_count # count samples processed so far
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
z.save(os.path.join(model_path, "ConvNet_CIFAR10_{}.dnn".format(epoch)))
# Load test data

Просмотреть файл

@ -87,7 +87,8 @@ def convnet_cifar10_dataaug(reader_train, reader_test, epoch_size = 50000, max_e
# trainer object
learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule,
l2_regularization_weight = l2_reg_weight)
trainer = cntk.Trainer(z, (ce, pe), learner)
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
trainer = cntk.Trainer(z, (ce, pe), learner, progress_printer)
# define mapping from reader streams to network inputs
input_map = {
@ -96,7 +97,6 @@ def convnet_cifar10_dataaug(reader_train, reader_test, epoch_size = 50000, max_e
}
cntk.utils.log_number_of_parameters(z) ; print()
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
# perform model training
for epoch in range(max_epochs): # loop over epochs
@ -105,9 +105,8 @@ def convnet_cifar10_dataaug(reader_train, reader_test, epoch_size = 50000, max_e
data = reader_train.next_minibatch(min(minibatch_size, epoch_size-sample_count), input_map=input_map) # fetch minibatch.
trainer.train_minibatch(data) # update model with it
sample_count += trainer.previous_minibatch_sample_count # count samples processed so far
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
### Evaluation action

Просмотреть файл

@ -150,25 +150,25 @@ def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64,
network = create_conv_network()
progress_printer = cntk.utils.ProgressPrinter(
progress_writers = [cntk.utils.ProgressPrinter(
freq=num_mbs_per_log,
tag='Training',
log_to_file=log_to_file,
rank=cntk.distributed.Communicator.rank(),
gen_heartbeat=gen_heartbeat,
num_epochs=max_epochs)
num_epochs=max_epochs)]
tensorboard_writer = cntk.utils.TensorBoardProgressWriter(
freq=num_mbs_per_log,
log_dir=tensorboard_logdir if tensorboard_logdir is not None else 'log',
rank=cntk.distributed.Communicator.rank(),
model=network['output'])
if tensorboard_logdir is not None:
progress_writers.append(cntk.utils.TensorBoardProgressWriter(
freq=num_mbs_per_log,
log_dir=tensorboard_logdir,
rank=cntk.distributed.Communicator.rank(),
model=network['output']))
trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, [progress_printer, tensorboard_writer])
trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers)
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
train_and_test(network, trainer, train_source, test_source, minibatch_size,
epoch_size, restore, profiling)
train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling)
if __name__=='__main__':
@ -179,7 +179,7 @@ if __name__=='__main__':
parser.add_argument('-datadir', '--datadir', help='Data directory where the CIFAR dataset is located', required=False, default=data_path)
parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
parser.add_argument('-tensorboard_logdir', '--tensorboard_logdir', help='Directory where to tensorboard logs should be written', required=False, default='log')
parser.add_argument('-tensorboard_logdir', '--tensorboard_logdir', help='Directory where TensorBoard logs should be created', required=False, default=None)
parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='160')
parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='64')
parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='50000')

Просмотреть файл

@ -56,6 +56,7 @@ def convnet_mnist(debug_output=False):
# training config
epoch_size = 60000 # for now we manually specify epoch size
minibatch_size = 128
max_epochs = 40
# Set learning parameters
lr_per_sample = [0.001]*10 + [0.0005]*10 + [0.0001]
@ -65,7 +66,8 @@ def convnet_mnist(debug_output=False):
# Instantiate the trainer object to drive the model training
learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule)
trainer = cntk.Trainer(z, (ce, pe), learner)
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
trainer = cntk.Trainer(z, (ce, pe), learner, progress_printer)
# define mapping from reader streams to network inputs
input_map = {
@ -74,8 +76,6 @@ def convnet_mnist(debug_output=False):
}
cntk.utils.log_number_of_parameters(z) ; print()
max_epochs = 40
progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)
# Get minibatches of images to train with and perform model training
for epoch in range(max_epochs): # loop over epochs
@ -84,9 +84,8 @@ def convnet_mnist(debug_output=False):
data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch.
trainer.train_minibatch(data) # update model with it
sample_count += data[label_var].num_samples # count samples processed so far
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
z.save(os.path.join(model_path, "ConvNet_MNIST_{}.dnn".format(epoch)))
# Load test data

Просмотреть файл

@ -4,16 +4,16 @@
# for full license information.
# ==============================================================================
import argparse
import numpy as np
import sys
import os
from cntk import Trainer, minibatch_size_schedule
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
from cntk.device import cpu, set_default_device
from cntk.learner import sgd, learning_rate_schedule, UnitType
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, relu, element_times, constant
from cntk.utils import ProgressPrinter
from cntk.training_session import *
from cntk.utils import ProgressPrinter, TensorBoardProgressWriter
abs_path = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "common"))
@ -35,7 +35,7 @@ def create_reader(path, is_training, input_dim, label_dim):
# Creates and trains a feedforward classification model for MNIST images
def simple_mnist():
def simple_mnist(tensorboard_logdir=None):
input_dim = 784
num_output_classes = 10
num_hidden_layers = 1
@ -65,24 +65,27 @@ def simple_mnist():
label : reader_train.streams.labels
}
lr_per_minibatch=learning_rate_schedule(0.2, UnitType.minibatch)
# Get minibatches of images to train with and perform model training
# Training config
minibatch_size = 64
num_samples_per_sweep = 60000
num_sweeps_to_train_with = 10
#training_progress_output_freq = 100
progress_printer = ProgressPrinter(
# Instantiate progress writers.
#training_progress_output_freq = 100
progress_writers = [ProgressPrinter(
#freq=training_progress_output_freq,
tag='Training',
num_epochs=num_sweeps_to_train_with)
num_epochs=num_sweeps_to_train_with)]
if tensorboard_logdir is not None:
progress_writers.append(TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z))
# Instantiate the trainer object to drive the model training
trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr=lr_per_minibatch), progress_printer)
lr_per_minibatch = learning_rate_schedule(0.2, UnitType.minibatch)
trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr=lr_per_minibatch), progress_writers)
training_session(
trainer=trainer,
trainer=trainer,
mb_source = reader_train,
mb_size = minibatch_size,
var_to_stream = input_map,
@ -120,5 +123,10 @@ if __name__=='__main__':
# use the best available one, e.g.
# set_default_device(cpu())
error = simple_mnist()
parser = argparse.ArgumentParser()
parser.add_argument('-tensorboard_logdir', '--tensorboard_logdir',
help='Directory where TensorBoard logs should be created', required=False, default=None)
args = vars(parser.parse_args())
error = simple_mnist(args['tensorboard_logdir'])
print("Error: %f" % error)

Просмотреть файл

@ -7,11 +7,9 @@
from __future__ import print_function
import os
import argparse
import math
import numpy as np
from cntk.utils import *
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, reduce_mean
from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs
import cntk.io.transforms as xforms
from cntk import Trainer, cntk_py
@ -53,7 +51,8 @@ def create_reader(map_file, mean_file, train):
# Train and evaluate the network.
def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_epochs, profiler_dir=None, model_dir=None):
def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_epochs, profiler_dir=None,
model_dir=None, tensorboard_logdir=None):
set_computation_network_trace_level(0)
@ -85,10 +84,17 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
# progress writers
progress_writers = [ProgressPrinter(tag='Training', num_epochs=max_epochs)]
tensorboard_writer = None
if tensorboard_logdir is not None:
tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)
progress_writers.append(tensorboard_writer)
# trainer object
learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule,
l2_regularization_weight = l2_reg_weight)
trainer = Trainer(z, (ce, pe), learner)
learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule,
l2_regularization_weight = l2_reg_weight)
trainer = Trainer(z, (ce, pe), learner, progress_writers)
# define mapping from reader streams to network inputs
input_map = {
@ -97,7 +103,6 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
}
log_number_of_parameters(z) ; print()
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
# perform model training
if profiler_dir:
@ -109,8 +114,14 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
data = reader_train.next_minibatch(min(minibatch_size, epoch_size-sample_count), input_map=input_map) # fetch minibatch.
trainer.train_minibatch(data) # update model with it
sample_count += trainer.previous_minibatch_sample_count # count samples processed so far
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
# Log mean of each parameter tensor, so that we can confirm that the parameters change indeed.
if tensorboard_writer:
for parameter in z.parameters:
tensorboard_writer.write_value(parameter.uid + "/mean", reduce_mean(parameter).eval(), epoch)
if model_dir:
z.save(os.path.join(model_dir, network_name + "_{}.dnn".format(epoch)))
enable_profiler() # begin to collect profiler data after first epoch
@ -126,7 +137,6 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
metric_numer = 0
metric_denom = 0
sample_count = 0
minibatch_index = 0
while sample_count < test_epoch_size:
current_minibatch = min(minibatch_size, test_epoch_size - sample_count)
@ -137,10 +147,9 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
metric_denom += current_minibatch
# Keep track of the number of samples processed so far.
sample_count += data[label_var].num_samples
minibatch_index += 1
print("")
print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom))
trainer.summarize_test_progress()
print("")
return metric_numer/metric_denom
@ -151,6 +160,7 @@ if __name__=='__main__':
parser.add_argument('-e', '--epochs', help='total epochs', required=False, default='160')
parser.add_argument('-p', '--profiler_dir', help='directory for saving profiler output', required=False, default=None)
parser.add_argument('-m', '--model_dir', help='directory for saving model', required=False, default=None)
parser.add_argument('-tensorboard_logdir', '--tensorboard_logdir', help='Directory where TensorBoard logs should be created', required=False, default=None)
args = vars(parser.parse_args())
epochs = int(args['epochs'])
@ -164,4 +174,5 @@ if __name__=='__main__':
reader_test = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)
epoch_size = 50000
train_and_evaluate(reader_train, reader_test, network_name, epoch_size, epochs, args['profiler_dir'], model_dir)
train_and_evaluate(reader_train, reader_test, network_name, epoch_size, epochs, args['profiler_dir'], model_dir,
args['tensorboard_logdir'])

Просмотреть файл

@ -94,7 +94,7 @@ def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, b
learner = block_momentum_distributed_learner(local_learner, block_size=block_size)
else:
learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)
return Trainer(network['output'], (network['ce'], network['pe']), learner, progress_printer)
# Train and test

Просмотреть файл

@ -159,21 +159,20 @@ def train_fast_rcnn(debug_output=False):
# Instantiate the trainer object
learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight)
trainer = Trainer(frcn_output, (ce, pe), learner)
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
trainer = Trainer(frcn_output, (ce, pe), learner, progress_printer)
# Get minibatches of images and perform model training
print("Training Fast R-CNN model for %s epochs." % max_epochs)
log_number_of_parameters(frcn_output)
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
for epoch in range(max_epochs): # loop over epochs
sample_count = 0
while sample_count < epoch_size: # loop over minibatches in the epoch
data = minibatch_source.next_minibatch(min(mb_size, epoch_size-sample_count), input_map=input_map)
trainer.train_minibatch(data) # update model with it
sample_count += trainer.previous_minibatch_sample_count # count samples processed so far
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
if debug_output:
frcn_output.save(os.path.join(abs_path, "Output", "frcn_py_%s.model" % (epoch+1)))

Просмотреть файл

@ -62,7 +62,8 @@ def deconv_mnist(max_epochs=3):
# Instantiate the trainer object to drive the model training
learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain=True)
trainer = cntk.Trainer(z, (rmse_loss, rmse_eval), learner)
progress_printer = cntk.utils.ProgressPrinter(tag='Training')
trainer = cntk.Trainer(z, (rmse_loss, rmse_eval), learner, progress_printer)
# define mapping from reader streams to network inputs
input_map = {
@ -70,7 +71,6 @@ def deconv_mnist(max_epochs=3):
}
cntk.utils.log_number_of_parameters(z) ; print()
progress_printer = cntk.utils.ProgressPrinter(tag='Training')
# Get minibatches of images to train with and perform model training
for epoch in range(max_epochs): # loop over epochs
@ -79,9 +79,8 @@ def deconv_mnist(max_epochs=3):
data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch.
trainer.train_minibatch(data) # update model with it
sample_count += data[input_var].num_samples # count samples processed so far
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
z.save(os.path.join(model_path, "07_Deconvolution_PY_{}.model".format(epoch)))
# rename final model

Просмотреть файл

@ -114,23 +114,22 @@ def train_model(base_model_file, feature_node_name, last_hidden_node_name,
lr_schedule = learning_rate_schedule(lr_per_mb, unit=UnitType.minibatch)
mm_schedule = momentum_schedule(momentum_per_mb)
learner = momentum_sgd(tl_model.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight)
trainer = Trainer(tl_model, (ce, pe), learner)
progress_printer = ProgressPrinter(tag='Training', num_epochs=num_epochs)
trainer = Trainer(tl_model, (ce, pe), learner, progress_printer)
# Get minibatches of images and perform model training
print("Training transfer learning model for {0} epochs (epoch_size = {1}).".format(num_epochs, epoch_size))
log_number_of_parameters(tl_model)
progress_printer = ProgressPrinter(tag='Training', num_epochs=num_epochs)
for epoch in range(num_epochs): # loop over epochs
sample_count = 0
while sample_count < epoch_size: # loop over minibatches in the epoch
data = minibatch_source.next_minibatch(min(mb_size, epoch_size-sample_count), input_map=input_map)
trainer.train_minibatch(data) # update model with it
sample_count += trainer.previous_minibatch_sample_count # count samples processed so far
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
if sample_count % (100 * mb_size) == 0:
print ("Processed {0} samples".format(sample_count))
progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
return tl_model

Просмотреть файл

@ -52,7 +52,7 @@ def create_model():
# train action #
########################
def train(reader, model, max_epochs, model_dir=None):
def train(reader, model, max_epochs, model_dir=None, tensorboard_logdir=None):
# Input variables denoting the features and label data
query = cntk.blocks.Input(input_dim, is_sparse=False)
slot_labels = cntk.blocks.Input(num_labels, is_sparse=True) # TODO: make sparse once it works
@ -73,16 +73,21 @@ def train(reader, model, max_epochs, model_dir=None):
momentum_time_constant = cntk.learner.momentum_as_time_constant_schedule(minibatch_size / -math.log(0.9))
# LR schedule over epochs (we don't run that many epochs, but if we did, these are good values)
lr_schedule = [0.003]*2+[0.0015]*12+[0.0003]
# trainer object
lr_schedule = [0.003]*2+[0.0015]*12+[0.0003]
lr_per_sample = cntk.learner.learning_rate_schedule(lr_schedule, cntk.learner.UnitType.sample, epoch_size)
learner = cntk.learner.adam_sgd(z.parameters,
lr=lr_per_sample, momentum=momentum_time_constant,
low_memory=True,
gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True)
trainer = cntk.Trainer(z, (ce, pe), [learner])
# Progress writers
progress_writers = [cntk.ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs)] # more detailed logging
#progress_writers = [cntk.ProgressPrinter(tag='Training', num_epochs=max_epochs)]
if tensorboard_logdir is not None:
progress_writers.append(cntk.TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z))
# trainer object
trainer = cntk.Trainer(z, (ce, pe), [learner], progress_writers)
# define mapping from reader streams to network inputs
input_map = {
@ -92,22 +97,30 @@ def train(reader, model, max_epochs, model_dir=None):
# process minibatches and perform model training
cntk.utils.log_number_of_parameters(z) ; print()
progress_printer = cntk.ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) # more detailed logging
#progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
t = 0
aggregate_loss = 0
aggregate_error = 0
total_samples = 0
# loop over epochs
for epoch in range(max_epochs):
epoch_end = (epoch+1) * epoch_size
aggregate_loss = 0
aggregate_error = 0
total_samples = 0
# loop over minibatches on the epoch
while t < epoch_end:
# BUGBUG? The change of minibatch_size parameter vv has no effect.
data = reader.next_minibatch(min(minibatch_size, epoch_end-t), input_map=input_map) # fetch minibatch
trainer.train_minibatch(data) # update model with it
t += trainer.previous_minibatch_sample_count # count samples processed so far
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
samples = trainer.previous_minibatch_sample_count
t += samples
total_samples += samples
aggregate_loss += trainer.previous_minibatch_loss_average * samples
aggregate_error += trainer.previous_minibatch_evaluation_average * samples
#def trace_node(name):
# nl = [n for n in z.parameters if n.name() == name]
@ -117,9 +130,9 @@ def train(reader, model, max_epochs, model_dir=None):
#trace_node('stabilizer_param')
if model_dir:
z.save(os.path.join(model_dir, "atis" + "_{}.dnn".format(epoch)))
loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
return loss, metric
return aggregate_loss / total_samples, aggregate_error / total_samples
#############################
@ -129,6 +142,8 @@ def train(reader, model, max_epochs, model_dir=None):
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-e', '--epochs', help='total epochs', required=False, default='8')
parser.add_argument('-tensorboard_logdir', '--tensorboard_logdir',
help='Directory where TensorBoard logs should be created', required=False, default=None)
args = vars(parser.parse_args())
max_epochs = int(args['epochs'])
@ -145,7 +160,7 @@ if __name__=='__main__':
model_path = os.path.join(abs_path, "Models")
# train
train(reader, model, max_epochs, model_path)
train(reader, model, max_epochs, model_path, args['tensorboard_logdir'])
# test (TODO)
reader = create_reader(data_dir + "/atis.test.ctf")

Просмотреть файл

@ -1,139 +0,0 @@
# ==============================================================================
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
from __future__ import print_function
import os
import math
from cntk.layers import * # Layers library
from cntk.utils import *
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
from cntk import Trainer
from cntk.learner import adam_sgd, learning_rate_schedule, UnitType, momentum_as_time_constant_schedule
from cntk.ops import cross_entropy_with_softmax, classification_error
########################
# variables and stuff #
########################
cntk_dir = os.path.dirname(os.path.abspath(__file__)) + "/../.." # data resides in the CNTK folder
data_dir = cntk_dir + "/Examples/LanguageUnderstanding/ATIS/Data" # under Examples/LanguageUnderstanding/ATIS
vocab_size = 943 ; num_labels = 129 ; num_intents = 26 # number of words in vocab, slot labels, and intent labels
model_dir = "./Models"
# model dimensions
input_dim = vocab_size
label_dim = num_labels
emb_dim = 150
hidden_dim = 300
########################
# define the reader #
########################
def create_reader(path):
return MinibatchSource(CTFDeserializer(path, StreamDefs(
query = StreamDef(field='S0', shape=input_dim, is_sparse=True),
intent_unused = StreamDef(field='S1', shape=num_intents, is_sparse=True), # BUGBUG: unused, and should infer dim
slot_labels = StreamDef(field='S2', shape=label_dim, is_sparse=True)
)))
########################
# define the model #
########################
def create_model():
with default_options(initial_state=0.1): # inject an option to mimic the BS version identically; remove some day
return Sequential([
Embedding(emb_dim),
Recurrence(LSTM(hidden_dim), go_backwards=False),
Dense(label_dim)
])
########################
# train action #
########################
def train(reader, model, max_epochs):
# Input variables denoting the features and label data
query = Input(input_dim, is_sparse=False)
slot_labels = Input(num_labels, is_sparse=True) # TODO: make sparse once it works
# apply model to input
z = model(query)
# loss and metric
ce = cross_entropy_with_softmax(z, slot_labels)
pe = classification_error (z, slot_labels)
# training config
epoch_size = 36000
minibatch_size = 70
num_mbs_to_show_result = 100
momentum_time_constant = momentum_as_time_constant_schedule(minibatch_size / -math.log(0.9)) # TODO: Change to round number. This is 664.39. 700?
lr_schedule = [0.003]*2+[0.0015]*12+[0.0003] # LR schedule over epochs (we don't run that many epochs, but if we did, these are good values)
# trainer object
lr_per_sample = learning_rate_schedule(lr_schedule, UnitType.sample, epoch_size)
learner = adam_sgd(z.parameters,
lr=lr_per_sample, momentum=momentum_time_constant,
unit_gain=True,
low_memory=True,
gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True)
# more detailed logging
progress_printer = ProgressPrinter(freq=100, first=10, tag='Training')
#progress_printer = ProgressPrinter(tag='Training')
tensorboard_writer = TensorBoardProgressWriter(freq=100, log_dir='atis_log', model=z)
trainer = Trainer(z, (ce, pe), [learner], [progress_printer, tensorboard_writer])
# define mapping from reader streams to network inputs
input_map = {
query : reader.streams.query,
slot_labels : reader.streams.slot_labels
}
# process minibatches and perform model training
log_number_of_parameters(z) ; print()
t = 0
for epoch in range(max_epochs): # loop over epochs
epoch_end = (epoch+1) * epoch_size
while t < epoch_end: # loop over minibatches on the epoch
# BUGBUG? The change of minibatch_size parameter vv has no effect.
data = reader.next_minibatch(min(minibatch_size, epoch_end-t), input_map=input_map) # fetch minibatch
trainer.train_minibatch(data) # update model with it
t += trainer.previous_minibatch_sample_count # count samples processed so far
#def trace_node(name):
# nl = [n for n in z.parameters if n.name() == name]
# if len(nl) > 0:
# print (name, np.asarray(nl[0].value))
#trace_node('W')
#trace_node('stabilizer_param')
trainer.summarize_training_progress()
tensorboard_writer.close()
#############################
# main function boilerplate #
#############################
if __name__=='__main__':
# TODO: leave these in for now as debugging aids; remove for beta
from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
#set_computation_network_trace_level(1) # TODO: remove debugging facilities once this all works
set_fixed_random_seed(1) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works
force_deterministic_algorithms()
reader = create_reader(data_dir + "/atis.train.ctf")
model = create_model()
# train
train(reader, model, max_epochs=8)
# test (TODO)
reader = create_reader(data_dir + "/atis.test.ctf")
#test(reader, model_dir + "/slu.cmf") # TODO: what is the correct pattern here?

Просмотреть файл

@ -1,134 +0,0 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
import numpy as np
import sys
import os
from cntk import Trainer
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
from cntk.learner import sgd, learning_rate_schedule, UnitType
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, relu, element_times, constant, \
reduce_max, reduce_mean, reduce_min
from cntk.utils import *
abs_path = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(abs_path, "..", ".."))
from Examples.common.nn import fully_connected_classifier_net
def check_path(path):
if not os.path.exists(path):
readme_file = os.path.normpath(os.path.join(
os.path.dirname(path), "..", "README.md"))
raise RuntimeError(
"File '%s' does not exist. Please follow the instructions at %s to download and prepare it." %
(path, readme_file))
def create_reader(path, is_training, input_dim, label_dim):
return MinibatchSource(CTFDeserializer(path, StreamDefs(
features=StreamDef(field='features', shape=input_dim, is_sparse=False),
labels=StreamDef(field='labels', shape=label_dim, is_sparse=False)
)), randomize=is_training, epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
# Creates and trains a feedforward classification model for MNIST images
def simple_mnist():
input_dim = 784
num_output_classes = 10
num_hidden_layers = 1
hidden_layers_dim = 200
# Input variables denoting the features and label data
features = input_variable(input_dim, np.float32)
label = input_variable(num_output_classes, np.float32)
# Instantiate the feedforward classification model
scaled_input = element_times(constant(0.00390625), features)
netout = fully_connected_classifier_net(
scaled_input, num_output_classes, hidden_layers_dim, num_hidden_layers, relu)
ce = cross_entropy_with_softmax(netout, label)
pe = classification_error(netout, label)
try:
rel_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
*"Image/MNIST/v0/Train-28x28_cntk_text.txt".split("/"))
except KeyError:
rel_path = os.path.join(*"../Image/DataSets/MNIST/Train-28x28_cntk_text.txt".split("/"))
path = os.path.normpath(os.path.join(abs_path, rel_path))
check_path(path)
reader_train = create_reader(path, True, input_dim, num_output_classes)
input_map = {
features: reader_train.streams.features,
label: reader_train.streams.labels
}
# Instantiate progress writers.
logdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "mnist_log")
tensorboard_writer = TensorBoardProgressWriter(freq=1, log_dir=logdir, model=netout)
progress_printer = ProgressPrinter(freq=10, tag='Training')
# Instantiate the trainer object to drive the model training
lr_per_minibatch = learning_rate_schedule(0.2, UnitType.minibatch)
learner = sgd(netout.parameters, lr=lr_per_minibatch)
trainer = Trainer(netout, (ce, pe), learner, [tensorboard_writer, progress_printer])
# Get minibatches of images to train with and perform model training
minibatch_size = 64
num_samples_per_sweep = 6000
num_sweeps_to_train_with = 2
num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) / minibatch_size
for minibatch_idx in range(0, int(num_minibatches_to_train)):
trainer.train_minibatch(reader_train.next_minibatch(minibatch_size, input_map=input_map))
# Log max/min/mean of each parameter tensor, so that we can confirm that the parameters change indeed.
# Don't want to do that very often though, otherwise will spend too much time computing min/max/mean.
if minibatch_idx % 10 == 9:
for p in netout.parameters:
tensorboard_writer.write_value(p.uid + "/max", reduce_max(p).eval(), minibatch_idx)
tensorboard_writer.write_value(p.uid + "/min", reduce_min(p).eval(), minibatch_idx)
tensorboard_writer.write_value(p.uid + "/mean", reduce_mean(p).eval(), minibatch_idx)
trainer.summarize_training_progress()
# Load test data
try:
rel_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
*"Image/MNIST/v0/Test-28x28_cntk_text.txt".split("/"))
except KeyError:
rel_path = os.path.join(*"../Image/DataSets/MNIST/Test-28x28_cntk_text.txt".split("/"))
path = os.path.normpath(os.path.join(abs_path, rel_path))
check_path(path)
reader_test = create_reader(path, False, input_dim, num_output_classes)
input_map = {
features: reader_test.streams.features,
label: reader_test.streams.labels
}
# Test data for trained model
test_minibatch_size = 1024
num_samples = 10000
num_minibatches_to_test = num_samples / test_minibatch_size
test_result = 0.0
for i in range(0, int(num_minibatches_to_test)):
mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map)
test_result += trainer.test_minibatch(mb)
# Average of evaluation errors of all test minibatches
trainer.summarize_test_progress()
return test_result / num_minibatches_to_test
if __name__ == '__main__':
# Specify the target device to be used for computing, if you do not want to
# use the best available one, e.g.
# set_default_device(cpu())
error = simple_mnist()
print("Error: %f" % error)

Просмотреть файл

@ -165,7 +165,8 @@ def train_lm(training_file, epochs, max_num_minibatches):
learner = momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant,
gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
gradient_clipping_with_truncation=gradient_clipping_with_truncation)
trainer = Trainer(z, (ce, errs), learner)
progress_printer = ProgressPrinter(freq=100, tag='Training')
trainer = Trainer(z, (ce, errs), learner, progress_printer)
sample_freq = 1000
minibatches_per_epoch = min(data_size // minibatch_size, max_num_minibatches // epochs)
@ -174,8 +175,6 @@ def train_lm(training_file, epochs, max_num_minibatches):
log_number_of_parameters(z)
print ("Running %d epochs with %d minibatches per epoch" % (epochs, minibatches_per_epoch))
print()
progress_printer = ProgressPrinter(freq=100, tag='Training')
for e in range(0, epochs):
# Specify the mapping of input variables in the model to actual minibatch data to be trained with
@ -188,7 +187,6 @@ def train_lm(training_file, epochs, max_num_minibatches):
mask = [False]
trainer.train_minibatch(arguments)
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
global_minibatch = e*minibatches_per_epoch + b
if global_minibatch % sample_freq == 0:
print(sample(z, ix_to_char, vocab_dim, char_to_ix))

Просмотреть файл

@ -15,7 +15,7 @@ from cntk.ops.functions import load_model
from cntk.blocks import LSTM, Stabilizer
from cntk.layers import Recurrence, Dense
from cntk.models import For, Sequential
from cntk.utils import log_number_of_parameters, ProgressPrinter
from cntk.utils import log_number_of_parameters
from data_reader import DataReader
from math import log, exp
from cntk.device import set_default_device, cpu, gpu

Просмотреть файл

@ -197,11 +197,11 @@ def conv3d_ucf11(train_reader, test_reader, max_epochs=30):
mm_schedule = momentum_as_time_constant_schedule([momentum_time_constant], epoch_size=epoch_size)
# Instantiate the trainer object to drive the model training
learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule, True)
trainer = Trainer(z, (ce, pe), learner)
learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule, True)
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
trainer = Trainer(z, (ce, pe), learner, progress_printer)
log_number_of_parameters(z) ; print()
progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
# Get minibatches of images to train with and perform model training
for epoch in range(max_epochs): # loop over epochs
@ -211,8 +211,7 @@ def conv3d_ucf11(train_reader, test_reader, max_epochs=30):
videos, labels, current_minibatch = train_reader.next_minibatch(minibatch_size)
trainer.train_minibatch({input_var : videos, label_var : labels})
progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
progress_printer.epoch_summary(with_metric=True)
trainer.summarize_training_progress()
# Test data for trained model
epoch_size = 332

Просмотреть файл

@ -8,6 +8,7 @@ import numpy as np
import os
import sys
import signal
import shutil
import subprocess
import re
import pytest
@ -53,15 +54,29 @@ def mpiexec_test(device_id, script, params, expected_test_error, match_exactly=T
assert np.allclose(float(results[0])/100, expected_test_error, atol=error_tolerance)
def test_cifar_convnet_distributed(device_id):
# Create a path to TensorBoard log directory and make sure it does not exist.
abs_path = os.path.dirname(os.path.abspath(__file__))
tb_logdir = os.path.join(abs_path, 'ConvNet_CIFAR10_DataAug_Distributed_test_log')
if os.path.exists(tb_logdir):
shutil.rmtree(tb_logdir)
params = [ "-n", "2",
"-m", "64",
"-e", "3200",
"-datadir", prepare_CIFAR10_data(),
"-tensorboard_logdir", tb_logdir,
"-q", "32",
"-r",
"-device", "0" ]
mpiexec_test(device_id, script_under_test, params, 0.75, True)
# Ensure that the TensorBoard log directory was created and contains exactly one file with the expected name.
tb_files = 0
for tb_file in os.listdir(tb_logdir):
assert tb_file.startswith("events.out.tfevents")
tb_files += 1
assert tb_files == 1
def test_cifar_convnet_distributed_1bitsgd(device_id):
params = [ "-n", "2",
"-m", "64",

Просмотреть файл

@ -6,6 +6,7 @@
import numpy as np
import os
import shutil
import sys
from cntk.ops.tests.ops_test_utils import cntk_device
from cntk.cntk_py import DeviceKind_GPU
@ -38,7 +39,14 @@ def test_cifar_resnet_error(device_id):
reader_train = create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True)
reader_test = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False)
test_error = train_and_evaluate(reader_train, reader_test, 'resnet20', epoch_size=512, max_epochs=1)
# Create a path to TensorBoard log directory and make sure it does not exist.
abs_path = os.path.dirname(os.path.abspath(__file__))
tb_logdir = os.path.join(abs_path, 'TrainResNet_CIFAR10_test_log')
if os.path.exists(tb_logdir):
shutil.rmtree(tb_logdir)
test_error = train_and_evaluate(reader_train, reader_test, 'resnet20', epoch_size=512, max_epochs=1,
tensorboard_logdir=tb_logdir)
# We are removing tolerance in error because running small epoch size has huge variance in accuracy. Will add
# tolerance back once convolution operator is determinsitic.
@ -47,3 +55,9 @@ def test_cifar_resnet_error(device_id):
# assert np.allclose(test_error, expected_test_error,
# atol=TOLERANCE_ABSOLUTE)
files = 0
for file in os.listdir(tb_logdir):
assert file.startswith("events.out.tfevents")
files += 1
assert files == 1

Просмотреть файл

@ -1,14 +1,14 @@
import cntk as C
import numpy as np
from cntk.io import MinibatchSource, HTKFeatureDeserializer, HTKMLFDeserializer, StreamDef, StreamDefs
from cntk.blocks import LSTM, Placeholder, Input
from cntk.layers import Recurrence, Dense, BatchNormalization
from cntk.blocks import LSTM
from cntk.layers import Recurrence, Dense
from cntk.models import Sequential, For
import os, sys
import os
abs_path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Speech", "AN4", "Data")
def test_htk_deserializers():
mbsize = 640
epoch_size = 1000 * mbsize
@ -46,16 +46,15 @@ def test_htk_deserializers():
momentum=C.momentum_as_time_constant_schedule(1000),
low_memory=True,
gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True)
trainer = C.Trainer(z, (ce, errs), learner)
progress_printer = C.ProgressPrinter(freq=0)
trainer = C.Trainer(z, (ce, errs), learner, progress_printer)
input_map={ features: reader.streams.amazing_features, labels: reader.streams.awesome_labels }
pp = C.ProgressPrinter(freq=0)
# just run and verify it doesn't crash
for i in range(3):
mb_data = reader.next_minibatch(mbsize, input_map=input_map)
trainer.train_minibatch(mb_data)
pp.update_with_trainer(trainer, with_metric=True)
assert True
os.chdir(abs_path)

Просмотреть файл

@ -7,6 +7,7 @@
from __future__ import print_function
import os, sys
import numpy as np
import shutil
from cntk import DeviceDescriptor
TOLERANCE_ABSOLUTE = 1E-1 # TODO: Once set_fixed_random_seed(1) is honored, this must be tightened a lot.
@ -244,12 +245,26 @@ def test_language_understanding(device_id):
# test of a config like in the example but with additions to test many code paths
if device_id >= 0: # BatchNormalization currently does not run on CPU
# Create a path to TensorBoard log directory and make sure it does not exist.
abs_path = os.path.dirname(os.path.abspath(__file__))
tb_logdir = os.path.join(abs_path, 'language_understanding_test_log')
if os.path.exists(tb_logdir):
shutil.rmtree(tb_logdir)
reader = create_reader(data_dir + "/atis.train.ctf", is_training=True)
model = create_test_model()
loss_avg, evaluation_avg = train(reader, model, max_epochs=1)
loss_avg, evaluation_avg = train(reader, model, max_epochs=1, tensorboard_logdir=tb_logdir)
log_number_of_parameters(model, trace_level=1) ; print()
expected_avg = [0.084, 0.407364]
assert np.allclose([evaluation_avg, loss_avg], expected_avg, atol=TOLERANCE_ABSOLUTE)
# Ensure that the TensorBoard log directory was created and contains exactly one file with the expected name.
tb_files = 0
for tb_file in os.listdir(tb_logdir):
assert tb_file.startswith("events.out.tfevents")
tb_files += 1
assert tb_files == 1
# example also saves and loads; we skip it here, so that we get a test case of no save/load
# (we save/load in all cases above)

Просмотреть файл

@ -6,6 +6,7 @@
import os, sys
import numpy as np
import shutil
from cntk.device import set_default_device
abs_path = os.path.dirname(os.path.abspath(__file__))
@ -15,11 +16,24 @@ from SimpleMNIST import simple_mnist
TOLERANCE_ABSOLUTE = 1E-1
def test_simple_mnist_error(device_id):
# Create a path to TensorBoard log directory and make sure it does not exist.
abs_path = os.path.dirname(os.path.abspath(__file__))
tb_logdir = os.path.join(abs_path, 'simple_mnist_test_log')
if os.path.exists(tb_logdir):
shutil.rmtree(tb_logdir)
from cntk.ops.tests.ops_test_utils import cntk_device
set_default_device(cntk_device(device_id))
test_error = simple_mnist()
test_error = simple_mnist(tb_logdir)
expected_test_error = 0.09
assert np.allclose(test_error, expected_test_error,
atol=TOLERANCE_ABSOLUTE)
# Ensure that the TensorBoard log directory was created and contains exactly one file with the expected name.
tb_files = 0
for tb_file in os.listdir(tb_logdir):
assert tb_file.startswith("events.out.tfevents")
tb_files += 1
assert tb_files == 1

Просмотреть файл

@ -24,7 +24,7 @@ def test_cntk_202_language_understanding_trainerror(nb):
for cell in nb.cells:
try:
if cell.cell_type == 'code':
m = re.search('\[Evaluation\].* metric = (?P<metric>\d+\.\d+)%', cell.outputs[0]['text'])
m = re.search('Finished Evaluation.* metric = (?P<metric>\d+\.\d+)%', cell.outputs[0]['text'])
if m:
metrics.append(float(m.group('metric')))
except IndexError:

Просмотреть файл

@ -350,7 +350,8 @@
" lr=lr_schedule, momentum=momentum_as_time_constant) \n",
" \n",
" # Instantiate the trainer\n",
" trainer = Trainer(model, (loss, label_error), learner)\n",
" progress_printer = ProgressPrinter(0)\n",
" trainer = Trainer(model, (loss, label_error), learner, progress_printer)\n",
" \n",
" # Map the data streams to the input and labels.\n",
" # Note: for autoencoders input == label\n",
@ -359,17 +360,18 @@
" label : reader_train.streams.features\n",
" } \n",
" \n",
" pp = ProgressPrinter(0)\n",
" aggregate_metric = 0\n",
" for i in range(num_minibatches_to_train):\n",
" # Read a mini batch from the training data file\n",
" data = reader_train.next_minibatch(minibatch_size, input_map = input_map)\n",
" \n",
" # Run the trainer on and perform model training\n",
" trainer.train_minibatch(data) \n",
" pp.update_with_trainer(trainer, with_metric=True)\n",
" trainer.train_minibatch(data)\n",
" samples = trainer.previous_minibatch_sample_count\n",
" aggregate_metric += trainer.previous_minibatch_evaluation_average * samples\n",
" \n",
" train_error = pp.avg_metric_since_start()*100\n",
" print(\"Average training error: {0:0.2f}%\".format(pp.avg_metric_since_start()*100))\n",
" train_error = (aggregate_metric*100.0) / (trainer.total_number_of_samples_seen)\n",
" print(\"Average training error: {0:0.2f}%\".format(train_error))\n",
" \n",
" #############################################################################\n",
" # Testing the model\n",

Просмотреть файл

@ -472,10 +472,11 @@
" l2_reg_weight = 0.001\n",
" \n",
" # trainer object\n",
" learner = momentum_sgd(z.parameters, \n",
" lr = lr_per_minibatch, momentum = momentum_time_constant, \n",
" l2_regularization_weight=l2_reg_weight)\n",
" trainer = Trainer(z, (ce, pe), [learner])\n",
" learner = momentum_sgd(z.parameters, \n",
" lr = lr_per_minibatch, momentum = momentum_time_constant, \n",
" l2_regularization_weight=l2_reg_weight)\n",
" progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)\n",
" trainer = Trainer(z, (ce, pe), [learner], [progress_printer])\n",
"\n",
" # define mapping from reader streams to network inputs\n",
" input_map = {\n",
@ -484,7 +485,6 @@
" }\n",
"\n",
" log_number_of_parameters(z) ; print()\n",
" progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)\n",
"\n",
" # perform model training\n",
" batch_index = 0\n",
@ -502,9 +502,8 @@
" plot_data['loss'].append(trainer.previous_minibatch_loss_average)\n",
" plot_data['error'].append(trainer.previous_minibatch_evaluation_average)\n",
" \n",
" progress_printer.update_with_trainer(trainer, with_metric=True) # log progress\n",
" batch_index += 1\n",
" progress_printer.epoch_summary(with_metric=True)\n",
" trainer.summarize_training_progress()\n",
" \n",
" #\n",
" # Evaluation action\n",

Просмотреть файл

@ -433,12 +433,12 @@
" gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True)\n",
"\n",
" # trainer\n",
" trainer = Trainer(model, criterion, learner)\n",
" progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)\n",
" #progress_printer = ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) # more detailed logging\n",
" trainer = Trainer(model, criterion, learner, progress_printer)\n",
"\n",
" # process minibatches and perform model training\n",
" log_number_of_parameters(model)\n",
" progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)\n",
" #progress_printer = ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) # more detailed logging\n",
"\n",
" t = 0\n",
" for epoch in range(max_epochs): # loop over epochs\n",
@ -450,10 +450,7 @@
" })\n",
" trainer.train_minibatch(data) # update model with it\n",
" t += data[criterion.arguments[1]].num_samples # samples so far\n",
" progress_printer.update_with_trainer(trainer, with_metric=True) # log progress\n",
" loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True)\n",
"\n",
" return loss, metric"
" trainer.summarize_training_progress()\n"
]
},
{
@ -559,8 +556,8 @@
" momentum_as_time_constant = momentum_as_time_constant_schedule(0)\n",
" dummy_learner = adam_sgd(criterion.parameters, \n",
" lr=lr_schedule, momentum=momentum_as_time_constant, low_memory=True)\n",
" evaluator = Trainer(model, criterion, dummy_learner)\n",
" progress_printer = ProgressPrinter(tag='Evaluation', num_epochs=0)\n",
" evaluator = Trainer(model, criterion, dummy_learner, progress_printer)\n",
"\n",
" while True:\n",
" minibatch_size = 500\n",
@ -570,11 +567,8 @@
" })\n",
" if not data: # until we hit the end\n",
" break\n",
" metric = evaluator.test_minibatch(data)\n",
" progress_printer.update(0, data[criterion.arguments[1]].num_samples, metric) # log progress\n",
" loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True)\n",
"\n",
" return loss, metric"
" evaluator.test_minibatch(data)\n",
" evaluator.summarize_test_progress()\n"
]
},
{

Просмотреть файл

@ -387,7 +387,8 @@
},
"outputs": [],
"source": [
"def build_graph(noise_shape, image_shape):\n",
"def build_graph(noise_shape, image_shape,\n",
" G_progress_printer, D_progress_printer):\n",
" input_dynamic_axes = [C.Axis.default_batch_axis()]\n",
" Z = Input(noise_shape, dynamic_axes=input_dynamic_axes)\n",
" X_real = Input(image_shape, dynamic_axes=input_dynamic_axes)\n",
@ -420,12 +421,14 @@
" G_trainer = Trainer(\n",
" X_fake,\n",
" (G_loss, None),\n",
" G_learner\n",
" G_learner,\n",
" G_progress_printer\n",
" )\n",
" D_trainer = Trainer(\n",
" D_real,\n",
" (D_loss, None),\n",
" D_learner\n",
" D_learner,\n",
" D_progress_printer\n",
" )\n",
"\n",
" return X_real, X_fake, Z, G_trainer, D_trainer"
@ -447,16 +450,15 @@
"outputs": [],
"source": [
"def train(reader_train):\n",
" X_real, X_fake, Z, G_trainer, D_trainer = \\\n",
" build_graph(g_input_dim, d_input_dim)\n",
" k = 2\n",
" \n",
" # print out loss for each model for upto 50 times\n",
" print_frequency_mbsize = num_minibatches // 50\n",
"\n",
" pp_G = ProgressPrinter(print_frequency_mbsize)\n",
" pp_D = ProgressPrinter(print_frequency_mbsize)\n",
" \n",
" k = 2\n",
" pp_D = ProgressPrinter(print_frequency_mbsize * k)\n",
"\n",
" X_real, X_fake, Z, G_trainer, D_trainer = \\\n",
" build_graph(g_input_dim, d_input_dim, pp_G, pp_D)\n",
" \n",
" input_map = {X_real: reader_train.streams.features}\n",
" for train_step in range(num_minibatches):\n",
@ -473,10 +475,7 @@
" Z_data = noise_sample(minibatch_size)\n",
" batch_inputs = {Z: Z_data}\n",
" G_trainer.train_minibatch(batch_inputs)\n",
" \n",
" pp_G.update_with_trainer(G_trainer)\n",
" pp_D.update_with_trainer(D_trainer)\n",
" \n",
"\n",
" G_trainer_loss = C.utils.get_train_loss(G_trainer)\n",
"\n",
" return Z, X_fake, G_trainer_loss"

Просмотреть файл

@ -325,16 +325,16 @@
" learning_rate_schedule = C.learning_rate_schedule(Param.learning_rate, C.UnitType.sample)\n",
" momentum_schedule = C.momentum_as_time_constant_schedule(Param.momentum_time_constant)\n",
" learner = C.momentum_sgd(z.parameters, learning_rate_schedule, momentum_schedule, True)\n",
" trainer = C.Trainer(z, (cross_entropy, errs), learner)\n",
" progress_writers = None\n",
" if do_print_progress:\n",
" progress_writers = [ProgressPrinter(freq=Param.reporting_interval, tag='Training')]\n",
" trainer = C.Trainer(z, (cross_entropy, errs), learner, progress_writers)\n",
"\n",
" minbatch = 0\n",
" average_cross_entropy = compute_average_cross_entropy(z)\n",
" minbatch_data = [0] # store minibatch values\n",
" cross_entropy_data = [average_cross_entropy] # store cross_entropy values\n",
"\n",
" if do_print_progress:\n",
" progress_printer = ProgressPrinter(freq=Param.reporting_interval, tag='Training') \n",
"\n",
" # Run training\n",
" t_total= 0\n",
"\n",
@ -344,6 +344,8 @@
" label_data, indices = get_random_one_hot_data(Param.minibatch_size)\n",
" arguments = ({labels : label_data})\n",
"\n",
" # If do_print_progress is True, this will automatically print the progress using ProgressPrinter\n",
" # The printed loss numbers are computed using the sampled softmax criterion\n",
" t_start = timeit.default_timer()\n",
" trainer.train_minibatch(arguments)\n",
" t_end = timeit.default_timer()\n",
@ -355,11 +357,6 @@
" if minbatch > 2:\n",
" t_total += t_delta\n",
"\n",
" # Print the progress using progress printer\n",
" # The prints numbers computed the the sampled softmax criterion\n",
" if do_print_progress:\n",
" progress_printer.update_with_trainer(trainer, with_metric=True)\n",
" \n",
" # For comparison also print result using the full criterion\n",
" if minbatch % Param.reporting_interval == int(Param.reporting_interval/2):\n",
" # memorize the progress data for plotting\n",

Просмотреть файл

@ -54,20 +54,21 @@ def ffnet():
lr_per_minibatch=learning_rate_schedule(0.5, UnitType.minibatch)
# Instantiate the trainer object to drive the model training
trainer = Trainer(netout, (ce, pe), sgd(netout.parameters, lr=lr_per_minibatch))
learner = sgd(netout.parameters, lr=lr_per_minibatch)
progress_printer = ProgressPrinter(128)
trainer = Trainer(netout, (ce, pe), learner, progress_printer)
# Get minibatches of training data and perform model training
minibatch_size = 25
pp = ProgressPrinter(128)
for i in range(1024):
features, labels = generate_random_data(
minibatch_size, input_dim, num_output_classes)
# Specify the mapping of input variables in the model to actual
# minibatch data to be trained with
trainer.train_minibatch({input: features, label: labels})
pp.update_with_trainer(trainer)
pp.epoch_summary()
trainer.summarize_training_progress()
test_features, test_labels = generate_random_data(
minibatch_size, input_dim, num_output_classes)
avg_error = trainer.test_minibatch(

Просмотреть файл

@ -50,20 +50,17 @@ def ffnet(optimizer):
# Instantiate the trainer object to drive the model training
lr_per_minibatch = learning_rate_schedule(0.125, UnitType.minibatch)
trainer = C.Trainer(z, (ce, pe), [optimizer(z.parameters, lr_per_minibatch)])
progress_printer = ProgressPrinter(0)
trainer = C.Trainer(z, (ce, pe), [optimizer(z.parameters, lr_per_minibatch)], progress_printer)
# Get minibatches of training data and perform model training
minibatch_size = 25
num_minibatches_to_train = 63
pp = ProgressPrinter(0)
for i in range(num_minibatches_to_train):
train_features, labels = generate_random_data(minibatch_size, inputs, outputs)
# Specify the mapping of input variables in the model to actual minibatch data to be trained with
trainer.train_minibatch({features : train_features, label : labels})
pp.update_with_trainer(trainer)
last_avg_error = pp.avg_loss_since_start()
test_features, test_labels = generate_random_data(minibatch_size, inputs, outputs)
avg_error = trainer.test_minibatch({features : test_features, label : test_labels})

Просмотреть файл

@ -4,12 +4,12 @@
# for full license information.
# ==============================================================================
from __future__ import print_function
import os
import sys
import time
from cntk import cntk_py
def _warn_deprecated(message):
from warnings import warn
warn('DEPRECATED: ' + message, DeprecationWarning, stacklevel=2)
@ -26,32 +26,33 @@ def _avg(numerator, denominator):
# TODO: Let's switch to import logging in the future instead of print. [ebarsoum]
class ProgressPrinter(cntk_py.ProgressWriter):
'''
Allows printing various training time statistics (e.g. loss and metric) and printing them as training progresses.
Allows printing various statistics (e.g. loss and metric) as training/evaluation progresses.
Args:
freq (`int` or `None`, default `None`): determines how often printing of training progress will occur.
A value of 0 means a geometric schedule (1,2,4,...).
A value > 0 means an arithmetic schedule (print for minibatch number: ``freq``,
print for minibatch number: 2*``freq``, print for minibatch number: 3*``freq``,...).
A value of None means no per-minibatch log.
first (`int`, default 0): Only start printing after the training minibatch number is greater or equal to
``first``.
tag (`string`, default EmptyString): prepend minibatch log lines with your own string
log_to_file (`string` or `None`, default `None`): if None, output log data to stdout.
If a string is passed, the string is path to a file for log data.
rank (`int` or `None`, default `None`): set this to distributed.rank if you are using distributed
parallelism -- each rank's log will go to separate file.
gen_heartbeat (`bool`, default `False`): If True output a progress message every 10 seconds or so to stdout.
num_epochs (`int`, default 300): The total number of epochs to be trained. Used for some metadata.
This parameter is optional.
test_freq (`int` or `None`, default `None`): similar to ``freq``, but applies to printing intermediate
test results.
test_first (`int`, default 0): similar to ``first``, but applies to printing intermediate test results.
'''
def __init__(self, freq=None, first=0, tag='', log_to_file=None, rank=None, gen_heartbeat=False, num_epochs=300,
test_freq=None, test_first=0):
'''
Constructor.
Args:
freq (`int` or `None`, default `None`): determines how often
printing will occur. The value of 0 means an geometric
schedule (1,2,4,...). A value > 0 means a arithmetic schedule
(a log print for minibatch number: ``freq``, a log print for minibatch number: 2*``freq``,
a log print for minibatch number: 3*``freq``,...), and a value of None means no per-minibatch log.
first (`int`, default 0): Only start logging after the minibatch number is greater or equal to ``first``.
tag (`string`, default EmptyString): prepend minibatch log lines with your own string
log_to_file (`string` or `None`, default `None`): if None, output log data to stdout.
If a string is passed, the string is path to a file for log data.
rank (`int` or `None`, default `None`): set this to distributed.rank if you are using distributed
parallelism -- each rank's log will go to separate file.
gen_heartbeat (`bool`, default `False`): If True output a progress message every 10 seconds or so to stdout.
num_epochs (`int`, default 300): The total number of epochs to be trained. Used for some metadata.
This parameter is optional.
test_freq (`int` or `None`, default `None`): similar to ``freq``, but applies to printing intermediate
test results.
test_first (`int`, default 0): similar to ``first``, but applies to printing intermediate test results.
'''
if freq is None:
freq = sys.maxsize
@ -289,7 +290,7 @@ class ProgressPrinter(cntk_py.ProgressWriter):
def update_with_trainer(self, trainer, with_metric=False):
'''
DEPRECATED. Use :func:`cntk.utils.ProgressPrinter.update_training` instead.
DEPRECATED.
Update the current loss, the minibatch size and optionally the metric using the information from the
``trainer``.
@ -300,7 +301,8 @@ class ProgressPrinter(cntk_py.ProgressWriter):
'''
if self.total_updates == 0:
# Only warn once to avoid flooding with warnings.
_warn_deprecated('Use ProgressPrinter.update_progress() instead.')
_warn_deprecated('Inefficient. '
'Please pass an instance of ProgressPrinter to Trainer upon construction.')
if trainer is not None and trainer.previous_minibatch_sample_count != 0:
self.update(
@ -389,25 +391,25 @@ class ProgressPrinter(cntk_py.ProgressWriter):
class TensorBoardProgressWriter(cntk_py.ProgressWriter):
'''
Allows tracking various training time statistics (e.g. loss and metric) and write them as TensorBoard event files.
Allows writing various statistics (e.g. loss and metric) to TensorBoard event files during training/evaluation.
The generated files can be opened in TensorBoard to visualize the progress.
Args:
freq (`int` or `None`, default `None`): frequency at which training progress is written.
For example, the value of 2 will cause the progress to be logged every second time when
`:func:cntk.util.TensorBoardFileWriter.update_with_trainer` is invoked.
None indicates that progress is logged only when
`:func:cntk.util.TensorBoardFileWriter.summarize_progress` is invoked.
Must be a positive integer otherwise.
log_dir (`string`, default '.'): directory where to create a TensorBoard event file.
rank (`int` or `None`, default `None`): rank of a worker when using distributed training, or `None` if
training locally. If not `None`, event files will be created only by rank 0.
model (:class:`cntk.ops.Function` or `None`, default `None`): model graph to plot.
'''
def __init__(self, freq=None, log_dir='.', rank=None, model=None):
'''
Constructor.
Args:
freq (`int` or `None`, default `None`): frequency at which progress is logged.
For example, the value of 2 will cause the progress to be logged every second time when
`:func:cntk.util.TensorBoardFileWriter.update_with_trainer` is invoked.
None indicates that progress is logged only when
`:func:cntk.util.TensorBoardFileWriter.summarize_progress` is invoked.
Must be a positive integer otherwise.
log_dir (`string`, default '.'): directory where to create a TensorBoard event file.
rank (`int` or `None`, default `None`): rank of a worker when using distributed training, or `None` if
training locally. If not `None`, event files will be created in log_dir/rank[rank] rather than log_dir.
model (:class:`cntk.ops.Function` or `None`, default `None`): model graph to plot.
'''
if freq is None:
freq = sys.maxsize

Просмотреть файл

@ -39,20 +39,22 @@ def ffnet():
# Instantiate the trainer object to drive the model training
lr_per_minibatch = learning_rate_schedule(0.125, UnitType.minibatch)
trainer = C.Trainer(z, (ce, pe), [sgd(z.parameters, lr=lr_per_minibatch)])
progress_printer = ProgressPrinter(0)
trainer = C.Trainer(z, (ce, pe), [sgd(z.parameters, lr=lr_per_minibatch)], [progress_printer])
# Get minibatches of training data and perform model training
minibatch_size = 25
num_minibatches_to_train = 1024
pp = ProgressPrinter(0)
aggregate_loss = 0.0
for i in range(num_minibatches_to_train):
train_features, labels = generate_random_data(minibatch_size, inputs, outputs)
# Specify the mapping of input variables in the model to actual minibatch data to be trained with
trainer.train_minibatch({features : train_features, label : labels})
pp.update_with_trainer(trainer)
sample_count = trainer.previous_minibatch_sample_count
aggregate_loss += trainer.previous_minibatch_loss_average * sample_count
last_avg_error = pp.avg_loss_since_start()
last_avg_error = aggregate_loss / trainer.total_number_of_samples_seen
test_features, test_labels = generate_random_data(minibatch_size, inputs, outputs)
avg_error = trainer.test_minibatch({features : test_features, label : test_labels})

Просмотреть файл

@ -65,17 +65,17 @@ def train_sequence_classifier(debug_output=False):
lr_per_sample = learning_rate_schedule(0.0005, UnitType.sample)
# Instantiate the trainer object to drive the model training
progress_printer = ProgressPrinter(0)
trainer = Trainer(classifier_output, (ce, pe),
sgd(classifier_output.parameters, lr=lr_per_sample))
sgd(classifier_output.parameters, lr=lr_per_sample),
progress_printer)
# Get minibatches of sequences to train with and perform model training
minibatch_size = 200
pp = ProgressPrinter(0)
for i in range(255):
mb = reader.next_minibatch(minibatch_size, input_map=input_map)
trainer.train_minibatch(mb)
pp.update_with_trainer(trainer, True)
evaluation_average = float(trainer.previous_minibatch_evaluation_average)
loss_average = float(trainer.previous_minibatch_loss_average)