Merge remote-tracking branch 'origin/master' into thilow/SimpleExampleSSM
This commit is contained in:
Коммит
98c2fbdcd1
|
@ -269,3 +269,4 @@ Tutorials/slots.wl
|
|||
/packages
|
||||
/CNTK.VC.db
|
||||
/CNTK.VC.VC.opendb
|
||||
.cache
|
||||
|
|
|
@ -6,8 +6,6 @@ DataDir = "$RootDir$"
|
|||
OutputDir = "$RootDir$/Output"
|
||||
ModelDir = "$OutputDir$/Models"
|
||||
|
||||
ndlMacros="$ConfigDir$/Macros.ndl"
|
||||
|
||||
precision = "float"
|
||||
deviceId = "Auto"
|
||||
|
||||
|
@ -88,7 +86,7 @@ Train = {
|
|||
SGD = {
|
||||
epochSize = 0
|
||||
minibatchSize = 256
|
||||
# CNTK weights new gradient by (1-momentum) for unit gain, thus we multiply Caffe's learning rate by (1-momentum)
|
||||
# CNTK weights new gradient by (1-momentum) for unit gain, thus we divide Caffe's learning rate by (1-momentum)
|
||||
learningRatesPerMB = 0.1*25:0.01*25:0.001*25:0.0001*25:0.00001
|
||||
momentumPerMB = 0.9
|
||||
maxEpochs = 112
|
||||
|
|
|
@ -17,7 +17,7 @@ from cntk.ops import *
|
|||
from cntk.distributed import data_parallel_distributed_learner, Communicator
|
||||
from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
|
||||
from cntk.blocks import Placeholder, Block
|
||||
from cntk.layers import Convolution, Activation, MaxPooling, Dense, Dropout, default_options
|
||||
from cntk.layers import Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options
|
||||
from cntk.models import Sequential
|
||||
from cntk.initializer import normal
|
||||
|
||||
|
@ -32,6 +32,7 @@ image_height = 227
|
|||
image_width = 227
|
||||
num_channels = 3 # RGB
|
||||
num_classes = 1000
|
||||
model_name = "AlexNet.model"
|
||||
|
||||
# Create a minibatch source.
|
||||
def create_image_mb_source(map_file, is_training, total_number_of_samples):
|
||||
|
@ -58,6 +59,7 @@ def create_image_mb_source(map_file, is_training, total_number_of_samples):
|
|||
ImageDeserializer(map_file, StreamDefs(
|
||||
features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
|
||||
labels = StreamDef(field='label', shape=num_classes))), # and second as 'label'
|
||||
randomize = is_training,
|
||||
epoch_size=total_number_of_samples,
|
||||
multithreaded_deserializer = True)
|
||||
|
||||
|
@ -95,21 +97,21 @@ def create_alexnet():
|
|||
with default_options(activation=None, pad=True, bias=True):
|
||||
z = Sequential([
|
||||
# we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU)
|
||||
Convolution((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'),
|
||||
Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'),
|
||||
Activation(activation=relu, name='relu1'),
|
||||
LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm1'),
|
||||
MaxPooling((3,3), (2,2), name='pool1'),
|
||||
|
||||
Convolution((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'),
|
||||
Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'),
|
||||
Activation(activation=relu, name='relu2'),
|
||||
LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm2'),
|
||||
MaxPooling((3,3), (2,2), name='pool2'),
|
||||
|
||||
Convolution((3,3), 384, init=normal(0.01), name='conv3'),
|
||||
Convolution2D((3,3), 384, init=normal(0.01), name='conv3'),
|
||||
Activation(activation=relu, name='relu3'),
|
||||
Convolution((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'),
|
||||
Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'),
|
||||
Activation(activation=relu, name='relu4'),
|
||||
Convolution((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'),
|
||||
Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'),
|
||||
Activation(activation=relu, name='relu5'),
|
||||
MaxPooling((3,3), (2,2), name='pool5'),
|
||||
|
||||
|
@ -123,8 +125,9 @@ def create_alexnet():
|
|||
])(input)
|
||||
|
||||
# loss and metric
|
||||
ce = cross_entropy_with_softmax(z, label_var)
|
||||
pe = classification_error(z, label_var)
|
||||
ce = cross_entropy_with_softmax(z, label_var)
|
||||
pe = classification_error(z, label_var)
|
||||
pe5 = classification_error(z, label_var, topN=5)
|
||||
|
||||
log_number_of_parameters(z) ; print()
|
||||
|
||||
|
@ -133,6 +136,7 @@ def create_alexnet():
|
|||
'label': label_var,
|
||||
'ce' : ce,
|
||||
'pe' : pe,
|
||||
'pe5': pe5,
|
||||
'output': z
|
||||
}
|
||||
|
||||
|
@ -145,9 +149,10 @@ def create_trainer(network, epoch_size, num_quantization_bits):
|
|||
l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
|
||||
|
||||
# Create learner
|
||||
local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
|
||||
# Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency
|
||||
parameter_learner = data_parallel_distributed_learner(
|
||||
cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight),
|
||||
local_learner,
|
||||
num_quantization_bits=num_quantization_bits,
|
||||
distributed_after=0)
|
||||
|
||||
|
@ -155,7 +160,7 @@ def create_trainer(network, epoch_size, num_quantization_bits):
|
|||
return cntk.Trainer(network['output'], network['ce'], network['pe'], parameter_learner)
|
||||
|
||||
# Train and test
|
||||
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size):
|
||||
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
|
||||
|
||||
# define mapping from intput streams to network inputs
|
||||
input_map = {
|
||||
|
@ -163,36 +168,27 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
|
|||
network['label']: train_source.streams.labels
|
||||
}
|
||||
|
||||
training_session = cntk.training_session(train_source, trainer,
|
||||
cntk.minibatch_size_schedule(minibatch_size), progress_printer, input_map, os.path.join(model_path, "AlexNet_"), epoch_size)
|
||||
training_session = cntk.training_session(
|
||||
training_minibatch_source = train_source,
|
||||
trainer = trainer,
|
||||
model_inputs_to_mb_source_mapping = input_map,
|
||||
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
progress_printer = progress_printer,
|
||||
# checkpoint_frequency = epoch_size,
|
||||
checkpoint_filename = os.path.join(model_path, model_name),
|
||||
# save_all_checkpoints = True,
|
||||
progress_frequency = epoch_size,
|
||||
cv_source = test_source,
|
||||
cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
# cv_frequency = epoch_size,
|
||||
restore = restore)
|
||||
|
||||
# Train all minibatches
|
||||
training_session.train()
|
||||
|
||||
# process minibatches and evaluate the model
|
||||
metric_numer = 0
|
||||
metric_denom = 0
|
||||
minibatch_index = 0
|
||||
|
||||
while True:
|
||||
data = test_source.next_minibatch(minibatch_size, input_map=input_map)
|
||||
if not data: break
|
||||
local_mb_samples=data[network['label']].num_samples
|
||||
metric_numer += trainer.test_minibatch(data) * local_mb_samples
|
||||
metric_denom += local_mb_samples
|
||||
minibatch_index += 1
|
||||
|
||||
fin_msg = "Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)
|
||||
progress_printer.end_progress_print(fin_msg)
|
||||
|
||||
print("")
|
||||
print(fin_msg)
|
||||
print("")
|
||||
|
||||
return metric_numer/metric_denom
|
||||
|
||||
|
||||
# Train and evaluate the network.
|
||||
def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112,
|
||||
log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False):
|
||||
restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=True):
|
||||
_cntk_py.set_computation_network_trace_level(0)
|
||||
|
||||
progress_printer = ProgressPrinter(
|
||||
|
@ -207,35 +203,46 @@ def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, mini
|
|||
trainer = create_trainer(network, epoch_size, num_quantization_bits)
|
||||
train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
|
||||
test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
|
||||
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size)
|
||||
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-datadir', help='specify the location of your data');
|
||||
parser.add_argument('-logdir', help='specify where the training log will be saved');
|
||||
parser.add_argument('-outputdir', help='specify where the output model/checkpoint files shall be saved');
|
||||
parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path)
|
||||
parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
|
||||
parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
|
||||
parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='112')
|
||||
parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='256')
|
||||
parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='1281167')
|
||||
parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32')
|
||||
parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
|
||||
parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
|
||||
|
||||
args = vars(parser.parse_args())
|
||||
|
||||
if args['datadir'] != None:
|
||||
data_path = args['datadir']
|
||||
|
||||
if args['logdir'] != None:
|
||||
log_dir = args['logdir']
|
||||
|
||||
if args['outputdir'] != None:
|
||||
if args['outputdir'] is not None:
|
||||
model_path = args['outputdir'] + "/models"
|
||||
if args['datadir'] is not None:
|
||||
data_path = args['datadir']
|
||||
if args['logdir'] is not None:
|
||||
log_dir = args['logdir']
|
||||
if args['device'] is not None:
|
||||
cntk.device.set_default_device(cntk.device.gpu(args['device']))
|
||||
|
||||
train_data=os.path.join(data_path, 'train_map.txt')
|
||||
test_data=os.path.join(data_path, 'val_map.txt')
|
||||
|
||||
alexnet_train_and_eval(train_data, test_data,
|
||||
num_quantization_bits=32,
|
||||
max_epochs=112,
|
||||
log_to_file=log_dir,
|
||||
num_mbs_per_log=500,
|
||||
gen_heartbeat=True)
|
||||
Communicator.finalize()
|
||||
try:
|
||||
alexnet_train_and_eval(train_data, test_data,
|
||||
minibatch_size=args['minibatch_size'],
|
||||
epoch_size=args['epoch_size'],
|
||||
num_quantization_bits=args['quantized_bits'],
|
||||
max_epochs=args['num_epochs'],
|
||||
restore=not args['restart'],
|
||||
log_to_file=args['logdir'],
|
||||
num_mbs_per_log=200,
|
||||
gen_heartbeat=True)
|
||||
finally:
|
||||
cntk.distributed.Communicator.finalize()
|
||||
|
|
|
@ -45,6 +45,7 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
|
|||
cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
|
||||
features = cntk.io.StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
|
||||
labels = cntk.io.StreamDef(field='label', shape=num_classes))), # and second as 'label'
|
||||
randomize=train,
|
||||
epoch_size=total_number_of_samples,
|
||||
multithreaded_deserializer = True)
|
||||
|
||||
|
@ -105,15 +106,15 @@ def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_
|
|||
l2_regularization_weight=l2_reg_weight)
|
||||
|
||||
if block_size != None:
|
||||
learner = cntk.distributed.block_momentum_distributed_learner(local_learner, block_size=block_size)
|
||||
parameter_learner = cntk.distributed.block_momentum_distributed_learner(local_learner, block_size=block_size)
|
||||
else:
|
||||
learner = cntk.distributed.data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)
|
||||
parameter_learner = cntk.distributed.data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)
|
||||
|
||||
# Create trainer
|
||||
return cntk.Trainer(network['output'], network['ce'], network['pe'], learner)
|
||||
return cntk.Trainer(network['output'], network['ce'], network['pe'], parameter_learner)
|
||||
|
||||
# Train and test
|
||||
def train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size):
|
||||
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
|
||||
|
||||
# define mapping from intput streams to network inputs
|
||||
input_map = {
|
||||
|
@ -125,20 +126,23 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
|
|||
training_minibatch_source = train_source,
|
||||
trainer = trainer,
|
||||
model_inputs_to_mb_source_mapping = input_map,
|
||||
mb_size_schedule = cntk.minibatch_size_schedule(64),
|
||||
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
progress_printer = progress_printer,
|
||||
# checkpoint_frequency = epoch_size,
|
||||
checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
|
||||
# save_all_checkpoints = False,
|
||||
progress_frequency=epoch_size,
|
||||
cv_source = test_source,
|
||||
cv_mb_size_schedule=cntk.minibatch_size_schedule(16),
|
||||
restore=False)
|
||||
cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
|
||||
# cv_frequency = epoch_size,
|
||||
restore=restore)
|
||||
|
||||
# Train all minibatches
|
||||
training_session.train()
|
||||
|
||||
# Train and evaluate the network.
|
||||
def convnet_cifar10_dataaug(train_data, test_data, mean_data, epoch_size=50000, num_quantization_bits=32,
|
||||
block_size=3200, warm_up=0, max_epochs=2, log_to_file=None,
|
||||
def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32,
|
||||
block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None,
|
||||
num_mbs_per_log=None, gen_heartbeat=False):
|
||||
_cntk_py.set_computation_network_trace_level(0)
|
||||
|
||||
|
@ -154,7 +158,7 @@ def convnet_cifar10_dataaug(train_data, test_data, mean_data, epoch_size=50000,
|
|||
trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up)
|
||||
train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
|
||||
test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
|
||||
train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size)
|
||||
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
@ -165,20 +169,25 @@ if __name__=='__main__':
|
|||
parser.add_argument('-datadir', '--datadir', help='Data directory where the CIFAR dataset is located', required=False, default=data_path)
|
||||
parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
|
||||
parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
|
||||
parser.add_argument('-e', '--epochs', help='Total number of epochs to train', type=int, required=False, default='160')
|
||||
parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='160')
|
||||
parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='64')
|
||||
parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='50000')
|
||||
parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32')
|
||||
parser.add_argument('-a', '--distributed_after', help='Number of samples to train with before running distributed', type=int, required=False, default='0')
|
||||
parser.add_argument('-b', '--block_samples', type=int, help="Number of samples per block for block momentum (BM) distributed learner (if 0 BM learner is not used)", required=False, default=None)
|
||||
parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
|
||||
parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
|
||||
|
||||
args = vars(parser.parse_args())
|
||||
|
||||
if args['outputdir'] is not None:
|
||||
model_path = args['outputdir'] + "/models"
|
||||
if args['device'] is not None:
|
||||
cntk.device.set_default_device(cntk.device.gpu(args['device']))
|
||||
if args['datadir'] is not None:
|
||||
data_path = args['datadir']
|
||||
if args['logdir'] is not None:
|
||||
log_dir = args['logdir']
|
||||
if args['device'] is not None:
|
||||
cntk.device.set_default_device(cntk.device.gpu(args['device']))
|
||||
|
||||
mean_data=os.path.join(data_path, 'CIFAR-10_mean.xml')
|
||||
train_data=os.path.join(data_path, 'train_map.txt')
|
||||
|
@ -186,14 +195,16 @@ if __name__=='__main__':
|
|||
|
||||
try:
|
||||
convnet_cifar10_dataaug(train_data, test_data, mean_data,
|
||||
epoch_size=50000,
|
||||
minibatch_size=args['minibatch_size'],
|
||||
epoch_size=args['epoch_size'],
|
||||
num_quantization_bits=args['quantized_bits'],
|
||||
block_size=args['block_samples'],
|
||||
warm_up=args['distributed_after'],
|
||||
max_epochs=args['epochs'],
|
||||
max_epochs=args['num_epochs'],
|
||||
restore=not args['restart'],
|
||||
log_to_file=args['logdir'],
|
||||
num_mbs_per_log=10,
|
||||
gen_heartbeat=True)
|
||||
num_mbs_per_log=100,
|
||||
gen_heartbeat=False)
|
||||
finally:
|
||||
cntk.distributed.Communicator.finalize()
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Node: ResNet-50 with ImageNet -- 101 layers bottleneck ResNet for image classification
|
||||
# ResNet-101 with ImageNet -- 101 layers bottleneck ResNet for image classification
|
||||
# Reference: "Deep Residual Learning for Image Recognition" https://arxiv.org/abs/1512.03385
|
||||
|
||||
command = TrainNetwork:BNStatistics:Eval
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# ConvNet applied on CIFAR-10 dataset, with data augmentation (translation and flipping).
|
||||
# ResNet110 applied on CIFAR-10 dataset, with data augmentation (translation and flipping).
|
||||
|
||||
command = TrainConvNet:Eval
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Node: ResNet-50 with ImageNet -- 152 layers bottleneck ResNet for image classification
|
||||
# ResNet-152 with ImageNet -- 152 layers bottleneck ResNet for image classification
|
||||
# Reference: "Deep Residual Learning for Image Recognition" https://arxiv.org/abs/1512.03385
|
||||
|
||||
command = TrainNetwork:BNStatistics:Eval
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# ConvNet applied on CIFAR-10 dataset, with data augmentation (translation and flipping).
|
||||
# ResNet20 applied on CIFAR-10 dataset, with data augmentation (translation and flipping).
|
||||
|
||||
command = TrainConvNet:Eval
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Node: ResNet-50 with ImageNet -- 50 layers bottleneck ResNet for image classification
|
||||
# ResNet-50 with ImageNet -- 50 layers bottleneck ResNet for image classification
|
||||
# Reference: "Deep Residual Learning for Image Recognition" https://arxiv.org/abs/1512.03385
|
||||
|
||||
command = TrainNetwork:BNStatistics:Eval
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
# CNTK Examples: Image/Classification/VGG
|
||||
|
||||
## BrainScript
|
||||
|
||||
### VGG16_ImageNet.cntk
|
||||
|
||||
This is the VGG model that contains 16 layers, which was referred as `ConvNet configuration D` in the [original paper](https://arxiv.org/pdf/1409.1556v6.pdf).
|
||||
|
||||
Run the example from the current folder using:
|
||||
|
||||
`cntk configFile=VGG16_ImageNet.cntk`
|
||||
|
||||
### VGG19_ImageNet.cntk
|
||||
|
||||
This is the VGG model that contains 19 layers, which was referred as `ConvNet configuration E` in the [original paper](https://arxiv.org/pdf/1409.1556v6.pdf).
|
||||
|
||||
Run the example from the current folder using:
|
||||
|
||||
`cntk configFile=VGG19_ImageNet.cntk`
|
|
@ -0,0 +1,167 @@
|
|||
# VGG16 with ImageNet -- 16 layers ConvNet for image classification
|
||||
# Reference: "Very Deep Convolutional Networks for Large-Scale Image Recognition" https://arxiv.org/abs/1409.1556
|
||||
|
||||
RootDir = "."
|
||||
|
||||
ConfigDir = "$RootDir$"
|
||||
DataDir = "$RootDir$"
|
||||
OutputDir = "$RootDir$/Output"
|
||||
ModelDir = "$OutputDir$/Models"
|
||||
|
||||
precision = "float"
|
||||
deviceId = "Auto"
|
||||
|
||||
command = Train:Test
|
||||
|
||||
parallelTrain = "true"
|
||||
traceLevel = 1
|
||||
numMBsToShowResult = 500
|
||||
|
||||
modelPath = "$ModelDir$/VGG16"
|
||||
stderr = "$OutputDir$/VGG16"
|
||||
|
||||
ImageH = 224
|
||||
ImageW = 224
|
||||
ImageC = 3
|
||||
NumLabels = 1000
|
||||
|
||||
parallelTrain = true
|
||||
hyperCompressMemory = true
|
||||
|
||||
################################
|
||||
Train = {
|
||||
action = "train"
|
||||
|
||||
BrainScriptNetworkBuilder = {
|
||||
imageShape = $ImageH$:$ImageW$:$ImageC$
|
||||
labelDim = $NumLabels$
|
||||
|
||||
model = Sequential (
|
||||
ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
ConvolutionalLayer {128, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {128, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
ConvolutionalLayer {256, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {256, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {256, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
DenseLayer {4096, activation=ReLU} : Dropout :
|
||||
DenseLayer {4096, activation=ReLU} : Dropout :
|
||||
LinearLayer {labelDim}
|
||||
)
|
||||
|
||||
# inputs
|
||||
features = Input {imageShape}
|
||||
featNorm = features - Splice(Constant(104):Constant(117):Constant(124), axis=3)
|
||||
labels = Input {labelDim}
|
||||
|
||||
# apply model to features
|
||||
z = model (featNorm)
|
||||
|
||||
# loss and error computation
|
||||
ce = CrossEntropyWithSoftmax (labels, z)
|
||||
errs = ClassificationError (labels, z)
|
||||
top5Errs = ClassificationError (labels, z, topN=5) # only used in Eval action
|
||||
|
||||
# declare special nodes
|
||||
featureNodes = (features)
|
||||
labelNodes = (labels)
|
||||
criterionNodes = (ce)
|
||||
evaluationNodes = (errs)
|
||||
outputNodes = (z)
|
||||
}
|
||||
|
||||
SGD = {
|
||||
epochSize = 0
|
||||
minibatchSize = 128
|
||||
# CNTK weights new gradient by (1-momentum) for unit gain, thus we divide Caffe's learning rate by (1-momentum)
|
||||
learningRatesPerMB = 0.1*20:0.01*20:0.001*20:0.0001*10:0.00001
|
||||
momentumPerMB = 0.9
|
||||
maxEpochs = 80
|
||||
gradUpdateType = None
|
||||
L2RegWeight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
|
||||
dropoutRate = 0.5
|
||||
|
||||
# TODO: try less bits?
|
||||
ParallelTrain = {
|
||||
parallelizationMethod = "DataParallelSGD"
|
||||
distributedMBReading = "true"
|
||||
parallelizationStartEpoch = 1
|
||||
DataParallelSGD = {
|
||||
gradientBits = 32
|
||||
}
|
||||
}
|
||||
|
||||
numMBsToShowResult = 250
|
||||
}
|
||||
|
||||
# Reader
|
||||
reader = {
|
||||
verbosity = 0 ; randomize = true
|
||||
deserializers = ({
|
||||
type = "ImageDeserializer" ; module = "ImageReader"
|
||||
file = "$DataDir$/train_map.txt"
|
||||
input = {
|
||||
features = { transforms = (
|
||||
{ type = "Crop" ; cropType = "randomSide" ; sideRatio = 0.4375:0.875 ; jitterType = "uniRatio" } : # [256, 512] jitter in scale
|
||||
{ type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
|
||||
{ type = "Transpose" }
|
||||
)}
|
||||
labels = { labelDim = $NumLabels$ }
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
cvreader = {
|
||||
verbosity = 0 ; randomize = false
|
||||
deserializers = ({
|
||||
type = "ImageDeserializer" ; module = "ImageReader"
|
||||
file = "$DataDir$/val_map.txt"
|
||||
input = {
|
||||
features = { transforms = (
|
||||
{ type = "Crop" ; cropType = "Center" ; sideRatio = 0.5833333 } : # 384 crop to 224
|
||||
{ type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
|
||||
{ type = "Transpose" }
|
||||
)}
|
||||
labels = { labelDim = $NumLabels$ }
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
################################
|
||||
Test = {
|
||||
action=test
|
||||
minibatchSize=128
|
||||
evalNodeNames = errs:top5Errs # also test top-5 error rate
|
||||
|
||||
# Reader
|
||||
reader = {
|
||||
verbosity = 0
|
||||
randomize = false
|
||||
|
||||
deserializers = (
|
||||
{
|
||||
type = "ImageDeserializer" ; module = "ImageReader"
|
||||
file="$DataDir$/val_map.txt"
|
||||
input = {
|
||||
features = { transforms = (
|
||||
{ type = "Crop"; cropType = "center"; sideRatio = 0.5833333 } : # 384 crop to 224
|
||||
{ type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
|
||||
{ type = "Transpose" }
|
||||
)}
|
||||
labels = { labelDim = 1000}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
# VGG19 with ImageNet -- 19 layers ConvNet for image classification
|
||||
# Reference: "Very Deep Convolutional Networks for Large-Scale Image Recognition" https://arxiv.org/abs/1409.1556
|
||||
|
||||
RootDir = "."
|
||||
|
||||
ConfigDir = "$RootDir$"
|
||||
DataDir = "$RootDir$"
|
||||
OutputDir = "$RootDir$/Output"
|
||||
ModelDir = "$OutputDir$/Models"
|
||||
|
||||
precision = "float"
|
||||
deviceId = "Auto"
|
||||
|
||||
command = Train:Test
|
||||
|
||||
parallelTrain = "true"
|
||||
traceLevel = 1
|
||||
numMBsToShowResult = 500
|
||||
|
||||
modelPath = "$ModelDir$/VGG19"
|
||||
stderr = "$OutputDir$/VGG19"
|
||||
|
||||
ImageH = 224
|
||||
ImageW = 224
|
||||
ImageC = 3
|
||||
NumLabels = 1000
|
||||
|
||||
parallelTrain = true
|
||||
hyperCompressMemory = true
|
||||
|
||||
################################
|
||||
Train = {
|
||||
action = "train"
|
||||
|
||||
BrainScriptNetworkBuilder = {
|
||||
imageShape = $ImageH$:$ImageW$:$ImageC$
|
||||
labelDim = $NumLabels$
|
||||
|
||||
model = Sequential (
|
||||
ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
ConvolutionalLayer {128, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {128, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
ConvolutionalLayer {256, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {256, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {256, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {256, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
ConvolutionalLayer {512, (3:3), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
DenseLayer {4096, activation=ReLU} : Dropout :
|
||||
DenseLayer {4096, activation=ReLU} : Dropout :
|
||||
LinearLayer {labelDim}
|
||||
)
|
||||
|
||||
# inputs
|
||||
features = Input {imageShape}
|
||||
featNorm = features - Splice(Constant(104):Constant(117):Constant(124), axis=3)
|
||||
labels = Input {labelDim}
|
||||
|
||||
# apply model to features
|
||||
z = model (featNorm)
|
||||
|
||||
# loss and error computation
|
||||
ce = CrossEntropyWithSoftmax (labels, z)
|
||||
errs = ClassificationError (labels, z)
|
||||
top5Errs = ClassificationError (labels, z, topN=5) # only used in Eval action
|
||||
|
||||
# declare special nodes
|
||||
featureNodes = (features)
|
||||
labelNodes = (labels)
|
||||
criterionNodes = (ce)
|
||||
evaluationNodes = (errs)
|
||||
outputNodes = (z)
|
||||
}
|
||||
|
||||
SGD = {
|
||||
epochSize = 0
|
||||
minibatchSize = 128
|
||||
# CNTK weights new gradient by (1-momentum) for unit gain, thus we divide Caffe's learning rate by (1-momentum)
|
||||
learningRatesPerMB = 0.1*20:0.01*20:0.001*20:0.0001*10:0.00001
|
||||
momentumPerMB = 0.9
|
||||
maxEpochs = 80
|
||||
gradUpdateType = None
|
||||
L2RegWeight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
|
||||
dropoutRate = 0.5
|
||||
|
||||
# TODO: try less bits?
|
||||
ParallelTrain = {
|
||||
parallelizationMethod = "DataParallelSGD"
|
||||
distributedMBReading = "true"
|
||||
parallelizationStartEpoch = 1
|
||||
DataParallelSGD = {
|
||||
gradientBits = 32
|
||||
}
|
||||
}
|
||||
|
||||
numMBsToShowResult = 250
|
||||
}
|
||||
|
||||
# Reader
|
||||
reader = {
|
||||
verbosity = 0 ; randomize = true
|
||||
deserializers = ({
|
||||
type = "ImageDeserializer" ; module = "ImageReader"
|
||||
file = "$DataDir$/train_map.txt"
|
||||
input = {
|
||||
features = { transforms = (
|
||||
{ type = "Crop" ; cropType = "randomSide" ; sideRatio = 0.4375:0.875 ; jitterType = "uniRatio" } : # [256, 512] jitter in scale
|
||||
{ type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
|
||||
{ type = "Transpose" }
|
||||
)}
|
||||
labels = { labelDim = $NumLabels$ }
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
cvreader = {
|
||||
verbosity = 0 ; randomize = false
|
||||
deserializers = ({
|
||||
type = "ImageDeserializer" ; module = "ImageReader"
|
||||
file = "$DataDir$/val_map.txt"
|
||||
input = {
|
||||
features = { transforms = (
|
||||
{ type = "Crop" ; cropType = "Center" ; sideRatio = 0.5833333 } : # 384 crop to 224
|
||||
{ type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
|
||||
{ type = "Transpose" }
|
||||
)}
|
||||
labels = { labelDim = $NumLabels$ }
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
################################
|
||||
Test = {
|
||||
action=test
|
||||
minibatchSize=128
|
||||
evalNodeNames = errs:top5Errs # also test top-5 error rate
|
||||
|
||||
# Reader
|
||||
reader = {
|
||||
verbosity = 0
|
||||
randomize = false
|
||||
|
||||
deserializers = (
|
||||
{
|
||||
type = "ImageDeserializer" ; module = "ImageReader"
|
||||
file="$DataDir$/val_map.txt"
|
||||
input = {
|
||||
features = { transforms = (
|
||||
{ type = "Crop"; cropType = "center"; sideRatio = 0.5833333 } : # 384 crop to 224
|
||||
{ type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
|
||||
{ type = "Transpose" }
|
||||
)}
|
||||
labels = { labelDim = 1000}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
|
@ -1,7 +0,0 @@
|
|||
m1=LoadModel($CurModel$, format=cntk)
|
||||
SetDefaultModel(m1)
|
||||
|
||||
# Add top-5 error prediction node.
|
||||
ErrTop5 = ClassificationError(labels, OutputNodes.z, Const(5), tag = "eval")
|
||||
|
||||
SaveModel(m1, $NewModel$, format=cntk)
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,55 +0,0 @@
|
|||
# Fully-connected layer with ReLU activation.
|
||||
DnnReLULayer(inDim, outDim, x, wScale, bValue)
|
||||
[
|
||||
W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
|
||||
b = Parameter(outDim, init = fixedValue, value = bValue)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
y = RectifiedLinear(z)
|
||||
]
|
||||
|
||||
# Fully-connected layer with batch normalization and ReLU activation.
|
||||
DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
|
||||
[
|
||||
W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
|
||||
b = Parameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
|
||||
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, v, spatial = false)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
# Fully-connected layer.
|
||||
DnnLayer(inDim, outDim, x, wScale, bValue)
|
||||
[
|
||||
W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
|
||||
b = Parameter(outDim, init = fixedValue, value = bValue)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
]
|
||||
|
||||
# Convolutional layer with ReLU activation.
|
||||
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
|
||||
[
|
||||
W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
|
||||
b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = "cudnn")
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
|
||||
z = Plus(c, b);
|
||||
y = RectifiedLinear(z);
|
||||
]
|
||||
|
||||
# Convolutional layer with batch normalization and ReLU activation.
|
||||
ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue)
|
||||
[
|
||||
W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
|
||||
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
|
||||
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
|
||||
bn = BatchNormalization(c, sc, b, m, v, spatial = true, imageLayout = "cudnn")
|
||||
y = RectifiedLinear(bn);
|
||||
]
|
|
@ -0,0 +1,27 @@
|
|||
# CNTK Examples: Image/Classification/VGG
|
||||
|
||||
## Python
|
||||
|
||||
### VGG16_ImageNet_Distributed.py
|
||||
|
||||
This is the VGG model that contains 16 layers, which was referred as `ConvNet configuration D` in the [original paper](https://arxiv.org/pdf/1409.1556v6.pdf).
|
||||
|
||||
Run the example from the current folder using:
|
||||
|
||||
`python VGG16_ImageNet_Distributed.py`
|
||||
|
||||
To run it in a distributed manner, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines#32-python). For example, the command for distributed training on the same machine (with multiple GPUs) with Windows is:
|
||||
|
||||
`mpiexec -n <#workers> python VGG16_ImageNet_Distributed.py`
|
||||
|
||||
### VGG19_ImageNet_Distributed.py
|
||||
|
||||
This is the VGG model that contains 19 layers, which was referred as `ConvNet configuration E` in the [original paper](https://arxiv.org/pdf/1409.1556v6.pdf).
|
||||
|
||||
Run the example from the current folder using:
|
||||
|
||||
`python VGG19_ImageNet_Distributed.py`
|
||||
|
||||
To run it in a distributed manner, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines#32-python). For example, the command for distributed training on the same machine (with multiple GPUs) with Windows is:
|
||||
|
||||
`mpiexec -n <#workers> python VGG19_ImageNet_Distributed.py`
|
|
@ -0,0 +1,242 @@
|
|||
# Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
# Licensed under the MIT license. See LICENSE.md file in the project root
|
||||
# for full license information.
|
||||
# ==============================================================================
|
||||
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import math
|
||||
import argparse
|
||||
import numpy as np
|
||||
import cntk
|
||||
import _cntk_py
|
||||
|
||||
from cntk.utils import *
|
||||
from cntk.ops import *
|
||||
from cntk.distributed import data_parallel_distributed_learner, Communicator
|
||||
from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
|
||||
from cntk.blocks import Placeholder, Block
|
||||
from cntk.layers import Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options
|
||||
from cntk.models import Sequential, LayerStack
|
||||
from cntk.initializer import normal
|
||||
|
||||
# default Paths relative to current python file.
|
||||
abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
data_path = os.path.join(abs_path, "..", "..", "..", "DataSets", "ImageNet")
|
||||
model_path = os.path.join(abs_path, "Models")
|
||||
log_dir = None
|
||||
|
||||
# model dimensions
|
||||
image_height = 224
|
||||
image_width = 224
|
||||
num_channels = 3 # RGB
|
||||
num_classes = 1000
|
||||
model_name = "VGG16.model"
|
||||
|
||||
cntk.cntk_py.enable_hyper_memory_compress()
|
||||
|
||||
# Create a minibatch source.
|
||||
def create_image_mb_source(map_file, is_training, total_number_of_samples):
|
||||
if not os.path.exists(map_file):
|
||||
raise RuntimeError("File '%s' does not exist." %map_file)
|
||||
|
||||
# transformation pipeline for the features has jitter/crop only when training
|
||||
transforms = []
|
||||
if is_training:
|
||||
transforms += [
|
||||
ImageDeserializer.crop(crop_type='randomside', side_ratio='0.4375:0.875', jitter_type='uniratio') # train uses jitter
|
||||
]
|
||||
else:
|
||||
transforms += [
|
||||
ImageDeserializer.crop(crop_type='center', side_ratio=0.5833333) # test has no jitter
|
||||
]
|
||||
|
||||
transforms += [
|
||||
ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
]
|
||||
|
||||
# deserializer
|
||||
return MinibatchSource(
|
||||
ImageDeserializer(map_file, StreamDefs(
|
||||
features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
|
||||
labels = StreamDef(field='label', shape=num_classes))), # and second as 'label'
|
||||
randomize = is_training,
|
||||
epoch_size=total_number_of_samples,
|
||||
multithreaded_deserializer = True)
|
||||
|
||||
# Create the network.
|
||||
def create_vgg16():
|
||||
|
||||
# Input variables denoting the features and label data
|
||||
feature_var = input_variable((num_channels, image_height, image_width))
|
||||
label_var = input_variable((num_classes))
|
||||
|
||||
# apply model to input
|
||||
# remove mean value
|
||||
input = minus(feature_var, constant([[[104]], [[117]], [[124]]]), name='mean_removed_input')
|
||||
|
||||
with default_options(activation=None, pad=True, bias=True):
|
||||
z = Sequential([
|
||||
# we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU)
|
||||
LayerStack(2, lambda i: [
|
||||
Convolution2D((3,3), 64, name='conv1_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu1_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool1'),
|
||||
|
||||
LayerStack(2, lambda i: [
|
||||
Convolution2D((3,3), 128, name='conv2_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu2_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool2'),
|
||||
|
||||
LayerStack(3, lambda i: [
|
||||
Convolution2D((3,3), 256, name='conv3_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu3_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool3'),
|
||||
|
||||
LayerStack(3, lambda i: [
|
||||
Convolution2D((3,3), 512, name='conv4_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu4_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool4'),
|
||||
|
||||
LayerStack(3, lambda i: [
|
||||
Convolution2D((3,3), 512, name='conv5_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu5_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool5'),
|
||||
|
||||
Dense(4096, name='fc6'),
|
||||
Activation(activation=relu, name='relu6'),
|
||||
Dropout(0.5, name='drop6'),
|
||||
Dense(4096, name='fc7'),
|
||||
Activation(activation=relu, name='relu7'),
|
||||
Dropout(0.5, name='drop7'),
|
||||
Dense(num_classes, name='fc8')
|
||||
])(input)
|
||||
|
||||
# loss and metric
|
||||
ce = cross_entropy_with_softmax(z, label_var)
|
||||
pe = classification_error(z, label_var)
|
||||
pe5 = classification_error(z, label_var, topN=5)
|
||||
|
||||
log_number_of_parameters(z) ; print()
|
||||
|
||||
return {
|
||||
'feature': feature_var,
|
||||
'label': label_var,
|
||||
'ce' : ce,
|
||||
'pe' : pe,
|
||||
'pe5': pe5,
|
||||
'output': z
|
||||
}
|
||||
|
||||
# Create trainer
|
||||
def create_trainer(network, epoch_size, num_quantization_bits):
|
||||
# Set learning parameters
|
||||
lr_per_mb = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001]
|
||||
lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
|
||||
mm_schedule = cntk.learner.momentum_schedule(0.9)
|
||||
l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
|
||||
|
||||
# Create learner
|
||||
local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
|
||||
# Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency
|
||||
parameter_learner = data_parallel_distributed_learner(
|
||||
local_learner,
|
||||
num_quantization_bits=num_quantization_bits,
|
||||
distributed_after=0)
|
||||
|
||||
# Create trainer
|
||||
return cntk.Trainer(network['output'], network['ce'], network['pe'], parameter_learner)
|
||||
|
||||
# Train and test
|
||||
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
|
||||
|
||||
# define mapping from intput streams to network inputs
|
||||
input_map = {
|
||||
network['feature']: train_source.streams.features,
|
||||
network['label']: train_source.streams.labels
|
||||
}
|
||||
|
||||
training_session = cntk.training_session(
|
||||
training_minibatch_source = train_source,
|
||||
trainer = trainer,
|
||||
model_inputs_to_mb_source_mapping = input_map,
|
||||
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
progress_printer = progress_printer,
|
||||
# checkpoint_frequency = epoch_size,
|
||||
checkpoint_filename = os.path.join(model_path, model_name),
|
||||
# save_all_checkpoints = True,
|
||||
progress_frequency = epoch_size,
|
||||
cv_source = test_source,
|
||||
cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
# cv_frequency = epoch_size,
|
||||
restore = restore)
|
||||
|
||||
# Train all minibatches
|
||||
training_session.train()
|
||||
|
||||
# Train and evaluate the network.
|
||||
def vgg16_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=128, epoch_size = 1281167, max_epochs=80,
|
||||
restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False):
|
||||
_cntk_py.set_computation_network_trace_level(0)
|
||||
|
||||
progress_printer = ProgressPrinter(
|
||||
freq=num_mbs_per_log,
|
||||
tag='Training',
|
||||
log_to_file=log_to_file,
|
||||
rank=Communicator.rank(),
|
||||
gen_heartbeat=gen_heartbeat,
|
||||
num_epochs=max_epochs)
|
||||
|
||||
network = create_vgg16()
|
||||
trainer = create_trainer(network, epoch_size, num_quantization_bits)
|
||||
train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
|
||||
test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
|
||||
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path)
|
||||
parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
|
||||
parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
|
||||
parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='80')
|
||||
parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='128')
|
||||
parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='1281167')
|
||||
parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32')
|
||||
parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
|
||||
parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
|
||||
|
||||
args = vars(parser.parse_args())
|
||||
|
||||
if args['outputdir'] is not None:
|
||||
model_path = args['outputdir'] + "/models"
|
||||
if args['datadir'] is not None:
|
||||
data_path = args['datadir']
|
||||
if args['logdir'] is not None:
|
||||
log_dir = args['logdir']
|
||||
if args['device'] is not None:
|
||||
cntk.device.set_default_device(cntk.device.gpu(args['device']))
|
||||
|
||||
train_data=os.path.join(data_path, 'train_map.txt')
|
||||
test_data=os.path.join(data_path, 'val_map.txt')
|
||||
|
||||
try:
|
||||
vgg16_train_and_eval(train_data, test_data,
|
||||
minibatch_size=args['minibatch_size'],
|
||||
epoch_size=args['epoch_size'],
|
||||
num_quantization_bits=args['quantized_bits'],
|
||||
max_epochs=args['num_epochs'],
|
||||
restore=not args['restart'],
|
||||
log_to_file=args['logdir'],
|
||||
num_mbs_per_log=200,
|
||||
gen_heartbeat=True)
|
||||
finally:
|
||||
cntk.distributed.Communicator.finalize()
|
|
@ -0,0 +1,242 @@
|
|||
# Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
# Licensed under the MIT license. See LICENSE.md file in the project root
|
||||
# for full license information.
|
||||
# ==============================================================================
|
||||
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import math
|
||||
import argparse
|
||||
import numpy as np
|
||||
import cntk
|
||||
import _cntk_py
|
||||
|
||||
from cntk.utils import *
|
||||
from cntk.ops import *
|
||||
from cntk.distributed import data_parallel_distributed_learner, Communicator
|
||||
from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
|
||||
from cntk.blocks import Placeholder, Block
|
||||
from cntk.layers import Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options
|
||||
from cntk.models import Sequential, LayerStack
|
||||
from cntk.initializer import normal
|
||||
|
||||
# default Paths relative to current python file.
|
||||
abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
data_path = os.path.join(abs_path, "..", "..", "..", "DataSets", "ImageNet")
|
||||
model_path = os.path.join(abs_path, "Models")
|
||||
log_dir = None
|
||||
|
||||
# model dimensions
|
||||
image_height = 224
|
||||
image_width = 224
|
||||
num_channels = 3 # RGB
|
||||
num_classes = 1000
|
||||
model_name = "VGG19.model"
|
||||
|
||||
cntk.cntk_py.enable_hyper_memory_compress()
|
||||
|
||||
# Create a minibatch source.
|
||||
def create_image_mb_source(map_file, is_training, total_number_of_samples):
|
||||
if not os.path.exists(map_file):
|
||||
raise RuntimeError("File '%s' does not exist." %map_file)
|
||||
|
||||
# transformation pipeline for the features has jitter/crop only when training
|
||||
transforms = []
|
||||
if is_training:
|
||||
transforms += [
|
||||
ImageDeserializer.crop(crop_type='randomside', side_ratio='0.4375:0.875', jitter_type='uniratio') # train uses jitter
|
||||
]
|
||||
else:
|
||||
transforms += [
|
||||
ImageDeserializer.crop(crop_type='center', side_ratio=0.5833333) # test has no jitter
|
||||
]
|
||||
|
||||
transforms += [
|
||||
ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
|
||||
]
|
||||
|
||||
# deserializer
|
||||
return MinibatchSource(
|
||||
ImageDeserializer(map_file, StreamDefs(
|
||||
features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
|
||||
labels = StreamDef(field='label', shape=num_classes))), # and second as 'label'
|
||||
randomize = is_training,
|
||||
epoch_size=total_number_of_samples,
|
||||
multithreaded_deserializer = True)
|
||||
|
||||
# Create the network.
|
||||
def create_vgg19():
|
||||
|
||||
# Input variables denoting the features and label data
|
||||
feature_var = input_variable((num_channels, image_height, image_width))
|
||||
label_var = input_variable((num_classes))
|
||||
|
||||
# apply model to input
|
||||
# remove mean value
|
||||
input = minus(feature_var, constant([[[104]], [[117]], [[124]]]), name='mean_removed_input')
|
||||
|
||||
with default_options(activation=None, pad=True, bias=True):
|
||||
z = Sequential([
|
||||
# we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU)
|
||||
LayerStack(2, lambda i: [
|
||||
Convolution2D((3,3), 64, name='conv1_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu1_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool1'),
|
||||
|
||||
LayerStack(2, lambda i: [
|
||||
Convolution2D((3,3), 128, name='conv2_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu2_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool2'),
|
||||
|
||||
LayerStack(4, lambda i: [
|
||||
Convolution2D((3,3), 256, name='conv3_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu3_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool3'),
|
||||
|
||||
LayerStack(4, lambda i: [
|
||||
Convolution2D((3,3), 512, name='conv4_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu4_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool4'),
|
||||
|
||||
LayerStack(4, lambda i: [
|
||||
Convolution2D((3,3), 512, name='conv5_{}'.format(i)),
|
||||
Activation(activation=relu, name='relu5_{}'.format(i)),
|
||||
]),
|
||||
MaxPooling((2,2), (2,2), name='pool5'),
|
||||
|
||||
Dense(4096, name='fc6'),
|
||||
Activation(activation=relu, name='relu6'),
|
||||
Dropout(0.5, name='drop6'),
|
||||
Dense(4096, name='fc7'),
|
||||
Activation(activation=relu, name='relu7'),
|
||||
Dropout(0.5, name='drop7'),
|
||||
Dense(num_classes, name='fc8')
|
||||
])(input)
|
||||
|
||||
# loss and metric
|
||||
ce = cross_entropy_with_softmax(z, label_var)
|
||||
pe = classification_error(z, label_var)
|
||||
pe5 = classification_error(z, label_var, topN=5)
|
||||
|
||||
log_number_of_parameters(z) ; print()
|
||||
|
||||
return {
|
||||
'feature': feature_var,
|
||||
'label': label_var,
|
||||
'ce' : ce,
|
||||
'pe' : pe,
|
||||
'pe5': pe5,
|
||||
'output': z
|
||||
}
|
||||
|
||||
# Create trainer
|
||||
def create_trainer(network, epoch_size, num_quantization_bits):
|
||||
# Set learning parameters
|
||||
lr_per_mb = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001]
|
||||
lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
|
||||
mm_schedule = cntk.learner.momentum_schedule(0.9)
|
||||
l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
|
||||
|
||||
# Create learner
|
||||
local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
|
||||
# Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency
|
||||
parameter_learner = data_parallel_distributed_learner(
|
||||
local_learner,
|
||||
num_quantization_bits=num_quantization_bits,
|
||||
distributed_after=0)
|
||||
|
||||
# Create trainer
|
||||
return cntk.Trainer(network['output'], network['ce'], network['pe'], parameter_learner)
|
||||
|
||||
# Train and test
|
||||
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
|
||||
|
||||
# define mapping from intput streams to network inputs
|
||||
input_map = {
|
||||
network['feature']: train_source.streams.features,
|
||||
network['label']: train_source.streams.labels
|
||||
}
|
||||
|
||||
training_session = cntk.training_session(
|
||||
training_minibatch_source = train_source,
|
||||
trainer = trainer,
|
||||
model_inputs_to_mb_source_mapping = input_map,
|
||||
mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
progress_printer = progress_printer,
|
||||
# checkpoint_frequency = epoch_size,
|
||||
checkpoint_filename = os.path.join(model_path, model_name),
|
||||
# save_all_checkpoints = True,
|
||||
progress_frequency = epoch_size,
|
||||
cv_source = test_source,
|
||||
cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
|
||||
# cv_frequency = epoch_size,
|
||||
restore = restore)
|
||||
|
||||
# Train all minibatches
|
||||
training_session.train()
|
||||
|
||||
# Train and evaluate the network.
|
||||
def vgg19_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=128, epoch_size = 1281167, max_epochs=80,
|
||||
restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False):
|
||||
_cntk_py.set_computation_network_trace_level(0)
|
||||
|
||||
progress_printer = ProgressPrinter(
|
||||
freq=num_mbs_per_log,
|
||||
tag='Training',
|
||||
log_to_file=log_to_file,
|
||||
rank=Communicator.rank(),
|
||||
gen_heartbeat=gen_heartbeat,
|
||||
num_epochs=max_epochs)
|
||||
|
||||
network = create_vgg19()
|
||||
trainer = create_trainer(network, epoch_size, num_quantization_bits)
|
||||
train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
|
||||
test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
|
||||
train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path)
|
||||
parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
|
||||
parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
|
||||
parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='80')
|
||||
parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='128')
|
||||
parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='1281167')
|
||||
parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32')
|
||||
parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
|
||||
parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
|
||||
|
||||
args = vars(parser.parse_args())
|
||||
|
||||
if args['outputdir'] is not None:
|
||||
model_path = args['outputdir'] + "/models"
|
||||
if args['datadir'] is not None:
|
||||
data_path = args['datadir']
|
||||
if args['logdir'] is not None:
|
||||
log_dir = args['logdir']
|
||||
if args['device'] is not None:
|
||||
cntk.device.set_default_device(cntk.device.gpu(args['device']))
|
||||
|
||||
train_data=os.path.join(data_path, 'train_map.txt')
|
||||
test_data=os.path.join(data_path, 'val_map.txt')
|
||||
|
||||
try:
|
||||
vgg19_train_and_eval(train_data, test_data,
|
||||
minibatch_size=args['minibatch_size'],
|
||||
epoch_size=args['epoch_size'],
|
||||
num_quantization_bits=args['quantized_bits'],
|
||||
max_epochs=args['num_epochs'],
|
||||
restore=not args['restart'],
|
||||
log_to_file=args['logdir'],
|
||||
num_mbs_per_log=200,
|
||||
gen_heartbeat=True)
|
||||
finally:
|
||||
cntk.distributed.Communicator.finalize()
|
|
@ -0,0 +1,25 @@
|
|||
# CNTK Examples: Image/Classification/VGG
|
||||
|
||||
## Overview
|
||||
|
||||
|Data: |The ILSVRC2012 dataset (http://www.image-net.org/challenges/LSVRC/2012/) for image classification.
|
||||
|:---------|:---
|
||||
|Purpose |This folder contains examples that demonstrate how to use CNTK to define VGG network (https://arxiv.org/abs/1409.1556) for image classification.
|
||||
|Network |VGG.
|
||||
|Training |Stochastic gradient descent with momentum.
|
||||
|Comments |See below.
|
||||
|
||||
## Running the example
|
||||
|
||||
### Getting the data
|
||||
We use the ILSVRC2012 datasets to demonstrate how to train the VGG model which was developed by the [Visual Geometry Group in University of Oxford](http://www.robots.ox.ac.uk/~vgg/research/very_deep/). It won the second place in the ILSVRC-2014 challenge. VGG has been a very popular model for its simple architect and high accuracy.
|
||||
|
||||
ILSVRC2012 datasets are not included in the CNTK distribution. You may obtain it through http://image-net.org.
|
||||
|
||||
## Details
|
||||
|
||||
We give examples for both Python and BrainScript.
|
||||
|
||||
### [Python](./Python)
|
||||
|
||||
### [BrainScript](./BrainScript)
|
|
@ -1,76 +0,0 @@
|
|||
load=ndlMacros
|
||||
run=DNN
|
||||
|
||||
ndlMacros = [
|
||||
ImageW = 224
|
||||
ImageH = 224
|
||||
ImageC = 3
|
||||
LabelDim = 1000
|
||||
|
||||
features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
|
||||
labels = Input(LabelDim, tag = label)
|
||||
|
||||
# Kernels width and height.
|
||||
kW = 3
|
||||
kH = 3
|
||||
# Kernel stride.
|
||||
hs = 1
|
||||
vs = 1
|
||||
|
||||
# Pooling settings.
|
||||
poolW = 2
|
||||
poolH = 2
|
||||
poolhs = 2
|
||||
poolvs = 2
|
||||
|
||||
# Initial parameter values.
|
||||
convWScale = 0.01 #7
|
||||
convBValue = 0
|
||||
fc1WScale = 0.01 #8
|
||||
fc1BValue = 0
|
||||
fc2WScale = 0.01 #3.2
|
||||
fc2BValue = 0
|
||||
fc3WScale = 0.01 #3.2
|
||||
fc3BValue = 0
|
||||
]
|
||||
|
||||
DNN=[
|
||||
cMap1 = 64
|
||||
conv1 = ConvReLULayer(features, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap2 = 128
|
||||
conv2 = ConvReLULayer(pool1, cMap2, 576, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool2 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap3 = 256
|
||||
conv3 = ConvReLULayer(pool2, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv4 = ConvReLULayer(conv3, cMap3, 2304, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool3 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap5 = 512
|
||||
conv5 = ConvReLULayer(pool3, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv6 = ConvReLULayer(conv5, cMap5, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool4 = MaxPooling(conv6, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap6 = 512
|
||||
conv7 = ConvReLULayer(pool4, cMap6, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv8 = ConvReLULayer(conv7, cMap6, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool5 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
hiddenDim = 4096
|
||||
h1 = DnnReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
|
||||
h1_d = Dropout(h1)
|
||||
h2 = DnnReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue)
|
||||
h2_d = Dropout(h2)
|
||||
ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
|
||||
|
||||
CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
|
||||
Err = ClassificationError(labels, ol, tag = Eval)
|
||||
OutputNodes = ol
|
||||
]
|
|
@ -1,109 +0,0 @@
|
|||
# Note: This sample uses the deprecated NdlNetworkBuilder.
|
||||
# An updated version using BrainScript is coming soon.
|
||||
# Please find updated samples on Github, https://github.com/Microsoft/CNTK/tree/master/Examples /...
|
||||
#
|
||||
RootDir = "."
|
||||
|
||||
ConfigDir = "$RootDir$"
|
||||
DataDir = "$RootDir$"
|
||||
OutputDir = "$RootDir$/Output"
|
||||
ModelDir = "$OutputDir$/Models"
|
||||
|
||||
ndlMacros="$ConfigDir$/Macros.ndl"
|
||||
|
||||
precision="float"
|
||||
deviceId="Auto"
|
||||
|
||||
command=Train:AddTop5Eval:Test
|
||||
|
||||
stderr="$OutputDir$/VGG_A"
|
||||
traceLevel=1
|
||||
numMBsToShowResult=500
|
||||
|
||||
Train=[
|
||||
action="train"
|
||||
modelPath="$ModelDir$/VGG_A"
|
||||
traceLevel=1
|
||||
|
||||
NDLNetworkBuilder=[
|
||||
networkDescription="$ConfigDir$/VGG_A.ndl"
|
||||
]
|
||||
|
||||
SGD=[
|
||||
epochSize=0
|
||||
minibatchSize=32
|
||||
learningRatesPerMB=0.01*20:0.003*12:0.001*28:0.0003
|
||||
momentumPerMB=0.9
|
||||
maxEpochs=70
|
||||
gradUpdateType="None"
|
||||
L2RegWeight=0.0005
|
||||
dropoutRate=0*5:0.5
|
||||
|
||||
numMBsToShowResult=10
|
||||
]
|
||||
|
||||
reader=[
|
||||
readerType="ImageReader"
|
||||
# Map file which maps images to labels using the following format:
|
||||
# <full path to image><tab><numerical label (0-based class id)>
|
||||
# Example:
|
||||
# C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
|
||||
file="$ConfigDir$/train_map.txt"
|
||||
# Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
|
||||
randomize="Auto"
|
||||
features=[
|
||||
# Below are the required parameters.
|
||||
width=224
|
||||
height=224
|
||||
channels=3
|
||||
# Below are the optional parameters.
|
||||
# Possible values: Center, Random. Default: Center
|
||||
cropType="RandomSide"
|
||||
# Horizontal random flip, will be enabled by default because cropType=RandomSide
|
||||
#hflip="true"
|
||||
# Crop scale side ratio. Examples: sideRatio=0.9, sideRatio=0.7:0.9.
|
||||
sideRatio=0.875
|
||||
# Crop scale ratio jitter type.
|
||||
# Possible values: None, UniRatio. Default: None
|
||||
jitterType="UniRatio"
|
||||
# Interpolation to use when scaling image to width x height size.
|
||||
# Possible values: nearest, linear, cubic, lanczos. Default: linear.
|
||||
interpolations="Linear"
|
||||
# Stores mean values for each pixel in OpenCV matrix XML format.
|
||||
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
|
||||
]
|
||||
labels=[
|
||||
labelDim=1000
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
AddTop5Eval=[
|
||||
action="edit"
|
||||
CurModel="$ModelDir$/VGG_A"
|
||||
NewModel="$ModelDir$/VGG_A.Top5"
|
||||
editPath="$ConfigDir$/CreateEvalModel.mel"
|
||||
]
|
||||
|
||||
Test=[
|
||||
action="test"
|
||||
modelPath="$ModelDir$/VGG_A.Top5"
|
||||
# Set minibatch size for testing.
|
||||
minibatchSize=32
|
||||
|
||||
reader=[
|
||||
readerType="ImageReader"
|
||||
file="$ConfigDir$/val_map.txt"
|
||||
randomize="None"
|
||||
features=[
|
||||
width=224
|
||||
height=224
|
||||
channels=3
|
||||
cropType="Center"
|
||||
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
|
||||
]
|
||||
labels=[
|
||||
labelDim=1000
|
||||
]
|
||||
]
|
||||
]
|
|
@ -1,84 +0,0 @@
|
|||
load=ndlMacros
|
||||
run=DNN
|
||||
|
||||
ndlMacros = [
|
||||
ImageW = 224
|
||||
ImageH = 224
|
||||
ImageC = 3
|
||||
LabelDim = 1000
|
||||
|
||||
features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
|
||||
labels = Input(LabelDim, tag = label)
|
||||
|
||||
# Kernels width and height.
|
||||
kW = 3
|
||||
kH = 3
|
||||
# Kernel stride.
|
||||
hs = 1
|
||||
vs = 1
|
||||
|
||||
# Pooling settings.
|
||||
poolW = 2
|
||||
poolH = 2
|
||||
poolhs = 2
|
||||
poolvs = 2
|
||||
|
||||
# Initial parameter values.
|
||||
convWScale = 7.07
|
||||
convBValue = 0
|
||||
fc1WScale = 3.0
|
||||
fc1BValue = 1
|
||||
fc2WScale = 3.0
|
||||
fc2BValue = 1
|
||||
fc3WScale = 1.0
|
||||
fc3BValue = 1
|
||||
]
|
||||
|
||||
DNN=[
|
||||
cMap1 = 64
|
||||
conv1 = ConvReLULayer(features, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv2 = ConvReLULayer(conv1, cMap1, 576, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap3 = 128
|
||||
conv3 = ConvReLULayer(pool1, cMap3, 576, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv4 = ConvReLULayer(conv3, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap5 = 256
|
||||
conv5 = ConvReLULayer(pool2, cMap5, 1152, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv6 = ConvReLULayer(conv5, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv7 = ConvReLULayer(conv6, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv8 = ConvReLULayer(conv7, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap9 = 512
|
||||
conv9 = ConvReLULayer(pool3, cMap9, 2304, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv10 = ConvReLULayer(conv9, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv11 = ConvReLULayer(conv10, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv12 = ConvReLULayer(conv11, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap13 = 512
|
||||
conv13 = ConvReLULayer(pool4, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv14 = ConvReLULayer(conv13, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv15 = ConvReLULayer(conv14, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
conv16 = ConvReLULayer(conv15, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
|
||||
|
||||
pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
hiddenDim = 4096
|
||||
h1 = DnnReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
|
||||
h1_d = Dropout(h1)
|
||||
h2 = DnnReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue)
|
||||
h2_d = Dropout(h2)
|
||||
ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
|
||||
|
||||
CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
|
||||
Err = ClassificationError(labels, ol, tag = Eval)
|
||||
OutputNodes = ol
|
||||
]
|
|
@ -1,85 +0,0 @@
|
|||
load=ndlMacros
|
||||
run=DNN
|
||||
|
||||
ndlMacros = [
|
||||
ImageW = 224
|
||||
ImageH = 224
|
||||
ImageC = 3
|
||||
LabelDim = 1000
|
||||
|
||||
features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
|
||||
labels = Input(LabelDim, tag = label)
|
||||
|
||||
# Kernels width and height.
|
||||
kW = 3
|
||||
kH = 3
|
||||
# Kernel stride.
|
||||
hs = 1
|
||||
vs = 1
|
||||
|
||||
# Pooling settings.
|
||||
poolW = 2
|
||||
poolH = 2
|
||||
poolhs = 2
|
||||
poolvs = 2
|
||||
|
||||
# Initial parameter values.
|
||||
convWScale = 7.07
|
||||
convBValue = 0
|
||||
scValue = 0.03
|
||||
fc1WScale = 3.0
|
||||
fc1BValue = 1
|
||||
fc2WScale = 3.0
|
||||
fc2BValue = 1
|
||||
fc3WScale = 1.0
|
||||
fc3BValue = 1
|
||||
]
|
||||
|
||||
DNN=[
|
||||
cMap1 = 64
|
||||
conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv2 = ConvBNReLULayer(conv1, cMap1, 576, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
|
||||
pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap3 = 128
|
||||
conv3 = ConvBNReLULayer(pool1, cMap3, 576, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv4 = ConvBNReLULayer(conv3, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
|
||||
pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap5 = 256
|
||||
conv5 = ConvBNReLULayer(pool2, cMap5, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv6 = ConvBNReLULayer(conv5, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv7 = ConvBNReLULayer(conv6, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv8 = ConvBNReLULayer(conv7, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
|
||||
pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap9 = 512
|
||||
conv9 = ConvBNReLULayer(pool3, cMap9, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv10 = ConvBNReLULayer(conv9, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv11 = ConvBNReLULayer(conv10, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv12 = ConvBNReLULayer(conv11, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
|
||||
pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
cMap13 = 512
|
||||
conv13 = ConvBNReLULayer(pool4, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv14 = ConvBNReLULayer(conv13, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv15 = ConvBNReLULayer(conv14, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
conv16 = ConvBNReLULayer(conv15, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
|
||||
|
||||
pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
|
||||
|
||||
hiddenDim = 4096
|
||||
h1 = DnnBNReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
|
||||
h1_d = Dropout(h1)
|
||||
h2 = DnnBNReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue)
|
||||
h2_d = Dropout(h2)
|
||||
ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
|
||||
|
||||
CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
|
||||
Err = ClassificationError(labels, ol, tag = Eval)
|
||||
OutputNodes = ol
|
||||
]
|
|
@ -1,118 +0,0 @@
|
|||
# Note: This sample uses the deprecated NdlNetworkBuilder.
|
||||
# An updated version using BrainScript is coming soon.
|
||||
# Please find updated samples on Github, https://github.com/Microsoft/CNTK/tree/master/Examples /...
|
||||
#
|
||||
RootDir = "."
|
||||
|
||||
ConfigDir = "$RootDir$"
|
||||
DataDir = "$RootDir$"
|
||||
OutputDir = "$RootDir$/Output"
|
||||
ModelDir = "$OutputDir$/Models"
|
||||
|
||||
ndlMacros="$ConfigDir$/Macros.ndl"
|
||||
|
||||
precision="float"
|
||||
deviceId="Auto"
|
||||
|
||||
command=Train:AddTop5Eval:Test
|
||||
|
||||
parallelTrain="false"
|
||||
|
||||
stderr="$OutputDir$/VGG_E_BN"
|
||||
traceLevel=1
|
||||
|
||||
Train=[
|
||||
action="train"
|
||||
modelPath="$ModelDir$/VGG_E_BN"
|
||||
|
||||
NDLNetworkBuilder=[
|
||||
networkDescription="$ConfigDir$/VGG_E_BN.ndl"
|
||||
]
|
||||
|
||||
SGD=[
|
||||
epochSize=0
|
||||
minibatchSize=16
|
||||
learningRatesPerMB=0.01*20:0.003*12:0.001*28:0.0003
|
||||
momentumPerMB=0.9
|
||||
maxEpochs=70
|
||||
gradUpdateType="None"
|
||||
L2RegWeight=0.0005
|
||||
dropoutRate=0*5:0.5
|
||||
|
||||
ParallelTrain=[
|
||||
parallelizationMethod="DataParallelSGD"
|
||||
distributedMBReading="true"
|
||||
parallelizationStartEpoch=1
|
||||
DataParallelSGD=[
|
||||
gradientBits=32
|
||||
]
|
||||
]
|
||||
|
||||
numMBsToShowResult=10
|
||||
]
|
||||
|
||||
reader=[
|
||||
readerType="ImageReader"
|
||||
# Map file which maps images to labels using the following format:
|
||||
# <full path to image><tab><numerical label (0-based class id)>
|
||||
# Example:
|
||||
# C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
|
||||
file="$DataDir$/train_map.txt"
|
||||
# Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
|
||||
randomize="Auto"
|
||||
features=[
|
||||
# Below are the required parameters.
|
||||
width=224
|
||||
height=224
|
||||
channels=3
|
||||
# Below are the optional parameters.
|
||||
# Possible values: Center, Random. Default: Center
|
||||
cropType="RandomSide"
|
||||
# Horizontal random flip, will be enabled because cropType=RandomSide
|
||||
#hflip="true"
|
||||
# Crop scale side ratio. Examples: sideRatio=0.9, sideRatio=0.7:0.9.
|
||||
sideRatio=0.875
|
||||
# Crop scale ratio jitter type.
|
||||
# Possible values: None, UniRatio. Default: None
|
||||
jitterType="UniRatio"
|
||||
# Interpolation to use when scaling image to width x height size.
|
||||
# Possible values: nearest, linear, cubic, lanczos. Default: linear.
|
||||
interpolations="Linear"
|
||||
# Stores mean values for each pixel in OpenCV matrix XML format.
|
||||
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
|
||||
]
|
||||
labels=[
|
||||
labelDim=1000
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
AddTop5Eval=[
|
||||
action="edit"
|
||||
CurModel="$ModelDir$/VGG_E_BN"
|
||||
NewModel="$ModelDir$/VGG_E_BN.Top5"
|
||||
editPath="$ConfigDir$/CreateEvalModel.mel"
|
||||
]
|
||||
|
||||
Test=[
|
||||
action="test"
|
||||
modelPath=$ModelDir$/VGG_E_BN.Top5
|
||||
# Set minibatch size for testing.
|
||||
minibatchSize=16
|
||||
|
||||
reader=[
|
||||
readerType="ImageReader"
|
||||
file="$DataDir$/val_map.txt"
|
||||
randomize="None"
|
||||
features=[
|
||||
width=224
|
||||
height=224
|
||||
channels=3
|
||||
cropType="Center"
|
||||
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
|
||||
]
|
||||
labels=[
|
||||
labelDim=1000
|
||||
]
|
||||
]
|
||||
]
|
|
@ -1,118 +0,0 @@
|
|||
# Note: This sample uses the deprecated NdlNetworkBuilder.
|
||||
# An updated version using BrainScript is coming soon.
|
||||
# Please find updated samples on Github, https://github.com/Microsoft/CNTK/tree/master/Examples /...
|
||||
#
|
||||
RootDir = "."
|
||||
|
||||
ConfigDir = "$RootDir$"
|
||||
DataDir = "$RootDir$"
|
||||
OutputDir = "$RootDir$/Output"
|
||||
ModelDir = "$OutputDir$/Models"
|
||||
|
||||
ndlMacros="$ConfigDir$/Macros.ndl"
|
||||
|
||||
precision="float"
|
||||
deviceId="Auto"
|
||||
|
||||
command=Train:AddTop5Eval:Test
|
||||
|
||||
parallelTrain="false"
|
||||
|
||||
stderr="$OutputDir$/VGG_E"
|
||||
traceLevel=1
|
||||
|
||||
Train=[
|
||||
action="train"
|
||||
modelPath="$ModelDir$/VGG_E"
|
||||
|
||||
NDLNetworkBuilder=[
|
||||
networkDescription="$ConfigDir$/VGG_E.ndl"
|
||||
]
|
||||
|
||||
SGD=[
|
||||
epochSize=0
|
||||
minibatchSize=16
|
||||
learningRatesPerMB=0.01*20:0.003*12:0.001*28:0.0003
|
||||
momentumPerMB=0.9
|
||||
maxEpochs=70
|
||||
gradUpdateType="None"
|
||||
L2RegWeight=0.0005
|
||||
dropoutRate=0*5:0.5
|
||||
|
||||
ParallelTrain=[
|
||||
parallelizationMethod="DataParallelSGD"
|
||||
distributedMBReading="true"
|
||||
parallelizationStartEpoch=1
|
||||
DataParallelSGD=[
|
||||
gradientBits=32
|
||||
]
|
||||
]
|
||||
|
||||
numMBsToShowResult=10
|
||||
]
|
||||
|
||||
reader=[
|
||||
readerType="ImageReader"
|
||||
# Map file which maps images to labels using the following format:
|
||||
# <full path to image><tab><numerical label (0-based class id)>
|
||||
# Example:
|
||||
# C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
|
||||
file="$DataDir$/train_map.txt"
|
||||
# Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
|
||||
randomize="Auto"
|
||||
features=[
|
||||
# Below are the required parameters.
|
||||
width=224
|
||||
height=224
|
||||
channels=3
|
||||
# Below are the optional parameters.
|
||||
# Possible values: Center, Random. Default: Center
|
||||
cropType="RandomSide"
|
||||
# Horizontal random flip, will be enabled because cropType=RandomSide
|
||||
#hflip="true"
|
||||
# Crop scale side ratio. Examples: sideRatio=0.9, sideRatio=0.7:0.9.
|
||||
sideRatio=0.875
|
||||
# Crop scale ratio jitter type.
|
||||
# Possible values: None, UniRatio. Default: None
|
||||
jitterType="UniRatio"
|
||||
# Interpolation to use when scaling image to width x height size.
|
||||
# Possible values: nearest, linear, cubic, lanczos. Default: linear.
|
||||
interpolations="Linear"
|
||||
# Stores mean values for each pixel in OpenCV matrix XML format.
|
||||
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
|
||||
]
|
||||
labels=[
|
||||
labelDim=1000
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
AddTop5Eval=[
|
||||
action="edit"
|
||||
CurModel="$ModelDir$/VGG_E"
|
||||
NewModel="$ModelDir$/VGG_E.Top5"
|
||||
editPath="$ConfigDir$/CreateEvalModel.mel"
|
||||
]
|
||||
|
||||
Test=[
|
||||
action="test"
|
||||
modelPath="$ModelDir$/VGG_E.Top5"
|
||||
# Set minibatch size for testing.
|
||||
minibatchSize=16
|
||||
|
||||
reader=[
|
||||
readerType="ImageReader"
|
||||
file="$DataDir$/val_map.txt"
|
||||
randomize="None"
|
||||
features=[
|
||||
width=224
|
||||
height=224
|
||||
channels=3
|
||||
cropType="Center"
|
||||
meanFile="$ConfigDir$/ImageNet1K_mean.xml"
|
||||
]
|
||||
labels=[
|
||||
labelDim=1000
|
||||
]
|
||||
]
|
||||
]
|
|
@ -3880,7 +3880,9 @@ namespace CNTK
|
|||
static MomentumSchedule DefaultVarianceMomentum = MomentumAsTimeConstantSchedule(2 * 3600 * 100);
|
||||
|
||||
///
|
||||
/// Create an instance of the CNTK built-in Adam learner (only the low-memory variant is supported at the moment).
|
||||
/// Create an instance of Adam learner as the original paper.
|
||||
/// Due to history reason, the legacy implementation of AdamLearner is FSAdaGrad. To keep compitability on the interface, we
|
||||
/// will switch to the original Adam only when lowMemory = false, while keep the legacy logic when it leaves default, aka. true.
|
||||
///
|
||||
CNTK_API LearnerPtr AdamLearner(const std::vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
|
|
|
@ -543,6 +543,48 @@ namespace CNTK
|
|||
s_targetAdagradAvDenom, momentum, varMomentum, UseUnitGainMomentum());
|
||||
}
|
||||
|
||||
LearnerAdam::LearnerAdam(const vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
const MomentumSchedule& momentumSchedule,
|
||||
bool unitGain,
|
||||
const MomentumSchedule& varianceMomentumSchedule,
|
||||
AdditionalLearningOptions additionalOptions)
|
||||
: LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule,
|
||||
unitGain, additionalOptions, /*allocateSmoothGradients*/ false),
|
||||
m_varianceMomentumSchedule(varianceMomentumSchedule)
|
||||
{
|
||||
for (const auto& parameter : parameters)
|
||||
{
|
||||
const auto shape = GetMatrixShape(parameter);
|
||||
NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] });
|
||||
m_smoothedGradientValues.emplace(parameter, view);
|
||||
m_smoothedCounts.emplace(parameter, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
/*virtual*/ void LearnerAdam::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
|
||||
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
|
||||
{
|
||||
DISPATCH_TO_TYPED_UPDATE_FUNCTION;
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
void LearnerAdam::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
|
||||
const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
|
||||
{
|
||||
GET_WRITABLE_MATRICES;
|
||||
|
||||
const auto learningRate = LearningRate(trainingSampleCount);
|
||||
const auto momentum = MomentumValueForMB(trainingSampleCount);
|
||||
|
||||
const auto varMomentum = VarianceMomentumValueForMB(trainingSampleCount);
|
||||
|
||||
double& smoothedCount = m_smoothedCounts.at(parameter);
|
||||
|
||||
smoothedGradientMatrix->AdamUpdate(*gradientMatrix, *parameterMatrix, smoothedCount, learningRate,
|
||||
momentum, varMomentum, UseUnitGainMomentum());
|
||||
}
|
||||
|
||||
LearnerRMSProp::LearnerRMSProp(const vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
double gamma, double inc, double dec, double max, double min,
|
||||
|
@ -623,16 +665,21 @@ namespace CNTK
|
|||
LearnerPtr AdamLearner(const vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
const MomentumSchedule& momentumSchedule,
|
||||
bool unitGain,
|
||||
bool unitGain, /*=true*/
|
||||
const MomentumSchedule& varianceMomentumSchedule, /*= MomentumAsTimeConstantSchedulePerSample(2 * 3600 * 100)*/
|
||||
bool lowMemory, /*= true*/
|
||||
AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
|
||||
{
|
||||
// TODO: Due to history reason, the legacy AdamLearner using FSAdaGrad implementation instead of the original paper implementation.
|
||||
// To keep interface backward compatible, the new adam will be enabled only when lowMemory is false.
|
||||
if (!lowMemory)
|
||||
{
|
||||
LogicError("AdamLearner: only the low-memory variant is supported at the moment.");
|
||||
return MakeSharedObject<LearnerAdam>(parameters, learningRateSchedule, momentumSchedule, unitGain, varianceMomentumSchedule, additionalOptions);
|
||||
}
|
||||
else
|
||||
{
|
||||
return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRateSchedule, momentumSchedule, unitGain, varianceMomentumSchedule, additionalOptions);
|
||||
}
|
||||
return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRateSchedule, momentumSchedule, unitGain, varianceMomentumSchedule, additionalOptions);
|
||||
}
|
||||
|
||||
LearnerPtr AdaGradLearner(const vector<Parameter>& parameters,
|
||||
|
|
|
@ -234,6 +234,36 @@ namespace CNTK
|
|||
MomentumSchedule m_varianceMomentumSchedule;
|
||||
};
|
||||
|
||||
class LearnerAdam : public LearnerMomentumSGD
|
||||
{
|
||||
public:
|
||||
|
||||
LearnerAdam(const std::vector<Parameter>& parameters,
|
||||
const LearningRateSchedule& learningRateSchedule,
|
||||
const MomentumSchedule& momentumSchedule,
|
||||
bool unitGain,
|
||||
const MomentumSchedule& varianceMomentumSchedule,
|
||||
AdditionalLearningOptions additionalOptions);
|
||||
|
||||
protected:
|
||||
|
||||
virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
|
||||
|
||||
template <typename ElementType>
|
||||
void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
|
||||
|
||||
private:
|
||||
|
||||
// returns current per-minibatch variance momentum value.
|
||||
double VarianceMomentumValueForMB(size_t minibatchSize) const
|
||||
{
|
||||
return MomentumValueForMB(m_varianceMomentumSchedule, minibatchSize);
|
||||
}
|
||||
|
||||
mutable std::unordered_map<Parameter, double> m_smoothedCounts;
|
||||
MomentumSchedule m_varianceMomentumSchedule;
|
||||
};
|
||||
|
||||
class LearnerRMSProp : public LearnerBase
|
||||
{
|
||||
public:
|
||||
|
|
|
@ -137,7 +137,6 @@ namespace CNTK
|
|||
(op == PrimitiveOpType::CrossEntropyWithSoftmax) ||
|
||||
(op == PrimitiveOpType::ClassificationError) ||
|
||||
(op == PrimitiveOpType::Logistic) ||
|
||||
(op == PrimitiveOpType::CosDistance) ||
|
||||
(op == PrimitiveOpType::LambdaRank) ||
|
||||
(op == PrimitiveOpType::NDCG))
|
||||
{
|
||||
|
|
|
@ -432,7 +432,20 @@ template <class ElemType>
|
|||
InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
|
||||
let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from
|
||||
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
|
||||
auto& output = Value(); // output goes here
|
||||
|
||||
#ifdef _MSC_VER
|
||||
auto& outputValuePtrRef = ValuePtrRef();
|
||||
#else
|
||||
auto& outputValuePtrRef = this->template ValuePtrRef();
|
||||
#endif
|
||||
if ((source.GetMatrixType() == SPARSE) && (outputValuePtrRef->GetMatrixType() != SPARSE))
|
||||
outputValuePtrRef = std::make_shared<Matrix<ElemType>>(outputValuePtrRef->GetNumRows(),
|
||||
outputValuePtrRef->GetNumCols(),
|
||||
outputValuePtrRef->GetPreferredDeviceId(),
|
||||
source.GetMatrixType(),
|
||||
source.GetFormat());
|
||||
|
||||
auto& output = Value(); // output goes here
|
||||
output.DoGatherColumnsOf(/*beta=*/0, index, source, /*alpha=*/1);
|
||||
}
|
||||
|
||||
|
@ -493,6 +506,19 @@ template <class ElemType>
|
|||
InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
|
||||
let& index = InputRef(INDEXDATA) .Value(); // column indices to copy from
|
||||
let& source = InputRef(SOURCEDATA).Value(); // source data to copy
|
||||
|
||||
#ifdef _MSC_VER
|
||||
auto& outputValuePtrRef = ValuePtrRef();
|
||||
#else
|
||||
auto& outputValuePtrRef = this->template ValuePtrRef();
|
||||
#endif
|
||||
if ((source.GetMatrixType() == SPARSE) && (outputValuePtrRef->GetMatrixType() != SPARSE))
|
||||
outputValuePtrRef = std::make_shared<Matrix<ElemType>>(outputValuePtrRef->GetNumRows(),
|
||||
outputValuePtrRef->GetNumCols(),
|
||||
outputValuePtrRef->GetPreferredDeviceId(),
|
||||
source.GetMatrixType(),
|
||||
source.GetFormat());
|
||||
|
||||
auto& output = Value(); // output goes here
|
||||
output.DoScatterColumnsOf(/*beta=*/0, index, source, /*alpha=*/1);
|
||||
}
|
||||
|
|
|
@ -69,6 +69,8 @@ public:
|
|||
if (flags & CopyNodeFlags::copyNodeValue)
|
||||
{
|
||||
auto node = dynamic_pointer_cast<ReshapeNode<ElemType>>(nodeP);
|
||||
node->m_beginDimParameter = m_beginDimParameter;
|
||||
node->m_endDimParameter = m_endDimParameter;
|
||||
node->m_replacementSampleLayout = m_replacementSampleLayout;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1246,6 +1246,41 @@ void CPUMatrix<ElemType>::FSAdagrad(CPUMatrix<ElemType>& gradients,
|
|||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void CPUMatrix<ElemType>::Adam(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
|
||||
ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
|
||||
{
|
||||
size_t numColsNeeded = 2 * gradients.GetNumCols();
|
||||
auto unitGainFactor = ElemType(unitGainMomentum ? (1.0 - momentum) : 1.0);
|
||||
|
||||
if (IsEmpty() || (GetNumCols() < numColsNeeded))
|
||||
{
|
||||
RequireSize(gradients.GetNumRows(), numColsNeeded);
|
||||
SetValue(0.0);
|
||||
}
|
||||
|
||||
assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded));
|
||||
|
||||
size_t n = gradients.GetNumElements();
|
||||
ElemType* grad = gradients.Data();
|
||||
ElemType* smoothAda = Data();
|
||||
ElemType* smoothMom = Data() + n;
|
||||
ElemType* val = functionValues.Data();
|
||||
#pragma omp parallel for
|
||||
// TODO: Unroll 4-times for better performance leveraging vectorization
|
||||
for (long i = 0; i < n; i++)
|
||||
{
|
||||
ElemType g = grad[i];
|
||||
ElemType adaSqr = adaWeight * smoothAda[i] + (1.0f - adaWeight) * g * g;
|
||||
smoothAda[i] = adaSqr;
|
||||
ElemType ada = sqrt(adaSqr);
|
||||
ElemType w = adaMul * (ElemType)( 1.0 / (ada + 1e-8));
|
||||
g = momentum * smoothMom[i] + unitGainFactor * g;
|
||||
smoothMom[i] = g;
|
||||
val[i] -= g * w * learnRatePerSample;
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
ElemType CPUMatrix<ElemType>::RmsProp(CPUMatrix<ElemType>& gradients,
|
||||
ElemType RMS_GAMMA,
|
||||
|
|
|
@ -95,7 +95,10 @@ public:
|
|||
|
||||
void FSAdagrad(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
|
||||
ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
|
||||
|
||||
|
||||
void Adam(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
|
||||
ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
|
||||
|
||||
ElemType RmsProp(CPUMatrix<ElemType>& gradients,
|
||||
ElemType RMS_GAMMA,
|
||||
ElemType RMS_WGT_INC,
|
||||
|
|
|
@ -1413,6 +1413,31 @@ void GPUMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>& gradients,
|
|||
learnRatePerSample, momentum, adaWeight, adaMul, unitGainMomentum);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::Adam(GPUMatrix<ElemType>& gradients,
|
||||
GPUMatrix<ElemType>& functionValues,
|
||||
ElemType learnRatePerSample,
|
||||
ElemType momentum,
|
||||
ElemType adaWeight,
|
||||
ElemType adaMul,
|
||||
bool unitGainMomentum)
|
||||
{
|
||||
size_t numColsNeeded = 2 * gradients.GetNumCols();
|
||||
|
||||
if (IsEmpty() || (GetNumCols() < numColsNeeded))
|
||||
{
|
||||
RequireSize(gradients.GetNumRows(), numColsNeeded);
|
||||
SetValue(0.0);
|
||||
}
|
||||
|
||||
assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded));
|
||||
|
||||
size_t n = gradients.GetNumElements();
|
||||
int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock;
|
||||
_adam<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(n, gradients.Data(), Data(), Data() + n, functionValues.Data(),
|
||||
learnRatePerSample, momentum, adaWeight, adaMul, unitGainMomentum);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
ElemType GPUMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& gradients,
|
||||
ElemType RMS_GAMMA,
|
||||
|
|
|
@ -224,10 +224,13 @@ public:
|
|||
}
|
||||
|
||||
ElemType Adagrad(GPUMatrix<ElemType>& gradients, const bool needAveMultiplier);
|
||||
|
||||
void FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
|
||||
|
||||
void FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
|
||||
ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
|
||||
|
||||
void Adam(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
|
||||
ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
|
||||
|
||||
ElemType RmsProp(GPUMatrix<ElemType>& gradients,
|
||||
ElemType RMS_GAMMA,
|
||||
ElemType RMS_WGT_INC,
|
||||
|
|
|
@ -5132,6 +5132,66 @@ __global__ void _maskColumnsValue(ElemType* a, const char* columnsMask, CUDA_LON
|
|||
a[IDX2C(rowIdx, colIdx, numRows)] = val;
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _adam(CUDA_LONG size, ElemType* grad, ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
|
||||
ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
|
||||
{
|
||||
const ElemType unitGainFactor = unitGainMomentum ? (1.0 - mom) : 1.0;
|
||||
CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
CUDA_LONG stride = blockDim.x * gridDim.x;
|
||||
for (; idx < size; idx += stride)
|
||||
{
|
||||
ElemType g = grad[idx];
|
||||
ElemType adaSqr = adaWeight * smoothAda[idx] + (1.0f - adaWeight) * g * g;
|
||||
smoothAda[idx] = adaSqr;
|
||||
ElemType w;
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
w = adaMul * rsqrt(adaSqr + 1e-8);
|
||||
}
|
||||
else
|
||||
{
|
||||
w = adaMul * rsqrtf(adaSqr + 1e-8);
|
||||
}
|
||||
|
||||
g = mom * smoothMom[idx] + unitGainFactor * g;
|
||||
smoothMom[idx] = g;
|
||||
g = lr*g*w;
|
||||
val[idx] -= g;
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _adam4BlockSparseCol(CUDA_LONG size,
|
||||
ElemType* grad_bsc, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
|
||||
ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
|
||||
ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
|
||||
{
|
||||
const ElemType unitGainFactor = unitGainMomentum ? (1.0 - mom) : 1.0;
|
||||
CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
CUDA_LONG stride = blockDim.x * gridDim.x;
|
||||
for (; idx < size; idx += stride)
|
||||
{
|
||||
ElemType g = _getvalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, idx);
|
||||
ElemType adaSqr = adaWeight * smoothAda[idx] + (1.0f - adaWeight) * g * g;
|
||||
smoothAda[idx] = adaSqr;
|
||||
ElemType w;
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
w = adaMul * rsqrt(adaSqr + 1e-8);
|
||||
}
|
||||
else
|
||||
{
|
||||
w = adaMul * rsqrtf(adaSqr + 1e-8);
|
||||
}
|
||||
|
||||
g = mom * smoothMom[idx] + unitGainFactor * g;
|
||||
smoothMom[idx] = g;
|
||||
g = lr*g*w;
|
||||
val[idx] -= g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1148,6 +1148,7 @@ void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPU
|
|||
{
|
||||
RuntimeError("Only support c += alpha * a operation");
|
||||
}
|
||||
|
||||
int blocksPerGrid = (int) ceil(1.0 * cRows / GridDim::maxThreadsPerBlock);
|
||||
SyncGuard syncGuard;
|
||||
for (int rowInB = 0; rowInB < l; rowInB++)
|
||||
|
@ -1546,6 +1547,39 @@ void GPUSparseMatrix<ElemType>::FSAdagrad(
|
|||
learnRatePerSample, momentum, adaWeight, adaMul, unitGainMomentum);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::Adam(
|
||||
GPUMatrix<ElemType>& c,
|
||||
GPUMatrix<ElemType>& functionValues,
|
||||
ElemType learnRatePerSample,
|
||||
ElemType momentum,
|
||||
ElemType adaWeight,
|
||||
ElemType adaMul,
|
||||
bool unitGainMomentum)
|
||||
{
|
||||
if (GetFormat() != MatrixFormat::matrixFormatSparseBlockCol)
|
||||
{
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
size_t numColsNeeded = 2 * GetNumCols();
|
||||
|
||||
if (c.IsEmpty() || (c.GetNumCols() < numColsNeeded))
|
||||
{
|
||||
c.RequireSize(GetNumRows(), numColsNeeded);
|
||||
c.SetValue(0.0);
|
||||
}
|
||||
|
||||
assert((c.GetNumRows() == GetNumRows()) && (c.GetNumCols() == numColsNeeded));
|
||||
|
||||
size_t n = GetNumElements();
|
||||
int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock;
|
||||
_adam4BlockSparseCol<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(
|
||||
n, Data(), ColOrRow2BlockId(), GetNumRows(),
|
||||
c.Data(), c.Data() + n, functionValues.Data(),
|
||||
learnRatePerSample, momentum, adaWeight, adaMul, unitGainMomentum);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
ElemType GPUSparseMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& c,
|
||||
ElemType RMS_GAMMA,
|
||||
|
|
|
@ -412,6 +412,7 @@ public:
|
|||
ElemType Adagrad(GPUMatrix<ElemType>& c, const bool needAveMultiplier);
|
||||
void FSAdagrad(GPUMatrix<ElemType>& c, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
|
||||
ElemType RmsProp(GPUMatrix<ElemType>& c, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
|
||||
void Adam(GPUMatrix<ElemType>& c, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum);
|
||||
|
||||
static void Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C);
|
||||
static void Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C);
|
||||
|
|
|
@ -1089,6 +1089,9 @@ Matrix<ElemType>& Matrix<ElemType>::DoGatherColumnsOf(ElemType beta, const Matri
|
|||
{
|
||||
DecideAndMoveToRightDevice(*this, idx, a); // TODO: only move target if beta != 0
|
||||
|
||||
if (a.GetMatrixType() != this->GetMatrixType())
|
||||
RuntimeError("Matrix::DoGatherColumnsOf: The source and target matrices must have same storage type (SPARSE/DENSE).");
|
||||
|
||||
DISPATCH_MATRIX_ON_FLAG(&a, this,
|
||||
{ m_CPUMatrix->DoGatherColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUMatrix, alpha); },
|
||||
{ m_GPUMatrix->DoGatherColumnsOf(beta, *idx.m_GPUMatrix, *a.m_GPUMatrix, alpha); },
|
||||
|
@ -1101,8 +1104,7 @@ Matrix<ElemType>& Matrix<ElemType>::DoGatherColumnsOf(ElemType beta, const Matri
|
|||
CPUSparseMatrix<ElemType> tempA(a.GetFormat(), a.GetNumRows(), a.GetNumCols(), a.m_GPUSparseMatrix->GetNumNZElements());
|
||||
a.m_GPUSparseMatrix->CopyToCPUSparseMatrix(tempA);
|
||||
|
||||
CPUSparseMatrix<ElemType> tempThis(m_GPUSparseMatrix->GetFormat(), m_GPUSparseMatrix->GetNumRows(), m_GPUSparseMatrix->GetNumCols(),
|
||||
m_GPUSparseMatrix->GetNumNZElements());
|
||||
CPUSparseMatrix<ElemType> tempThis(m_GPUSparseMatrix->GetFormat(), m_GPUSparseMatrix->GetNumRows(), m_GPUSparseMatrix->GetNumCols(), m_GPUSparseMatrix->GetNumNZElements());
|
||||
m_GPUSparseMatrix->CopyToCPUSparseMatrix(tempThis);
|
||||
|
||||
tempThis.DoGatherColumnsOf(beta, *tempIdx.m_CPUMatrix, tempA, alpha);
|
||||
|
@ -1121,11 +1123,27 @@ Matrix<ElemType>& Matrix<ElemType>::DoScatterColumnsOf(ElemType beta, const Matr
|
|||
{
|
||||
DecideAndMoveToRightDevice(*this, idx, a); // TODO: only move target if beta != 0
|
||||
|
||||
if (a.GetMatrixType() != this->GetMatrixType())
|
||||
RuntimeError("Matrix::DoScatterColumnsOf: The source and target matrices must have same storage type (SPARSE/DENSE).");
|
||||
|
||||
DISPATCH_MATRIX_ON_FLAG(&a, this,
|
||||
{ m_CPUMatrix->DoScatterColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUMatrix, alpha); },
|
||||
{ m_GPUMatrix->DoScatterColumnsOf(beta, *idx.m_GPUMatrix, *a.m_GPUMatrix, alpha); },
|
||||
{ m_CPUSparseMatrix->DoScatterColumnsOf(beta, *idx.m_CPUMatrix, *a.m_CPUSparseMatrix, alpha); },
|
||||
{ NOT_IMPLEMENTED; });
|
||||
{
|
||||
// TODO replace by more performant version directly on GPU that does not require the round-trip over CPU.
|
||||
|
||||
Matrix<ElemType> tempIdx(CPUDEVICE); tempIdx.AssignValuesOf(idx);
|
||||
|
||||
CPUSparseMatrix<ElemType> tempA(a.GetFormat(), a.GetNumRows(), a.GetNumCols(), a.m_GPUSparseMatrix->GetNumNZElements());
|
||||
a.m_GPUSparseMatrix->CopyToCPUSparseMatrix(tempA);
|
||||
|
||||
CPUSparseMatrix<ElemType> tempThis(m_GPUSparseMatrix->GetFormat(), m_GPUSparseMatrix->GetNumRows(), m_GPUSparseMatrix->GetNumCols(), m_GPUSparseMatrix->GetNumNZElements());
|
||||
m_GPUSparseMatrix->CopyToCPUSparseMatrix(tempThis);
|
||||
|
||||
tempThis.DoScatterColumnsOf(beta, *tempIdx.m_CPUMatrix, tempA, alpha);
|
||||
m_GPUSparseMatrix->SetValue(tempThis);
|
||||
});
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
@ -1672,6 +1690,40 @@ void Matrix<ElemType>::FSAdagradUpdate(size_t mbSize,
|
|||
// Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
|
||||
}
|
||||
|
||||
///
|
||||
// Implement the original adam algorithm according to the paper
|
||||
// Ref: ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION, https://arxiv.org/pdf/1412.6980.pdf
|
||||
///
|
||||
template <class ElemType>
|
||||
void Matrix<ElemType>::AdamUpdate(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, double& smoothedCount,
|
||||
const double learnRatePerSample, const double meanMomentum, const double varMomentum, bool unitGainMomentum)
|
||||
{
|
||||
smoothedCount++;
|
||||
// Bias correction
|
||||
let biasCorrection = (ElemType)(sqrt(1- pow(varMomentum, smoothedCount))/(1- pow(meanMomentum, smoothedCount)));
|
||||
|
||||
DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
|
||||
{
|
||||
m_CPUMatrix->Adam(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix,
|
||||
(ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum,
|
||||
biasCorrection, unitGainMomentum);
|
||||
SetDataLocation(CPU);
|
||||
},
|
||||
{
|
||||
m_GPUMatrix->Adam(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix,
|
||||
(ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum,
|
||||
biasCorrection, unitGainMomentum);
|
||||
SetDataLocation(GPU);
|
||||
},
|
||||
{ NOT_IMPLEMENTED; },
|
||||
{ gradients.m_GPUSparseMatrix->Adam(*m_GPUMatrix, *functionValues.m_GPUMatrix,
|
||||
(ElemType)learnRatePerSample, (ElemType)meanMomentum,
|
||||
(ElemType)varMomentum, biasCorrection, unitGainMomentum);
|
||||
SetDataLocation(GPU); });
|
||||
|
||||
// Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
ElemType Matrix<ElemType>::RmsProp(Matrix<ElemType>& gradients,
|
||||
ElemType RMS_GAMMA,
|
||||
|
|
|
@ -215,6 +215,10 @@ public:
|
|||
Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, double& smoothedCount,
|
||||
const double learnRatePerSample, const double targetAdagradAvDenom,
|
||||
const double meanMomentum, const double varMomentum, bool unitGainMomentum = true);
|
||||
|
||||
void AdamUpdate(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, double& smoothedCount,
|
||||
const double learnRatePerSample, const double meanMomentum, const double varMomentum, bool unitGainMomentum = true);
|
||||
|
||||
ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
|
||||
|
||||
void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true); // by default we only reallocate if need to grow
|
||||
|
|
|
@ -261,6 +261,11 @@ void GPUSparseMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>&, GPUMatrix<ElemTy
|
|||
{
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::Adam(GPUMatrix<ElemType>& c, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
|
||||
{
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
ElemType GPUSparseMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>&, ElemType, ElemType, ElemType, ElemType, ElemType, const bool)
|
||||
{
|
||||
|
@ -1077,6 +1082,13 @@ void GPUMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<El
|
|||
{
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::Adam(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
|
||||
ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
ElemType GPUMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier)
|
||||
{
|
||||
|
|
|
@ -7,50 +7,28 @@
|
|||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
import subprocess
|
||||
import re
|
||||
import pytest
|
||||
from cntk.ops.tests.ops_test_utils import cntk_device
|
||||
from cntk.cntk_py import DeviceKind_GPU
|
||||
from cntk.device import set_default_device
|
||||
from cntk.io import ReaderConfig, ImageDeserializer
|
||||
from cntk import distributed
|
||||
import pytest
|
||||
|
||||
abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(abs_path)
|
||||
sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "AlexNet", "Python"))
|
||||
example_dir = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "AlexNet", "Python")
|
||||
sys.path.append(example_dir)
|
||||
from prepare_test_data import prepare_ImageNet_data
|
||||
from AlexNet_ImageNet_Distributed import alexnet_train_and_eval
|
||||
from ConvNet_CIFAR10_DataAug_Distributed_test import mpiexec_test
|
||||
script_under_test = os.path.join(example_dir, "AlexNet_ImageNet_Distributed.py")
|
||||
|
||||
#TOLERANCE_ABSOLUTE = 2E-1
|
||||
|
||||
def test_alexnet_error(device_id):
|
||||
if cntk_device(device_id).type() != DeviceKind_GPU:
|
||||
pytest.skip('test only runs on GPU')
|
||||
set_default_device(cntk_device(device_id))
|
||||
|
||||
base_path = prepare_ImageNet_data()
|
||||
# change dir to locate data.zip correctly
|
||||
os.chdir(base_path)
|
||||
|
||||
from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
|
||||
set_computation_network_trace_level(1)
|
||||
set_fixed_random_seed(1) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works
|
||||
#force_deterministic_algorithms()
|
||||
# TODO: do the above; they lead to slightly different results, so not doing it for now
|
||||
|
||||
# for test purpose we train and test on same data
|
||||
train_data=os.path.join(base_path, 'val1024_map.txt')
|
||||
test_data=os.path.join(base_path, 'val1024_map.txt')
|
||||
|
||||
test_error = alexnet_train_and_eval(train_data, test_data,
|
||||
num_quantization_bits=32,
|
||||
minibatch_size=16,
|
||||
epoch_size=64,
|
||||
max_epochs=2)
|
||||
distributed.Communicator.finalize()
|
||||
# expected_test_error = 0.0
|
||||
|
||||
# We are removing tolerance in error because running small epoch size has huge variance in accuracy. Will add
|
||||
# tolerance back once convolution operator is determinsitic.
|
||||
|
||||
# assert np.allclose(test_error, expected_test_error,
|
||||
# atol=TOLERANCE_ABSOLUTE)
|
||||
def test_alexnet_imagenet_distributed(device_id):
|
||||
params = [ "-n", "2",
|
||||
"-m", "8",
|
||||
"-e", "16",
|
||||
"-datadir", prepare_ImageNet_data(),
|
||||
"-q", "32",
|
||||
"-r",
|
||||
"-device", "0" ]
|
||||
mpiexec_test(device_id, script_under_test, params, 0.99, True)
|
||||
|
|
|
@ -16,28 +16,15 @@ from cntk.cntk_py import DeviceKind_GPU
|
|||
from cntk.device import set_default_device
|
||||
|
||||
abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(abs_path)
|
||||
example_dir = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "ConvNet", "Python")
|
||||
script_under_test = os.path.join(example_dir, "ConvNet_CIFAR10_DataAug_Distributed.py")
|
||||
|
||||
sys.path.append(example_dir)
|
||||
from prepare_test_data import prepare_CIFAR10_data
|
||||
script_under_test = os.path.join(example_dir, "ConvNet_CIFAR10_DataAug_Distributed.py")
|
||||
|
||||
TOLERANCE_ABSOLUTE = 2E-1
|
||||
TIMEOUT_SECONDS = 300
|
||||
|
||||
def data_set_directory():
|
||||
try:
|
||||
base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
|
||||
*"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
|
||||
# N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt
|
||||
# and CIFAR-10_mean.xml in the base_path.
|
||||
except KeyError:
|
||||
base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
*"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))
|
||||
|
||||
base_path = os.path.normpath(base_path)
|
||||
os.chdir(os.path.join(base_path, '..'))
|
||||
return base_path
|
||||
|
||||
def mpiexec_test(device_id, script, params, expected_test_error, match_exactly=True, per_minibatch_tolerance=TOLERANCE_ABSOLUTE, error_tolerance=TOLERANCE_ABSOLUTE):
|
||||
if cntk_device(device_id).type() != DeviceKind_GPU:
|
||||
pytest.skip('test only runs on GPU')
|
||||
|
@ -56,6 +43,7 @@ def mpiexec_test(device_id, script, params, expected_test_error, match_exactly=T
|
|||
results = re.findall("Cross Validation \[.+?\]: Minibatch\[.+?\]: errs = (.+?)%", str_out)
|
||||
|
||||
assert len(results) == 2
|
||||
print(results)
|
||||
|
||||
if match_exactly:
|
||||
assert results[0] == results[1]
|
||||
|
@ -65,23 +53,32 @@ def mpiexec_test(device_id, script, params, expected_test_error, match_exactly=T
|
|||
assert np.allclose(float(results[0])/100, expected_test_error, atol=error_tolerance)
|
||||
|
||||
def test_cifar_convnet_distributed(device_id):
|
||||
params = [ "-e", "2",
|
||||
"-datadir", data_set_directory(),
|
||||
params = [ "-n", "2",
|
||||
"-m", "64",
|
||||
"-e", "3200",
|
||||
"-datadir", prepare_CIFAR10_data(),
|
||||
"-q", "32",
|
||||
"-r",
|
||||
"-device", "0" ]
|
||||
mpiexec_test(device_id, script_under_test, params, 0.617)
|
||||
mpiexec_test(device_id, script_under_test, params, 0.75, True)
|
||||
|
||||
def test_cifar_convnet_distributed_1bitsgd(device_id):
|
||||
params = [ "-e", "2",
|
||||
"-datadir", data_set_directory(),
|
||||
params = [ "-n", "2",
|
||||
"-m", "64",
|
||||
"-e", "3200",
|
||||
"-datadir", prepare_CIFAR10_data(),
|
||||
"-q", "1",
|
||||
"-r",
|
||||
"-device", "0" ]
|
||||
mpiexec_test(device_id, script_under_test, params, 0.617)
|
||||
mpiexec_test(device_id, script_under_test, params, 0.75, True)
|
||||
|
||||
|
||||
def test_cifar_convnet_distributed_block_momentum(device_id):
|
||||
params = [ "-e", "2",
|
||||
"-datadir", data_set_directory(),
|
||||
"-b", "3200",
|
||||
params = [ "-n", "2",
|
||||
"-m", "64",
|
||||
"-e", "3200",
|
||||
"-datadir", prepare_CIFAR10_data(),
|
||||
"-b", "1600",
|
||||
"-r",
|
||||
"-device", "0" ]
|
||||
mpiexec_test(device_id, script_under_test, params, 0.6457, False, 10)
|
||||
mpiexec_test(device_id, script_under_test, params, 0.78, False, 10)
|
||||
|
|
|
@ -30,8 +30,7 @@ def test_cifar_convnet_error(device_id):
|
|||
# change dir to locate data.zip correctly
|
||||
os.chdir(base_path)
|
||||
|
||||
from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
|
||||
set_computation_network_trace_level(1)
|
||||
from _cntk_py import set_fixed_random_seed, force_deterministic_algorithms
|
||||
set_fixed_random_seed(1) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works
|
||||
#force_deterministic_algorithms()
|
||||
# TODO: do the above; they lead to slightly different results, so not doing it for now
|
||||
|
|
|
@ -15,13 +15,14 @@ example_dir = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image"
|
|||
sys.path.append(example_dir)
|
||||
sys.path.append(abs_path)
|
||||
|
||||
from ConvNet_CIFAR10_DataAug_Distributed_test import mpiexec_test, data_set_directory
|
||||
from prepare_test_data import prepare_CIFAR10_data
|
||||
from ConvNet_CIFAR10_DataAug_Distributed_test import mpiexec_test
|
||||
|
||||
script_under_test = os.path.join(example_dir, "TrainResNet_CIFAR10_Distributed.py")
|
||||
|
||||
def test_cifar_resnet_distributed(device_id):
|
||||
params = [ "-e", "2",
|
||||
"-datadir", data_set_directory(),
|
||||
"-datadir", prepare_CIFAR10_data(),
|
||||
"-q", "32",
|
||||
"-es", "512",
|
||||
"-device", "0" ]
|
||||
|
@ -29,7 +30,7 @@ def test_cifar_resnet_distributed(device_id):
|
|||
|
||||
def test_cifar_resnet_distributed_1bitsgd(device_id):
|
||||
params = [ "-e", "2",
|
||||
"-datadir", data_set_directory(),
|
||||
"-datadir", prepare_CIFAR10_data(),
|
||||
"-q", "1",
|
||||
"-es", "512",
|
||||
"-device", "0" ]
|
||||
|
@ -38,7 +39,7 @@ def test_cifar_resnet_distributed_1bitsgd(device_id):
|
|||
|
||||
def test_cifar_resnet_distributed_block_momentum(device_id):
|
||||
params = [ "-e", "2",
|
||||
"-datadir", data_set_directory(),
|
||||
"-datadir", prepare_CIFAR10_data(),
|
||||
"-b", "3200",
|
||||
"-es", "512",
|
||||
"-device", "0" ]
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
# Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
# Licensed under the MIT license. See LICENSE.md file in the project root
|
||||
# for full license information.
|
||||
# ==============================================================================
|
||||
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
import subprocess
|
||||
import re
|
||||
import pytest
|
||||
from cntk.ops.tests.ops_test_utils import cntk_device
|
||||
from cntk.cntk_py import DeviceKind_GPU
|
||||
from cntk.device import set_default_device
|
||||
|
||||
abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(abs_path)
|
||||
example_dir = os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "VGG", "Python")
|
||||
sys.path.append(example_dir)
|
||||
from prepare_test_data import prepare_ImageNet_data
|
||||
from ConvNet_CIFAR10_DataAug_Distributed_test import mpiexec_test
|
||||
script_under_test = os.path.join(example_dir, "VGG16_ImageNet_Distributed.py")
|
||||
|
||||
# def test_alexnet_imagenet_distributed(device_id):
|
||||
# params = [ "-n", "2",
|
||||
# "-m", "1",
|
||||
# "-e", "2",
|
||||
# "-datadir", prepare_ImageNet_data(),
|
||||
# "-q", "32",
|
||||
# "-r",
|
||||
# "-device", "0" ]
|
||||
# mpiexec_test(device_id, script_under_test, params, 0.99, True)
|
|
@ -29,16 +29,19 @@ def prepare_CIFAR10_data():
|
|||
|
||||
def prepare_ImageNet_data():
|
||||
base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
*"../../../../Examples/Image/DataSets/ImageNet".split("/"))
|
||||
*"../../../../Examples/Image/DataSets/ImageNet/test_data".split("/"))
|
||||
base_path = os.path.normpath(base_path)
|
||||
if not os.path.isdir(base_path):
|
||||
os.mkdir(base_path)
|
||||
|
||||
# If val1024_map.txt don't exist locally, copy to local location
|
||||
if not os.path.isfile(os.path.join(base_path, 'val1024_map.txt')):
|
||||
if not (os.path.isfile(os.path.join(base_path, 'train_map.txt')) and os.path.isfile(os.path.join(base_path, 'val_map.txt'))):
|
||||
# copy from backup location
|
||||
base_path_bak = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
|
||||
*"Image/ImageNet/2012/v0".split("/"))
|
||||
base_path_bak = os.path.normpath(base_path_bak)
|
||||
|
||||
copyfile(os.path.join(base_path_bak, 'val1024_map.txt'), os.path.join(base_path, 'val1024_map.txt'))
|
||||
copyfile(os.path.join(base_path_bak, 'val1024_map.txt'), os.path.join(base_path, 'train_map.txt'))
|
||||
copyfile(os.path.join(base_path_bak, 'val1024_map.txt'), os.path.join(base_path, 'val_map.txt'))
|
||||
copyfile(os.path.join(base_path_bak, 'val1024.zip'), os.path.join(base_path, 'val1024.zip'))
|
||||
return base_path
|
|
@ -1,43 +0,0 @@
|
|||
# Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
# Licensed under the MIT license. See LICENSE.md file in the project root
|
||||
# for full license information.
|
||||
# ==============================================================================
|
||||
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import platform
|
||||
from cntk.io import ReaderConfig, ImageDeserializer, FULL_DATA_SWEEP
|
||||
from cntk import distributed
|
||||
from cntk.device import set_default_device, gpu
|
||||
|
||||
abs_path = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(abs_path)
|
||||
sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "Examples", "Image", "Classification", "ConvNet", "Python"))
|
||||
from prepare_test_data import prepare_CIFAR10_data
|
||||
from ConvNet_CIFAR10_DataAug_Distributed import convnet_cifar10_dataaug
|
||||
|
||||
def run_cifar_convnet_distributed():
|
||||
base_path = prepare_CIFAR10_data()
|
||||
# change dir to locate data.zip correctly
|
||||
os.chdir(base_path)
|
||||
|
||||
from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
|
||||
set_computation_network_trace_level(1)
|
||||
set_fixed_random_seed(1) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works
|
||||
#force_deterministic_algorithms()
|
||||
# TODO: do the above; they lead to slightly different results, so not doing it for now
|
||||
|
||||
train_data = os.path.join(base_path, 'train_map.txt')
|
||||
mean_data = os.path.join(base_path, 'CIFAR-10_mean.xml')
|
||||
test_data = os.path.join(base_path, 'test_map.txt')
|
||||
|
||||
num_quantization_bits = 32
|
||||
return convnet_cifar10_dataaug(train_data, test_data, mean_data, num_quantization_bits, epoch_size=512, max_epochs=2)
|
||||
|
||||
if __name__=='__main__':
|
||||
assert distributed.Communicator.rank() < distributed.Communicator.num_workers()
|
||||
set_default_device(gpu(0)) # force using GPU-0 in test for speed
|
||||
run_cifar_convnet_distributed()
|
||||
distributed.Communicator.finalize()
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -898,6 +898,36 @@ BOOST_FIXTURE_TEST_CASE(CPUMatrixSeedingDouble, RandomSeedFixture)
|
|||
BOOST_CHECK(m1.IsEqualTo(m2));
|
||||
}
|
||||
|
||||
BOOST_FIXTURE_TEST_CASE(CPUMatrixAdam, RandomSeedFixture)
|
||||
{
|
||||
CPUMatrix<double> adamMatrix;
|
||||
CPUMatrix<double> gradients(2, 1);
|
||||
CPUMatrix<double> parameters(2, 1);
|
||||
CPUMatrix<double> expectedParameters(2, 1);
|
||||
CPUMatrix<double> expectedStates(2, 2);
|
||||
double gradientValues[] = { 0.1, -0.1 };
|
||||
double paramValues[] = { 0.1, 0.1 };
|
||||
double expectedValues[] = { -0.05811338, 0.25811338 };
|
||||
double expectedStateValues[] = {1e-5, 0.01, 1e-5, -0.01};
|
||||
gradients.SetValue(2, 1, gradientValues, matrixFormatRowMajor);
|
||||
parameters.SetValue(2, 1, paramValues, matrixFormatRowMajor);
|
||||
expectedParameters.SetValue(2, 1, expectedValues, matrixFormatRowMajor);
|
||||
expectedStates.SetValue(2, 2, expectedStateValues, matrixFormatRowMajor);
|
||||
adamMatrix.Adam(gradients, parameters, 0.1, 0.9, 0.999, 0.5, true);
|
||||
|
||||
BOOST_CHECK(parameters.IsEqualTo(expectedParameters, 1e-6));
|
||||
BOOST_CHECK(adamMatrix.IsEqualTo(expectedStates, 1e-6));
|
||||
|
||||
double expectedValues2[] = { -0.27059249, 0.47059249 };
|
||||
double expectedStateValues2[] = { 2e-05, 0.019, 2e-05, -0.019 };
|
||||
expectedParameters.SetValue(2, 1, expectedValues2, matrixFormatRowMajor);
|
||||
expectedStates.SetValue(2, 2, expectedStateValues2, matrixFormatRowMajor);
|
||||
adamMatrix.Adam(gradients, parameters, 0.1, 0.9, 0.999, 0.5, true);
|
||||
|
||||
BOOST_CHECK(parameters.IsEqualTo(expectedParameters, 1e-6));
|
||||
BOOST_CHECK(adamMatrix.IsEqualTo(expectedStates, 1e-6));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
}
|
||||
} } }
|
||||
|
|
|
@ -537,6 +537,34 @@ BOOST_FIXTURE_TEST_CASE(GPUMatrixCurandSeedingDouble, RandomSeedFixture)
|
|||
BOOST_CHECK(m1.IsEqualTo(m2));
|
||||
}
|
||||
|
||||
BOOST_FIXTURE_TEST_CASE(GPUMatrixAdam, RandomSeedFixture)
|
||||
{
|
||||
GPUMatrix<double> adamMatrix(c_deviceIdZero);
|
||||
GPUMatrix<double> gradients(2, 1, c_deviceIdZero);
|
||||
GPUMatrix<double> parameters(2, 1, c_deviceIdZero);
|
||||
GPUMatrix<double> expectedParameters(2, 1, c_deviceIdZero);
|
||||
GPUMatrix<double> expectedStates(2, 2, c_deviceIdZero);
|
||||
double gradientValues[] = { 0.1, -0.1 };
|
||||
double paramValues[] = { 0.1, 0.1 };
|
||||
double expectedValues[] = { -0.05803489, 0.25803488 };
|
||||
double expectedStateValues[] = { 1e-5, 0.01, 1e-5, -0.01 };
|
||||
gradients.SetValue(2, 1, c_deviceIdZero, gradientValues, matrixFormatRowMajor);
|
||||
parameters.SetValue(2, 1, c_deviceIdZero, paramValues, matrixFormatRowMajor);
|
||||
expectedParameters.SetValue(2, 1, c_deviceIdZero, expectedValues, matrixFormatRowMajor);
|
||||
expectedStates.SetValue(2, 2, c_deviceIdZero, expectedStateValues, matrixFormatRowMajor);
|
||||
adamMatrix.Adam(gradients, parameters, 0.1, 0.9, 0.999, 0.5, true);
|
||||
BOOST_CHECK(parameters.IsEqualTo(expectedParameters, 1e-6));
|
||||
BOOST_CHECK(adamMatrix.IsEqualTo(expectedStates, 1e-6));
|
||||
|
||||
double expectedValues2[] = { -0.27046135, 0.47046134 };
|
||||
double expectedStateValues2[] = { 2e-05, 0.019, 2e-05, -0.019 };
|
||||
expectedParameters.SetValue(2, 1, c_deviceIdZero, expectedValues2, matrixFormatRowMajor);
|
||||
expectedStates.SetValue(2, 2, c_deviceIdZero, expectedStateValues2, matrixFormatRowMajor);
|
||||
adamMatrix.Adam(gradients, parameters, 0.1, 0.9, 0.999, 0.5, true);
|
||||
BOOST_CHECK(parameters.IsEqualTo(expectedParameters, 1e-6));
|
||||
BOOST_CHECK(adamMatrix.IsEqualTo(expectedStates, 1e-6));
|
||||
}
|
||||
|
||||
#if 0 // Temporarily disabling
|
||||
BOOST_FIXTURE_TEST_CASE(GPUMatrixLargeInequality, RandomSeedFixture)
|
||||
{
|
||||
|
|
|
@ -95,6 +95,15 @@ void TestFSAdaGradLearner(size_t numParameters, size_t numMinibatches, bool unit
|
|||
TestUpdate<ElementType>(learner, shape, numMinibatches, device);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
void TestAdamLearner(size_t numParameters, size_t numMinibatches, bool unitGainMomentum, const DeviceDescriptor& device)
|
||||
{
|
||||
NDShape shape = CreateShape(rng() % maxNumAxes + 1, maxDimSize);
|
||||
auto parameters = CreateParameters<ElementType>(shape, numParameters, device);
|
||||
auto learner = AdamLearner(parameters, LearningRatePerSampleSchedule({ 0.5 }), MomentumAsTimeConstantSchedule({ 10.0, 100.0, 1000.0 }), unitGainMomentum, MomentumPerSampleSchedule(0.99), false);
|
||||
TestUpdate<ElementType>(learner, shape, numMinibatches, device);
|
||||
}
|
||||
|
||||
template <typename ElementType>
|
||||
void TestRMSPropLearner(size_t numParameters, size_t numMinibatches, const DeviceDescriptor& device)
|
||||
{
|
||||
|
@ -335,6 +344,8 @@ void LearnerTests()
|
|||
TestMomentumSGDLearner<float>(numParameters, numMinibatches, unitGain, device);
|
||||
TestNesterovLearner<float>(numParameters, numMinibatches, unitGain, device);
|
||||
TestFSAdaGradLearner<double>(numParameters, numMinibatches, unitGain, device);
|
||||
TestAdamLearner<float>(numParameters, numMinibatches, unitGain, device);
|
||||
TestAdamLearner<double>(numParameters, numMinibatches, unitGain, device);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -76,7 +76,7 @@
|
|||
"name": "Fast R-CNN",
|
||||
"url": "https://github.com/Microsoft/CNTK/wiki/Object-Detection-using-Fast-R-CNN",
|
||||
"description": "Train object detection from images by adapting pre-trained classification models on arbitrarily sized regions of interest using ROI pooling.",
|
||||
"language": ["BrainScript"],
|
||||
"language": ["Python", "BrainScript"],
|
||||
"type": ["Tutorial", "Recipe"]
|
||||
},
|
||||
{
|
||||
|
@ -204,7 +204,7 @@
|
|||
"name": "VGG",
|
||||
"url": "https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Classification/VGG",
|
||||
"description": "Deep CNN from University of Oxford. This was the winning model for the ILSVRC2014 localization task.",
|
||||
"language": ["BrainScript"],
|
||||
"language": ["Python", "BrainScript"],
|
||||
"type": ["Recipe"]
|
||||
},
|
||||
{
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -292,3 +292,10 @@ class Value(cntk_py.Value):
|
|||
'''
|
||||
return self.shape[0]
|
||||
|
||||
def user_function(user_func):
|
||||
'''
|
||||
Wraps the passed Function to create a composite representing the
|
||||
composite Function graph rooted at the passed root Function.
|
||||
'''
|
||||
from . import as_composite
|
||||
return as_composite(user_func)
|
||||
|
|
|
@ -42,6 +42,11 @@ def depth_first_search(node, visitor):
|
|||
pass
|
||||
|
||||
if visitor(node):
|
||||
if node.is_parameter:
|
||||
node = node.as_parameter()
|
||||
elif node.is_constant:
|
||||
node = node.as_constant()
|
||||
|
||||
accum.append(node)
|
||||
|
||||
visited.add(node)
|
||||
|
|
|
@ -349,7 +349,10 @@ def Recurrence(over, go_backwards=False, initial_state=initial_state_default_or_
|
|||
f_x_h_c = over(x, prev_state) # apply the recurrent over
|
||||
# this returns a Function (x, (h_prev, c_prev)) -> (h, c)
|
||||
h_c = f_x_h_c.outputs
|
||||
replacements = { value_forward: value for (value_forward, value) in zip(list(_as_tuple(state_forward)), h_c) }
|
||||
if type(state_forward) is tuple and len(state_forward) > 1:
|
||||
replacements = { value_forward: value for (value_forward, value) in zip(list(_as_tuple(state_forward)), h_c) }
|
||||
else:
|
||||
replacements = {(state_forward,)[0] : h_c[0] }
|
||||
f_x_h_c.replace_placeholders(replacements) # resolves state_forward := h_c
|
||||
h = f_x_h_c.outputs[0] # 'h' is a Variable (the output of a Function that computed it)
|
||||
if _trace_layers:
|
||||
|
|
|
@ -564,9 +564,6 @@ def adam_sgd(parameters, lr, momentum, unit_gain=default_unit_gain_value(),
|
|||
<http://arxiv.org/abs/1412.6980>`_. International Conference for
|
||||
Learning Representations, 2015.
|
||||
'''
|
||||
if not low_memory:
|
||||
raise NotImplementedError('adam: low_memory=True currently required')
|
||||
|
||||
_verify_learning_rate_type(lr)
|
||||
_verify_momentum_type(momentum)
|
||||
_verify_momentum_type(variance_momentum)
|
||||
|
|
|
@ -647,11 +647,7 @@ class UserFunction(Function):
|
|||
|
||||
'''
|
||||
def __init__(self, inputs, name=''):
|
||||
# FIXME we need to save a reference here so that the function does not
|
||||
# disappear
|
||||
self.var_inputs = inputs
|
||||
|
||||
super(Function, self).__init__(inputs, name)
|
||||
super(UserFunction, self).__init__(inputs, name)
|
||||
|
||||
# Memory management for user defined functions has to be controlled by
|
||||
# the C++ side. For more information:
|
||||
|
@ -753,11 +749,22 @@ class UserFunction(Function):
|
|||
outputs.extend(self.infer_outputs())
|
||||
|
||||
def infer_outputs(self):
|
||||
raise NotImplementedError('infer_outputs has to be overridden')
|
||||
'''
|
||||
Returns a list of all output variables this user-defined function
|
||||
outputs.
|
||||
|
||||
Output variables are created by
|
||||
:meth:`~cntk.ops.functions.output_variable`.
|
||||
'''
|
||||
raise NotImplementedError('infer_outputs has to be overwritten')
|
||||
|
||||
def op_name(self):
|
||||
'''
|
||||
Returns the operator name.
|
||||
'''
|
||||
return 'UserFunction'
|
||||
|
||||
|
||||
@typemap
|
||||
def load_model(filename, device=None):
|
||||
'''
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
# Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
# Licensed under the MIT license. See LICENSE.md file in the project root
|
||||
# for full license information.
|
||||
# ==============================================================================
|
||||
|
||||
"""
|
||||
Unit tests for the cosine distance class.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from .. import *
|
||||
from ...axis import Axis
|
||||
from ... import sequence
|
||||
|
||||
def test_cosine_distance():
|
||||
a = np.reshape(np.arange(25.0, dtype = np.float32), (5,5))
|
||||
b = np.reshape(np.arange(0, 5, dtype=np.float32), (1,5))
|
||||
|
||||
src = input_variable(shape=(5), dynamic_axes=[ Axis.default_batch_axis(), Axis("Seq")])
|
||||
tgt = input_variable(shape=(5))
|
||||
tgt_br = sequence.broadcast_as(tgt, src)
|
||||
cos_seq = cosine_distance(src, tgt_br)
|
||||
assert len(cos_seq.dynamic_axes)==2
|
||||
assert cos_seq.dynamic_axes[1].name=="Seq"
|
||||
val = cos_seq.eval({src:[a], tgt:[b]})
|
||||
expected = [[ 1., 0.914659, 0.878459, 0.86155, 0.851852]]
|
||||
print(np.allclose(val, expected))
|
|
@ -14,7 +14,7 @@ import pytest
|
|||
from .ops_test_utils import unittest_helper, _test_unary_op, _test_binary_op, AA, I, precision, PRECISION_TO_TYPE, cntk_device
|
||||
import cntk as C
|
||||
from cntk.axis import Axis
|
||||
from ...utils import sanitize_dtype_cntk
|
||||
from ...utils import sanitize_dtype_cntk, one_hot
|
||||
from .. import constant
|
||||
|
||||
EPS_IN_LOG = 1e-37 # 1e-37 is the highest guaranteed precision
|
||||
|
@ -395,3 +395,39 @@ def test_op_gather_derived_dynamic_axes_equivalence(device_id, precision):
|
|||
res = z.eval({a: input_data1, b: input_data2})
|
||||
expected_forward = [[[3.]]]
|
||||
assert np.array_equal(res, expected_forward)
|
||||
|
||||
|
||||
def test_op_gather_sparse(device_id):
|
||||
from .. import sequence, times
|
||||
|
||||
input_sparse_indices = [[1, 3, 5], [2, 4]]
|
||||
vocab_size = 6
|
||||
input_data = one_hot(input_sparse_indices, vocab_size)
|
||||
|
||||
a = I(shape=(vocab_size,), is_sparse=True, name='a')
|
||||
|
||||
a_last = sequence.last(a)
|
||||
a_last_dense = times(a_last, np.eye(vocab_size))
|
||||
res = a_last_dense.eval({a : input_data})
|
||||
assert np.array_equal(res, [[[0, 0, 0, 0, 0, 1]], [[0, 0, 0, 0, 1, 0]]])
|
||||
|
||||
a_last_2 = sequence.slice(a, -2, 0)
|
||||
a_last_2_dense = times(a_last_2, np.eye(vocab_size))
|
||||
res = a_last_2_dense.eval({a : input_data})
|
||||
assert np.array_equal(res, [[[0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1]], [[0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0]]])
|
||||
|
||||
|
||||
def test_op_scatter_sparse(device_id):
|
||||
from .. import sequence, times
|
||||
|
||||
input_sparse_indices = [[1, 3, 5], [2, 4]]
|
||||
vocab_size = 6
|
||||
input_data = one_hot(input_sparse_indices, vocab_size)
|
||||
|
||||
a = I(shape=(vocab_size,), is_sparse=True, name='a')
|
||||
|
||||
a_last_scatter = sequence.scatter(sequence.last(a), sequence.is_first(a))
|
||||
a_last_scatter_dense = times(a_last_scatter, np.eye(vocab_size))
|
||||
res = a_last_scatter_dense.eval({a : input_data})
|
||||
assert np.array_equal(res[0], np.asarray([[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]))
|
||||
assert np.array_equal(res[1], np.asarray([[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]]))
|
||||
|
|
|
@ -46,7 +46,8 @@ def linear_layer(input_var, output_dim):
|
|||
|
||||
def dense_layer(input, output_dim, nonlinearity):
|
||||
r = linear_layer(input, output_dim)
|
||||
r = nonlinearity(r)
|
||||
if isinstance(nonlinearity, UserFunction):
|
||||
r = user_function(nonlinearity(r))
|
||||
return r
|
||||
|
||||
def fully_connected_classifier_net(input, num_output_classes, hidden_layer_dim,
|
||||
|
|
|
@ -25,7 +25,8 @@ class MyPlus(UserFunction):
|
|||
self.backward_calls = 0
|
||||
|
||||
def infer_outputs(self):
|
||||
return [output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes)]
|
||||
return [output_variable(self.inputs[0].shape,
|
||||
self.inputs[0].dtype, self.inputs[0].dynamic_axes)]
|
||||
|
||||
def forward(self, arguments, device=None, outputs_to_retain=None):
|
||||
assert len(self.inputs)==2
|
||||
|
@ -45,7 +46,7 @@ def test_ext_eval_1():
|
|||
dim = 4
|
||||
p = parameter(shape=(dim,), init=10, name='p')
|
||||
i = input_variable(dim, needs_gradient=True, name='i_var')
|
||||
m = MyPlus(i, constant(3))
|
||||
m = user_function(MyPlus(i, constant(3)))
|
||||
z = m+p
|
||||
|
||||
input_data = np.random.rand(dim)
|
||||
|
@ -56,7 +57,7 @@ def test_ext_eval_2_only_param():
|
|||
dim = 4
|
||||
p = parameter(shape=(dim,), init=10, name='p')
|
||||
i = input_variable(dim, needs_gradient=True, name='i_var')
|
||||
m = MyPlus(p, constant(3))
|
||||
m = user_function(MyPlus(p, constant(3)))
|
||||
# combine does not work
|
||||
# z = combine([m.output])
|
||||
z = m+i
|
||||
|
@ -68,7 +69,7 @@ def test_ext_eval_2_only_param():
|
|||
def test_ext_eval_3_no_input():
|
||||
dim = 4
|
||||
p = parameter(shape=(dim,), init=10, name='p')
|
||||
m = MyPlus(p, constant(3))
|
||||
m = user_function(MyPlus(p, constant(3)))
|
||||
z = m+0
|
||||
|
||||
result = z.eval()
|
||||
|
@ -79,7 +80,7 @@ def test_ext_eval_4_a_inside_graph():
|
|||
dim = 4
|
||||
p_init = 10
|
||||
p = parameter(shape=(dim,), init=p_init, name='p')
|
||||
m = MyPlus(p, constant(3))
|
||||
m = user_function(MyPlus(p, constant(3)))
|
||||
z = p * m
|
||||
|
||||
result = z.eval()
|
||||
|
@ -90,7 +91,7 @@ def test_ext_eval_4_b_inside_graph():
|
|||
dim = 4
|
||||
p_init = 10
|
||||
p = parameter(shape=(dim,), init=p_init, name='p')
|
||||
z = p * MyPlus(p, constant(3))
|
||||
z = user_function(p * MyPlus(p, constant(3)))
|
||||
|
||||
result = z.eval()
|
||||
# No batch dimension since we have no input
|
||||
|
@ -100,14 +101,14 @@ def test_ext_eval_5_times():
|
|||
dim = 2
|
||||
p_init = 10
|
||||
p = parameter(shape=(dim,), init=p_init, name='p')
|
||||
m = MyPlus(p, constant(3))
|
||||
m = user_function(MyPlus(p, constant(3)))
|
||||
z = times(m, parameter(shape=(2,50), init=2))
|
||||
|
||||
result = z.eval()
|
||||
# No batch dimension since we have no input
|
||||
assert np.allclose(result, ((p_init*np.ones_like(result))+3)*2*2)
|
||||
|
||||
def test_ext_clone():
|
||||
def test_ext_eval_6_clone():
|
||||
dim = 4
|
||||
i = input_variable(dim, needs_gradient=True, name='i_var')
|
||||
m = i + 3
|
||||
|
@ -115,20 +116,34 @@ def test_ext_clone():
|
|||
p = parameter(shape=(dim,), init=10, name='p')
|
||||
z = m + p
|
||||
|
||||
m_udf = MyPlus(i, constant(3))
|
||||
m_udf = user_function(MyPlus(i, constant(3)))
|
||||
z_clone = z.clone('share', {m : m_udf} );
|
||||
|
||||
input_data = np.random.rand(dim)
|
||||
result = z_clone.eval([input_data])
|
||||
assert np.allclose(result[0][0], input_data+3+10)
|
||||
|
||||
def test_ext_eval_7_placeholder():
|
||||
dim = 4
|
||||
p = parameter(shape=(dim,), init=10, name='p')
|
||||
i = input_variable(dim, needs_gradient=True, name='i_var')
|
||||
pl = placeholder_variable()
|
||||
m = user_function(MyPlus(pl, constant(3)))
|
||||
z = m+p
|
||||
z.replace_placeholder(i)
|
||||
|
||||
input_data = np.random.rand(dim)
|
||||
result = z.eval([input_data])
|
||||
assert np.allclose(result[0][0], input_data+3+10)
|
||||
|
||||
def test_ext_train():
|
||||
dim = 4
|
||||
|
||||
p = parameter(shape=(dim,), init=10)
|
||||
i = input_variable(dim, needs_gradient=True, name='i_var')
|
||||
m = MyPlus(i, constant(3))
|
||||
z = m+p
|
||||
# keeping m unwrapped since we need to access its member variables
|
||||
z = user_function(m)+p
|
||||
|
||||
momentum_time_constant = momentum_as_time_constant_schedule(1100)
|
||||
lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)
|
||||
|
@ -172,7 +187,7 @@ def test_ext_backpropstate(payload):
|
|||
|
||||
p = parameter(shape=(dim,), init=10)
|
||||
in1 = input_variable(dim, needs_gradient=True, name='i_var')
|
||||
m = TestBackPropState(in1, payload)
|
||||
m = user_function(TestBackPropState(in1, payload))
|
||||
z = m+p
|
||||
|
||||
lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)
|
||||
|
@ -223,6 +238,7 @@ def test_ext_lambdafunc():
|
|||
m = LambdaFunc(k,
|
||||
when=lambda arg: np.sum(arg)>1,
|
||||
execute=cb.inc)
|
||||
m = user_function(m)
|
||||
z = m+0
|
||||
|
||||
momentum_time_constant = momentum_as_time_constant_schedule(1100)
|
||||
|
@ -262,7 +278,7 @@ def test_udf_plus_and_last():
|
|||
x = input_variable(shape=(2,))
|
||||
y = input_variable(shape=(2,), dynamic_axes=[Axis.default_batch_axis()])
|
||||
|
||||
func = as_composite(PlusAndLast(x, y))
|
||||
func = user_function(PlusAndLast(x, y))
|
||||
|
||||
dt_precision = np.float32
|
||||
operand1 = [AA([[1., 2.], [3., 4.]], dtype=dt_precision)]
|
||||
|
|
|
@ -132,6 +132,26 @@ class Variable(VariableMixin, TensorOpsMixin, cntk_py.Variable):
|
|||
super(Variable, self).__init__(shape, is_sparse, dtype, needs_gradient, name,
|
||||
dynamic_axes)
|
||||
|
||||
@typemap
|
||||
def as_parameter(self):
|
||||
'''
|
||||
Converts this instance into a :class:`Parameter`
|
||||
'''
|
||||
if not self.is_parameter:
|
||||
raise TypeError('cannot be converted into a Parameter')
|
||||
|
||||
return cntk_py.Parameter(self)
|
||||
|
||||
@typemap
|
||||
def as_constant(self):
|
||||
'''
|
||||
Converts this instance into a :class:`Constant`
|
||||
'''
|
||||
if not self.is_constant:
|
||||
raise TypeError('cannot be converted into a Constant')
|
||||
|
||||
return cntk_py.Constant(self)
|
||||
|
||||
|
||||
class Parameter(VariableMixin, TensorOpsMixin, cntk_py.Parameter):
|
||||
'''
|
||||
|
|
|
@ -20,15 +20,13 @@ def _graph_dict():
|
|||
|
||||
d['i1'] = input_variable(
|
||||
shape=(2, 3), dynamic_axes=input_dynamic_axes, name='i1')
|
||||
d['i2'] = input_variable(
|
||||
shape=(2, 3), dynamic_axes=input_dynamic_axes, name='i2')
|
||||
d['c1'] = constant(shape=(2, 3), value=6, name='c1')
|
||||
d['p1'] = parameter(shape=(3, 2), init=7, name='p1')
|
||||
|
||||
d['p1'] = parameter(shape=(3, 2), name='p1')
|
||||
|
||||
d['op1'] = plus(d['i1'], d['i2'], name='op1')
|
||||
d['op1'] = plus(d['i1'], d['c1'], name='op1')
|
||||
d['op2'] = times(d['op1'], d['p1'], name='op2')
|
||||
|
||||
#d['slice'] = slice(d['i2'], Axis.default_dynamic_axis(), 0, 3)
|
||||
#d['slice'] = slice(d['c1'], Axis.default_dynamic_axis(), 0, 3)
|
||||
#label_sentence_start = sequence.first(raw_labels)
|
||||
|
||||
# no name
|
||||
|
@ -49,9 +47,9 @@ def _simple_dict():
|
|||
d = {}
|
||||
|
||||
d['i1'] = input_variable(shape=(2, 3), name='i1')
|
||||
d['i2'] = input_variable(shape=(2, 3), name='i2')
|
||||
d['p1'] = parameter(shape=(3, 2), name='p1')
|
||||
d['op1'] = plus(d['i1'], d['i2'], name='op1')
|
||||
d['c1'] = constant(shape=(2, 3), value=6, name='c1')
|
||||
d['p1'] = parameter(shape=(3, 2), init=7, name='p1')
|
||||
d['op1'] = plus(d['i1'], d['c1'], name='op1')
|
||||
d['op2'] = times(d['op1'], d['p1'], name='op2')
|
||||
d['root'] = d['op2']
|
||||
|
||||
|
@ -64,7 +62,7 @@ def _simple_dict():
|
|||
def test_find_nodes():
|
||||
d = _graph_dict()
|
||||
|
||||
for name in ['i1', 'i2', 'p1', 'op1', 'op2', 'past']:
|
||||
for name in ['i1', 'c1', 'p1', 'op1', 'op2', 'past']:
|
||||
n = find_all_with_name(d['root'], name)
|
||||
assert len(n) == 1, name
|
||||
assert n[0].name == name, name
|
||||
|
@ -89,6 +87,17 @@ def test_find_nodes():
|
|||
|
||||
assert find_by_name(d['root'], 'none') is None
|
||||
|
||||
def test_find_nodes_returning_proper_types():
|
||||
d = _graph_dict()
|
||||
|
||||
c1 = find_by_name(d['root'], 'c1')
|
||||
assert isinstance(c1, Constant)
|
||||
assert np.allclose(c1.value, np.zeros((2,3))+6)
|
||||
|
||||
p1 = find_by_name(d['root'], 'p1')
|
||||
assert isinstance(p1, Parameter)
|
||||
assert np.allclose(p1.value, np.zeros((3,2))+7)
|
||||
|
||||
|
||||
def test_plot():
|
||||
d = _simple_dict()
|
||||
|
@ -107,4 +116,4 @@ def test_depth_first_search():
|
|||
|
||||
found = depth_first_search(d['op2'], lambda x:True)
|
||||
found_names = [v.name for v in found]
|
||||
assert found_names == ['op2', 'op1', 'i1', 'i2', 'p1']
|
||||
assert found_names == ['op2', 'op1', 'i1', 'c1', 'p1']
|
||||
|
|
|
@ -8,6 +8,10 @@ import numpy as np
|
|||
import pytest
|
||||
|
||||
from ..layers import *
|
||||
from ..blocks import init_default_or_glorot_uniform, Parameter, _INFERRED, Placeholder
|
||||
from ..utils import _as_tuple
|
||||
from ..ops import sigmoid, times, tanh, element_times, plus, combine, input_variable
|
||||
from ..axis import Axis
|
||||
|
||||
def test_layers_name(device_id):
|
||||
from cntk import placeholder_variable, combine
|
||||
|
@ -19,3 +23,45 @@ def test_layers_name(device_id):
|
|||
q = Convolution((3,3), 3, name='conv33')(I)
|
||||
assert(q.root_function.name == 'conv33')
|
||||
|
||||
def gru_cell(shape, init=init_default_or_glorot_uniform, name=''): # (x, (h,c))
|
||||
shape = _as_tuple(shape)
|
||||
|
||||
if len(shape) != 1 :
|
||||
raise ValueError("gru_cell: shape must be vectors (rank-1 tensors)")
|
||||
|
||||
# determine stacking dimensions
|
||||
cell_shape_stacked = shape * 2 # patched dims with stack_axis duplicated 4 times
|
||||
|
||||
# parameters
|
||||
Wz = Parameter(cell_shape_stacked, init = init, name='Wz')
|
||||
Wr = Parameter(cell_shape_stacked, init = init, name='Wr')
|
||||
Wh = Parameter(cell_shape_stacked, init = init, name='Wh')
|
||||
Uz = Parameter( _INFERRED + shape, init = init, name = 'Uz')
|
||||
Ur = Parameter( _INFERRED + shape, init = init, name = 'Ur')
|
||||
Uh = Parameter( _INFERRED + shape, init = init, name = 'Uh')
|
||||
|
||||
def create_s_placeholder():
|
||||
# we pass the known dimensions here, which makes dimension inference easier
|
||||
return Placeholder(shape=shape, name='S') # (h, c)
|
||||
|
||||
# parameters to model function
|
||||
x = Placeholder(name='gru_block_arg')
|
||||
prev_status = create_s_placeholder()
|
||||
|
||||
# formula of model function
|
||||
Sn_1 = prev_status
|
||||
|
||||
z = sigmoid(times(x, Uz, name='x*Uz') + times(Sn_1, Wz, name='Sprev*Wz'), name='z')
|
||||
r = sigmoid(times(x, Ur, name='x*Ur') + times(Sn_1, Wr, name='Sprev*Wr'), name='r')
|
||||
h = tanh(times(x, Uh, name='x*Uh') + times(element_times(Sn_1, r, name='Sprev*r'), Wh), name='h')
|
||||
s = plus(element_times((1-z), h, name='(1-z)*h'), element_times(z, Sn_1, name='z*SPrev'), name=name)
|
||||
apply_x_s = combine([s])
|
||||
apply_x_s.create_placeholder = create_s_placeholder
|
||||
return apply_x_s
|
||||
|
||||
def test_recurrence():
|
||||
r = Recurrence(gru_cell(5), go_backwards=False)
|
||||
a = input_variable(shape=(5,), dynamic_axes=[Axis.default_batch_axis(), Axis('Seq')])
|
||||
x = np.reshape(np.arange(0,25, dtype=np.float32), (1,5,5))
|
||||
rt = r(a).eval({a:x})
|
||||
print(rt)
|
||||
|
|
|
@ -34,6 +34,8 @@ def sanitize_precision(precision):
|
|||
return np.float32
|
||||
elif precision in [cntk_py.DataType_Double, 'double', 'float64', np.float64]:
|
||||
return np.float64
|
||||
elif precision in [cntk_py.DataType_Unknown]:
|
||||
return None
|
||||
else:
|
||||
raise ValueError('precision value: "%s" is not supported' % precision)
|
||||
|
||||
|
@ -112,6 +114,7 @@ def sanitize_input(arg, fallback_dtype=np.float32, reshape=None):
|
|||
``arg`` is a number or NumPy array. Variable otherwise.
|
||||
"""
|
||||
|
||||
from cntk.ops.functions import UserFunction
|
||||
from cntk.ops.variables import Constant, Variable, Parameter
|
||||
from cntk.ops.functions import Function
|
||||
from cntk.ops import constant
|
||||
|
@ -483,6 +486,8 @@ def sanitize_dtype_cntk(dtype):
|
|||
return cntk_py.DataType_Float
|
||||
elif dtype == np.float64:
|
||||
return cntk_py.DataType_Double
|
||||
elif dtype == object:
|
||||
return cntk_py.DataType_Unknown
|
||||
else:
|
||||
raise ValueError('data type "%s" is not supported' % dtype)
|
||||
|
||||
|
|
|
@ -48,7 +48,10 @@ tuple, strings, etc.)::
|
|||
|
||||
This can now be used as a normal operator like::
|
||||
|
||||
s = MySigmoid(prev_node)
|
||||
s = user_function(MySigmoid(prev_node))
|
||||
|
||||
Note that we cannot pass the `UserFunction` instance directly into the graph.
|
||||
It is representing a primitive function, which we have to pass through `user_function()`.
|
||||
|
||||
In case, the operator is initialized with multiple inputs, ``forward()`` 's
|
||||
``argument`` will be a list of those inputs::
|
||||
|
@ -133,7 +136,7 @@ interesting behavior, for instance::
|
|||
debug_node = LambdaFunc(node,
|
||||
when=lambda arg: np.var(arg)>1,
|
||||
execute=lambda arg: pdb.set_trace())
|
||||
# out = ... using debug_node ...
|
||||
# out = ... using user_function(debug_node) ...
|
||||
# ... training out
|
||||
|
||||
Now, if the variance of the input tensor exceeds 1, we will be put into
|
||||
|
|
Загрузка…
Ссылка в новой задаче