diff --git a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h index 4443c5c39..fd22eca09 100644 --- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h +++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h @@ -750,7 +750,7 @@ namespace CNTK /// /// Destruct 'this' Value object. /// - CNTK_API virtual ~Value(); + virtual ~Value(); /// /// Returns the descriptor of the device that 'this' Value resides on @@ -796,28 +796,28 @@ namespace CNTK /// /// Returns the NDArrayView object corresponding to the data contents of 'this value object. /// - CNTK_API virtual NDArrayViewPtr Data() const; + virtual NDArrayViewPtr Data() const; /// /// Returns the NDMask object corresponding to the mask associated with 'this value object. /// - CNTK_API virtual NDMaskPtr Mask() const; + virtual NDMaskPtr Mask() const; /// /// Creates a new Value with newly allocated storage on the same device as 'this' Value and copies 'this' Value's contents into the newly allocated Value. /// - CNTK_API virtual ValuePtr DeepClone(bool readOnly = false) const; + virtual ValuePtr DeepClone(bool readOnly = false) const; /// /// Creates a new Value which is an alias of 'this' Value. /// - CNTK_API virtual ValuePtr Alias(bool readOnly = false) const; + virtual ValuePtr Alias(bool readOnly = false) const; /// /// Copies the contents of the 'source' Value to 'this' Value. /// The shapes of the 'source' Value's data and mask must be identical to 'this' Value's data and mask. /// - CNTK_API virtual void CopyFrom(const Value& source); + virtual void CopyFrom(const Value& source); private: // Disallow copy and move construction and assignment @@ -2025,10 +2025,10 @@ namespace CNTK /// and the user is responsible for ensuring that the contents of the inputs and outputs are unchanged until after any uses of the BackPropState instance /// for backpropagating gradients through this function. /// - CNTK_API virtual BackPropStatePtr Forward(const std::unordered_map& arguments, - std::unordered_map& outputs, - const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice(), - const std::unordered_set& outputsToRetainBackwardStateFor = {}) = 0; + virtual BackPropStatePtr Forward(const std::unordered_map& arguments, + std::unordered_map& outputs, + const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice(), + const std::unordered_set& outputsToRetainBackwardStateFor = {}) = 0; /// /// Backpropagates supplied 'rootGradientValues' for one or more of the output variables of the Function, to produce gradient Values @@ -2039,9 +2039,9 @@ namespace CNTK /// The 'state' parameter is an instance of an BackPropState instance obtained from a previous call to the Forward method on 'this; Function for the /// computation that this gradient backpropagation corresponds to. /// - CNTK_API virtual void Backward(const BackPropStatePtr& state, - const std::unordered_map& rootGradientValues, - std::unordered_map& backPropagatedGradientValuesForInputs) = 0; + virtual void Backward(const BackPropStatePtr& state, + const std::unordered_map& rootGradientValues, + std::unordered_map& backPropagatedGradientValuesForInputs) = 0; public: @@ -2621,7 +2621,7 @@ namespace CNTK // Method to update the parameters associated with this learner. By returning false, this method indicates that // learning has stopped for all of the parameters associated with this learner // - CNTK_API virtual bool Update(const std::unordered_map& gradientValues, size_t trainingSampleCount) = 0; + virtual bool Update(const std::unordered_map& gradientValues, size_t trainingSampleCount) = 0; /// /// Returns the set of parameters associated with this learner. @@ -2633,7 +2633,7 @@ namespace CNTK /// // TODO: move the following two methods into ISerializable interface, make // Learner (and all other entities that need checkpointing capability) implement it. - CNTK_API virtual Dictionary GetCheckpointState() const + virtual Dictionary GetCheckpointState() const { Dictionary baseCheckpointState; baseCheckpointState[LearningRateAttributeName] = m_learningRate; @@ -2644,7 +2644,7 @@ namespace CNTK /// /// Optionally overridable method to restore the learner's state from a previous checkpoint. /// - CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) + virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) { if (checkpoint.Contains(LearningRateAttributeName)) m_learningRate = checkpoint[LearningRateAttributeName].Value(); @@ -2655,8 +2655,8 @@ namespace CNTK /// virtual ~Learner() {} - CNTK_API virtual void ResetLearningRate(double learningRate) { m_learningRate = learningRate; } - CNTK_API virtual double LearningRate() const { return m_learningRate; } + virtual void ResetLearningRate(double learningRate) { m_learningRate = learningRate; } + virtual double LearningRate() const { return m_learningRate; } protected: Learner(const std::vector& parameters, double learningRate) diff --git a/Tests/UnitTests/V2LibraryTests/CifarResNet.cpp b/Tests/UnitTests/V2LibraryTests/CifarResNet.cpp index cf0e734c5..871d0b4ed 100644 --- a/Tests/UnitTests/V2LibraryTests/CifarResNet.cpp +++ b/Tests/UnitTests/V2LibraryTests/CifarResNet.cpp @@ -112,7 +112,7 @@ FunctionPtr ResNetClassifier(Variable input, size_t numOutputClasses, const Devi auto pool = Pooling(rn3_3, PoolingType::Average, { poolW, poolH, 1 }, { poolhStride, poolvStride, 1 }); // Output DNN layer - auto outTimesParams = Parameter(NDArrayView::RandomNormal({ numOutputClasses, 1, 1, cMap3 }, 0.0, fc1WScale, 1, device)); + auto outTimesParams = Parameter({ numOutputClasses, 1, 1, cMap3 }, DataType::Float, GlorotUniformInitializer(1, 0, fc1WScale), device); auto outBiasParams = Parameter({ numOutputClasses }, (float)fc1BValue, device); return Plus(Times(outTimesParams, pool), outBiasParams, outputName); diff --git a/Tests/UnitTests/V2LibraryTests/Common.h b/Tests/UnitTests/V2LibraryTests/Common.h index 2836922c9..f63c0a543 100644 --- a/Tests/UnitTests/V2LibraryTests/Common.h +++ b/Tests/UnitTests/V2LibraryTests/Common.h @@ -123,10 +123,10 @@ inline CNTK::FunctionPtr FullyConnectedLinearLayer(CNTK::Variable input, size_t assert(input.Shape().Rank() == 1); size_t inputDim = input.Shape()[0]; - auto timesParam = CNTK::Parameter(CNTK::NDArrayView::RandomUniform({ outputDim, inputDim }, -0.05, 0.05, 1, device)); + auto timesParam = CNTK::Parameter({ outputDim, inputDim }, CNTK::DataType::Float, CNTK::GlorotUniformInitializer(), device); auto timesFunction = CNTK::Times(timesParam, input); - auto plusParam = CNTK::Parameter(CNTK::NDArrayView::RandomUniform({ outputDim }, -0.05, 0.05, 1, device)); + auto plusParam = CNTK::Parameter({ outputDim }, 0.0f, device); return CNTK::Plus(plusParam, timesFunction, outputName); } @@ -159,11 +159,11 @@ std::pair LSTMPCellWithSelfStabilization(C unsigned long seed = 1; auto createProjectionParam = [device, &seed](size_t outputDim, size_t inputDim) { - return CNTK::Parameter({ outputDim, inputDim }, CNTK::AsDataType(), CNTK::UniformInitializer(1, seed++), device); + return CNTK::Parameter({ outputDim, inputDim }, CNTK::AsDataType(), CNTK::GlorotUniformInitializer(1, 0, 1, seed++), device); }; auto createDiagWeightParam = [device, &seed](size_t dim) { - return CNTK::Parameter({ dim }, CNTK::AsDataType(), CNTK::UniformInitializer(1, seed++), device); + return CNTK::Parameter({ dim }, CNTK::AsDataType(), CNTK::GlorotUniformInitializer(1, 0, 1, seed++), device); }; auto stabilizedPrevOutput = Stabilize(prevOutput, device); diff --git a/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp b/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp index b2dbf1770..334bc4598 100644 --- a/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp +++ b/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp @@ -38,8 +38,8 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS bool forceEmbedding = useSparseInputs; /* Embeddings */ - auto inputEmbeddingWeights = Parameter(NDArrayView::RandomUniform({ inputEmbeddingDim, inputVocabDim }, -0.05, 0.05, 1, device)); - auto labelEmbeddingWeights = Parameter(NDArrayView::RandomUniform({ labelEmbeddingDim, labelVocabDim }, -0.05, 0.05, 1, device)); + auto inputEmbeddingWeights = Parameter({ inputEmbeddingDim, inputVocabDim }, DataType::Float, GlorotUniformInitializer(), device); + auto labelEmbeddingWeights = Parameter({ labelEmbeddingDim, labelVocabDim }, DataType::Float, GlorotUniformInitializer(), device); auto inputEmbedding = (!forceEmbedding && (inputVocabDim <= inputEmbeddingDim)) ? inputSequence : Times(inputEmbeddingWeights, inputSequence); auto labelEmbedding = (!forceEmbedding && (labelVocabDim <= labelEmbeddingDim)) ? labelSequence : Times(labelEmbeddingWeights, labelSequence); @@ -111,7 +111,7 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS auto decoderDim = hiddenDim; /* Softmax output layer */ - auto outputLayerProjWeights = Parameter(NDArrayView::RandomUniform({ labelVocabDim, decoderDim }, -0.05, 0.05, 1, device)); + auto outputLayerProjWeights = Parameter({ labelVocabDim, decoderDim }, DataType::Float, GlorotUniformInitializer(), device); auto biasWeights = Parameter({ labelVocabDim }, 0.0f, device); auto z = Plus(Times(outputLayerProjWeights, Stabilize(decoderOutput, device)), biasWeights, L"classifierOutput"); diff --git a/Tests/UnitTests/V2LibraryTests/SequenceClassification.cpp b/Tests/UnitTests/V2LibraryTests/SequenceClassification.cpp index b16f8fb30..aa3eadef9 100644 --- a/Tests/UnitTests/V2LibraryTests/SequenceClassification.cpp +++ b/Tests/UnitTests/V2LibraryTests/SequenceClassification.cpp @@ -11,7 +11,7 @@ FunctionPtr Embedding(const Variable& input, size_t embeddingDim, const DeviceDe assert(input.Shape().Rank() == 1); size_t inputDim = input.Shape()[0]; - auto embeddingParameters = Parameter(CNTK::NDArrayView::RandomUniform({ embeddingDim, inputDim }, -0.05, 0.05, 1, device)); + auto embeddingParameters = Parameter({ embeddingDim, inputDim }, DataType::Float, GlorotUniformInitializer(), device); return Times(embeddingParameters, input); } diff --git a/bindings/python/cntk/ops/variables.py b/bindings/python/cntk/ops/variables.py index 6713dab53..4aa5395d1 100644 --- a/bindings/python/cntk/ops/variables.py +++ b/bindings/python/cntk/ops/variables.py @@ -48,6 +48,7 @@ class Parameter(TensorOpsMixin,Parameter): data_type = str(value.dtype) if initializer is not None: + shape = utils.sanitize_shape(shape) data_type = utils.sanitize_dtype_cntk(data_type) super(Parameter, self).__init__(shape, data_type, initializer, device, name) diff --git a/bindings/python/examples/CifarResNet/CifarResNet.py b/bindings/python/examples/CifarResNet/CifarResNet.py index 62d81bef1..a3cb6f2cc 100644 --- a/bindings/python/examples/CifarResNet/CifarResNet.py +++ b/bindings/python/examples/CifarResNet/CifarResNet.py @@ -10,6 +10,7 @@ import os from cntk import Trainer, sgd_learner, DeviceDescriptor from cntk.ops import input_variable, constant, parameter, cross_entropy_with_softmax, combine, classification_error, times, pooling, AVG_POOLING from cntk.io import ReaderConfig, ImageDeserializer +from cntk.initializer import glorot_uniform_initializer abs_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(abs_path, "..", "..")) @@ -30,23 +31,16 @@ def create_mb_source(features_stream_name, labels_stream_name, image_height, raise RuntimeError("File '%s' or '%s' do not exist. Please run CifarDownload%s.py and CifarConverter%s.py from CIFAR-10 to fetch them"%(map_file, mean_file, cifar_py3, cifar_py3)) image = ImageDeserializer(map_file) - image.map_features(feature_name, + image.map_features(features_stream_name, [ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio'), ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ImageDeserializer.mean(mean_file)]) - image.map_labels(label_name, num_classes) + image.map_labels(labels_stream_name, num_classes) rc = ReaderConfig(image, epoch_size=sys.maxsize) - - input_streams_config = {features_stream_name: features_stream_config, labels_stream_name: labels_stream_config} - deserializer_config = {"type" : "ImageDeserializer", "file" : map_file, "input" : input_streams_config} - - minibatch_config = {"epochSize" : sys.maxsize, "deserializers" : [deserializer_config]} - print(minibatch_config) - - return minibatch_source(minibatch_config) + return rc.minibatch_source() def get_projection_map(out_dim, in_dim): if in_dim > out_dim: @@ -99,13 +93,13 @@ def resnet_classifer(input, num_classes): poolv_stride = 1 pool = pooling(rn3_3, AVG_POOLING, (1, poolh, poolw), (1, poolv_stride, poolh_stride)) - out_times_params = parameter(shape=(c_map3, 1, 1, num_classes)) - out_bias_params = parameter(shape=(num_classes)) + out_times_params = parameter(shape=(c_map3, 1, 1, num_classes), initializer=glorot_uniform_initializer()) + out_bias_params = parameter(shape=(num_classes), value=0) t = times(pool, out_times_params) return t + out_bias_params # Trains a residual network model on the Cifar image dataset -def cifar_resnet(): +def cifar_resnet(base_path): image_height = 32 image_width = 32 num_channels = 3 @@ -113,7 +107,7 @@ def cifar_resnet(): feats_stream_name = 'features' labels_stream_name = 'labels' minibatch_source = create_mb_source(feats_stream_name, labels_stream_name, - image_height, image_width, num_channels, num_classes) + image_height, image_width, num_channels, num_classes, base_path) features_si = minibatch_source.stream_info(feats_stream_name) labels_si = minibatch_source.stream_info(labels_stream_name) diff --git a/bindings/python/examples/MNIST/SimpleMNIST.py b/bindings/python/examples/MNIST/SimpleMNIST.py index 9bd68c40b..408534245 100644 --- a/bindings/python/examples/MNIST/SimpleMNIST.py +++ b/bindings/python/examples/MNIST/SimpleMNIST.py @@ -47,7 +47,7 @@ def simple_mnist(): labels_si = mb_source.stream_info(labels_stream_name) # Instantiate the trainer object to drive the model training - trainer = Trainer(netout, ce, pe, [sgd_learner(netout.owner.parameters(), + trainer = Trainer(netout, ce, pe, [sgd_learner(netout.parameters(), lr=0.003125)]) # Get minibatches of images to train with and perform model training diff --git a/bindings/python/examples/common/nn.py b/bindings/python/examples/common/nn.py index 9f4fada43..fa7b9c29f 100644 --- a/bindings/python/examples/common/nn.py +++ b/bindings/python/examples/common/nn.py @@ -9,6 +9,7 @@ import sys import os from cntk.ops import * from cntk.utils import sanitize_dtype_cntk, get_train_eval_criterion, get_train_loss +from cntk.initializer import glorot_uniform_initializer def linear_layer(input_var, output_dim): try: @@ -18,8 +19,8 @@ def linear_layer(input_var, output_dim): shape = input_var.shape() input_dim = shape[0] - times_param = parameter(shape=(input_dim, output_dim)) - bias_param = parameter(shape=(output_dim)) + times_param = parameter(shape=(input_dim, output_dim), initializer=glorot_uniform_initializer()) + bias_param = parameter(shape=(output_dim), value=0) t = times(input_var, times_param) return bias_param + t @@ -44,12 +45,12 @@ def conv_bn_layer(input, out_feature_map_count, kernel_width, kernel_height, h_s shape = input_var.shape() num_in_channels = shape[0] #TODO: use RandomNormal to initialize, needs to be exposed in the python api - conv_params = parameter(shape=(num_in_channels, kernel_height, kernel_width, out_feature_map_count)) + conv_params = parameter(shape=(num_in_channels, kernel_height, kernel_width, out_feature_map_count), initializer=glorot_uniform_initializer(output_rank=-1, filter_rank=2)) conv_func = convolution(conv_params, input, (num_in_channels, v_stride, h_stride)) #TODO: initialize using b_value and sc_value, needs to be exposed in the python api - bias_params = parameter(shape=(out_feature_map_count)) - scale_params = parameter(shape=(out_feature_map_count)) + bias_params = parameter(shape=(out_feature_map_count), value=b_value) + scale_params = parameter(shape=(out_feature_map_count), value=sc_value) running_mean = constant((out_feature_map_count), 0.0) running_invstd = constant((out_feature_map_count), 0.0) return batch_normalization(conv_func, scale_params, bias_params, running_mean, running_invstd, True, bn_time_const, 0.0, 0.000000001) @@ -74,8 +75,8 @@ def proj_layer(w_proj, input, h_stride, v_stride, b_value, sc_value, bn_time_con conv_func = convolution(w_proj, input, (num_in_channels, v_stride, h_stride)) out_feature_map_count = w_proj.shape()[-1]; #TODO: initialize using b_value and sc_value, needs to be exposed in the python api - bias_params = parameter(shape=(out_feature_map_count)) - scale_params = parameter(shape=(out_feature_map_count)) + bias_params = parameter(shape=(out_feature_map_count), value=b_value) + scale_params = parameter(shape=(out_feature_map_count), value=sc_value) running_mean = constant((out_feature_map_count), 0.0) running_invstd = constant((out_feature_map_count), 0.0) return batch_normalization(conv_func, scale_params, bias_params, running_mean, running_invstd, True, bn_time_const) @@ -91,7 +92,7 @@ def resnet_node2_inc(input, out_feature_map_count, kernel_width, kernel_height, def embedding(input, embedding_dim): input_dim = input.shape()[0]; - embedding_parameters = parameter(shape=(input_dim, embedding_dim)) + embedding_parameters = parameter(shape=(input_dim, embedding_dim), initializer=glorot_uniform_initializer()) return times(input, embedding_parameters) def select_last(operand): @@ -110,28 +111,28 @@ def LSTMP_cell_with_self_stabilization(input, prev_output, prev_cell_state): output_dim = prev_output.shape()[0]; cell_dim = prev_cell_state.shape()[0]; - Wxo = parameter(shape=(input_dim, cell_dim)) - Wxi = parameter(shape=(input_dim, cell_dim)) - Wxf = parameter(shape=(input_dim, cell_dim)) - Wxc = parameter(shape=(input_dim, cell_dim)) + Wxo = parameter(shape=(input_dim, cell_dim), initializer=glorot_uniform_initializer()) + Wxi = parameter(shape=(input_dim, cell_dim), initializer=glorot_uniform_initializer()) + Wxf = parameter(shape=(input_dim, cell_dim), initializer=glorot_uniform_initializer()) + Wxc = parameter(shape=(input_dim, cell_dim), initializer=glorot_uniform_initializer()) Bo = parameter(shape=(cell_dim), value=0) Bc = parameter(shape=(cell_dim), value=0) Bi = parameter(shape=(cell_dim), value=0) Bf = parameter(shape=(cell_dim), value=0) - Whi = parameter(shape=(output_dim, cell_dim)) - Wci = parameter(shape=(cell_dim)) + Whi = parameter(shape=(output_dim, cell_dim), initializer=glorot_uniform_initializer()) + Wci = parameter(shape=(cell_dim), initializer=glorot_uniform_initializer()) - Whf = parameter(shape=(output_dim, cell_dim)) - Wcf = parameter(shape=(cell_dim)) + Whf = parameter(shape=(output_dim, cell_dim), initializer=glorot_uniform_initializer()) + Wcf = parameter(shape=(cell_dim), initializer=glorot_uniform_initializer()) - Who = parameter(shape=(output_dim, cell_dim)) - Wco = parameter(shape=(cell_dim)) + Who = parameter(shape=(output_dim, cell_dim), initializer=glorot_uniform_initializer()) + Wco = parameter(shape=(cell_dim), initializer=glorot_uniform_initializer()) - Whc = parameter(shape=(output_dim, cell_dim)) + Whc = parameter(shape=(output_dim, cell_dim), initializer=glorot_uniform_initializer()) - Wmr = parameter(shape=(cell_dim, output_dim)) + Wmr = parameter(shape=(cell_dim, output_dim), initializer=glorot_uniform_initializer()) # Stabilization by routing input through an extra scalar parameter sWxo = parameter(value=0)