CNTK v2 library: Fixed some python wrappers

2016-09-05 13:22:51 -07:00 · 2016-09-05 13:22:51 -07:00 · 3b763f7d53
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -18,7 +18,7 @@ namespace CNTK
    }

    /*static*/ std::atomic<bool> DeviceDescriptor::s_defaultDeviceFrozen(false);
-    /*static*/ std::shared_ptr<DeviceDescriptor> DeviceDescriptor::s_defaultDevice(new DeviceDescriptor(DeviceDescriptor::GPUDevice(0)));
+    /*static*/ std::shared_ptr<DeviceDescriptor> DeviceDescriptor::s_defaultDevice(new DeviceDescriptor(DeviceDescriptor::CPUDevice()));

    /*static*/ DeviceDescriptor DeviceDescriptor::DefaultDevice()
    {
--- a/bindings/python/cntk/ops/init.py
+++ b/bindings/python/cntk/ops/init.py
@ -35,61 +35,61 @@ def combine(operands, name=''):
 # evaluation ops
 ################################################################################

-def cross_entropy_with_softmax(target_vector, output_vector, name=''):
+def cross_entropy_with_softmax(output_vector, target_vector, name=''):
    '''
    This operation computes the cross entropy over the softmax of the `output_vector`.
    It expects the `output_vector` as unscaled, and it computes softmax over 
    the `output_vector` internally.  Any `output_vector` input over which softmax is 
    already computed before passing to this operator will be incorrect.
    
-    :math:`cross\_entropy\_with\_softmax(t, o) = {-{\sum_{i \in \{1,len(t)\}} t_i \log(softmax(o_i)) }}`
+    :math:`cross\_entropy\_with\_softmax(o, t) = {-{\sum_{i \in \{1,len(t)\}} t_i \log(softmax(o_i)) }}`
    
    Example:
-        >>> C.eval(C.cross_entropy_with_softmax([0., 0., 0., 1.], [1., 1., 1., 50.]))
+        >>> C.eval(C.cross_entropy_with_softmax([1., 1., 1., 50.], [0., 0., 0., 1.]))
        #[0.]
        
-        >>> C.eval(C.cross_entropy_with_softmax([0.35, 0.15, 0.05, 0.45], [1., 2., 3., 4.]))
+        >>> C.eval(C.cross_entropy_with_softmax([1., 2., 3., 4.], [0.35, 0.15, 0.05, 0.45]))
        #[1.84]
    
    Args:
+        output_vector: the unscaled computed output values from the network
        target_vector: usually it is one-hot vector where the hot bit corresponds to the label index. 
        But it can be any probability distribution over the labels.
-        output_vector: the unscaled computed output values from the network
        name (str): the name of the node in the network            
    Returns:
        :class:`cntk.Function`
    '''
    from cntk import cross_entropy_with_softmax
-    target_vector = sanitize_input(target_vector, get_data_type(output_vector))
    output_vector = sanitize_input(output_vector, get_data_type(target_vector))
-    return cross_entropy_with_softmax(target_vector, output_vector, name).output()
+    target_vector = sanitize_input(target_vector, get_data_type(output_vector))
+    return cross_entropy_with_softmax(output_vector, target_vector, name).output()

-def square_error(target_matrix, output_matrix, name=''):
+def squared_error(output_matrix, target_matrix, name=''):
    '''
    This operation computes the sum of the squared difference between elements 
    in the two input matrices. The result is a scalar (i.e., one by one matrix). 
    This is often used as a training criterion node. 
    
    Example:
-        >>> C.eval(C.square_error([4., 6.], [2., 1.]))
+        >>> C.eval(C.square_error([2., 1.], [4., 6.]))
        #[29.]
        
        >>> C.eval(C.square_error([1., 2.], [1., 2.]))
        #[0.]
    
    Args:
-        target_matrix: target matrix, it is usually a one-hot vector where the hot bit corresponds to the label index
        output_matrix: the output values from the network
+        target_matrix: target matrix, it is usually a one-hot vector where the hot bit corresponds to the label index
        name (str): the name of the node in the network            
    Returns:
        :class:`cntk.Function`
    '''
-    from cntk import square_error
-    target_matrix = sanitize_input(target_matrix, get_data_type(output_matrix))
+    from cntk import squared_error
    output_matrix = sanitize_input(output_matrix, get_data_type(target_matrix))
-    return square_error(target_matrix, output_matrix, name).output()
+    target_matrix = sanitize_input(target_matrix, get_data_type(output_matrix))
+    return square_error(output_matrix, target_matrix, name).output()

-def classification_error(target_vector, output_vector, name=''):
+def classification_error(output_vector, target_vector, name=''):
    '''
    This operation computes the prediction error. It finds the index of the highest 
    value in the output_vector and compares it to the actual ground truth label
@ -99,23 +99,23 @@ def classification_error(target_vector, output_vector, name=''):
    defined for it.
    
    Example:
-        >>> C.eval(C.classification_error([0., 0., 0., 1.], [1., 2., 3., 4.]))
+        >>> C.eval(C.classification_error([1., 2., 3., 4.], [0., 0., 0., 1.]))
        #[0.]
        
-        >>> C.eval(C.classification_error([0., 0., 1., 0.], [1., 2., 3., 4.]))
+        >>> C.eval(C.classification_error([1., 2., 3., 4.], [0., 0., 1., 0.]))
        #[1.]
    
    Args:
-        target_vector: it is one-hot vector where the hot bit corresponds to the label index
        output_vector: the output values from the network
+        target_vector: it is one-hot vector where the hot bit corresponds to the label index
        name (str): the name of the node in the network            
    Returns:
        :class:`cntk.Function`
    '''
    from cntk import classification_error
-    target_vector = sanitize_input(target_vector, get_data_type(output_vector))
    output_vector = sanitize_input(output_vector, get_data_type(target_vector))
-    return classification_error(target_vector, output_vector, name).output()
+    target_vector = sanitize_input(target_vector, get_data_type(output_vector))
+    return classification_error(output_vector, target_vector, name).output()

 ################################################################################
 # convolution ops
@ -903,7 +903,7 @@ def cond(flag, value_if_true, value_if_false, name=''):
 # TODO: add default value for initial_state. It should be a constant scalar 
 # (0.0), using the default device

-def future_value(initial_state, x, time_step=1, name=''):
+def future_value(x, initial_state=None, time_step=1, name=''):
    '''
    This function returns the future value w.r.t. `x`. It is most often used when 
    creating RNNs. The resulting tensor has the same shape as the input but is 
@ -915,20 +915,26 @@ def future_value(initial_state, x, time_step=1, name=''):
    Example:
        TBA    
    Args:        
+        x: the tensor (or its name) from which the future value is obtained. 
        initial_state: tensor or scalar representing the initial value to be
        used when the input tensor is shifted in time.
-        x: the tensor (or its name) from which the future value is obtained. 
        time_step (int): the number of time steps to look into the future (default 1)        
        name (str): the name of the node in the network
    Returns:
        :class:`cntk.Function`
    '''    
-    
+
+    from ..utils import sanitize_dtype_cntk
+    from ..cntk_py import Constant
    from cntk import future_value
+
+    if initial_state is None:
+        initial_state = Constant.scalar(sanitize_dtype_cntk(np.float32), 0.0)
+
    x = sanitize_input(x)
-    return future_value(initial_state, x, time_step, name).output()    
+    return future_value(x, initial_state, time_step, name).output()
    
-def past_value(initial_state, x, time_step=1, default_hidden_activation=0.1, name=''):
+def past_value(x, initial_state=None, time_step=1, name=''):
    '''
    This function returns the past value w.r.t. `x`. It is most often used when 
    creating RNNs. The resulting tensor has the same shape as the input but is 
@ -940,18 +946,24 @@ def past_value(initial_state, x, time_step=1, default_hidden_activation=0.1, nam
    Example:
        TBA
    Args:        
+        x: the tensor (or its name) from which the past value is obtained
        initial_state: tensor or scalar representing the initial value to be
        used when the input tensor is shifted in time.
-        x: the tensor (or its name) from which the past value is obtained
        time_step (int): the number of time steps to look into the past (default 1)        
        name (str): the name of the node in the network
    Returns:
        :class:`cntk.Function`
    '''    
    
+    from ..utils import sanitize_dtype_cntk
+    from ..cntk_py import Constant
    from cntk import past_value
+
+    if initial_state is None:
+        initial_state = Constant.scalar(sanitize_dtype_cntk(np.float32), 0.0)
+
    x = sanitize_input(x)
-    return past_value(initial_state, x, time_step, name).output()    
+    return past_value(x, initial_state, time_step, name).output()

 ################################################################################
 # reshaping ops
--- a/bindings/python/examples/CifarResNet/CifarResNet.py
+++ b/bindings/python/examples/CifarResNet/CifarResNet.py
@ -7,7 +7,7 @@
 import numpy as np
 import sys
 import os
-from cntk import learning_rates_per_sample, Trainer, sgdlearner, create_minibatch_source, get_train_loss, get_train_eval_criterion, cntk_device
+from cntk import learning_rates_per_sample, Trainer, sgdlearner, create_minibatch_source, get_train_loss, get_train_eval_criterion, DeviceDescriptor
 from cntk.ops import input_variable, constant, parameter, cross_entropy_with_softmax, combine, classification_error, times, pooling, AVG_POOLING
 from examples.common.nn import conv_bn_relu_layer, conv_bn_layer, resnet_node2, resnet_node2_inc

@ -103,7 +103,7 @@ def cifar_resnet():

    # Input variables denoting the features and label data
    image_input = input_variable((num_channels, image_height, image_width), features_si.m_element_type)
-    label_var = input_variable((num_classes), features_si.m_element_type, needs_gradient=False)
+    label_var = input_variable((num_classes), features_si.m_element_type)

    # Instantiate the resnet classification model
    classifier_output = resnet_classifer(image_input, num_classes)
@ -124,14 +124,13 @@ def cifar_resnet():

        # Specify the mapping of input variables in the model to actual minibatch data to be trained with
        arguments = {image_input : mb[features_si].m_data, label_var : mb[labels_si].m_data}
-
        trainer.train_minibatch(arguments)

        print_training_progress(training_progress_output_freq, i, trainer)

 if __name__=='__main__':
    # Specify the target device to be used for computing
-    target_device = DeviceDescriptor.gpu_device(0)
+    target_device = DeviceDescriptor.gpudevice(0)
    DeviceDescriptor.set_default_device(target_device)

    cifar_resnet()
--- a/bindings/python/examples/MNIST/SimpleMNIST.py
+++ b/bindings/python/examples/MNIST/SimpleMNIST.py
@ -19,11 +19,11 @@ def simple_mnist():
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
-    input = input_variable(input_dim, np.float32, needs_gradient=False, name="features")
-    label = input_variable(num_output_classes, np.float32, needs_gradient=False, name="labels")
+    input = input_variable(input_dim, np.float32)
+    label = input_variable(num_output_classes, np.float32)

-    scaled_input = element_times(constant((), 0.00390625), input)
    # Instantiate the feedforward classification model
+    scaled_input = element_times(constant((), 0.00390625), input)
    netout = fully_connected_classifier_net(scaled_input, num_output_classes, hidden_layers_dim, num_hidden_layers, sigmoid)

    ce = cross_entropy_with_softmax(netout, label)
@ -61,7 +61,7 @@ def simple_mnist():

 if __name__=='__main__':
    # Specify the target device to be used for computing
-    target_device = DeviceDescriptor_cpudevice()
+    target_device = DeviceDescriptor.gpudevice(0)
    DeviceDescriptor.set_default_device(target_device)

    simple_mnist()
--- a/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py
+++ b/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py
@ -0,0 +1,151 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import numpy as np
+import sys
+import os
+import time
+from cntk import learning_rates_per_sample, DeviceDescriptor, Trainer, sgdlearner, Axis, get_train_loss, get_train_eval_criterion
+from cntk.ops import variable, cross_entropy_with_softmax, classification_error
+from examples.common.nn import LSTMP_component_with_self_stabilization, embedding, fully_connected_linear_layer, select_last
+
+# Creates and trains a sequence to sequence translation model
+def train_sequence_to_sequence_translator():
+
+    input_vocab_dim = 69
+    label_vocab_dim = 69
+
+    hidden_dim = 512
+    num_layers = 2
+
+    # Source and target inputs to the model
+    input_dynamic_axes = [ Axis('inputAxis'), Axis.default_batch_axis() ]
+    raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes = input_dynamic_axes)
+
+    label_dynamic_axes = [ Axis('labelAxis'), Axis.default_batch_axis() ]
+    raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes = label_dynamic_axes)
+
+    input_sequence = raw_input
+
+    # Drop the sentence start token from the label, for decoder training
+    label_sequence = cntk.ops.slice(raw_labels, label_dynamic_axes[0], 1, 0)
+    label_sentence_start = Sequence.first(raw_labels)
+
+    is_first_label = Sequence.is_first(label_sequence)
+
+    label_sentence_start_scattered = Sequence.scatter(label_sentence_start, is_first_label)
+
+    # Encoder
+    encoderOutputH = stabilize<float>(inputEmbedding, device)
+    futureValueRecurrenceHook = [](const Variable& x) { return FutureValue(x) }
+    for (size_t i = 0 i < num_layers ++i)
+        std::tie(encoderOutputH, encoderOutputC) = LSTMPComponentWithSelfStabilization<float>(encoderOutputH, hidden_dim, hidden_dim, futureValueRecurrenceHook, futureValueRecurrenceHook, device)
+
+    thoughtVectorH = Sequence::First(encoderOutputH)
+    thoughtVectorC = Sequence::First(encoderOutputC)
+
+    thoughtVectorBroadcastH = Sequence::BroadcastAs(thoughtVectorH, labelEmbedding)
+    thoughtVectorBroadcastC = Sequence::BroadcastAs(thoughtVectorC, labelEmbedding)
+
+    /* Decoder */
+    bool addBeamSearchReorderingHook = false
+    beamSearchReorderHook = Constant({ 1, 1 }, 1.0f)
+    decoderHistoryFromGroundTruth = labelEmbedding
+    decoderInput = ElementSelect(is_first_label, label_sentence_startEmbeddedScattered, PastValue(decoderHistoryFromGroundTruth))
+
+    decoderOutputH = Stabilize<float>(decoderInput, device)
+    FunctionPtr decoderOutputC
+    pastValueRecurrenceHookWithBeamSearchReordering = [addBeamSearchReorderingHook, beamSearchReorderHook](const FunctionPtr& operand) {
+        return PastValue(addBeamSearchReorderingHook ? Times(operand, beamSearchReorderHook) : operand)
+    }
+
+    for (size_t i = 0 i < num_layers ++i)
+    {
+        std::function<FunctionPtr(const Variable&)> recurrenceHookH, recurrenceHookC
+        if (i == 0)
+        {
+            recurrenceHookH = pastValueRecurrenceHookWithBeamSearchReordering
+            recurrenceHookC = pastValueRecurrenceHookWithBeamSearchReordering
+        }
+        else
+        {
+            isFirst = Sequence::IsFirst(labelEmbedding)
+            recurrenceHookH = [labelEmbedding, thoughtVectorBroadcastH, isFirst, addBeamSearchReorderingHook, beamSearchReorderHook](const FunctionPtr& operand) {
+                return ElementSelect(isFirst, thoughtVectorBroadcastH, PastValue(addBeamSearchReorderingHook ? Times(operand, beamSearchReorderHook) : operand))
+            }
+
+            recurrenceHookC = [labelEmbedding, thoughtVectorBroadcastC, isFirst, addBeamSearchReorderingHook, beamSearchReorderHook](const FunctionPtr& operand) {
+                return ElementSelect(isFirst, thoughtVectorBroadcastC, PastValue(addBeamSearchReorderingHook ? Times(operand, beamSearchReorderHook) : operand))
+            }
+        }
+
+        std::tie(decoderOutputH, encoderOutputC) = LSTMPComponentWithSelfStabilization<float>(decoderOutputH, hidden_dim, hidden_dim, recurrenceHookH, recurrenceHookC, device)
+    }
+
+    decoderOutput = decoderOutputH
+    decoderDim = hidden_dim
+
+    /* Softmax output layer */
+    outputLayerProjWeights = Parameter(NDArrayView::RandomUniform<float>({ label_vocab_dim, decoderDim }, -0.05, 0.05, 1, device))
+    biasWeights = Parameter({ label_vocab_dim }, 0.0f, device)
+
+    z = Plus(Times(outputLayerProjWeights, Stabilize<float>(decoderOutput, device)), biasWeights, L"classifierOutput")
+    ce = CrossEntropyWithSoftmax(z, label_sequence, L"lossFunction")
+    errs = ClassificationError(z, label_sequence, L"classificationError")
+
+
+
+
+    input_dim = 2000
+    cell_dim = 25
+    hidden_dim = 25
+    embedding_dim = 50
+    num_output_classes = 5
+
+    # Input variables denoting the features and label data
+    features = variable(shape=input_dim, is_sparse=True, name="features")
+    label = variable(num_output_classes, dynamic_axes = [Axis.default_batch_axis()], name="labels")
+
+    # Instantiate the sequence classification model
+    classifier_output = LSTM_sequence_classifer_net(features, num_output_classes, embedding_dim, hidden_dim, cell_dim)
+
+    ce = cross_entropy_with_softmax(classifier_output, label)
+    pe = classification_error(classifier_output, label)
+    
+    rel_path = r"../../../../Tests/EndToEndTests/Text/SequenceClassification/Data/Train.ctf"
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
+
+    mb_source = text_minibatch_source(path, [ ( 'features', input_dim, True, 'x' ), ( 'labels', num_output_classes, False, 'y' ) ], 0)
+    features_si = mb_source.stream_info(features)
+    labels_si = mb_source.stream_info(label)
+
+    # Instantiate the trainer object to drive the model training
+    lr = lr = learning_rates_per_sample(0.0005)
+    trainer = Trainer(classifier_output, ce, pe, [sgdlearner(classifier_output.owner.parameters(), lr)])                   
+
+    # Get minibatches of sequences to train with and perform model training
+    minibatch_size = 200
+    training_progress_output_freq = 1  
+    i = 0
+    while True:
+        mb = mb_source.get_next_minibatch(minibatch_size)
+        if  len(mb) == 0:
+            break
+
+        # Specify the mapping of input variables in the model to actual minibatch data to be trained with
+        arguments = {features : mb[features_si].m_data, label : mb[labels_si].m_data}
+        trainer.train_minibatch(arguments)
+
+        print_training_progress(training_progress_output_freq, i, trainer)
+
+        i += 1
+
+if __name__=='__main__':    
+    # Specify the target device to be used for computing
+    target_device = DeviceDescriptor.cpu_device()
+    DeviceDescriptor.set_default_device(target_device)
+
+    train_sequence_classifier()
--- a/bindings/python/examples/SequenceClassification/SequenceClassification.py
+++ b/bindings/python/examples/SequenceClassification/SequenceClassification.py
@ -27,11 +27,10 @@ def train_sequence_classifier():
    hidden_dim = 25;
    embedding_dim = 50;
    num_output_classes = 5;
-    feature_stream_name = 'features'
-    labels_stream_name = 'labels'
+
    # Input variables denoting the features and label data
-    features = input_variable(shape=input_dim, is_sparse=True, name=feature_stream_name)
-    label = input_variable(num_output_classes, dynamic_axes = [Axis.default_batch_axis()], name=labels_stream_name)
+    features = input_variable(shape=input_dim, is_sparse=True)
+    label = input_variable(num_output_classes, dynamic_axes = [Axis.default_batch_axis()])

    # Instantiate the sequence classification model
    classifier_output = LSTM_sequence_classifer_net(features, num_output_classes, embedding_dim, hidden_dim, cell_dim)
@ -41,12 +40,13 @@ def train_sequence_classifier():
    
    rel_path = r"../../../../Tests/EndToEndTests/Text/SequenceClassification/Data/Train.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
+    feature_stream_name = 'features'
+    labels_stream_name = 'labels'

    mb_source = text_format_minibatch_source(path, list([
                    StreamConfiguration( feature_stream_name, input_dim, True, 'x' ), 
                    StreamConfiguration( labels_stream_name, num_output_classes, False, 'y')]), 0)

-
    features_si = mb_source.stream_info(features)
    labels_si = mb_source.stream_info(label)

@ -66,13 +66,14 @@ def train_sequence_classifier():
        # Specify the mapping of input variables in the model to actual minibatch data to be trained with
        arguments = {features : mb[features_si].m_data, label : mb[labels_si].m_data}
        trainer.train_minibatch(arguments)
+
        print_training_progress(i, trainer, training_progress_output_freq)

        i += 1

 if __name__=='__main__':    
    # Specify the target device to be used for computing
-    target_device = DeviceDescriptor_cpudevice()
+    target_device = DeviceDescriptor.cpudevice()
    DeviceDescriptor.set_default_device(target_device)

    train_sequence_classifier()
--- a/bindings/python/examples/SimpleFFNet/feedforwardNet.py
+++ b/bindings/python/examples/SimpleFFNet/feedforwardNet.py
@ -33,7 +33,6 @@ def ffnet():
    feature_stream_name = 'features'
    labels_stream_name = 'labels'

-
    mb_source = text_format_minibatch_source(path, list([
                    StreamConfiguration( feature_stream_name, input_dim ), 
                    StreamConfiguration( labels_stream_name, num_output_classes)]))
@ -58,11 +57,9 @@ def ffnet():
        trainer.train_minibatch(arguments)
        print_training_progress(i, trainer, training_progress_output_freq)

-
 if __name__=='__main__':
    # Specify the target device to be used for computing
-
-    target_device = DeviceDescriptor_cpudevice()
+    target_device = DeviceDescriptor.cpudevice()
    DeviceDescriptor.set_default_device(target_device)

    ffnet()
--- a/bindings/python/examples/common/nn.py
+++ b/bindings/python/examples/common/nn.py
@ -81,6 +81,14 @@ def embedding(input, embedding_dim):
 def select_last(operand):
    return slice(operand, Axis.default_dynamic_axis(), -1, 0)

+def stabilize(operand):
+    scalar_constant = 4.0
+    f = Constant.scalar(scalar_constant);
+    fInv = Constant.scalar(f.get_data_type(), 1.0 / scalar_constant)
+
+    beta = element_times(fInv, log(Constant.scalar(f.get_data_type(), 1.0) + exp(element_times(f, parameter(shape=(), dtype=f.get_data_type(), init_value=0.99537863)))))
+    return element_times(beta, operand)
+
 def LSTMP_cell_with_self_stabilization(input, prev_output, prev_cell_state):
    input_dim = input.shape()[0]
    output_dim = prev_output.shape()[0];
@ -173,8 +181,8 @@ def LSTMP_component_with_self_stabilization(input, output_dim, cell_dim):
    dc = placeholder_variable(shape=(cell_dim))

    LSTMCell = LSTMP_cell_with_self_stabilization(input, dh, dc)
-    actualDh = past_value(LSTMCell[0], constant((), 0.0), 1); 
-    actualDc = past_value(LSTMCell[1], constant((), 0.0), 1); 
+    actualDh = past_value(LSTMCell[0]); 
+    actualDc = past_value(LSTMCell[1]); 

    # Form the recurrence loop by replacing the dh and dc placeholders with the actualDh and actualDc
    return LSTMCell[0].owner.replace_placeholders({ dh : actualDh, dc : actualDc})