Adapting examples; set MinibatchData.data to be a property

2017-03-29 10:41:04 +02:00 · 2017-03-29 10:41:04 +02:00 · 9adade3311
--- a/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence.py
+++ b/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence.py
@ -279,7 +279,7 @@ def train(train_reader, valid_reader, vocab, i2w, s2smodel, max_epochs, epoch_si

 # This decodes the test set and counts the string error rate.
 def evaluate_decoding(reader, s2smodel, i2w):
-    
+
    model_decoding = create_model_greedy(s2smodel) # wrap the greedy decoder around the model

    progress_printer = ProgressPrinter(tag='Evaluation')
@ -301,7 +301,7 @@ def evaluate_decoding(reader, s2smodel, i2w):

        num_total += len(outputs)
        num_wrong += sum([label != output for output, label in zip(outputs, labels)])
-        
+
    rate = num_wrong / num_total
    print("string error rate of {:.1f}% in {} samples".format(100 * rate, num_total))
    return rate
@ -321,7 +321,7 @@ def Evaluator(model, criterion):
        parameters |= set(model.parameters)
    if metric:
        parameters |= set(metric.parameters)
-    dummy_learner = momentum_sgd(tuple(parameters), 
+    dummy_learner = momentum_sgd(tuple(parameters),
                                 lr = learning_rate_schedule(1, UnitType.minibatch),
                                 momentum = momentum_as_time_constant_schedule(0))
    return Trainer(model, (loss, metric), dummy_learner)
@ -382,10 +382,10 @@ def translate(tokens, model_decoding, vocab, i2w, show_attention=False, max_labe
    # print out translation and stop at the sequence-end tag
    prediction = np.argmax(pred, axis=-1)
    translation = [i2w[i] for i in prediction]
-    
+
    # show attention window (requires matplotlib, seaborn, and pandas)
    if use_attention and show_attention:
-    
+
        #att_value = model_decoding.attention_model.attention_weights(query)
        # BUGBUG: fails with "Forward: Feature Not Implemented"
        q = combine([model_decoding.attention_model.attention_weights])
@ -440,7 +440,7 @@ def get_vocab(path):
    vocab = [w.strip() for w in open(path).readlines()]
    i2w = { i:w for i,w in enumerate(vocab) }
    w2i = { w:i for i,w in enumerate(vocab) }
-    
+
    return (vocab, i2w, w2i)

 # Given a vocab and tensor, print the output
@ -454,9 +454,9 @@ def debug_attention(model, input):
    words_p = q(input)
    words = words_p[0]
    p     = words_p[1]
-    len = words.shape[attention_axis-1]
+    seq_len = words[0].shape[attention_axis-1]
    span = 7 #attention_span  #7 # test sentence is 7 tokens long
-    p_sq = np.squeeze(p[0,:len,:span,0,:]) # (batch, len, attention_span, 1, vector_dim)
+    p_sq = np.squeeze(p[0][:seq_len,:span,0,:]) # (batch, len, attention_span, 1, vector_dim)
    opts = np.get_printoptions()
    np.set_printoptions(precision=5)
    print(p_sq)
@ -477,7 +477,7 @@ if __name__ == '__main__':

    # create inputs and create model
    model = create_model()
-    
+
    # train
    train_reader = create_reader(os.path.join(DATA_DIR, TRAINING_DATA), True)
    valid_reader = create_reader(os.path.join(DATA_DIR, VALIDATION_DATA), True)
@ -489,7 +489,7 @@ if __name__ == '__main__':
    # test string error rate on decoded output
    test_reader = create_reader(os.path.join(DATA_DIR, TESTING_DATA), False)
    evaluate_decoding(test_reader, model, i2w)
-    
+
    # test same metric same as in training on test set
    test_reader = create_reader(os.path.join(DATA_DIR, TESTING_DATA), False)
    evaluate_metric(test_reader, model)
--- a/Tutorials/CNTK_101_LogisticRegression.ipynb
+++ b/Tutorials/CNTK_101_LogisticRegression.ipynb
--- a/Tutorials/CNTK_103B_MNIST_FeedForwardNetwork.ipynb
+++ b/Tutorials/CNTK_103B_MNIST_FeedForwardNetwork.ipynb
--- a/Tutorials/CNTK_105_Basic_Autoencoder_for_Dimensionality_Reduction.ipynb
+++ b/Tutorials/CNTK_105_Basic_Autoencoder_for_Dimensionality_Reduction.ipynb
--- a/Tutorials/CNTK_202_Language_Understanding.ipynb
+++ b/Tutorials/CNTK_202_Language_Understanding.ipynb
@ -707,11 +707,11 @@
    "onehot = np.zeros([len(w),len(query_dict)], np.float32)\n",
    "for t in range(len(w)):\n",
    "    onehot[t,w[t]] = 1\n",
-    "pred = model.eval({model.arguments[0]:[onehot]})\n",
+    "pred = model.eval({model.arguments[0]:[onehot]})[0]\n",
    "print(pred.shape)\n",
-    "best = np.argmax(pred,axis=2)\n",
-    "print(best[0])\n",
-    "list(zip(seq.split(),[slots_wl[s] for s in best[0]]))"
+    "best = np.argmax(pred,axis=1)\n",
+    "print(best)\n",
+    "list(zip(seq.split(),[slots_wl[s] for s in best]))"
   ]
  },
  {
--- a/Tutorials/CNTK_203_Reinforcement_Learning_Basics.ipynb
+++ b/Tutorials/CNTK_203_Reinforcement_Learning_Basics.ipynb
--- a/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
+++ b/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
@ -842,9 +842,9 @@
    "    words_p = q(input)\n",
    "    words = words_p[0]\n",
    "    p     = words_p[1]\n",
-    "    len = words.shape[attention_axis-1]\n",
+    "    seq_len = words[0].shape[attention_axis-1]\n",
    "    span = 7 #attention_span  #7 # test sentence is 7 tokens long\n",
-    "    p_sq = np.squeeze(p[0,:len,:span,0,:]) # (batch, len, attention_span, 1, vector_dim)\n",
+    "    p_sq = np.squeeze(p[0][:seq_len,:span,0,:]) # (batch, len, attention_span, 1, vector_dim)\n",
    "    opts = np.get_printoptions()\n",
    "    np.set_printoptions(precision=5)\n",
    "    print(p_sq)\n",
@ -897,7 +897,7 @@
      " [ 0.1439   0.14321  0.14322  0.14308  0.14287  0.14216  0.14156]\n",
      " [ 0.1439   0.14321  0.14322  0.14308  0.14287  0.14216  0.14156]]\n",
      " Minibatch[ 181- 210]: loss = 3.143627 * 1565, metric = 82.30% * 1565;\n",
-      " Minibatch[ 211- 240]: loss = 3.186274 * 1583, metric = 83.39% * 1583;\n",
+      " Minibatch[ 211- 240]: loss = 3.186273 * 1583, metric = 83.39% * 1583;\n",
      " Minibatch[ 241- 270]: loss = 3.128010 * 1562, metric = 83.03% * 1562;\n",
      " Minibatch[ 271- 300]: loss = 3.152663 * 1551, metric = 83.69% * 1551;\n",
      "['<s> A B A D I </s>']\n",
@ -921,7 +921,7 @@
      " [ 0.14417  0.14347  0.14339  0.14316  0.14279  0.14191  0.14111]\n",
      " [ 0.14417  0.14348  0.14339  0.14316  0.14279  0.14191  0.14111]]\n",
      " Minibatch[ 391- 420]: loss = 3.126911 * 1601, metric = 82.26% * 1601;\n",
-      "Finished Epoch[1 of 300]: [Training] loss = 3.279741 * 22067, metric = 84.28% * 22067 92.880s (237.6 samples/s);\n",
+      "Finished Epoch[1 of 300]: [Training] loss = 3.279741 * 22067, metric = 84.28% * 22067 260.163s ( 84.8 samples/s);\n",
      "Saving final model to 'model_0.cmf'\n",
      "1 epochs complete.\n"
     ]
@ -932,6 +932,15 @@
    "train(train_reader, valid_reader, vocab, i2w, model, max_epochs=1, epoch_size=25000)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -1290,15 +1299,6 @@
    "\n",
    "With the above model, you have the basics for training a powerful sequence-to-sequence model with attention in a number of distinct domains. The only major changes required are preparing a dataset with pairs input and output sequences and in general the rest of the building blocks will remain the same. Good luck, and have fun!"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/Tutorials/CNTK_205_Artistic_Style_Transfer.ipynb
+++ b/Tutorials/CNTK_205_Artistic_Style_Transfer.ipynb
--- a/Tutorials/CNTK_206A_Basic_GAN.ipynb
+++ b/Tutorials/CNTK_206A_Basic_GAN.ipynb
--- a/Tutorials/CNTK_206B_DCGAN.ipynb
+++ b/Tutorials/CNTK_206B_DCGAN.ipynb
--- a/Tutorials/CNTK_207_Training_with_Sampled_Softmax.ipynb
+++ b/Tutorials/CNTK_207_Training_with_Sampled_Softmax.ipynb
--- a/Tutorials/CNTK_301_Image_Recognition_with_Deep_Transfer_Learning.ipynb
+++ b/Tutorials/CNTK_301_Image_Recognition_with_Deep_Transfer_Learning.ipynb
@ -788,7 +788,7 @@
    "        output = loaded_model.eval(arguments)\n",
    "\n",
    "        # return softmax probabilities\n",
-    "        sm = softmax(output[0,0])\n",
+    "        sm = softmax(output[0][0])\n",
    "        return sm.eval()\n",
    "    except FileNotFoundError:\n",
    "        print(\"Could not open (skipping file): \", image_path)\n",
--- a/bindings/python/cntk/core.py
+++ b/bindings/python/cntk/core.py
@ -227,7 +227,7 @@ class Value(cntk_py.Value):
        else:
            super(Value, self).__init__(ndav)

-    def as_sequences(self, variable):
+    def as_sequences(self, variable=None):
        '''
        Convert a Value to a sequence of NumPy arrays that have their masked
        entries removed.
@ -238,6 +238,9 @@ class Value(cntk_py.Value):
            returned. Otherwise, the arrays will be returned directly.
        '''
        if self.is_sparse():
+            if variable is None:
+                raise ValueError('cannot convert sparse value to sequences '
+                                 'wihtout the corresponding variable')
            network = _sparse_to_dense_network_cache(variable.shape)

            warnings.warn('converting Value object to CSR format might be slow')
--- a/bindings/python/cntk/internal/sanitize.py
+++ b/bindings/python/cntk/internal/sanitize.py
@ -3,13 +3,9 @@
 # for full license information.
 # ==============================================================================

-import sys
 import numbers
 import collections
-import copy
 import numpy as np
-from numbers import Number
-from scipy import sparse

 from .. import cntk_py
 from ..axis import Axis
--- a/bindings/python/cntk/io/init.py
+++ b/bindings/python/cntk/io/init.py
@ -4,6 +4,7 @@
 # for full license information.
 # ==============================================================================

+import warnings
 from .. import cntk_py, Value
 from ..tensor import ArrayMixin
 from cntk.internal import typemap
@ -49,6 +50,25 @@ class MinibatchData(cntk_py.MinibatchData, ArrayMixin):
        '''
        return self.data.as_sequences(variable)

+    @property
+    def data(self):
+        '''
+        The Value representation of the minibatch.
+        '''
+        return super(MinibatchData, self).data()
+
+    @property
+    def value(self):
+        '''
+        The value of the minibatch as a NumPy array.
+        '''
+        warnings.warn('the .value property is deprecated. Please use '
+                      '.asarray() or .as_sequences() to get the NumPy '
+                      'representations or .data to get the Value '
+                      'representation', RuntimeWarning)
+
+        return self.as_sequences()
+
    @property
    def shape(self):
        '''
--- a/bindings/python/doc/concepts.rst
+++ b/bindings/python/doc/concepts.rst
@ -1,43 +1,43 @@
 :orphan:

-Concepts 
+Concepts
 ========

 There is a common property in key machine learning models, such as deep neural
-networks (DNNs), convolutional neural networks (CNNs), and recurrent neural 
+networks (DNNs), convolutional neural networks (CNNs), and recurrent neural
 networks (RNNs). All of these models can be described as *computational networks*.

-The directed edges of these *computational networks* are vectors, matrices, or in 
-general n-dimensional arrays (tensors) which represent input data and model 
-parameters. The vertices are *functions* (also called operations) that are 
-performing a computation on these input tensors. 
+The directed edges of these *computational networks* are vectors, matrices, or in
+general n-dimensional arrays (tensors) which represent input data and model
+parameters. The vertices are *functions* (also called operations) that are
+performing a computation on these input tensors.


 Tensors
 -------

-The underlying data structure in CNTK is that of a *tensor*. It is a 
-multidimensional array on which computations can be performed. Every dimension in 
-these arrays is referred to as an *axis* to distinguish it from the scalar size 
-of every axis. So, a matrix has two *axes* which both have a certain 
-*dimension* corresponding to the number of rows and columns of the *axes*. 
+The underlying data structure in CNTK is that of a *tensor*. It is a
+multidimensional array on which computations can be performed. Every dimension in
+these arrays is referred to as an *axis* to distinguish it from the scalar size
+of every axis. So, a matrix has two *axes* which both have a certain
+*dimension* corresponding to the number of rows and columns of the *axes*.

-Using tensors makes the framework generic in that it can be used e.g. for 
-classification problems where the inputs are vectors, black-and-white 
-images (input is a matrix of points), color images (includes a separate dimension 
-for r, g, and b) or videos (has an extra time dimension). 
+Using tensors makes the framework generic in that it can be used e.g. for
+classification problems where the inputs are vectors, black-and-white
+images (input is a matrix of points), color images (includes a separate dimension
+for r, g, and b) or videos (has an extra time dimension).

- Tensors have a *shape* which describes the dimensions of its axes. E.g. a shape ``[2,3,4]`` 
-  would refer to a tensor with three axes that have, respectively, 2, 3, and 4 
-  dimensions. 
+- Tensors have a *shape* which describes the dimensions of its axes. E.g. a shape ``[2,3,4]``
+  would refer to a tensor with three axes that have, respectively, 2, 3, and 4
+  dimensions.

- CNTK allows for the last axis to be a *dynamic axis*, i.e. an axis whose size 
-  might vary between input samples. This allows for easily 
-  modelling sequences (for recurrent networks) without needing to introduce masks 
+- CNTK allows for the last axis to be a *dynamic axis*, i.e. an axis whose size
+  might vary between input samples. This allows for easily
+  modelling sequences (for recurrent networks) without needing to introduce masks
  or padding. See below for a detailed explanation.

- All data inside of a tensor is of a certain data type. Right now, CNTK 
-  implements *float* (32 bit) and *double* (64 bit) precision floating point types, 
+- All data inside of a tensor is of a certain data type. Right now, CNTK
+  implements *float* (32 bit) and *double* (64 bit) precision floating point types,
  and all tensors in a network have the same type.

 - Tensors come either in *dense* or *sparse* form. Sparse tensors should be used
@ -45,39 +45,39 @@ for r, g, and b) or videos (has an extra time dimension).
  tensors, however, the data ingestion of sparse tensors is only supported via
  the reader framework and not yet through NumPy.

-  
+
 Tensors are introduced in CNTK in one of three places:

- **Inputs**: These represent data inputs to the computation which are usually 
-  bound to a data reader. Data inputs are organized as (mini) batches and 
-  therefore receive an extra minibatch dimension. In addition, inputs can have a 
-  "ragged" axis called "dynamic axis" which is used to model sequential data. See 
+- **Inputs**: These represent data inputs to the computation which are usually
+  bound to a data reader. Data inputs are organized as (mini) batches and
+  therefore receive an extra minibatch dimension. In addition, inputs can have a
+  "ragged" axis called "dynamic axis" which is used to model sequential data. See
  below for details.

- **Parameters**: Parameters are weight tensors that make up the bulk of the 
-  actual model. Parameters are initialized using a constant (e.g. all 0's, 
-  randomly  generated data, or initialized from a file) and are updated during 
+- **Parameters**: Parameters are weight tensors that make up the bulk of the
+  actual model. Parameters are initialized using a constant (e.g. all 0's,
+  randomly  generated data, or initialized from a file) and are updated during
  *backpropagation* in a training run.

- **Constants**: Constants are very similar to parameters, but they are not 
+- **Constants**: Constants are very similar to parameters, but they are not
  taking part in backpropagation.

-All of these represent the *leaf nodes* in the network, or, in other words, the 
+All of these represent the *leaf nodes* in the network, or, in other words, the
 input parameters of the function that the network represents.

-To introduce a tensor, simply use one of the methods in the cntk namespace. Once 
+To introduce a tensor, simply use one of the methods in the cntk namespace. Once
 introduced, overloaded operators can be applied to them to form an operator graph::

  import cntk as C

  # Create an input with the shape (2,3,*)
-  >>> x = C.input_variable((2,3), name='features') 
+  >>> x = C.input_variable((2,3), name='features')

  # Create a constant scalar with value 2
  >>> c = C.constant(value=2)

  # Create a parameter of shape (2,3), randomly initialized
-  >>> w = C.parameter((2,3))         
+  >>> w = C.parameter((2,3))

  # Set up some test input data to check the operators.
  # We specify a full batch having a sequence with one element, which is a
@ -85,30 +85,31 @@ introduced, overloaded operators can be applied to them to form an operator grap
  >>> test_input = [[ np.asarray([[10,20,30],[40,50,60]]) ]]

  # Elementwise multiplication operation
-  >>> op  = x * c                    
+  >>> op  = x * c

  # Evaluate the op using test_input
  >>> print(op.eval({ x: test_input }))
-  [[[[  20.   40.   60.]
-     [  80.  100.  120.]]]]
-     
-  # Same as above (2 will be converted to constant)
-  >>> op2 = x * 2                    
-  >>> print(op2.eval({ x: test_input }))
-  [[[[  20.   40.   60.]
-     [  80.  100.  120.]]]]
+  [array([[[  20.,   40.,   60.],
+          [  80.,  100.,  120.]]], dtype=float32)]

-  #  Elementwise multiplication of two 2x3 matrices 
-  >>> op3 = x * [[1,2,3], [4,5,6]]  
+  # Same as above (2 will be converted to constant)
+  >>> op2 = x * 2
+  >>> print(op2.eval({ x: test_input }))
+  [array([[[  20.,   40.,   60.],
+          [  80.,  100.,  120.]]], dtype=float32)]
+
+  #  Elementwise multiplication of two 2x3 matrices
+  >>> op3 = x * [[1,2,3], [4,5,6]]
  >>> print(op3.eval({ x: test_input}))
-  [[[[  10.   40.   90.]
-     [ 160.  250.  360.]]]]
+  [array([[[  10.,   40.,   90.],
+          [ 160.,  250.,  360.]]], dtype=float32)]
+

 Broadcasting
 ~~~~~~~~~~~~

-For operations that require the tensor dimensions of their arguments to match, 
-*broadcasting*  is applied automatically whenever a tensor dimension is 1. 
+For operations that require the tensor dimensions of their arguments to match,
+*broadcasting*  is applied automatically whenever a tensor dimension is 1.
 Examples are elementwise product or plus operations.
 E.g. the following are equivalent:

@ -117,5 +118,5 @@ E.g. the following are equivalent:

  >>> C.element_times([2,3], [2,2]).eval()
  array([ 4.,  6.], dtype=float32)
-  
-  
+
+
--- a/bindings/python/doc/gettingstarted.rst
+++ b/bindings/python/doc/gettingstarted.rst
@ -22,24 +22,29 @@ more common case) is as follows:
    >>> x0 = np.asarray([[2., 1.]], dtype=np.float32)
    >>> y0 = np.asarray([[4., 6.]], dtype=np.float32)
    >>> cntk.squared_error(x, y).eval({x:x0, y:y0})
-    array([[ 29.]], dtype=float32)
+    [array([ 29.], dtype=float32)]

 In the above example we are first setting up two input variables with shape ``(1, 2)``. We then setup a ``squared_error`` node with those two variables as 
 inputs. Within the ``eval()`` method we can setup the input-mapping of the data for those two variables. In this case we pass in two numpy arrays. 
 The squared error is then of course ``(2-4)**2 + (1-6)**2 = 29``.

-As the graph nodes implement the NumPy array interface, you can easily access
-their content and use them in other NumPy operations:
+Most of the data containers like parameters, constants, values, etc. implement
+the asarray() method, which returns a NumPy interface.

    >>> import cntk as C
    >>> c = C.constant(3, shape=(2,3))
-    >>> np.asarray(c)
+    >>> c.asarray()
    array([[ 3.,  3.,  3.],
           [ 3.,  3.,  3.]], dtype=float32)
-    >>> np.ones_like(c)
+    >>> np.ones_like(c.asarray())
    array([[ 1.,  1.,  1.],
           [ 1.,  1.,  1.]], dtype=float32)

+For values that have a sequence axis, ``asarray()`` cannot work since, it requires
+the shape to be rectangular and sequences most of the time have different
+lengths. In that case, ``as_sequences(var)`` returns a list of NumPy arrays,
+where every NumPy arrays has the shape of the static axes of ``var``.
+
 Overview and first run
 ----------------------