Adapting examples; set MinibatchData.data to be a property

This commit is contained in:
Willi Richert 2017-03-29 10:41:04 +02:00
Родитель 1368b2de3f
Коммит 9adade3311
17 изменённых файлов: 557 добавлений и 613 удалений

Просмотреть файл

@ -279,7 +279,7 @@ def train(train_reader, valid_reader, vocab, i2w, s2smodel, max_epochs, epoch_si
# This decodes the test set and counts the string error rate.
def evaluate_decoding(reader, s2smodel, i2w):
model_decoding = create_model_greedy(s2smodel) # wrap the greedy decoder around the model
progress_printer = ProgressPrinter(tag='Evaluation')
@ -301,7 +301,7 @@ def evaluate_decoding(reader, s2smodel, i2w):
num_total += len(outputs)
num_wrong += sum([label != output for output, label in zip(outputs, labels)])
rate = num_wrong / num_total
print("string error rate of {:.1f}% in {} samples".format(100 * rate, num_total))
return rate
@ -321,7 +321,7 @@ def Evaluator(model, criterion):
parameters |= set(model.parameters)
if metric:
parameters |= set(metric.parameters)
dummy_learner = momentum_sgd(tuple(parameters),
dummy_learner = momentum_sgd(tuple(parameters),
lr = learning_rate_schedule(1, UnitType.minibatch),
momentum = momentum_as_time_constant_schedule(0))
return Trainer(model, (loss, metric), dummy_learner)
@ -382,10 +382,10 @@ def translate(tokens, model_decoding, vocab, i2w, show_attention=False, max_labe
# print out translation and stop at the sequence-end tag
prediction = np.argmax(pred, axis=-1)
translation = [i2w[i] for i in prediction]
# show attention window (requires matplotlib, seaborn, and pandas)
if use_attention and show_attention:
#att_value = model_decoding.attention_model.attention_weights(query)
# BUGBUG: fails with "Forward: Feature Not Implemented"
q = combine([model_decoding.attention_model.attention_weights])
@ -440,7 +440,7 @@ def get_vocab(path):
vocab = [w.strip() for w in open(path).readlines()]
i2w = { i:w for i,w in enumerate(vocab) }
w2i = { w:i for i,w in enumerate(vocab) }
return (vocab, i2w, w2i)
# Given a vocab and tensor, print the output
@ -454,9 +454,9 @@ def debug_attention(model, input):
words_p = q(input)
words = words_p[0]
p = words_p[1]
len = words.shape[attention_axis-1]
seq_len = words[0].shape[attention_axis-1]
span = 7 #attention_span #7 # test sentence is 7 tokens long
p_sq = np.squeeze(p[0,:len,:span,0,:]) # (batch, len, attention_span, 1, vector_dim)
p_sq = np.squeeze(p[0][:seq_len,:span,0,:]) # (batch, len, attention_span, 1, vector_dim)
opts = np.get_printoptions()
np.set_printoptions(precision=5)
print(p_sq)
@ -477,7 +477,7 @@ if __name__ == '__main__':
# create inputs and create model
model = create_model()
# train
train_reader = create_reader(os.path.join(DATA_DIR, TRAINING_DATA), True)
valid_reader = create_reader(os.path.join(DATA_DIR, VALIDATION_DATA), True)
@ -489,7 +489,7 @@ if __name__ == '__main__':
# test string error rate on decoded output
test_reader = create_reader(os.path.join(DATA_DIR, TESTING_DATA), False)
evaluate_decoding(test_reader, model, i2w)
# test same metric same as in training on test set
test_reader = create_reader(os.path.join(DATA_DIR, TESTING_DATA), False)
evaluate_metric(test_reader, model)

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -707,11 +707,11 @@
"onehot = np.zeros([len(w),len(query_dict)], np.float32)\n",
"for t in range(len(w)):\n",
" onehot[t,w[t]] = 1\n",
"pred = model.eval({model.arguments[0]:[onehot]})\n",
"pred = model.eval({model.arguments[0]:[onehot]})[0]\n",
"print(pred.shape)\n",
"best = np.argmax(pred,axis=2)\n",
"print(best[0])\n",
"list(zip(seq.split(),[slots_wl[s] for s in best[0]]))"
"best = np.argmax(pred,axis=1)\n",
"print(best)\n",
"list(zip(seq.split(),[slots_wl[s] for s in best]))"
]
},
{

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -842,9 +842,9 @@
" words_p = q(input)\n",
" words = words_p[0]\n",
" p = words_p[1]\n",
" len = words.shape[attention_axis-1]\n",
" seq_len = words[0].shape[attention_axis-1]\n",
" span = 7 #attention_span #7 # test sentence is 7 tokens long\n",
" p_sq = np.squeeze(p[0,:len,:span,0,:]) # (batch, len, attention_span, 1, vector_dim)\n",
" p_sq = np.squeeze(p[0][:seq_len,:span,0,:]) # (batch, len, attention_span, 1, vector_dim)\n",
" opts = np.get_printoptions()\n",
" np.set_printoptions(precision=5)\n",
" print(p_sq)\n",
@ -897,7 +897,7 @@
" [ 0.1439 0.14321 0.14322 0.14308 0.14287 0.14216 0.14156]\n",
" [ 0.1439 0.14321 0.14322 0.14308 0.14287 0.14216 0.14156]]\n",
" Minibatch[ 181- 210]: loss = 3.143627 * 1565, metric = 82.30% * 1565;\n",
" Minibatch[ 211- 240]: loss = 3.186274 * 1583, metric = 83.39% * 1583;\n",
" Minibatch[ 211- 240]: loss = 3.186273 * 1583, metric = 83.39% * 1583;\n",
" Minibatch[ 241- 270]: loss = 3.128010 * 1562, metric = 83.03% * 1562;\n",
" Minibatch[ 271- 300]: loss = 3.152663 * 1551, metric = 83.69% * 1551;\n",
"['<s> A B A D I </s>']\n",
@ -921,7 +921,7 @@
" [ 0.14417 0.14347 0.14339 0.14316 0.14279 0.14191 0.14111]\n",
" [ 0.14417 0.14348 0.14339 0.14316 0.14279 0.14191 0.14111]]\n",
" Minibatch[ 391- 420]: loss = 3.126911 * 1601, metric = 82.26% * 1601;\n",
"Finished Epoch[1 of 300]: [Training] loss = 3.279741 * 22067, metric = 84.28% * 22067 92.880s (237.6 samples/s);\n",
"Finished Epoch[1 of 300]: [Training] loss = 3.279741 * 22067, metric = 84.28% * 22067 260.163s ( 84.8 samples/s);\n",
"Saving final model to 'model_0.cmf'\n",
"1 epochs complete.\n"
]
@ -932,6 +932,15 @@
"train(train_reader, valid_reader, vocab, i2w, model, max_epochs=1, epoch_size=25000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
@ -1290,15 +1299,6 @@
"\n",
"With the above model, you have the basics for training a powerful sequence-to-sequence model with attention in a number of distinct domains. The only major changes required are preparing a dataset with pairs input and output sequences and in general the rest of the building blocks will remain the same. Good luck, and have fun!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -788,7 +788,7 @@
" output = loaded_model.eval(arguments)\n",
"\n",
" # return softmax probabilities\n",
" sm = softmax(output[0,0])\n",
" sm = softmax(output[0][0])\n",
" return sm.eval()\n",
" except FileNotFoundError:\n",
" print(\"Could not open (skipping file): \", image_path)\n",

Просмотреть файл

@ -227,7 +227,7 @@ class Value(cntk_py.Value):
else:
super(Value, self).__init__(ndav)
def as_sequences(self, variable):
def as_sequences(self, variable=None):
'''
Convert a Value to a sequence of NumPy arrays that have their masked
entries removed.
@ -238,6 +238,9 @@ class Value(cntk_py.Value):
returned. Otherwise, the arrays will be returned directly.
'''
if self.is_sparse():
if variable is None:
raise ValueError('cannot convert sparse value to sequences '
'wihtout the corresponding variable')
network = _sparse_to_dense_network_cache(variable.shape)
warnings.warn('converting Value object to CSR format might be slow')

Просмотреть файл

@ -3,13 +3,9 @@
# for full license information.
# ==============================================================================
import sys
import numbers
import collections
import copy
import numpy as np
from numbers import Number
from scipy import sparse
from .. import cntk_py
from ..axis import Axis

Просмотреть файл

@ -4,6 +4,7 @@
# for full license information.
# ==============================================================================
import warnings
from .. import cntk_py, Value
from ..tensor import ArrayMixin
from cntk.internal import typemap
@ -49,6 +50,25 @@ class MinibatchData(cntk_py.MinibatchData, ArrayMixin):
'''
return self.data.as_sequences(variable)
@property
def data(self):
'''
The Value representation of the minibatch.
'''
return super(MinibatchData, self).data()
@property
def value(self):
'''
The value of the minibatch as a NumPy array.
'''
warnings.warn('the .value property is deprecated. Please use '
'.asarray() or .as_sequences() to get the NumPy '
'representations or .data to get the Value '
'representation', RuntimeWarning)
return self.as_sequences()
@property
def shape(self):
'''

Просмотреть файл

@ -1,43 +1,43 @@
:orphan:
Concepts
Concepts
========
There is a common property in key machine learning models, such as deep neural
networks (DNNs), convolutional neural networks (CNNs), and recurrent neural
networks (DNNs), convolutional neural networks (CNNs), and recurrent neural
networks (RNNs). All of these models can be described as *computational networks*.
The directed edges of these *computational networks* are vectors, matrices, or in
general n-dimensional arrays (tensors) which represent input data and model
parameters. The vertices are *functions* (also called operations) that are
performing a computation on these input tensors.
The directed edges of these *computational networks* are vectors, matrices, or in
general n-dimensional arrays (tensors) which represent input data and model
parameters. The vertices are *functions* (also called operations) that are
performing a computation on these input tensors.
Tensors
-------
The underlying data structure in CNTK is that of a *tensor*. It is a
multidimensional array on which computations can be performed. Every dimension in
these arrays is referred to as an *axis* to distinguish it from the scalar size
of every axis. So, a matrix has two *axes* which both have a certain
*dimension* corresponding to the number of rows and columns of the *axes*.
The underlying data structure in CNTK is that of a *tensor*. It is a
multidimensional array on which computations can be performed. Every dimension in
these arrays is referred to as an *axis* to distinguish it from the scalar size
of every axis. So, a matrix has two *axes* which both have a certain
*dimension* corresponding to the number of rows and columns of the *axes*.
Using tensors makes the framework generic in that it can be used e.g. for
classification problems where the inputs are vectors, black-and-white
images (input is a matrix of points), color images (includes a separate dimension
for r, g, and b) or videos (has an extra time dimension).
Using tensors makes the framework generic in that it can be used e.g. for
classification problems where the inputs are vectors, black-and-white
images (input is a matrix of points), color images (includes a separate dimension
for r, g, and b) or videos (has an extra time dimension).
- Tensors have a *shape* which describes the dimensions of its axes. E.g. a shape ``[2,3,4]``
would refer to a tensor with three axes that have, respectively, 2, 3, and 4
dimensions.
- Tensors have a *shape* which describes the dimensions of its axes. E.g. a shape ``[2,3,4]``
would refer to a tensor with three axes that have, respectively, 2, 3, and 4
dimensions.
- CNTK allows for the last axis to be a *dynamic axis*, i.e. an axis whose size
might vary between input samples. This allows for easily
modelling sequences (for recurrent networks) without needing to introduce masks
- CNTK allows for the last axis to be a *dynamic axis*, i.e. an axis whose size
might vary between input samples. This allows for easily
modelling sequences (for recurrent networks) without needing to introduce masks
or padding. See below for a detailed explanation.
- All data inside of a tensor is of a certain data type. Right now, CNTK
implements *float* (32 bit) and *double* (64 bit) precision floating point types,
- All data inside of a tensor is of a certain data type. Right now, CNTK
implements *float* (32 bit) and *double* (64 bit) precision floating point types,
and all tensors in a network have the same type.
- Tensors come either in *dense* or *sparse* form. Sparse tensors should be used
@ -45,39 +45,39 @@ for r, g, and b) or videos (has an extra time dimension).
tensors, however, the data ingestion of sparse tensors is only supported via
the reader framework and not yet through NumPy.
Tensors are introduced in CNTK in one of three places:
- **Inputs**: These represent data inputs to the computation which are usually
bound to a data reader. Data inputs are organized as (mini) batches and
therefore receive an extra minibatch dimension. In addition, inputs can have a
"ragged" axis called "dynamic axis" which is used to model sequential data. See
- **Inputs**: These represent data inputs to the computation which are usually
bound to a data reader. Data inputs are organized as (mini) batches and
therefore receive an extra minibatch dimension. In addition, inputs can have a
"ragged" axis called "dynamic axis" which is used to model sequential data. See
below for details.
- **Parameters**: Parameters are weight tensors that make up the bulk of the
actual model. Parameters are initialized using a constant (e.g. all 0's,
randomly generated data, or initialized from a file) and are updated during
- **Parameters**: Parameters are weight tensors that make up the bulk of the
actual model. Parameters are initialized using a constant (e.g. all 0's,
randomly generated data, or initialized from a file) and are updated during
*backpropagation* in a training run.
- **Constants**: Constants are very similar to parameters, but they are not
- **Constants**: Constants are very similar to parameters, but they are not
taking part in backpropagation.
All of these represent the *leaf nodes* in the network, or, in other words, the
All of these represent the *leaf nodes* in the network, or, in other words, the
input parameters of the function that the network represents.
To introduce a tensor, simply use one of the methods in the cntk namespace. Once
To introduce a tensor, simply use one of the methods in the cntk namespace. Once
introduced, overloaded operators can be applied to them to form an operator graph::
import cntk as C
# Create an input with the shape (2,3,*)
>>> x = C.input_variable((2,3), name='features')
>>> x = C.input_variable((2,3), name='features')
# Create a constant scalar with value 2
>>> c = C.constant(value=2)
# Create a parameter of shape (2,3), randomly initialized
>>> w = C.parameter((2,3))
>>> w = C.parameter((2,3))
# Set up some test input data to check the operators.
# We specify a full batch having a sequence with one element, which is a
@ -85,30 +85,31 @@ introduced, overloaded operators can be applied to them to form an operator grap
>>> test_input = [[ np.asarray([[10,20,30],[40,50,60]]) ]]
# Elementwise multiplication operation
>>> op = x * c
>>> op = x * c
# Evaluate the op using test_input
>>> print(op.eval({ x: test_input }))
[[[[ 20. 40. 60.]
[ 80. 100. 120.]]]]
# Same as above (2 will be converted to constant)
>>> op2 = x * 2
>>> print(op2.eval({ x: test_input }))
[[[[ 20. 40. 60.]
[ 80. 100. 120.]]]]
[array([[[ 20., 40., 60.],
[ 80., 100., 120.]]], dtype=float32)]
# Elementwise multiplication of two 2x3 matrices
>>> op3 = x * [[1,2,3], [4,5,6]]
# Same as above (2 will be converted to constant)
>>> op2 = x * 2
>>> print(op2.eval({ x: test_input }))
[array([[[ 20., 40., 60.],
[ 80., 100., 120.]]], dtype=float32)]
# Elementwise multiplication of two 2x3 matrices
>>> op3 = x * [[1,2,3], [4,5,6]]
>>> print(op3.eval({ x: test_input}))
[[[[ 10. 40. 90.]
[ 160. 250. 360.]]]]
[array([[[ 10., 40., 90.],
[ 160., 250., 360.]]], dtype=float32)]
Broadcasting
~~~~~~~~~~~~
For operations that require the tensor dimensions of their arguments to match,
*broadcasting* is applied automatically whenever a tensor dimension is 1.
For operations that require the tensor dimensions of their arguments to match,
*broadcasting* is applied automatically whenever a tensor dimension is 1.
Examples are elementwise product or plus operations.
E.g. the following are equivalent:
@ -117,5 +118,5 @@ E.g. the following are equivalent:
>>> C.element_times([2,3], [2,2]).eval()
array([ 4., 6.], dtype=float32)

Просмотреть файл

@ -22,24 +22,29 @@ more common case) is as follows:
>>> x0 = np.asarray([[2., 1.]], dtype=np.float32)
>>> y0 = np.asarray([[4., 6.]], dtype=np.float32)
>>> cntk.squared_error(x, y).eval({x:x0, y:y0})
array([[ 29.]], dtype=float32)
[array([ 29.], dtype=float32)]
In the above example we are first setting up two input variables with shape ``(1, 2)``. We then setup a ``squared_error`` node with those two variables as
inputs. Within the ``eval()`` method we can setup the input-mapping of the data for those two variables. In this case we pass in two numpy arrays.
The squared error is then of course ``(2-4)**2 + (1-6)**2 = 29``.
As the graph nodes implement the NumPy array interface, you can easily access
their content and use them in other NumPy operations:
Most of the data containers like parameters, constants, values, etc. implement
the asarray() method, which returns a NumPy interface.
>>> import cntk as C
>>> c = C.constant(3, shape=(2,3))
>>> np.asarray(c)
>>> c.asarray()
array([[ 3., 3., 3.],
[ 3., 3., 3.]], dtype=float32)
>>> np.ones_like(c)
>>> np.ones_like(c.asarray())
array([[ 1., 1., 1.],
[ 1., 1., 1.]], dtype=float32)
For values that have a sequence axis, ``asarray()`` cannot work since, it requires
the shape to be rectangular and sequences most of the time have different
lengths. In that case, ``as_sequences(var)`` returns a list of NumPy arrays,
where every NumPy arrays has the shape of the static axes of ``var``.
Overview and first run
----------------------