Updating the CNTK_202 notebook and test

This commit is contained in:
REDMOND\sayanpa 2017-04-26 11:14:46 -07:00
Родитель 48aec50aa9
Коммит d08da86eca
2 изменённых файлов: 298 добавлений и 257 удалений

Просмотреть файл

@ -34,6 +34,6 @@ def test_cntk_202_language_understanding_trainerror(nb):
pass
except KeyError:
pass
expectedMetrics = [2.8, 2.1, 2.4, 2.1]
expectedMetrics = [0.3, 0.3, 0.4, 0.3]
# TODO tighten tolerances
assert numpy.allclose(expectedMetrics, metrics, atol=0.2)
assert numpy.allclose(expectedMetrics, metrics, atol=0.15)

Просмотреть файл

@ -4,13 +4,13 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hands-On Lab: Language Understanding with Recurrent Networks\n",
"# CNTK 202: Language Understanding with Recurrent Networks\n",
"\n",
"This hands-on lab shows how to implement a recurrent network to process text,\n",
"This tutorial shows how to implement a recurrent network to process text,\n",
"for the [Air Travel Information Services](https://catalog.ldc.upenn.edu/LDC95S26) \n",
"(ATIS) task of slot tagging (tag individual words to their respective classes, \n",
"where the classes are provided as labels in the training data set).\n",
"We will start with a straight-forward embedding of the words followed by a recurrent LSTM.\n",
"We will start with a straight-forward (linear) embedding of the words followed by a recurrent LSTM.\n",
"This will then be extended to include neighboring words and run bidirectionally.\n",
"Lastly, we will turn this system into an intent classifier. \n",
"\n",
@ -60,10 +60,10 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Reusing locally cached: query.wl\n",
"Reusing locally cached: atis.test.ctf\n",
"Reusing locally cached: atis.train.ctf\n",
"Reusing locally cached: slots.wl\n"
"Reusing locally cached: atis.test.ctf\n",
"Reusing locally cached: slots.wl\n",
"Reusing locally cached: query.wl\n"
]
}
],
@ -125,12 +125,6 @@
"import numpy as np\n",
"\n",
"import cntk as C\n",
"from cntk.logging import ProgressPrinter, log_number_of_parameters\n",
"from cntk.io import MinibatchSource, CTFDeserializer\n",
"from cntk.io import StreamDef, StreamDefs, INFINITELY_REPEAT\n",
"from cntk import *\n",
"from cntk.learners import fsadagrad, learning_rate_schedule\n",
"from cntk.layers import * # CNTK Layers library\n",
"\n",
"# Select the right target device when this notebook is being tested:\n",
"if 'TEST_DEVICE' in os.environ:\n",
@ -154,7 +148,7 @@
"specific item of information (slot), and which one.\n",
"\n",
"The data in your working folder has already been converted into the \"CNTK Text Format.\"\n",
"Let's look at an example from the test-set file `atis.test.ctf`:\n",
"Let us look at an example from the test-set file `atis.test.ctf`:\n",
"\n",
" 19 |S0 178:1 |# BOS |S1 14:1 |# flight |S2 128:1 |# O\n",
" 19 |S0 770:1 |# show |S2 128:1 |# O\n",
@ -233,12 +227,16 @@
"emb_dim = 150\n",
"hidden_dim = 300\n",
"\n",
"# Create the containers for input feature (x) and the label (y)\n",
"x = C.sequence.input(vocab_size)\n",
"y = C.sequence.input(num_labels)\n",
"\n",
"def create_model():\n",
" with default_options(initial_state=0.1):\n",
" return Sequential([\n",
" Embedding(emb_dim, name='embed'),\n",
" Recurrence(LSTM(hidden_dim), go_backwards=False),\n",
" Dense(num_labels, name='classify')\n",
" with C.layers.default_options(initial_state=0.1):\n",
" return C.layers.Sequential([\n",
" C.layers.Embedding(emb_dim, name='embed'),\n",
" C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),\n",
" C.layers.Dense(num_labels, name='classify')\n",
" ])"
]
},
@ -274,9 +272,9 @@
],
"source": [
"# peek\n",
"model = create_model()\n",
"print(model.embed.E.shape)\n",
"print(model.classify.b.value)"
"z = create_model()\n",
"print(z.embed.E.shape)\n",
"print(z.classify.b.value)"
]
},
{
@ -285,7 +283,28 @@
"collapsed": false
},
"source": [
"As you can see the attributes of the model are fully accessible from Python. The model has 3 layers. The first layer is an embedding and you can access its parameter `E` (where the embeddings are stored) like any other attribute of a Python object. Its shape contains a `-1` which indicates that this parameter is not fully specified yet. When we decide what data we will run through this network (very shortly) the shape will be the size of the input vocabulary. We also print the bias term in the last layer. Bias terms are by default initialized to 0 (but there's also a way to change that).\n"
"The module attributes are fully accessible from Python. The model has 3 layers. The first layer is an embedding and you can access its parameter `E` (where the embeddings are stored) like any other attribute of a Python object. Its shape contains a `-1` which indicates that this parameter is not fully specified yet. When we decide what data we will run through this network the shape will be the size of the input vocabulary. We also print the bias term in the last layer. Bias terms are by default initialized to 0 (but there's also a way to change that). As you create the model, one should name the layer component and then access the parameters as shown here. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(943, 150)\n"
]
}
],
"source": [
"# Pass an input and check the dimension\n",
"z = create_model()\n",
"print(z(x).embed.E.shape)"
]
},
{
@ -332,23 +351,23 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def create_reader(path, is_training):\n",
" return MinibatchSource(CTFDeserializer(path, StreamDefs(\n",
" query = StreamDef(field='S0', shape=vocab_size, is_sparse=True),\n",
" intent_unused = StreamDef(field='S1', shape=num_intents, is_sparse=True), \n",
" slot_labels = StreamDef(field='S2', shape=num_labels, is_sparse=True)\n",
" )), randomize=is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)"
" return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(\n",
" query = C.io.StreamDef(field='S0', shape=vocab_size, is_sparse=True),\n",
" intent_unused = C.io.StreamDef(field='S1', shape=num_intents, is_sparse=True), \n",
" slot_labels = C.io.StreamDef(field='S2', shape=num_labels, is_sparse=True)\n",
" )), randomize=is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {
"collapsed": false
},
@ -356,10 +375,10 @@
{
"data": {
"text/plain": [
"dict_keys(['query', 'slot_labels', 'intent_unused'])"
"dict_keys(['slot_labels', 'intent_unused', 'query'])"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -376,22 +395,9 @@
"source": [
"### Trainer\n",
"\n",
"We also must define the training criterion (loss function), and also an error metric to track. Below we make extensive use of `placeholders`. Remember that the code we have been writing is not actually executing any heavy computation it is just specifying the function we want to compute on data during training/testing. And in the same way that it is convenient to have names for arguments when you write a regular function in a programming language, it is convenient to have placeholders that refer to arguments (or local computations that need to be reused). Eventually, some other code will replace these placeholders with other known quantities in the same way that in a programming language the function will be called with concrete values bound to its arguments. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def create_criterion_function(model):\n",
" labels = C.placeholder(name='labels')\n",
" ce = cross_entropy_with_softmax(model, labels)\n",
" errs = classification_error (model, labels)\n",
" return combine ([ce, errs]) # (features, labels) -> (loss, metric)"
"We also must define the training criterion (loss function), and also an error metric to track. In most tutorials, we know the input dimensions and the corresponding labels. We directly create the loss and the error functions. In this tutorial we will do the same. However, we take a brief detour and learn about placeholders. This concept would be useful for Task 3. \n",
"\n",
"**Learning note**: Introduction to `placeholder`: Remember that the code we have been writing is not actually executing any heavy computation it is just specifying the function we want to compute on data during training/testing. And in the same way that it is convenient to have names for arguments when you write a regular function in a programming language, it is convenient to have placeholders that refer to arguments (or local computations that need to be reused). Eventually, some other code will replace these placeholders with other known quantities in the same way that in a programming language the function will be called with concrete values bound to its arguments. Here is an example below that illustrates the use of `placeholder`."
]
},
{
@ -400,15 +406,65 @@
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Composite(Combine): Input('Input2293', [#, *], [129]), Placeholder('labels', [???], [???]) -> Output('Block2263_Output_0', [#, *], [1]), Output('Block2283_Output_0', [#, *], [])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def create_criterion_function(model):\n",
" labels = C.placeholder(name='labels')\n",
" ce = C.cross_entropy_with_softmax(model, labels)\n",
" errs = C.classification_error (model, labels)\n",
" return C.combine ([ce, errs]) # (features, labels) -> (loss, metric)\n",
"\n",
"criterion = create_criterion_function(create_model())\n",
"criterion.replace_placeholders({criterion.placeholders[0]: C.sequence.input(num_labels)})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"While the cell above works well when one has input parameters defined at network creation, it compromises readability. Hence we prefer creating functions as shown below"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def train(reader, model, max_epochs=16):\n",
" # criterion: (model args, labels) -> (loss, metric)\n",
" # here (query, slot_labels) -> (ce, errs)\n",
" criterion = create_criterion_function(model)\n",
"\n",
" criterion.replace_placeholders({criterion.placeholders[0]: C.sequence.input(vocab_size),\n",
" criterion.placeholders[1]: C.sequence.input(num_labels)})\n",
"def create_criterion_function_preferred(model, labels):\n",
" ce = C.cross_entropy_with_softmax(model, labels)\n",
" errs = C.classification_error (model, labels)\n",
" return ce, errs # (model, labels) -> (loss, error metric)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def train(reader, model_func, max_epochs=10):\n",
" \n",
" # Instantiate the model function; x is the input (feature) variable \n",
" model = model_func(x)\n",
" \n",
" # Instantiate the loss and error function\n",
" loss, label_error = create_criterion_function_preferred(model, y)\n",
"\n",
" # training config\n",
" epoch_size = 18000 # 18000 samples is half the dataset size \n",
@ -420,37 +476,43 @@
" # (we don't run this many epochs, but if we did, these are good values)\n",
" lr_per_sample = [0.003]*4+[0.0015]*24+[0.0003]\n",
" lr_per_minibatch = [x * minibatch_size for x in lr_per_sample]\n",
" lr_schedule = learning_rate_schedule(lr_per_minibatch, UnitType.minibatch, epoch_size)\n",
" lr_schedule = C.learning_rate_schedule(lr_per_minibatch, C.UnitType.minibatch, epoch_size)\n",
" \n",
" # Momentum\n",
" momentum_as_time_constant = momentum_as_time_constant_schedule(700)\n",
" # Momentum schedule\n",
" momentum_as_time_constant = C.momentum_as_time_constant_schedule(700)\n",
" \n",
" # We use a variant of the FSAdaGrad optimizer which is known to work well on this dataset\n",
" # We use a the Adam optimizer which is known to work well on this dataset\n",
" # Feel free to try other optimizers from \n",
" # https://www.cntk.ai/pythondocs/cntk.learner.html#module-cntk.learner\n",
" learner = fsadagrad(criterion.parameters,\n",
" lr=lr_schedule, momentum=momentum_as_time_constant,\n",
" gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True)\n",
" learner = C.adam(parameters=model.parameters,\n",
" lr=lr_schedule,\n",
" momentum=momentum_as_time_constant,\n",
" gradient_clipping_threshold_per_sample=15, \n",
" gradient_clipping_with_truncation=True)\n",
"\n",
" # trainer\n",
" progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)\n",
" #progress_printer = ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) # more detailed logging\n",
" trainer = Trainer(model, criterion, learner, progress_printer)\n",
" # Setup the progress updater\n",
" progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs)\n",
" \n",
" # Uncomment below for more detailed logging\n",
" #progress_printer = ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) \n",
"\n",
" # Instantiate the trainer\n",
" trainer = C.Trainer(model, (loss, label_error), learner, progress_printer)\n",
"\n",
" # process minibatches and perform model training\n",
" log_number_of_parameters(model)\n",
" C.logging.log_number_of_parameters(model)\n",
"\n",
" t = 0\n",
" for epoch in range(max_epochs): # loop over epochs\n",
" epoch_end = (epoch+1) * epoch_size\n",
" while t < epoch_end: # loop over minibatches on the epoch\n",
" data = reader.next_minibatch(minibatch_size, input_map={ # fetch minibatch\n",
" criterion.arguments[0]: reader.streams.query,\n",
" criterion.arguments[1]: reader.streams.slot_labels\n",
" x: reader.streams.query,\n",
" y: reader.streams.slot_labels\n",
" })\n",
" trainer.train_minibatch(data) # update model with it\n",
" t += data[criterion.arguments[1]].num_samples # samples so far\n",
" trainer.summarize_training_progress()\n"
" trainer.train_minibatch(data) # update model with it\n",
" t += data[y].num_samples # samples so far\n",
" trainer.summarize_training_progress()"
]
},
{
@ -464,7 +526,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"metadata": {
"collapsed": false,
"scrolled": false
@ -475,31 +537,27 @@
"output_type": "stream",
"text": [
"Training 721479 parameters in 6 parameter tensors.\n",
"Finished Epoch[1 of 16]: [Training] loss = 1.097468 * 18010, metric = 0.00% * 18010 3.835s (4696.2 samples/s);\n",
"Finished Epoch[2 of 16]: [Training] loss = 0.443539 * 18051, metric = 0.00% * 18051 3.464s (5211.0 samples/s);\n",
"Finished Epoch[3 of 16]: [Training] loss = 0.294534 * 17941, metric = 0.00% * 17941 3.383s (5303.3 samples/s);\n",
"Finished Epoch[4 of 16]: [Training] loss = 0.213083 * 18059, metric = 0.00% * 18059 3.455s (5226.9 samples/s);\n",
"Finished Epoch[5 of 16]: [Training] loss = 0.158128 * 17957, metric = 0.00% * 17957 3.430s (5235.3 samples/s);\n",
"Finished Epoch[6 of 16]: [Training] loss = 0.143503 * 18021, metric = 0.00% * 18021 3.435s (5246.3 samples/s);\n",
"Finished Epoch[7 of 16]: [Training] loss = 0.117660 * 17980, metric = 0.00% * 17980 3.362s (5348.0 samples/s);\n",
"Finished Epoch[8 of 16]: [Training] loss = 0.121787 * 18025, metric = 0.00% * 18025 3.423s (5265.8 samples/s);\n",
"Finished Epoch[9 of 16]: [Training] loss = 0.082948 * 17956, metric = 0.00% * 17956 3.401s (5279.6 samples/s);\n",
"Finished Epoch[10 of 16]: [Training] loss = 0.084909 * 18039, metric = 0.00% * 18039 3.401s (5304.0 samples/s);\n",
"Finished Epoch[11 of 16]: [Training] loss = 0.091183 * 17966, metric = 0.00% * 17966 3.314s (5421.2 samples/s);\n",
"Finished Epoch[12 of 16]: [Training] loss = 0.065457 * 18041, metric = 0.00% * 18041 3.370s (5353.4 samples/s);\n",
"Finished Epoch[13 of 16]: [Training] loss = 0.069261 * 17984, metric = 0.00% * 17984 3.529s (5096.1 samples/s);\n",
"Finished Epoch[14 of 16]: [Training] loss = 0.069089 * 17976, metric = 0.00% * 17976 3.338s (5385.3 samples/s);\n",
"Finished Epoch[15 of 16]: [Training] loss = 0.061216 * 18030, metric = 0.00% * 18030 3.407s (5292.0 samples/s);\n",
"Finished Epoch[16 of 16]: [Training] loss = 0.052744 * 18014, metric = 0.00% * 18014 3.482s (5173.5 samples/s);\n"
"Learning rate per minibatch: 0.21\n",
"Finished Epoch[1 of 10]: [Training] loss = 0.787482 * 18010, metric = 15.61% * 18010 4.638s (3883.1 samples/s);\n",
"Finished Epoch[2 of 10]: [Training] loss = 0.223525 * 18051, metric = 5.25% * 18051 4.401s (4101.6 samples/s);\n",
"Finished Epoch[3 of 10]: [Training] loss = 0.154852 * 17941, metric = 3.68% * 17941 4.495s (3991.3 samples/s);\n",
"Finished Epoch[4 of 10]: [Training] loss = 0.106380 * 18059, metric = 2.64% * 18059 4.469s (4040.9 samples/s);\n",
"Learning rate per minibatch: 0.105\n",
"Finished Epoch[5 of 10]: [Training] loss = 0.069279 * 17957, metric = 1.65% * 17957 4.545s (3950.9 samples/s);\n",
"Finished Epoch[6 of 10]: [Training] loss = 0.061887 * 18021, metric = 1.50% * 18021 4.378s (4116.3 samples/s);\n",
"Finished Epoch[7 of 10]: [Training] loss = 0.054078 * 17980, metric = 1.29% * 17980 4.127s (4356.7 samples/s);\n",
"Finished Epoch[8 of 10]: [Training] loss = 0.050230 * 18025, metric = 1.30% * 18025 4.501s (4004.7 samples/s);\n",
"Finished Epoch[9 of 10]: [Training] loss = 0.030962 * 17956, metric = 0.86% * 17956 3.940s (4557.4 samples/s);\n",
"Finished Epoch[10 of 10]: [Training] loss = 0.033263 * 18039, metric = 0.90% * 18039 4.045s (4459.6 samples/s);\n"
]
}
],
"source": [
"def do_train():\n",
" global model\n",
" model = create_model()\n",
" global z\n",
" z = create_model()\n",
" reader = create_reader(data['train']['file'], is_training=True)\n",
" train(reader, model)\n",
" train(reader, z)\n",
"do_train()"
]
},
@ -508,8 +566,8 @@
"metadata": {},
"source": [
"This shows how learning proceeds over epochs (passes through the data).\n",
"For example, after four epochs, the loss, which is the cross-entropy criterion, has reached 0.22 as measured on the ~18000 samples of this epoch,\n",
"and that the error rate is 5.0% on those same 18000 training samples.\n",
"For example, after four epochs, the loss, which is the cross-entropy criterion, has reached 0.11 as measured on the ~18000 samples of this epoch,\n",
"and that the error rate is 2.6% on those same 18000 training samples.\n",
"\n",
"The epoch size is the number of samples--counted as *word tokens*, not sentences--to\n",
"process between model checkpoints.\n",
@ -517,7 +575,7 @@
"Once the training has completed (a little less than 2 minutes on a Titan-X or a Surface Book),\n",
"you will see an output like this\n",
"```\n",
"Finished Epoch [16]: [Training] loss = 0.058111 * 18014, metric = 1.3% * 18014\n",
"Finished Epoch [10]: [Training] loss = 0.033263 * 18039, metric = 0.9% * 18039\n",
"```\n",
"which is the loss (cross entropy) and the metric (classification error) averaged over the final epoch.\n",
"\n",
@ -541,33 +599,39 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def evaluate(reader, model):\n",
" criterion = create_criterion_function(model)\n",
" criterion.replace_placeholders({criterion.placeholders[0]: C.sequence.input(num_labels)})\n",
"def evaluate(reader, model_func):\n",
" \n",
" # Create the containers for input feature (x) and the label (y)\n",
" x = C.sequence.input(vocab_size)\n",
" y = C.sequence.input(num_labels) \n",
" \n",
" # Instantiate the model function; x is the input (feature) variable \n",
" model = model_func(x)\n",
" \n",
" # Create the loss and error functions\n",
" loss, label_error = create_criterion_function_preferred(model, y)\n",
"\n",
" # process minibatches and perform evaluation\n",
" lr_schedule = learning_rate_schedule(1, UnitType.minibatch)\n",
" momentum_as_time_constant = momentum_as_time_constant_schedule(0)\n",
" dummy_learner = fsadagrad(criterion.parameters, \n",
" lr=lr_schedule, momentum=momentum_as_time_constant)\n",
" progress_printer = ProgressPrinter(tag='Evaluation', num_epochs=0)\n",
" evaluator = Trainer(model, criterion, dummy_learner, progress_printer)\n",
" progress_printer = C.logging.ProgressPrinter(tag='Evaluation', num_epochs=0)\n",
"\n",
" while True:\n",
" minibatch_size = 500\n",
" data = reader.next_minibatch(minibatch_size, input_map={ # fetch minibatch\n",
" criterion.arguments[0]: reader.streams.query,\n",
" criterion.arguments[1]: reader.streams.slot_labels\n",
" x: reader.streams.query,\n",
" y: reader.streams.slot_labels\n",
" })\n",
" if not data: # until we hit the end\n",
" break\n",
"\n",
" evaluator = C.eval.Evaluator(loss, progress_printer)\n",
" evaluator.test_minibatch(data)\n",
" \n",
" evaluator.summarize_test_progress()\n"
]
},
@ -580,7 +644,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 13,
"metadata": {
"collapsed": false
},
@ -589,58 +653,41 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Finished Evaluation [1]: Minibatch[1-23]: metric = 2.75% * 10984;\n"
"Finished Evaluation [1]: Minibatch[1-23]: metric = 0.31% * 10984;\n"
]
},
{
"data": {
"text/plain": [
"array([ -6.17747717e-02, 1.67661443e-01, 2.67170966e-01,\n",
" -1.74591824e-01, -8.95597786e-02, -1.41288519e-01,\n",
" -1.65670067e-01, -3.37033689e-01, -2.43425101e-01,\n",
" -4.46879327e-01, -1.68832064e-01, -3.02033067e-01,\n",
" -1.76276788e-01, -2.74571389e-01, 1.39392257e-01,\n",
" 8.47953185e-02, -5.46203911e-01, -1.13420285e-01,\n",
" 2.31410667e-01, -6.03151739e-01, 1.89564034e-01,\n",
" 1.24402806e-01, -2.14793026e-01, -4.18093294e-01,\n",
" -3.74919385e-01, 1.71167374e-01, -2.51746088e-01,\n",
" -1.43728435e-01, -9.00750905e-02, -3.50904576e-02,\n",
" -1.18319996e-01, -1.28747627e-01, 5.05147576e-02,\n",
" -1.58297736e-02, -2.74065882e-01, 8.33015382e-01,\n",
" 2.71681130e-01, 2.87652642e-01, 5.84618635e-02,\n",
" -8.77579302e-02, -5.99530458e-01, 5.89624830e-02,\n",
" 4.13840532e-01, 2.55903602e-01, 5.06808400e-01,\n",
" -1.23152020e-03, -9.60507244e-02, -1.53484166e-01,\n",
" 2.22164527e-01, -1.28024921e-01, -1.65894926e-01,\n",
" -1.12369537e-01, -1.55051336e-01, 2.41953477e-01,\n",
" 1.22186437e-01, -4.66029584e-01, 9.18036997e-02,\n",
" -2.58443505e-01, -2.13418171e-01, -1.93984643e-01,\n",
" -4.68383253e-01, -3.08870435e-01, -2.98281580e-01,\n",
" -4.93319303e-01, -4.19607669e-01, -4.19408053e-01,\n",
" 2.76399136e-01, -2.33146876e-01, -4.22037452e-01,\n",
" -5.73692262e-01, -5.13007641e-01, -4.26108152e-01,\n",
" -2.96079516e-01, -4.26696450e-01, -5.30181289e-01,\n",
" -4.90620822e-01, -1.50063947e-01, -8.53931606e-02,\n",
" -2.88960963e-01, -3.32533240e-01, -1.82914421e-01,\n",
" -6.08625636e-02, 1.39558926e-01, -3.55813682e-01,\n",
" 2.61406936e-02, -3.05180758e-01, -2.52452403e-01,\n",
" -3.59835297e-01, -3.46008271e-01, -7.74578974e-02,\n",
" -4.68599826e-01, -4.85255606e-02, 8.09183344e-03,\n",
" -1.18216865e-01, -2.91808456e-01, -3.23003262e-01,\n",
" -5.39603047e-02, -2.29025170e-01, -2.62914300e-01,\n",
" -2.39585713e-01, 3.86748970e-01, -4.06216830e-01,\n",
" -7.60369599e-02, -1.73274621e-01, -4.33509350e-01,\n",
" -1.12799823e-01, -5.64482391e-01, -8.78849402e-02,\n",
" -1.08817495e-01, 1.01803876e-01, -1.71952978e-01,\n",
" -1.97624609e-01, -3.14228594e-01, -5.24510920e-01,\n",
" -4.37267244e-01, -2.05273271e-01, -4.08936590e-01,\n",
" -4.90692288e-01, -4.09333557e-01, 5.79499081e-02,\n",
" -6.00475848e-01, -2.65847117e-01, -5.08210480e-01,\n",
" -3.80596757e-01, 7.19741860e-04, -3.00036907e-01,\n",
" -2.07646489e-01, -1.52841002e-01, 4.92504120e-01], dtype=float32)"
"array([-0.03496236, -0.10450421, -0.05187245, 0.00315526, -0.03009692,\n",
" -0.00325531, -0.01932414, -0.07887443, 0.00732914, -0.04862531,\n",
" 0.00964272, 0.03630733, -0.06187062, 0.01710773, -0.03572332,\n",
" -0.02587368, -0.10132953, -0.07231853, 0.00561448, -0.12136029,\n",
" -0.0191984 , -0.10075094, 0.04011377, 0.02298491, -0.00923854,\n",
" 0.009691 , 0.0097783 , -0.06790726, -0.00056537, -0.07296011,\n",
" -0.04167816, 0.01845982, -0.03122563, 0.02366866, 0.05708881,\n",
" 0.07603112, 0.02723131, -0.02462533, -0.04524489, -0.0555689 ,\n",
" -0.10288978, -0.07713016, 0.06880754, -0.05441185, -0.07832464,\n",
" -0.0793924 , 0.01641271, 0.01577672, -0.00071338, -0.08438327,\n",
" -0.04914922, -0.07169627, 0.00832607, -0.04796996, 0.06898551,\n",
" 0.02447084, -0.06484294, -0.04137091, -0.02579352, -0.03093053,\n",
" 0.02354763, -0.00530715, 0.04292115, -0.01890544, -0.00403343,\n",
" 0.00891688, -0.07418539, -0.05906755, -0.01341417, -0.11452992,\n",
" -0.0419256 , -0.04264174, -0.01602072, -0.01675532, -0.08315146,\n",
" 0.00711557, 0.00813489, 0.04468995, 0.01084688, -0.02816842,\n",
" -0.08520442, -0.04315411, -0.04462098, -0.18218465, -0.09093954,\n",
" -0.03417725, -0.04131652, 0.02496975, -0.05490182, -0.08519867,\n",
" -0.03578508, -0.10880258, -0.09149118, -0.14748524, -0.04622766,\n",
" -0.1308063 , 0.0984766 , -0.03518417, -0.06054863, -0.06158594,\n",
" -0.01769979, -0.07496347, -0.08180398, -0.11138144, -0.01515801,\n",
" 0.02478061, -0.10648742, -0.08123869, -0.08370569, -0.04487203,\n",
" -0.09033024, -0.01348432, -0.01635014, -0.0092629 , 0.01535111,\n",
" -0.09441991, 0.02300696, -0.00615608, 0.01199632, -0.15448859,\n",
" -0.11188693, -0.07685462, -0.06171533, 0.02005708, -0.03582112,\n",
" -0.04907253, -0.03334751, -0.09121794, 0.09192816], dtype=float32)"
]
},
"execution_count": 11,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@ -648,9 +695,9 @@
"source": [
"def do_test():\n",
" reader = create_reader(data['test']['file'], is_training=False)\n",
" evaluate(reader, model)\n",
" evaluate(reader, z)\n",
"do_test()\n",
"model.classify.b.value"
"z.classify.b.value"
]
},
{
@ -662,7 +709,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 14,
"metadata": {
"collapsed": false
},
@ -672,7 +719,7 @@
"output_type": "stream",
"text": [
"[178, 429, 444, 619, 937, 851, 752, 179]\n",
"(1, 8, 129)\n",
"(8, 129)\n",
"[128 128 128 48 110 128 78 128]\n"
]
},
@ -689,7 +736,7 @@
" ('EOS', 'O')]"
]
},
"execution_count": 12,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -708,7 +755,9 @@
"onehot = np.zeros([len(w),len(query_dict)], np.float32)\n",
"for t in range(len(w)):\n",
" onehot[t,w[t]] = 1\n",
"pred = model.eval({model.arguments[0]:[onehot]})[0]\n",
"\n",
"#x = C.sequence.input(vocab_size)\n",
"pred = z(x).eval({x:[onehot]})[0]\n",
"print(pred.shape)\n",
"best = np.argmax(pred,axis=1)\n",
"print(best)\n",
@ -795,7 +844,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 15,
"metadata": {
"collapsed": false
},
@ -803,14 +852,15 @@
"source": [
"# Your task: Add batch normalization\n",
"def create_model():\n",
" with default_options(initial_state=0.1):\n",
" return Sequential([\n",
" Embedding(emb_dim),\n",
" Recurrence(LSTM(hidden_dim), go_backwards=False),\n",
" Dense(num_labels)\n",
" with C.layers.default_options(initial_state=0.1):\n",
" return C.layers.Sequential([\n",
" C.layers.Embedding(emb_dim),\n",
" C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),\n",
" C.layers.Dense(num_labels)\n",
" ])\n",
"\n",
"# Enable these when done:\n",
"z = create_model()\n",
"#do_train()\n",
"#do_test()"
]
@ -841,7 +891,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 16,
"metadata": {
"collapsed": false
},
@ -849,14 +899,15 @@
"source": [
"# Your task: Add lookahead\n",
"def create_model():\n",
" with default_options(initial_state=0.1):\n",
" return Sequential([\n",
" Embedding(emb_dim),\n",
" Recurrence(LSTM(hidden_dim), go_backwards=False),\n",
" Dense(num_labels)\n",
" with C.layers.default_options(initial_state=0.1):\n",
" return C.layers.Sequential([\n",
" C.layers.Embedding(emb_dim),\n",
" C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),\n",
" C.layers.Dense(num_labels)\n",
" ])\n",
" \n",
"# Enable these when done:\n",
"z = create_model()\n",
"#do_train()\n",
"#do_test()"
]
@ -928,7 +979,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 17,
"metadata": {
"collapsed": false
},
@ -936,11 +987,11 @@
"source": [
"# Your task: Add bidirectional recurrence\n",
"def create_model():\n",
" with default_options(initial_state=0.1): \n",
" return Sequential([\n",
" Embedding(emb_dim),\n",
" Recurrence(LSTM(hidden_dim), go_backwards=False),\n",
" Dense(num_labels)\n",
" with C.layers.default_options(initial_state=0.1): \n",
" return C.layers.Sequential([\n",
" C.layers.Embedding(emb_dim),\n",
" C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),\n",
" C.layers.Dense(num_labels)\n",
" ])\n",
"\n",
"# Enable these when done:\n",
@ -952,7 +1003,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Works like a charm! This model achieves 2.1%, a tiny bit better than the lookahead model above.\n",
"Works like a charm! This model achieves 0.32%, better than the lookahead model above.\n",
"The bidirectional model has 40% less parameters than the lookahead one. However, if you go back and look closely\n",
"you may find that the lookahead one trained about 30% faster.\n",
"This is because the lookahead model has both less horizontal dependencies (one instead of two\n",
@ -968,7 +1019,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 18,
"metadata": {
"collapsed": false
},
@ -977,36 +1028,32 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Training 722379 parameters in 10 parameter tensors.\n",
"Finished Epoch[1 of 16]: [Training] loss = 0.442527 * 18010, metric = 0.00% * 18010 4.111s (4380.9 samples/s);\n",
"Finished Epoch[2 of 16]: [Training] loss = 0.162579 * 18051, metric = 0.00% * 18051 3.543s (5094.8 samples/s);\n",
"Finished Epoch[3 of 16]: [Training] loss = 0.118221 * 17941, metric = 0.00% * 17941 3.453s (5195.8 samples/s);\n",
"Finished Epoch[4 of 16]: [Training] loss = 0.091876 * 18059, metric = 0.00% * 18059 3.578s (5047.2 samples/s);\n",
"Finished Epoch[5 of 16]: [Training] loss = 0.050012 * 17957, metric = 0.00% * 17957 3.531s (5085.5 samples/s);\n",
"Finished Epoch[6 of 16]: [Training] loss = 0.048105 * 18021, metric = 0.00% * 18021 3.466s (5199.4 samples/s);\n",
"Finished Epoch[7 of 16]: [Training] loss = 0.043658 * 17980, metric = 0.00% * 17980 3.395s (5296.0 samples/s);\n",
"Finished Epoch[8 of 16]: [Training] loss = 0.036579 * 18025, metric = 0.00% * 18025 3.464s (5203.5 samples/s);\n",
"Finished Epoch[9 of 16]: [Training] loss = 0.025926 * 17956, metric = 0.00% * 17956 3.461s (5188.1 samples/s);\n",
"Finished Epoch[10 of 16]: [Training] loss = 0.026058 * 18039, metric = 0.00% * 18039 3.445s (5236.3 samples/s);\n",
"Finished Epoch[11 of 16]: [Training] loss = 0.027655 * 17966, metric = 0.00% * 17966 3.443s (5218.1 samples/s);\n",
"Finished Epoch[12 of 16]: [Training] loss = 0.017355 * 18041, metric = 0.00% * 18041 3.400s (5306.2 samples/s);\n",
"Finished Epoch[13 of 16]: [Training] loss = 0.019586 * 17984, metric = 0.00% * 17984 3.559s (5053.1 samples/s);\n",
"Finished Epoch[14 of 16]: [Training] loss = 0.019649 * 17976, metric = 0.00% * 17976 3.391s (5301.1 samples/s);\n",
"Finished Epoch[15 of 16]: [Training] loss = 0.019151 * 18030, metric = 0.00% * 18030 3.277s (5502.0 samples/s);\n",
"Finished Epoch[16 of 16]: [Training] loss = 0.013417 * 18014, metric = 0.00% * 18014 3.500s (5146.9 samples/s);\n",
"Finished Evaluation [1]: Minibatch[1-23]: metric = 2.08% * 10984;\n"
"Training 721479 parameters in 6 parameter tensors.\n",
"Learning rate per minibatch: 0.21\n",
"Finished Epoch[1 of 10]: [Training] loss = 0.776891 * 18010, metric = 15.20% * 18010 4.448s (4049.0 samples/s);\n",
"Finished Epoch[2 of 10]: [Training] loss = 0.227451 * 18051, metric = 5.15% * 18051 4.320s (4178.5 samples/s);\n",
"Finished Epoch[3 of 10]: [Training] loss = 0.152620 * 17941, metric = 3.55% * 17941 4.297s (4175.2 samples/s);\n",
"Finished Epoch[4 of 10]: [Training] loss = 0.105561 * 18059, metric = 2.57% * 18059 4.554s (3965.5 samples/s);\n",
"Learning rate per minibatch: 0.105\n",
"Finished Epoch[5 of 10]: [Training] loss = 0.066330 * 17957, metric = 1.53% * 17957 4.477s (4010.9 samples/s);\n",
"Finished Epoch[6 of 10]: [Training] loss = 0.060731 * 18021, metric = 1.45% * 18021 4.353s (4139.9 samples/s);\n",
"Finished Epoch[7 of 10]: [Training] loss = 0.050680 * 17980, metric = 1.26% * 17980 4.018s (4474.9 samples/s);\n",
"Finished Epoch[8 of 10]: [Training] loss = 0.045435 * 18025, metric = 1.21% * 18025 3.931s (4585.3 samples/s);\n",
"Finished Epoch[9 of 10]: [Training] loss = 0.030330 * 17956, metric = 0.86% * 17956 4.399s (4081.8 samples/s);\n",
"Finished Epoch[10 of 10]: [Training] loss = 0.032149 * 18039, metric = 0.89% * 18039 4.035s (4470.6 samples/s);\n",
"Finished Evaluation [1]: Minibatch[1-23]: metric = 0.37% * 10984;\n"
]
}
],
"source": [
"def create_model():\n",
" with default_options(initial_state=0.1):\n",
" return Sequential([\n",
" Embedding(emb_dim),\n",
" BatchNormalization(),\n",
" Recurrence(LSTM(hidden_dim), go_backwards=False),\n",
" BatchNormalization(),\n",
" Dense(num_labels)\n",
" with C.layers.default_options(initial_state=0.1):\n",
" return C.layers.Sequential([\n",
" C.layers.Embedding(emb_dim),\n",
" #C.layers.BatchNormalization(),\n",
" C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),\n",
" #C.layers.BatchNormalization(),\n",
" C.layers.Dense(num_labels)\n",
" ])\n",
"\n",
"do_train()\n",
@ -1022,9 +1069,10 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 19,
"metadata": {
"collapsed": false
"collapsed": false,
"scrolled": true
},
"outputs": [
{
@ -1032,39 +1080,35 @@
"output_type": "stream",
"text": [
"Training 901479 parameters in 6 parameter tensors.\n",
"Finished Epoch[1 of 16]: [Training] loss = 1.042147 * 18010, metric = 0.00% * 18010 3.837s (4693.8 samples/s);\n",
"Finished Epoch[2 of 16]: [Training] loss = 0.367830 * 18051, metric = 0.00% * 18051 3.642s (4956.3 samples/s);\n",
"Finished Epoch[3 of 16]: [Training] loss = 0.240927 * 17941, metric = 0.00% * 17941 3.681s (4873.9 samples/s);\n",
"Finished Epoch[4 of 16]: [Training] loss = 0.162869 * 18059, metric = 0.00% * 18059 3.922s (4604.5 samples/s);\n",
"Finished Epoch[5 of 16]: [Training] loss = 0.117314 * 17957, metric = 0.00% * 17957 4.501s (3989.6 samples/s);\n",
"Finished Epoch[6 of 16]: [Training] loss = 0.104019 * 18021, metric = 0.00% * 18021 4.452s (4047.8 samples/s);\n",
"Finished Epoch[7 of 16]: [Training] loss = 0.091837 * 17980, metric = 0.00% * 17980 3.672s (4896.5 samples/s);\n",
"Finished Epoch[8 of 16]: [Training] loss = 0.085473 * 18025, metric = 0.00% * 18025 3.660s (4924.9 samples/s);\n",
"Finished Epoch[9 of 16]: [Training] loss = 0.055418 * 17956, metric = 0.00% * 17956 3.668s (4895.3 samples/s);\n",
"Finished Epoch[10 of 16]: [Training] loss = 0.056879 * 18039, metric = 0.00% * 18039 3.616s (4988.7 samples/s);\n",
"Finished Epoch[11 of 16]: [Training] loss = 0.059523 * 17966, metric = 0.00% * 17966 3.561s (5045.2 samples/s);\n",
"Finished Epoch[12 of 16]: [Training] loss = 0.038848 * 18041, metric = 0.00% * 18041 3.594s (5019.8 samples/s);\n",
"Finished Epoch[13 of 16]: [Training] loss = 0.042280 * 17984, metric = 0.00% * 17984 3.670s (4900.3 samples/s);\n",
"Finished Epoch[14 of 16]: [Training] loss = 0.046722 * 17976, metric = 0.00% * 17976 3.591s (5005.8 samples/s);\n",
"Finished Epoch[15 of 16]: [Training] loss = 0.037370 * 18030, metric = 0.00% * 18030 3.629s (4968.3 samples/s);\n",
"Finished Epoch[16 of 16]: [Training] loss = 0.034207 * 18014, metric = 0.00% * 18014 3.650s (4935.3 samples/s);\n",
"Finished Evaluation [1]: Minibatch[1-23]: metric = 2.18% * 10984;\n"
"Learning rate per minibatch: 0.21\n",
"Finished Epoch[1 of 10]: [Training] loss = 0.735173 * 18010, metric = 14.15% * 18010 4.754s (3788.4 samples/s);\n",
"Finished Epoch[2 of 10]: [Training] loss = 0.199272 * 18051, metric = 4.64% * 18051 4.375s (4125.9 samples/s);\n",
"Finished Epoch[3 of 10]: [Training] loss = 0.133776 * 17941, metric = 2.98% * 17941 4.592s (3907.0 samples/s);\n",
"Finished Epoch[4 of 10]: [Training] loss = 0.089028 * 18059, metric = 2.07% * 18059 4.787s (3772.5 samples/s);\n",
"Learning rate per minibatch: 0.105\n",
"Finished Epoch[5 of 10]: [Training] loss = 0.050588 * 17957, metric = 1.25% * 17957 4.312s (4164.4 samples/s);\n",
"Finished Epoch[6 of 10]: [Training] loss = 0.045545 * 18021, metric = 0.98% * 18021 4.382s (4112.5 samples/s);\n",
"Finished Epoch[7 of 10]: [Training] loss = 0.042307 * 17980, metric = 0.90% * 17980 4.535s (3964.7 samples/s);\n",
"Finished Epoch[8 of 10]: [Training] loss = 0.031965 * 18025, metric = 0.82% * 18025 4.537s (3972.9 samples/s);\n",
"Finished Epoch[9 of 10]: [Training] loss = 0.019296 * 17956, metric = 0.45% * 17956 4.385s (4094.9 samples/s);\n",
"Finished Epoch[10 of 10]: [Training] loss = 0.019359 * 18039, metric = 0.50% * 18039 5.191s (3475.1 samples/s);\n",
"Finished Evaluation [1]: Minibatch[1-23]: metric = 0.46% * 10984;\n"
]
}
],
"source": [
"def OneWordLookahead():\n",
" x = C.placeholder()\n",
" apply_x = splice (x, sequence.future_value(x))\n",
" apply_x = C.splice(x, C.sequence.future_value(x))\n",
" return apply_x\n",
"\n",
"def create_model():\n",
" with default_options(initial_state=0.1):\n",
" return Sequential([\n",
" Embedding(emb_dim),\n",
" with C.layers.default_options(initial_state=0.1):\n",
" return C.layers.Sequential([\n",
" C.layers.Embedding(emb_dim),\n",
" OneWordLookahead(),\n",
" Recurrence(LSTM(hidden_dim), go_backwards=False),\n",
" Dense(num_labels) \n",
" C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),\n",
" C.layers.Dense(num_labels) \n",
" ])\n",
"\n",
"do_train()\n",
@ -1080,7 +1124,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 20,
"metadata": {
"collapsed": false
},
@ -1090,40 +1134,37 @@
"output_type": "stream",
"text": [
"Training 541479 parameters in 9 parameter tensors.\n",
"Finished Epoch[1 of 16]: [Training] loss = 1.066098 * 18010, metric = 0.00% * 18010 6.280s (2867.8 samples/s);\n",
"Finished Epoch[2 of 16]: [Training] loss = 0.398866 * 18051, metric = 0.00% * 18051 6.111s (2953.9 samples/s);\n",
"Finished Epoch[3 of 16]: [Training] loss = 0.256761 * 17941, metric = 0.00% * 17941 6.036s (2972.3 samples/s);\n",
"Finished Epoch[4 of 16]: [Training] loss = 0.179482 * 18059, metric = 0.00% * 18059 6.233s (2897.3 samples/s);\n",
"Finished Epoch[5 of 16]: [Training] loss = 0.130943 * 17957, metric = 0.00% * 17957 6.140s (2924.6 samples/s);\n",
"Finished Epoch[6 of 16]: [Training] loss = 0.115195 * 18021, metric = 0.00% * 18021 6.050s (2978.7 samples/s);\n",
"Finished Epoch[7 of 16]: [Training] loss = 0.095112 * 17980, metric = 0.00% * 17980 6.009s (2992.2 samples/s);\n",
"Finished Epoch[8 of 16]: [Training] loss = 0.094233 * 18025, metric = 0.00% * 18025 6.143s (2934.2 samples/s);\n",
"Finished Epoch[9 of 16]: [Training] loss = 0.062660 * 17956, metric = 0.00% * 17956 6.010s (2987.7 samples/s);\n",
"Finished Epoch[10 of 16]: [Training] loss = 0.063548 * 18039, metric = 0.00% * 18039 6.121s (2947.1 samples/s);\n",
"Finished Epoch[11 of 16]: [Training] loss = 0.063781 * 17966, metric = 0.00% * 17966 6.054s (2967.6 samples/s);\n",
"Finished Epoch[12 of 16]: [Training] loss = 0.046256 * 18041, metric = 0.00% * 18041 6.104s (2955.6 samples/s);\n",
"Finished Epoch[13 of 16]: [Training] loss = 0.048350 * 17984, metric = 0.00% * 17984 6.316s (2847.4 samples/s);\n",
"Finished Epoch[14 of 16]: [Training] loss = 0.054151 * 17976, metric = 0.00% * 17976 6.018s (2987.0 samples/s);\n",
"Finished Epoch[15 of 16]: [Training] loss = 0.041818 * 18030, metric = 0.00% * 18030 6.241s (2889.0 samples/s);\n",
"Finished Epoch[16 of 16]: [Training] loss = 0.040104 * 18014, metric = 0.00% * 18014 6.159s (2924.8 samples/s);\n",
"Finished Evaluation [1]: Minibatch[1-23]: metric = 2.04% * 10984;\n"
"Learning rate per minibatch: 0.21\n",
"Finished Epoch[1 of 10]: [Training] loss = 0.773435 * 18010, metric = 14.75% * 18010 8.725s (2064.2 samples/s);\n",
"Finished Epoch[2 of 10]: [Training] loss = 0.189657 * 18051, metric = 4.34% * 18051 7.606s (2373.3 samples/s);\n",
"Finished Epoch[3 of 10]: [Training] loss = 0.113594 * 17941, metric = 2.65% * 17941 7.401s (2424.1 samples/s);\n",
"Finished Epoch[4 of 10]: [Training] loss = 0.066348 * 18059, metric = 1.59% * 18059 8.079s (2235.3 samples/s);\n",
"Learning rate per minibatch: 0.105\n",
"Finished Epoch[5 of 10]: [Training] loss = 0.040841 * 17957, metric = 0.96% * 17957 7.584s (2367.7 samples/s);\n",
"Finished Epoch[6 of 10]: [Training] loss = 0.037693 * 18021, metric = 0.83% * 18021 7.506s (2400.9 samples/s);\n",
"Finished Epoch[7 of 10]: [Training] loss = 0.034889 * 17980, metric = 0.76% * 17980 7.626s (2357.7 samples/s);\n",
"Finished Epoch[8 of 10]: [Training] loss = 0.026481 * 18025, metric = 0.66% * 18025 7.511s (2399.8 samples/s);\n",
"Finished Epoch[9 of 10]: [Training] loss = 0.014968 * 17956, metric = 0.35% * 17956 7.319s (2453.3 samples/s);\n",
"Finished Epoch[10 of 10]: [Training] loss = 0.015767 * 18039, metric = 0.45% * 18039 7.795s (2314.2 samples/s);\n",
"Finished Evaluation [1]: Minibatch[1-23]: metric = 0.32% * 10984;\n"
]
}
],
"source": [
"def BiRecurrence(fwd, bwd):\n",
" F = Recurrence(fwd)\n",
" G = Recurrence(bwd, go_backwards=True)\n",
" F = C.layers.Recurrence(fwd)\n",
" G = C.layers.Recurrence(bwd, go_backwards=True)\n",
" x = C.placeholder()\n",
" apply_x = splice (F(x), G(x))\n",
" apply_x = C.splice(F(x), G(x))\n",
" return apply_x \n",
"\n",
"def create_model():\n",
" with default_options(initial_state=0.1):\n",
" return Sequential([\n",
" Embedding(emb_dim),\n",
" BiRecurrence(LSTM(hidden_dim//2), LSTM(hidden_dim//2)),\n",
" Dense(num_labels)\n",
" with C.layers.default_options(initial_state=0.1):\n",
" return C.layers.Sequential([\n",
" C.layers.Embedding(emb_dim),\n",
" BiRecurrence(C.layers.LSTM(hidden_dim//2), \n",
" C.layers.LSTM(hidden_dim//2)),\n",
" C.layers.Dense(num_labels)\n",
" ])\n",
"\n",
"do_train()\n",
@ -1143,7 +1184,7 @@
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},