1. correct typo in the notebook

2. add header to all the python files 3. add comments for train.py to explain what does it do
2019-06-07 15:27:01 -04:00 · 2019-06-07 15:27:01 -04:00 · 349958bafb
--- a/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb
+++ b/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb
@ -77,7 +77,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "This notebook serves as an introduction to an end-to-end NLP solution for sentence similarity building one of the advanced models - GenSen on AzureML platform. We show the advantages of AzureML when training large NLP models with GPU.\n",
+    "This notebook serves as an introduction to an end-to-end NLP solution for sentence similarity building one of the advanced models, GenSen, on the AzureML platform. We show the advantages of AzureML when training large NLP models with GPU.\n",
    "\n",
    "For more information on **AzureML**, see these resources:\n",
    "* [Quickstart notebook](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-create-workspace-with-python)\n",
@ -285,7 +285,7 @@
   "metadata": {},
   "source": [
    "## 1.1 Load SNLI Dataset\n",
-    "We provide a function `load_pandas_df` which\n",
+    "We provide a function `load_pandas_df` which:\n",
    "* Downloads the SNLI zipfile at the specified directory location\n",
    "* Extracts the file based on the specified split\n",
    "* Loads the split as a pandas dataframe"
@ -958,7 +958,7 @@
   "source": [
    "**Horovod on AzureML**\n",
    "\n",
-    "Horovod is a distributed training framework for TensorFlow, PyTorch etc. to make distributed Deap Learning fast and easy to use. We have created 2 nodes in the GPU cluster on AzureML. By using Horovod, we can use those two machines to train the model in parallel. In theory, the model trains faster on AzureML than on VM which uses single machine because it converges faster which we will get lower loss. However, by using more nodes, the model may take more time in communicating with each node. The communication time could be ignored when the model is trained on the large datasets.\n",
+    "Horovod is a distributed training framework for TensorFlow, PyTorch etc. to make distributed Deep Learning fast and easy to use. We have created 2 nodes in the GPU cluster on AzureML. By using Horovod, we can use those two machines to train the model in parallel. In theory, the model trains faster on AzureML than on VM which uses single machine because it converges faster which we will get lower loss. However, by using more nodes, the model may take more time in communicating with each node. The communication time could be ignored when the model is trained on the large datasets.\n",
    "\n",
    "AzureML can automatically create figures on the loss and time, which is eaiser to track the performance as in the following figure:\n",
    "![best_val_loss](https://nlpbp.blob.core.windows.net/images/best_val_loss.PNG)**Validation Loss**"
--- a/utils_nlp/model/gensen/conditional_gru.py
+++ b/utils_nlp/model/gensen/conditional_gru.py
@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """A Gated Recurrent Unit (GRU) cell with peepholes."""
 import math
 import torch
--- a/utils_nlp/model/gensen/create_gensen_model.py
+++ b/utils_nlp/model/gensen/create_gensen_model.py
@ -1,5 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """Creates a GenSen model from a MultiSeq2Seq model."""
 import os
 import pickle
--- a/utils_nlp/model/gensen/gensen.py
+++ b/utils_nlp/model/gensen/gensen.py
@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """GenSen Encoder"""
 import h5py
 from sklearn.linear_model import LinearRegression
@ -367,76 +370,3 @@ class GenSenSingle(nn.Module):
            return h.data.cpu().numpy(), h_t.data.cpu().numpy()
        else:
            return h, h_t
-
-
-if __name__ == "__main__":
-    # Sentences need to be lowercased.
-    sentences = [
-        "hello world .",
-        "the quick brown fox jumped over the lazy dog .",
-        "this is a sentence .",
-    ]
-    vocab = [
-        "the",
-        "quick",
-        "brown",
-        "fox",
-        "jumped",
-        "over",
-        "lazy",
-        "dog",
-        "hello",
-        "world",
-        ".",
-        "this",
-        "is",
-        "a",
-        "sentence",
-        "<s>",
-        "</s>",
-        "<pad>",
-        "<unk>",
-    ]
-
-    ###########################
-    ##### GenSenSingle ########
-    ###########################
-
-    gensen_1 = GenSenSingle(
-        model_folder="./data/models",
-        filename_prefix="nli_large_bothskip",
-        pretrained_emb="./data/embedding/glove.840B.300d.h5",
-    )
-    reps_h, reps_h_t = gensen_1.get_representation(
-        sentences, pool="last", return_numpy=True
-    )
-    # reps_h contains the hidden states for all words in all sentences (padded to the max length of sentences) (batch_size x seq_len x 2048)
-    # reps_h_t contains only the last hidden state for all sentences in the minibatch (batch_size x 2048)
-    print(reps_h.shape, reps_h_t.shape)
-
-    # gensen_1 = GenSenSingle(
-    #     model_folder='./data/models/example',
-    #     filename_prefix='gensen.model',
-    #     pretrained_emb='./data/embedding/glove.840B.300d.h5'
-    # )
-    # reps_h, reps_h_t = gensen_1.get_representation(
-    #     sentences, pool='last', return_numpy=True
-    # )
-    # # reps_h contains the hidden states for all words in all sentences (padded to the max length of sentences) (batch_size x seq_len x 2048)
-    # # reps_h_t contains only the last hidden state for all sentences in the minibatch (batch_size x 2048)
-    # print(reps_h.shape, reps_h_t.shape)
-
-    """
-    gensen_2 = GenSenSingle(
-        model_folder='./data/models',
-        filename_prefix='nli_large_bothskip_parse',
-        pretrained_emb='./data/embedding/glove.840B.300d.h5'
-    )
-    gensen = GenSen(gensen_1, gensen_2)
-    reps_h, reps_h_t = gensen.get_representation(
-        sentences, pool='last', return_numpy=True
-    )
-    # reps_h contains the hidden states for all words in all sentences (padded to the max length of sentences) (batch_size x seq_len x 2048)
-    # reps_h_t contains only the last hidden state for all sentences in the minibatch (batch_size x 4096)
-    print reps_h.shape, reps_h_t.shape
-    """
--- a/utils_nlp/model/gensen/multi_task_model.py
+++ b/utils_nlp/model/gensen/multi_task_model.py
@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """Parent model for Multitask Training."""
 import torch
 import torch.nn as nn
@ -7,7 +10,7 @@ from conditional_gru import ConditionalGRU


 class MultitaskModel(nn.Module):
-    r"""A Multi Task Sequence to Sequence (Seq2Seq) model with GRUs.
+    """A Multi Task Sequence to Sequence (Seq2Seq) model with GRUs.

    Auxiliary NLI task trained jointly as well.
    Ref: Multi-Task Sequence to Sequence Learning
--- a/utils_nlp/model/gensen/train.py
+++ b/utils_nlp/model/gensen/train.py
@ -1,6 +1,16 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-"""Run script."""
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""
+The GenSen training process follows the steps:
+1. Create or load the dataset vocabulary
+2. Train on the training dataset for each batch epoch (batch size = 48 updates)
+3. Evaluate on the validation dataset for every 10 epoches
+4. Find the local minimum point on validation loss
+5. Save the best model and stop the training process
+"""
 import logging
 import argparse
 import os
--- a/utils_nlp/model/gensen/utils.py
+++ b/utils_nlp/model/gensen/utils.py
@ -1,5 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """Minibatching utilities."""
 import itertools
 import operator