diff --git a/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb b/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb
index 8a40dc7..4148272 100644
--- a/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb
+++ b/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb
@@ -132,6 +132,13 @@
     "scrolled": true
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Failure while loading azureml_run_type_providers. Failed to load entrypoint hyperdrive = azureml.train.hyperdrive:HyperDriveRun._from_run_dto with exception module 'azureml.train.hyperdrive' has no attribute 'HyperDriveRun'.\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -188,7 +195,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 2,
    "metadata": {
     "scrolled": true
    },
@@ -242,7 +249,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -285,7 +292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -301,9 +308,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 5,
    "metadata": {
-    "scrolled": true
+    "scrolled": false
    },
    "outputs": [
     {
@@ -484,7 +491,7 @@
        "4  2267923837.jpg#2r1e     entailment    NaN    NaN    NaN    NaN  "
       ]
      },
-     "execution_count": 22,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -511,7 +518,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -591,7 +598,7 @@
        "4                         There are children present  "
       ]
      },
-     "execution_count": 23,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -622,7 +629,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -643,7 +650,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 8,
    "metadata": {
     "scrolled": true
    },
@@ -654,7 +661,7 @@
        "'../../data\\\\clean/snli_1.0'"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -682,7 +689,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -691,7 +698,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -713,29 +720,29 @@
    "source": [
     "**Prerequisites:**\n",
     "\n",
-    "Upload the all the local files under `data_folder` to the path `./data/processed/` on the default datastore.\n",
+    "Upload the all the local files under `data_folder` to the default datastore.\n",
     "\n",
     "**Note: To download data required to train a GenSen model in the original paper, run code [here](https://github.com/Maluuba/gensen/blob/master/get_data.sh). By training on the original datasets (training time around 20 hours), it will reproduce the results in the [paper](https://arxiv.org/abs/1804.00079). For simplicity, we will train on a smaller dataset, which is SNLI preprocessed in [1 Data Loading and Preprocessing](#1-Data-Loading-and-Preprocessing) for showcasing the example.**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "$AZUREML_DATAREFERENCE_6faee69b569b4268b8bf027b0bb4fd73"
+       "$AZUREML_DATAREFERENCE_liqungensen"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "ds.upload(src_dir=data_folder, target_path=\"data/processed\", overwrite=True, show_progress=False)"
+    "ds.upload(src_dir=data_folder, overwrite=True, show_progress=False)"
    ]
   },
   {
@@ -769,7 +776,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -777,13 +784,13 @@
      "output_type": "stream",
      "text": [
       "Found existing compute target.\n",
-      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-06-14T20:39:31.676000+00:00', 'errors': None, 'creationTime': '2019-06-03T21:18:34.507970+00:00', 'modifiedTime': '2019-06-03T21:18:50.790782+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 8, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
+      "{'currentNodeCount': 2, 'targetNodeCount': 2, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 2, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-06-21T20:14:04.778000+00:00', 'errors': None, 'creationTime': '2019-06-19T02:57:39.833104+00:00', 'modifiedTime': '2019-06-19T02:58:11.339451+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
      ]
     }
    ],
    "source": [
     "# choose a name for your cluster\n",
-    "cluster_name = \"gpugensen\"\n",
+    "cluster_name = \"gensen-mlflow\"\n",
     "\n",
     "try:\n",
     "    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
@@ -791,7 +798,7 @@
     "except ComputeTargetException:\n",
     "    print('Creating a new compute target...')\n",
     "    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
-    "                                                           max_nodes=8)\n",
+    "                                                           max_nodes=2)\n",
     "\n",
     "    # create the cluster\n",
     "    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
@@ -814,7 +821,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -836,19 +843,11 @@
    "metadata": {},
    "source": [
     "### 2.3.1 Prepare Training Script\n",
-    "Now you will need to create your training script. In this tutorial, the script for distributed training of GENSEN is already provided for you at `train.py`. In practice, you should be able to take any custom PyTorch training script as is and run it with Azure ML without having to modify your code.\n",
+    "Now you will need to create your training script. In this tutorial, the script for distributed training of GENSEN is already provided for you at `gensen_train.py`. \n",
     "\n",
-    "However, if you would like to use Azure ML's [metric logging](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#logging) capabilities, you will have to add a small amount of Azure ML logic inside your training script. In this example, at each logging interval, we will log the loss for that minibatch to our Azure ML run.\n",
+    "In this example, we use MLflow to log your metrics. We also use the [Azure ML-Mlflow](https://pypi.org/project/azureml-mlflow/) package to log these metrics to the Azure Portal. This is done with no change to the provided training script!\n",
     "\n",
-    "To do so, in `train.py`, we will first access the Azure ML `Run` object within the script:\n",
-    "```Python\n",
-    "from azureml.core.run import Run\n",
-    "run = Run.get_context()\n",
-    "```\n",
-    "Later within the script, we log the loss metric to our run:\n",
-    "```Python\n",
-    "run.log('loss', loss.item())\n",
-    "```"
+    "In this example the script provided logs the loss for that minibatch to our Azure ML portal."
    ]
   },
   {
@@ -872,7 +871,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -881,7 +880,7 @@
        "'../../utils_nlp/gensen/gensen_config.json'"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -902,7 +901,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -917,12 +916,12 @@
     "### 2.3.3 Create a PyTorch Estimator\n",
     "The Azure ML SDK's PyTorch estimator enables you to easily submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch).\n",
     "\n",
-    "`sample_config.json` defines all the hyperparameters and paths when training GenSen model. The trained model will be saved in `data/models/example` to Azure Blob Storage. **Remember to clean `data/models/example` folder in order to save new models.**"
+    "`gensen_config.json` defines all the hyperparameters and paths when training GenSen model. The trained model will be saved in `models` to Azure Blob Storage. **Remember to clean `models` folder in order to save new models.**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -942,11 +941,12 @@
     "                    script_params=script_params,\n",
     "                    compute_target=compute_target,\n",
     "                    entry_script='utils_nlp/gensen/gensen_train.py',\n",
-    "                    node_count=4,\n",
+    "                    node_count=2,\n",
     "                    process_count_per_node=1,\n",
     "                    distributed_training=MpiConfiguration(),\n",
     "                    use_gpu=True,\n",
-    "                    conda_packages=['scikit-learn=0.20.3', 'h5py', 'nltk']\n",
+    "                    conda_packages=['scikit-learn=0.20.3', 'h5py', 'nltk'],\n",
+    "                    pip_packages=['azureml-mlflow>=1.0.43.1','numpy>=1.16.0']\n",
     "                   )"
    ]
   },
@@ -979,7 +979,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 9,
    "metadata": {
     "scrolled": true
    },
@@ -988,7 +988,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Submitting C:\\Users\\lishao\\Project\\Rotation2\\NLP directory for run. The size of the directory >= 25 MB, so it can take a few minutes.\n"
+      "Submitting E:\\Projects\\NLP-BP\\temp\\nlp directory for run. The size of the directory >= 25 MB, so it can take a few minutes.\n"
      ]
     },
     {
@@ -996,9 +996,9 @@
      "output_type": "stream",
      "text": [
       "Run(Experiment: pytorch-gensen,\n",
-      "Id: pytorch-gensen_1560797674_e36e44f4,\n",
+      "Id: pytorch-gensen_1561150688_f84eab04,\n",
       "Type: azureml.scriptrun,\n",
-      "Status: Preparing)\n"
+      "Status: Queued)\n"
      ]
     }
    ],
@@ -1019,7 +1019,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Horovod on AzureML**\n",
+    "#### Horovod on AzureML\n",
     "\n",
     "[Horovod](https://github.com/horovod/horovod) is a distributed training framework for TensorFlow, PyTorch etc. to make distributed Deep Learning fast and easy to use. We have created 2 nodes in the GPU cluster on AzureML. By using Horovod, we can use those two machines to train the model in parallel. In theory, the model trains faster on AzureML than on VM which uses single machine because it converges faster which we will get lower loss. However, by using more nodes, the model may take more time in communicating with each node. The communication time could be ignored when the model is trained on the large datasets.\n",
     "\n",
@@ -1031,7 +1031,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Interpret the Training Results**\n",
+    "#### Interpret the Training Results\n",
     "\n",
     "The following chart shows the model validation loss (the less loss, the better performance) with different nodes with AmlCompute:\n",
     "\n",
@@ -1042,28 +1042,29 @@
     "From the chart, we can tell training with more nodes, the performance is getting better with lower loss."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Azureml Widget allows an easy way to stream updates of the logged metrics right into your notebook. To use this feature install the widget by running the commands below. \n",
+    "\n",
+    "```\n",
+    "conda install ipywidgets\n",
+    "\n",
+    "jupyter nbextension install --py --user azureml.widgets\n",
+    "\n",
+    "jupyter nbextension enable azureml.widgets --user --py\n",
+    "\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": null,
    "metadata": {
-    "scrolled": true
+    "scrolled": false
    },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "19d55fcc0871444da604b1d828d9eac4",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "RunDetails(run).show()"
    ]
@@ -1085,7 +1086,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "run.wait_for_completion(show_output=True) # this provides a verbose log"
@@ -1114,6 +1117,35 @@
     " ```"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3.6 Clean up after training\n",
+    "\n",
+    "We finally delete the training script `gensen_train.py` and config file `gensen_config.json` from the project directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gensen_train = os.path.join(project_folder,'utils_nlp/gensen/gensen_train.py')\n",
+    "gensen_config = os.path.join(project_folder,'utils_nlp/gensen/gensen_config.json')\n",
+    "\n",
+    "if os.path.isfile(gensen_train):\n",
+    "    os.remove(gensen_train)\n",
+    "else:\n",
+    "    print(\"Error: %s file not found\" % gensen_train)\n",
+    "    \n",
+    "if os.path.isfile(gensen_config):\n",
+    "    os.remove(gensen_config)\n",
+    "else:\n",
+    "    print(\"Error: %s file not found\" % gensen_config)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1135,7 +1167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1164,7 +1196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1194,54 +1226,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": null,
    "metadata": {
     "scrolled": false
    },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c61e610d4601486e9f41fd852320b47b",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO',…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5c47f13e11c646cd865d4f286b70ab0c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "RunDetails(hyperdrive_run).show()"
    ]
@@ -1331,9 +1320,9 @@
    }
   ],
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python NLP CPU",
    "language": "python",
-   "name": "python3"
+   "name": "nlp_cpu"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/scenarios/sentence_similarity/gensen_config.json b/scenarios/sentence_similarity/gensen_config.json
index f7e47a7..54aff67 100644
--- a/scenarios/sentence_similarity/gensen_config.json
+++ b/scenarios/sentence_similarity/gensen_config.json
@@ -15,20 +15,20 @@
   },
   "data": {"paths": [
         {
-            "train_src": "data/processed/snli_1.0_train.txt.s1.tok",
-            "train_trg": "data/processed/snli_1.0_train.txt.s2.tok",
-            "val_src": "data/processed/snli_1.0_dev.txt.s1.tok",
-            "val_trg": "data/processed/snli_1.0_dev.txt.s1.tok",
+            "train_src": "snli_1.0_train.txt.s1.tok",
+            "train_trg": "snli_1.0_train.txt.s2.tok",
+            "val_src": "snli_1.0_dev.txt.s1.tok",
+            "val_trg": "snli_1.0_dev.txt.s1.tok",
             "taskname": "snli"
         }
     ],
         "max_src_length": 90,
         "max_trg_length": 90,
         "task": "multi-seq2seq-nli",
-        "save_dir": "data/models/example",
-        "nli_train": "data/processed/snli_1.0_train.txt.clean.noblank",
-        "nli_dev": "data/processed/snli_1.0_dev.txt.clean.noblank",
-        "nli_test": "data/processed/snli_1.0_test.txt.clean.noblank"
+        "save_dir": "models/",
+        "nli_train": "snli_1.0_train.txt.clean.noblank",
+        "nli_dev": "snli_1.0_dev.txt.clean.noblank",
+        "nli_test": "snli_1.0_test.txt.clean.noblank"
 	},
     "model": {
     	"dim_src": 2048,
diff --git a/scenarios/sentence_similarity/gensen_train.py b/scenarios/sentence_similarity/gensen_train.py
index 7b704a1..253c75c 100644
--- a/scenarios/sentence_similarity/gensen_train.py
+++ b/scenarios/sentence_similarity/gensen_train.py
@@ -15,20 +15,20 @@ AzureML provides AI Compute to train the model and track the performance.
 This training process is based on GPU only.
 
 """
-import logging
 import argparse
-import os
 import json
+import logging
+import os
 import time
 
+import horovod.torch as hvd
+import mlflow
 import numpy as np
 import torch
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
 import torch.nn.functional as f
 import torch.optim as optim
-from azureml.core.run import Run
-import horovod.torch as hvd
 
 from utils_nlp.gensen.multi_task_model import MultitaskModel
 from utils_nlp.gensen.utils import (
@@ -37,10 +37,8 @@ from utils_nlp.gensen.utils import (
     compute_validation_loss,
 )
 
-# get the Azure ML run object
-run = Run.get_context()
-
 cudnn.benchmark = True
+logger = logging.getLogger(__name__)
 
 hvd.init()
 if torch.cuda.is_available():
@@ -172,7 +170,9 @@ def evaluate(
         # Horovod: print output only on first rank.
         if hvd.rank() == 0:
             # log the best val accuracy to AML run
-            run.log("Best Validation Loss", np.float(validation_loss))
+            logging.info(
+                "Best Validation Loss: {}".format(np.float(validation_loss))
+            )
 
         # If the validation loss is small enough, and it starts to go up.
         # Should stop training.
@@ -182,8 +182,6 @@ def evaluate(
             min_val_loss_epoch = monitor_epoch
             model_state = model.state_dict()
 
-        run.log("Validation Loss", validation_loss)
-        print(monitor_epoch, min_val_loss_epoch, min_val_loss)
         logging.info(
             "Monitor epoch: %d Validation Loss:  %.3f Min Validation Epoch: "
             "%d Loss : %.3f "
@@ -275,312 +273,333 @@ def train(config, data_folder, learning_rate=0.0001):
         config(dict): Loaded json file as a python object.
         data_folder(str): Path to the folder containing the data.
         learning_rate(float): Learning rate for the model.
-
     """
     owd = os.getcwd()
+    os.chdir(data_folder)
 
     try:
-        save_dir = config["data"]["save_dir"]
+        with mlflow.start_run():
+            save_dir = config["data"]["save_dir"]
+            if not os.path.exists("./log"):
+                os.makedirs("./log")
 
-        os.chdir(data_folder)
+            os.makedirs(save_dir, exist_ok=True)
 
-        if not os.path.exists("./log"):
-            os.makedirs("./log")
+            setup_logging(config)
 
-        os.makedirs(save_dir, exist_ok=True)
+            batch_size = config["training"]["batch_size"]
+            src_vocab_size = config["model"]["n_words_src"]
+            trg_vocab_size = config["model"]["n_words_trg"]
+            max_len_src = config["data"]["max_src_length"]
+            max_len_trg = config["data"]["max_trg_length"]
+            model_state = {}
 
-        setup_logging(config)
+            train_src = [item["train_src"] for item in config["data"]["paths"]]
+            train_trg = [item["train_trg"] for item in config["data"]["paths"]]
+            tasknames = [item["taskname"] for item in config["data"]["paths"]]
 
-        batch_size = config["training"]["batch_size"]
-        src_vocab_size = config["model"]["n_words_src"]
-        trg_vocab_size = config["model"]["n_words_trg"]
-        max_len_src = config["data"]["max_src_length"]
-        max_len_trg = config["data"]["max_trg_length"]
-        model_state = {}
-
-        train_src = [item["train_src"] for item in config["data"]["paths"]]
-        train_trg = [item["train_trg"] for item in config["data"]["paths"]]
-        tasknames = [item["taskname"] for item in config["data"]["paths"]]
-
-        # Keep track of indicies to train forward and backward jointly
-        if (
-            "skipthought_next" in tasknames
-            and "skipthought_previous" in tasknames
-        ):
-            skipthought_idx = tasknames.index("skipthought_next")
-            skipthought_backward_idx = tasknames.index("skipthought_previous")
-            paired_tasks = {
-                skipthought_idx: skipthought_backward_idx,
-                skipthought_backward_idx: skipthought_idx,
-            }
-        else:
-            paired_tasks = None
-            skipthought_idx = None
-            skipthought_backward_idx = None
-
-        train_iterator = BufferedDataIterator(
-            train_src,
-            train_trg,
-            src_vocab_size,
-            trg_vocab_size,
-            tasknames,
-            save_dir,
-            buffer_size=1e6,
-            lowercase=True,
-            seed=(hvd.rank() + 1) * 12345,
-        )
-
-        nli_iterator = NLIIterator(
-            train=config["data"]["nli_train"],
-            dev=config["data"]["nli_dev"],
-            test=config["data"]["nli_test"],
-            vocab_size=-1,
-            vocab=os.path.join(save_dir, "src_vocab.pkl"),
-            seed=(hvd.rank() + 1) * 12345,
-        )
-
-        src_vocab_size = len(train_iterator.src[0]["word2id"])
-        trg_vocab_size = len(train_iterator.trg[0]["word2id"])
-
-        # Logging set up.
-        logging.info("Finished creating iterator ...")
-        log_config(config)
-        logging.info(
-            "Found %d words in source : "
-            % (len(train_iterator.src[0]["id2word"]))
-        )
-        for idx, taskname in enumerate(tasknames):
-            logging.info(
-                "Found %d target words in task %s "
-                % (len(train_iterator.trg[idx]["id2word"]), taskname)
-            )
-        logging.info("Found %d words in src " % src_vocab_size)
-        logging.info("Found %d words in trg " % trg_vocab_size)
-
-        weight_mask = torch.ones(trg_vocab_size).cuda()
-        weight_mask[train_iterator.trg[0]["word2id"]["<pad>"]] = 0
-        loss_criterion = nn.CrossEntropyLoss(weight=weight_mask).cuda()
-        nli_criterion = nn.CrossEntropyLoss().cuda()
-
-        model = MultitaskModel(
-            src_emb_dim=config["model"]["dim_word_src"],
-            trg_emb_dim=config["model"]["dim_word_trg"],
-            src_vocab_size=src_vocab_size,
-            trg_vocab_size=trg_vocab_size,
-            src_hidden_dim=config["model"]["dim_src"],
-            trg_hidden_dim=config["model"]["dim_trg"],
-            bidirectional=config["model"]["bidirectional"],
-            pad_token_src=train_iterator.src[0]["word2id"]["<pad>"],
-            pad_token_trg=train_iterator.trg[0]["word2id"]["<pad>"],
-            nlayers_src=config["model"]["n_layers_src"],
-            dropout=config["model"]["dropout"],
-            num_tasks=len(train_iterator.src),
-            paired_tasks=paired_tasks,
-        ).cuda()
-
-        optimizer = setup_horovod(model, learning_rate=learning_rate)
-        logging.info(model)
-
-        n_gpus = config["training"]["n_gpus"]
-        model = torch.nn.DataParallel(model, device_ids=range(n_gpus))
-
-        task_losses = [[] for _ in tasknames]
-        task_idxs = [0 for _ in tasknames]
-        nli_losses = []
-        updates = 0
-        nli_ctr = 0
-        nli_epoch = 0
-        monitor_epoch = 0
-        nli_mbatch_ctr = 0
-        mbatch_times = []
-        min_val_loss = 10000000
-        min_val_loss_epoch = -1
-        rng_num_tasks = len(tasknames) - 1 if paired_tasks else len(tasknames)
-        logging.info("Commencing Training ...")
-        start = time.time()
-        while True:
-            # Train NLI once every 10 minibatches of other tasks
-            if nli_ctr % 10 == 0:
-                minibatch = nli_iterator.get_parallel_minibatch(
-                    nli_mbatch_ctr, batch_size * n_gpus
+            # Keep track of indicies to train forward and backward jointly
+            if (
+                "skipthought_next" in tasknames
+                and "skipthought_previous" in tasknames
+            ):
+                skipthought_idx = tasknames.index("skipthought_next")
+                skipthought_backward_idx = tasknames.index(
+                    "skipthought_previous"
                 )
-                optimizer.zero_grad()
-                class_logits = model(
-                    minibatch, -1, return_hidden=False, paired_trg=None
-                )
-
-                loss = nli_criterion(
-                    class_logits.contiguous().view(-1, class_logits.size(1)),
-                    minibatch["labels"].contiguous().view(-1),
-                )
-
-                # nli_losses.append(loss.data[0])
-                nli_losses.append(loss.item())
-                loss.backward()
-                torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
-                optimizer.step()
-
-                # For AML.
-                run.log("loss", loss.item())
-
-                nli_mbatch_ctr += batch_size * n_gpus
-                if nli_mbatch_ctr >= len(nli_iterator.train_lines):
-                    nli_mbatch_ctr = 0
-                    nli_epoch += 1
+                paired_tasks = {
+                    skipthought_idx: skipthought_backward_idx,
+                    skipthought_backward_idx: skipthought_idx,
+                }
             else:
-                # Sample a random task
-                task_idx = np.random.randint(low=0, high=rng_num_tasks)
+                paired_tasks = None
+                skipthought_idx = None
+                skipthought_backward_idx = None
 
-                # Get a minibatch corresponding to the sampled task
-                minibatch = train_iterator.get_parallel_minibatch(
-                    task_idx,
-                    task_idxs[task_idx],
-                    batch_size * n_gpus,
-                    max_len_src,
-                    max_len_trg,
+            train_iterator = BufferedDataIterator(
+                train_src,
+                train_trg,
+                src_vocab_size,
+                trg_vocab_size,
+                tasknames,
+                save_dir,
+                buffer_size=1e6,
+                lowercase=True,
+                seed=(hvd.rank() + 1) * 12345,
+            )
+
+            nli_iterator = NLIIterator(
+                train=config["data"]["nli_train"],
+                dev=config["data"]["nli_dev"],
+                test=config["data"]["nli_test"],
+                vocab_size=-1,
+                vocab=os.path.join(save_dir, "src_vocab.pkl"),
+                seed=(hvd.rank() + 1) * 12345,
+            )
+
+            src_vocab_size = len(train_iterator.src[0]["word2id"])
+            trg_vocab_size = len(train_iterator.trg[0]["word2id"])
+
+            # Logging set up.
+            logging.info("Finished creating iterator ...")
+            log_config(config)
+            logging.info(
+                "Found %d words in source : "
+                % (len(train_iterator.src[0]["id2word"]))
+            )
+            for idx, taskname in enumerate(tasknames):
+                logging.info(
+                    "Found %d target words in task %s "
+                    % (len(train_iterator.trg[idx]["id2word"]), taskname)
                 )
+            logging.info("Found %d words in src " % src_vocab_size)
+            logging.info("Found %d words in trg " % trg_vocab_size)
 
-                """Increment pointer into task and if current buffer is 
-                exhausted, fetch new buffer. """
-                task_idxs[task_idx] += batch_size * n_gpus
-                if task_idxs[task_idx] >= train_iterator.buffer_size:
-                    train_iterator.fetch_buffer(task_idx)
-                    task_idxs[task_idx] = 0
+            weight_mask = torch.ones(trg_vocab_size).cuda()
+            weight_mask[train_iterator.trg[0]["word2id"]["<pad>"]] = 0
+            loss_criterion = nn.CrossEntropyLoss(weight=weight_mask).cuda()
+            nli_criterion = nn.CrossEntropyLoss().cuda()
 
-                if task_idx == skipthought_idx:
-                    minibatch_back = train_iterator.get_parallel_minibatch(
-                        skipthought_backward_idx,
-                        task_idxs[skipthought_backward_idx],
+            model = MultitaskModel(
+                src_emb_dim=config["model"]["dim_word_src"],
+                trg_emb_dim=config["model"]["dim_word_trg"],
+                src_vocab_size=src_vocab_size,
+                trg_vocab_size=trg_vocab_size,
+                src_hidden_dim=config["model"]["dim_src"],
+                trg_hidden_dim=config["model"]["dim_trg"],
+                bidirectional=config["model"]["bidirectional"],
+                pad_token_src=train_iterator.src[0]["word2id"]["<pad>"],
+                pad_token_trg=train_iterator.trg[0]["word2id"]["<pad>"],
+                nlayers_src=config["model"]["n_layers_src"],
+                dropout=config["model"]["dropout"],
+                num_tasks=len(train_iterator.src),
+                paired_tasks=paired_tasks,
+            ).cuda()
+
+            optimizer = setup_horovod(model, learning_rate=learning_rate)
+            logging.info(model)
+
+            n_gpus = config["training"]["n_gpus"]
+            model = torch.nn.DataParallel(model, device_ids=range(n_gpus))
+
+            task_losses = [[] for _ in tasknames]
+            task_idxs = [0 for _ in tasknames]
+            nli_losses = []
+            updates = 0
+            nli_ctr = 0
+            nli_epoch = 0
+            monitor_epoch = 0
+            nli_mbatch_ctr = 0
+            mbatch_times = []
+            min_val_loss = 10000000
+            min_val_loss_epoch = -1
+            rng_num_tasks = (
+                len(tasknames) - 1 if paired_tasks else len(tasknames)
+            )
+            logging.info("OS Environ: \n {} \n\n".format(os.environ))
+            mlflow.log_param("learning_rate", learning_rate)
+            logging.info("Commencing Training ...")
+            start = time.time()
+            while True:
+                batch_start_time = time.time()
+                # Train NLI once every 10 minibatches of other tasks
+                if nli_ctr % 10 == 0:
+                    minibatch = nli_iterator.get_parallel_minibatch(
+                        nli_mbatch_ctr, batch_size * n_gpus
+                    )
+                    optimizer.zero_grad()
+                    class_logits = model(
+                        minibatch, -1, return_hidden=False, paired_trg=None
+                    )
+
+                    loss = nli_criterion(
+                        class_logits.contiguous().view(
+                            -1, class_logits.size(1)
+                        ),
+                        minibatch["labels"].contiguous().view(-1),
+                    )
+
+                    # nli_losses.append(loss.data[0])
+                    nli_losses.append(loss.item())
+                    loss.backward()
+                    torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
+                    optimizer.step()
+
+                    nli_mbatch_ctr += batch_size * n_gpus
+                    if nli_mbatch_ctr >= len(nli_iterator.train_lines):
+                        nli_mbatch_ctr = 0
+                        nli_epoch += 1
+                else:
+                    # Sample a random task
+                    task_idx = np.random.randint(low=0, high=rng_num_tasks)
+
+                    # Get a minibatch corresponding to the sampled task
+                    minibatch = train_iterator.get_parallel_minibatch(
+                        task_idx,
+                        task_idxs[task_idx],
                         batch_size * n_gpus,
                         max_len_src,
                         max_len_trg,
                     )
-                    task_idxs[skipthought_backward_idx] += batch_size * n_gpus
-                    if (
-                        task_idxs[skipthought_backward_idx]
-                        >= train_iterator.buffer_size
-                    ):
-                        train_iterator.fetch_buffer(skipthought_backward_idx)
-                        task_idxs[skipthought_backward_idx] = 0
 
-                    optimizer.zero_grad()
-                    decoder_logit, decoder_logit_2 = model(
-                        minibatch,
-                        task_idx,
-                        paired_trg=minibatch_back["input_trg"],
-                    )
+                    """Increment pointer into task and if current buffer is 
+                    exhausted, fetch new buffer. """
+                    task_idxs[task_idx] += batch_size * n_gpus
+                    if task_idxs[task_idx] >= train_iterator.buffer_size:
+                        train_iterator.fetch_buffer(task_idx)
+                        task_idxs[task_idx] = 0
 
-                    loss_f = loss_criterion(
-                        decoder_logit.contiguous().view(
-                            -1, decoder_logit.size(2)
-                        ),
-                        minibatch["output_trg"].contiguous().view(-1),
-                    )
+                    if task_idx == skipthought_idx:
+                        minibatch_back = train_iterator.get_parallel_minibatch(
+                            skipthought_backward_idx,
+                            task_idxs[skipthought_backward_idx],
+                            batch_size * n_gpus,
+                            max_len_src,
+                            max_len_trg,
+                        )
+                        task_idxs[skipthought_backward_idx] += (
+                            batch_size * n_gpus
+                        )
+                        if (
+                            task_idxs[skipthought_backward_idx]
+                            >= train_iterator.buffer_size
+                        ):
+                            train_iterator.fetch_buffer(
+                                skipthought_backward_idx
+                            )
+                            task_idxs[skipthought_backward_idx] = 0
 
-                    loss_b = loss_criterion(
-                        decoder_logit_2.contiguous().view(
-                            -1, decoder_logit_2.size(2)
-                        ),
-                        minibatch_back["output_trg"].contiguous().view(-1),
-                    )
+                        optimizer.zero_grad()
+                        decoder_logit, decoder_logit_2 = model(
+                            minibatch,
+                            task_idx,
+                            paired_trg=minibatch_back["input_trg"],
+                        )
 
-                    task_losses[task_idx].append(loss_f.data[0])
-                    task_losses[skipthought_backward_idx].append(
-                        loss_b.data[0]
-                    )
-                    loss = loss_f + loss_b
+                        loss_f = loss_criterion(
+                            decoder_logit.contiguous().view(
+                                -1, decoder_logit.size(2)
+                            ),
+                            minibatch["output_trg"].contiguous().view(-1),
+                        )
 
-                else:
-                    optimizer.zero_grad()
-                    decoder_logit = model(minibatch, task_idx)
+                        loss_b = loss_criterion(
+                            decoder_logit_2.contiguous().view(
+                                -1, decoder_logit_2.size(2)
+                            ),
+                            minibatch_back["output_trg"].contiguous().view(-1),
+                        )
 
-                    loss = loss_criterion(
-                        decoder_logit.contiguous().view(
-                            -1, decoder_logit.size(2)
-                        ),
-                        minibatch["output_trg"].contiguous().view(-1),
-                    )
+                        task_losses[task_idx].append(loss_f.data[0])
+                        task_losses[skipthought_backward_idx].append(
+                            loss_b.data[0]
+                        )
+                        loss = loss_f + loss_b
 
-                    task_losses[task_idx].append(loss.item())
+                    else:
+                        optimizer.zero_grad()
+                        decoder_logit = model(minibatch, task_idx)
 
-                loss.backward()
-                # For distributed optimizer need to sync before gradient
-                # clipping.
-                optimizer.synchronize()
+                        loss = loss_criterion(
+                            decoder_logit.contiguous().view(
+                                -1, decoder_logit.size(2)
+                            ),
+                            minibatch["output_trg"].contiguous().view(-1),
+                        )
 
-                torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
-                optimizer.step()
+                        task_losses[task_idx].append(loss.item())
 
-            end = time.time()
-            mbatch_times.append(end - start)
+                    loss.backward()
+                    # For distributed optimizer need to sync before gradient
+                    # clipping.
+                    optimizer.synchronize()
 
-            # Validations
-            if (
-                updates % config["management"]["monitor_loss"] == 0
-                and updates != 0
-            ):
-                monitor_epoch += 1
-                for idx, task in enumerate(tasknames):
-                    logging.info(
-                        "Seq2Seq Examples Processed : %d %s Loss : %.5f Num %s "
-                        "minibatches : %d"
-                        % (
-                            updates,
-                            task,
+                    torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
+                    optimizer.step()
+
+                end = time.time()
+                mbatch_times.append(end - batch_start_time)
+
+                # Validations
+                if (
+                    updates % config["management"]["monitor_loss"] == 0
+                    and updates != 0
+                ):
+                    monitor_epoch += 1
+                    for idx, task in enumerate(tasknames):
+                        logging.info(
+                            "Seq2Seq Examples Processed : %d %s Loss : %.5f Num %s "
+                            "minibatches : %d"
+                            % (
+                                updates,
+                                task,
+                                np.mean(task_losses[idx]),
+                                task,
+                                len(task_losses[idx]),
+                            )
+                        )
+                        mlflow.log_metric(
+                            "validation_loss",
                             np.mean(task_losses[idx]),
-                            task,
-                            len(task_losses[idx]),
+                            step=monitor_epoch,
+                        )
+
+                    logging.info(
+                        "Round: %d NLI Epoch : %d NLI Examples Processed : %d NLI "
+                        "Loss : %.5f "
+                        % (
+                            nli_ctr,
+                            nli_epoch,
+                            nli_mbatch_ctr,
+                            np.mean(nli_losses),
                         )
                     )
-                    run.log("Task Loss", np.mean(task_losses[idx]))
+                    mlflow.log_metric(
+                        "nli_loss", np.mean(nli_losses), step=nli_epoch
+                    )
 
-                logging.info(
-                    "Round: %d NLI Epoch : %d NLI Examples Processed : %d NLI "
-                    "Loss : %.5f "
-                    % (nli_ctr, nli_epoch, nli_mbatch_ctr, np.mean(nli_losses))
-                )
-                run.log("NLI Loss", np.mean(nli_losses))
-                logging.info(
-                    "Average time per mininbatch : %.5f"
-                    % (np.mean(mbatch_times))
-                )
-                run.log(
-                    "Average time per mininbatch : ", np.mean(mbatch_times)
-                )
-                task_losses = [[] for _ in tasknames]
-                mbatch_times = []
-                nli_losses = []
+                    logging.info(
+                        "Average time per mininbatch : %.5f"
+                        % (np.mean(mbatch_times))
+                    )
+                    mlflow.log_metric(
+                        "minibatch_avg_duration", np.mean(mbatch_times)
+                    )
 
-                # For validate and break if done.
-                logging.info("############################")
-                logging.info("##### Evaluating model #####")
-                logging.info("############################")
-                training_complete, min_val_loss_epoch, min_val_loss, model_state = evaluate(
-                    config=config,
-                    train_iterator=train_iterator,
-                    model=model,
-                    loss_criterion=loss_criterion,
-                    monitor_epoch=monitor_epoch,
-                    min_val_loss=min_val_loss,
-                    min_val_loss_epoch=min_val_loss_epoch,
-                    save_dir=save_dir,
-                    starting_time=start,
-                    model_state=model_state,
-                )
-                if training_complete:
-                    break
+                    task_losses = [[] for _ in tasknames]
+                    mbatch_times = []
+                    nli_losses = []
 
-                logging.info("Evaluating on NLI")
-                evaluate_nli(
-                    nli_iterator=nli_iterator,
-                    model=model,
-                    n_gpus=n_gpus,
-                    batch_size=batch_size,
-                )
+                    # For validate and break if done.
+                    logging.info("############################")
+                    logging.info("##### Evaluating model #####")
+                    logging.info("############################")
+                    training_complete, min_val_loss_epoch, min_val_loss, model_state = evaluate(
+                        config=config,
+                        train_iterator=train_iterator,
+                        model=model,
+                        loss_criterion=loss_criterion,
+                        monitor_epoch=monitor_epoch,
+                        min_val_loss=min_val_loss,
+                        min_val_loss_epoch=min_val_loss_epoch,
+                        save_dir=save_dir,
+                        starting_time=start,
+                        model_state=model_state,
+                    )
+                    if training_complete:
+                        break
 
-            updates += batch_size * n_gpus
-            nli_ctr += 1
-            logging.info("Updates: %d" % updates)
+                    logging.info("Evaluating on NLI")
+                    evaluate_nli(
+                        nli_iterator=nli_iterator,
+                        model=model,
+                        n_gpus=n_gpus,
+                        batch_size=batch_size,
+                    )
+
+                updates += batch_size * n_gpus
+                nli_ctr += 1
+                logging.info("Updates: %d" % updates)
     finally:
         os.chdir(owd)
 
diff --git a/tests/unit/test_word_embeddings.py b/tests/unit/test_word_embeddings.py
index c1eb0e3..6ac9391 100644
--- a/tests/unit/test_word_embeddings.py
+++ b/tests/unit/test_word_embeddings.py
@@ -38,6 +38,7 @@ def test_load_pretrained_vectors_word2vec():
 
     assert isinstance(load_word2vec(dir_path), Word2VecKeyedVectors)
 
+
 def test_load_pretrained_vectors_glove():
     dir_path = "temp_data/"
     file_path = os.path.join(
@@ -58,7 +59,8 @@ def test_load_pretrained_vectors_glove():
 
 def test_load_pretrained_vectors_fasttext():
     dir_path = "temp_data/"
-    file_path = os.path.join(os.path.join(dir_path, "fastText"), "wiki.simple.bin")
+    file_path = os.path.join(os.path.join(dir_path, "fastText"),
+                             "wiki.simple.bin")
 
     assert isinstance(load_fasttext(dir_path), FastText)
 
diff --git a/tools/generate_conda_file.py b/tools/generate_conda_file.py
index 95befec..428790a 100644
--- a/tools/generate_conda_file.py
+++ b/tools/generate_conda_file.py
@@ -54,9 +54,7 @@ CONDA_GPU = {
 }
 
 PIP_BASE = {
-    "azureml-sdk[notebooks,tensorboard]": (
-        "azureml-sdk[notebooks,tensorboard]==1.0.33"
-    ),
+    "azureml-sdk[notebooks,tensorboard]": "azureml-sdk[notebooks,tensorboard]==1.0.43",
     "azureml-dataprep": "azureml-dataprep==1.1.4",
     "black": "black>=18.6b4",
     "dask": "dask[dataframe]==1.2.2",
@@ -75,6 +73,7 @@ PIP_BASE = {
     "nltk": "nltk>=3.4",
     "pytorch-pretrained-bert": "pytorch-pretrained-bert>=0.6",
     "seqeval": "seqeval>=0.0.12",
+    "azureml-mlflow": "azureml-mlflow>=1.0.43.1",
 }
 
 PIP_GPU = {"horovod": "horovod>=0.16.1"}