update with the changes in staging branch

2019-08-13 18:10:56 +00:00 · 2019-08-13 18:10:56 +00:00 · bd2b60d0bc
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@ -0,0 +1,13 @@
+[bumpversion]
+current_version = 1.0.0
+commit = True
+tag = True
+message = "Bump version: {current_version} -> {new_version}"
+
+[bumpversion:file:setup.py]
+search = version='{current_version}'
+replace = version='{new_version}'
+
+[bumpversion:file:utils_nlp/__init__.py]
+search = __version__ = '{current_version}'
+replace = __version__ = '{new_version}'
--- a/.gitignore
+++ b/.gitignore
@ -125,10 +125,18 @@ tools/repo_metrics/config.py
 *.pkl
 nlp_*.yaml
 nohup.out
+temp/
+tmp/

 # Data
 data/
-sentence-similarity/data/
+*/question_answering/bidaf.tar.gz
+*/question_answering/bidafenv.yml
+*/question_answering/config.json
+*/question_answering/score.py
+*/question_answering/vocabulary/
+*/question_answering/weights.th

 # AML Config
 aml_config/
+.azureml/
--- a/NOTICE.txt
+++ b/NOTICE.txt
@ -18,7 +18,7 @@ General Public License.

 --

-https://github.com/huggingface/pytorch-pretrained-BERT
+https://github.com/huggingface/pytorch-transformers

                                 Apache License
                           Version 2.0, January 2004
@ -428,3 +428,33 @@ https://github.com/stanfordnlp/glove
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
+BSD License
+
+For SentEval software
+
+Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@ -1,14 +1,45 @@
-
-| Branch | Status                                                                                                                                                                                                      |     | Branch  | Status                                                                                                                                                                                                         |
-| ------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/unit-test-master?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=22&branchName=master) |     | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/unit-test-staging?branchName=staging)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=21&branchName=staging) |
-
 # NLP Best Practices

-This repository contains examples and best practices for building NLP systems, provided as [Jupyter notebooks](scenarios) and [utility functions](utils_nlp). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on problems involving text and language.
+This repository contains examples and best practices for building natural language processing (NLP) systems, provided as [Jupyter notebooks](scenarios) and [utility functions](utils_nlp). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on problems involving text and language.
+
+![](https://nlpbp.blob.core.windows.net/images/cognitive_services.PNG)
+## Overview
+
+The goal of this repository is to build a comprehensive set of tools and examples that leverage recent advances in NLP algorithms, neural architectures, and distributed machine learning systems.
+The content is based on our past and potential future engagements with customers as well as collaboration with partners, researchers, and the open source community.
+
+We’re hoping that the tools would significantly reduce the time from a business problem, or a research idea, to full implementation of a system. In addition, the example notebooks would serve as guidelines and showcase best practices and usage of the tools.
+
+In an era of transfer learning, transformers, and deep architectures, we believe that pretrained models provide a unified solution to many real-world problems and allow handling different tasks and languages easily. We will, therefore, prioritize such models, as they achieve state-of-the-art results on several NLP benchmarks and can be used in a number of applications ranging from simple text classification to sophisticated intelligent chat bots.
+
+> [*GLUE Leaderboard*](https://gluebenchmark.com/leaderboard)  
+> [*SQuAD Leaderbord*](https://rajpurkar.github.io/SQuAD-explorer/)
+
+## Content
+
+The following is a summary of the scenarios covered in the repository. Each scenario is demonstrated in one or more Jupyter notebook examples that make use of the core code base of models and utilities.
+
+| Scenario                 | Applications                                 |  Models |
+|---| ------------------------ | ------------------- |
+|[Text Classification](scenarios/text_classification)      |Topic Classification|BERT|
+|[Named Entity Recognition](scenarios/named_entity_recognition) |Wikipedia NER                                              |BERT|
+|[Entailment](scenarios/entailment)|MultiNLI Natural Language Inference|BERT|
+|[Question Answering](scenarios/question_answering) |SQuAD                                              | BiDAF, BERT|
+|[Sentence Similarity](scenarios/sentence_similarity)      |STS Benchmark                         |Representation: TF-IDF, Word Embeddings, Doc Embeddings<br>Metrics: Cosine Similarity, Word Mover's Distance|
+|[Embeddings](scenarios/embeddings)| Custom Embeddings Training|Word2Vec<br>fastText<br>GloVe|
+| [Annotation](scenarios/annotation) | Text annotation | Tutorial |
+
+

 ## Getting Started
 To get started, navigate to the [Setup Guide](SETUP.md), where you'll find instructions on how to setup your environment and dependencies.

 ## Contributing
 This project welcomes contributions and suggestions. Before contributing, please see our [contribution guidelines](CONTRIBUTING.md).
+
+
+## Build Status
+| Build Type | Branch | Status |  | Branch | Status | 
+| --- | --- | --- | --- | --- | --- | 
+| **Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=staging) |
+| **Linux GPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=master) |
--- a/SETUP.md
+++ b/SETUP.md
@ -26,13 +26,10 @@ Depending on the type of NLP system and the notebook that needs to be run, there
 ### Requirements

 * A machine running Linux, MacOS or Windows.  
-    > NOTE: Windows machine are not **FULLY SUPPORTED**. Please use at your own risk.  
+    > NOTE: Windows machines are not **FULLY SUPPORTED**. Please use at your own risk.  
 * Miniconda or Anaconda with Python version >= 3.6.
    * This is pre-installed on Azure DSVM such that one can run the following steps directly. To setup on your local machine, [Miniconda](https://docs.conda.io/en/latest/miniconda.html) is a quick way to get started.
    * It is recommended to update conda to the latest version: `conda update -n base -c defaults conda`
-* CUDA Toolkit >=9.2 (for GPU machines only)
-    * On Windows: Download and install [toolkit](https://developer.nvidia.com/cuda-toolkit)
-    * On Linux: *conda install cudatoolkit>=9.2*


 ### Dependencies Setup
@ -52,7 +49,7 @@ You can specify the environment name as well with the flag `-n`.
 Click on the following menus to see how to install the Python GPU environment:

 <details>
-<summary><strong><em>Python GPU environment</em></strong></summary>
+<summary><strong><em>Python GPU environment on Linux, MacOS</em></strong></summary>

 Assuming that you have a GPU machine, to install the Python GPU environment, which by default installs the CPU environment:

@ -62,6 +59,22 @@ Assuming that you have a GPU machine, to install the Python GPU environment, whi

 </details>

+<details>
+<summary><strong><em>Python GPU environment on Windows</em></strong></summary>
+
+Assuming that you have an Azure GPU DSVM machine, here are the steps to setup the Python GPU environment:
+1. Make sure you have CUDA Toolkit version 9.0 above installed on your Windows machine. You can run the command below in your terminal to check.
+
+         nvcc --version
+    If you don't have CUDA Toolkit or don't have the right version, please download it from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
+
+2. Install the GPU environment.
+
+        cd nlp
+        python tools/generate_conda_file.py --gpu
+        conda env create -n nlp_gpu -f nlp_gpu.yaml
+
+</details>

 ### Register Conda Environment in DSVM JupyterHub

@ -70,12 +83,12 @@ We can register our created conda environment to appear as a kernel in the Jupyt
    conda activate my_env_name
    python -m ipykernel install --user --name my_env_name --display-name "Python (my_env_name)"

-If you are using the DSVM, you can [connect to JupyterHub](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#jupyterhub-and-jupyterlab) by browsing to `https://your-vm-ip:8000`.  
+If you are using the DSVM, you can [connect to JupyterHub](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#jupyterhub-and-jupyterlab) by browsing to `https://your-vm-ip:8000`.  If you are prompted to enter user name and password, enter the user name and password that you use to log in to your virtual machine. 

 ## Install this repository via PIP
 A [setup.py](setup.py) file is provied in order to simplify the installation of this utilities in this repo from the main directory.

-    pip install -e utils_nlp
+    pip install -e .

 It is also possible to install directly from Github.

--- a/init.py
+++ b/init.py
@ -1,15 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-__title__ = "Microsoft NLP"
-__version__ = "2019.08"
-__author__ = "NLPDev Team at Microsoft"
-__license__ = "MIT"
-__copyright__ = "Copyright 2018-present Microsoft Corporation"
-
-# Synonyms
-TITLE = __title__
-VERSION = __version__
-AUTHOR = __author__
-LICENSE = __license__
-COPYRIGHT = __copyright__
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,2 +1,2 @@
 [tool.black]
-line-length = 79
+line-length = 100
--- a/scenarios/README.md
+++ b/scenarios/README.md
@ -1,19 +1,15 @@
 # NLP Scenarios

-This folder contains examples and best practices, written in Jupyter notebooks, for building Natural Language Processing systems for different scenarios.
+This folder contains examples and best practices, written in Jupyter notebooks, for building Natural Language Processing systems for the following scenarios.

-## Summary

-The following is a summary of the scenarios covered in the best practice notebooks. Each scenario is demonstrated in one or more Jupyter notebook examples that make use of the core code base of models and utilities.
-
-| Scenario                 | Applications                                 |  Models |
-|---| ------------------------ | ------------------- |
-|[Text Classification](text_classification)      |Topic Classification|BERT|
-|[Named Entity Recognition](named_entity_recognition) |Wikipedia NER                                              |BERT|
-|[Entailment](./entailment)|XNLI Natural Language Inference|BERT|
-|[Question Answering](question_answering) |SQuAD                                              | BiDAF|
-|[Sentence Similarity](sentence_similarity)      |STS Benchmark                         |Representation: TF-IDF, Word Embeddings, Doc Embeddings<br>Metrics: Cosine Similarity, Word Mover's Distance|
-|[Embeddings](embeddings)| Custom Embeddings Training|Word2Vec<br>fastText<br>GloVe|
+- [Text Classification](text_classification)
+- [Named Entity Recognition](named_entity_recognition)
+- [Entailment](entailment)
+- [Question Answering](question_answering)
+- [Sentence Similarity](sentence_similarity)
+- [Embeddings](embeddings)
+- [Annotation](annotation)

 ## Azure-enhanced notebooks

@ -31,8 +27,8 @@ The Azure products featured in the notebooks include:
  * Scaling up and out on Azure Machine Learning Compute
  * Deploying a web service to both Azure Container Instance and Azure Kubernetes Service

-* [Azure Kubernetes Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aks) - You can use Azure Machine Learning service to host your classification model in a web service deployment on Azure Kubernetes Service (AKS). AKS is good for high-scale production deployments and provides autoscaling, and fast response times.
+* [Azure Kubernetes Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aks) - You can use Azure Machine Learning service to host your model in a web service deployment on Azure Kubernetes Service (AKS). AKS is good for high-scale production deployments and provides autoscaling, and fast response times.

-* [Azure Container Instance](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aci)- You can use Azure Machine Learning service to host your classification model in a web service deployment on Azure Container Instance (ACI). ACI is good for low scale, CPU-based workloads.
+* [Azure Container Instance](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aci)- You can use Azure Machine Learning service to host your model in a web service deployment on Azure Container Instance (ACI). ACI is good for low scale, CPU-based workloads.

 There may be other Azure service or products used in the notebooks. Introduction and/or reference of those will be provided in the notebooks.
--- a/scenarios/annotation/Doccano.md
+++ b/scenarios/annotation/Doccano.md
@ -0,0 +1,140 @@
+# Doccano: Text Annotation Tool
+
+## What is Doccano?
+
+[Doccano](https://github.com/chakki-works/doccano) is one of the best open source tools that provides text annotation features. The latest version supports annotation features for text classification, sequence labeling (NER) and sequence to sequence (machine translation, text summarization). There are many other open source and commercial annotation tools available. Hereafter is an list of those tools:
+
+- [Brat](https://brat.nlplab.org/) (open source)
+- [Anafora](https://github.com/weitechen/anafora) (open source)
+- [Prodigy](https://prodi.gy/) (commercial)
+- [LightTag](https://www.lighttag.io/) (commercial)
+
+Doccano needs to be hosted somewhere such that we can collaborate it. This tutorial walks through how to deploy Doccano on Azure and collaboratively annotate text data for natural language processing tasks.
+
+## Deploy to Azure
+
+Doccano can be deployed to Azure ([Web App for Containers](https://azure.microsoft.com/en-us/services/app-service/containers/) +
+[PostgreSQL database](https://azure.microsoft.com/en-us/services/postgresql/)) by clicking on the button below:
+
+<p align="center">
+  <a href="https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fchakki-works%2Fdoccano%2Fmaster%2Fazuredeploy.json"><img width=180 src="https://nlpbp.blob.core.windows.net/images/deploybutton.jpg" /></a>
+</p>
+
+You will need to have an existing Azure subscription such that you can create all Azure resources need to deploy Doccano. Otherwise you can get a [free Azure account](https://azure.microsoft.com/en-us/offers/ms-azr-0044p/?WT.mc_id=medium-blog-abornst) and then click the deploy button above.
+
+You will need to specify your subscription and resource group, and fill in the setting details (App Name, Secret Key, and etc.) and then deploy. It takes a few minutes to create all needed Azure resources. Hereafter is a screen snippet of the deployment. 
+
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/deploy_to_azure.jpg" />
+</p>
+
+## Tutorial
+
+### Useful Links
+
+#### Main Page
+
+After the deployment you can navigate to following url where **{`appname`}** is the `App Name` you choose when deploy to Azure:
+
+_**https://{appname}.azurewebsites.net**_
+
+For example, if your appname is "**doccano**", then the link will be
+
+_**https://doccano.azurewebsites.net**_
+
+And we will use `doccano` as the app name for this tutorial.
+
+#### Login Page
+
+You can login by clicking the `login` button at the top right of the main page, or you can navigate to the page with the link
+
+_**https://doccano.azurewebsites.net/login**_
+
+Both will bring you in to the Doccano login page where you can login with the Admin user name and Admin password you configured in the deployment. 
+
+#### Admin Page
+
+By default, only the Admin user is created for you after the deployment. You can add more users, groups and configure the Doccano service by navigating to the admin page.
+
+_**https://doccano.azurewebsites.net/admin**_
+
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/admin_page.JPG" />
+</p>
+
+### Create Project
+
+The first step we need to do is to create a new project for annotation. And here we will use the NER annotation task for science fictions to give you a brief tutorial on Doccano. 
+
+After login with Admin user name and Admin password, you will be navigated to the main project list page of Doccano and there is no project. 
+
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/project_list.jpg" />
+</p>
+
+To create your project, make sure you’re in the project list page and click `Create Project` button. As for this tutorial, we name the project as `sequence labeling for books`, write some description, then choose the sequence labeling task type.
+
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/create_project.jpg" />
+</p>
+
+### Import Data
+
+After creating a project, we will see the "`Import Data`" page, or click `Import Data` button in the navigation bar. We should see the following screen:
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/import_data.jpg" />
+</p>
+
+We choose JSONL and click `Select a file` button. Select `books.json` and it would be loaded automatically. Below is the `books.json` file containing lots of science fictions description with different languages. We need to annotate some entities like people name, book title, date and so on. 
+
+```json
+{"text": "The Hitchhiker's Guide to the Galaxy (sometimes referred to as HG2G, HHGTTGor H2G2) is a comedy science fiction series created by Douglas Adams. Originally a radio comedy broadcast on BBC Radio 4 in 1978, it was later adapted to other formats, including stage shows, novels, comic books, a 1981 TV series, a 1984 video game, and 2005 feature film."}
+{"text": "《三体》是中国大陆作家刘慈欣于2006年5月至12月在《科幻世界》杂志上连载的一部长篇科幻小说，出版后成为中国大陆最畅销的科幻长篇小说之一。2008年，该书的单行本由重庆出版社出版。本书是三体系列（系列原名为：地球往事三部曲）的第一部，该系列的第二部《三体II：黑暗森林》已经于2008年5月出版。2010年11月，第三部《三体III：死神永生》出版发行。 2011年，“地球往事三部曲”在台湾陆续出版。小说的英文版获得美国科幻奇幻作家协会2014年度“星云奖”提名，并荣获2015年雨果奖最佳小说奖。"}
+{"text": "『銀河英雄伝説』（ぎんがえいゆうでんせつ）は、田中芳樹によるSF小説。また、これを原作とするアニメ、漫画、コンピュータゲーム、朗読、オーディオブック等の関連作品。略称は『銀英伝』（ぎんえいでん）。原作は累計発行部数が1500万部を超えるベストセラー小説である。1982年から2009年6月までに複数の版で刊行され、発行部数を伸ばし続けている。"}
+```
+
+After importing the dataset, you should be able to see the dataset immediately. 
+
+### Define labels
+
+Click `Labels` button in left bar to define our own labels. We should see the label editor page. In label editor page, you can create labels by specifying label text, shortcut key, background color and text color.
+
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/define_labels.jpg" />
+</p>
+
+### Annotation
+
+Next, we are ready to annotate the texts. Just click the `Annotate Data` button in the navigation bar, we can start to annotate the documents. You can just select the text and then use the shortcut key that you have defined to label the entities. 
+
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/annotate.jpg" />
+</p>
+
+### Export Data
+
+After the annotation step, we can download the annotated data. Click the `Edit data` button in the navigation bar, and then click `Export Data`. You should see below screen:
+
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/export_data.jpg" />
+</p>
+
+Here we choose JSONL file to download the data by clicking the button. Below is the annotated result for our tutorial project.
+
+```json
+{"id": 1, "text": "The Hitchhiker's Guide to the Galaxy (sometimes referred to as HG2G, HHGTTGor H2G2) is a comedy science fiction series created by Douglas Adams. Originally a radio comedy broadcast on BBC Radio 4 in 1978, it was later adapted to other formats, including stage shows, novels, comic books, a 1981 TV series, a 1984 video game, and 2005 feature film.", "annotations": [{"label": 2, "start_offset": 0, "end_offset": 36, "user": 1}, {"label": 2, "start_offset": 63, "end_offset": 67, "user": 1}, {"label": 2, "start_offset": 69, "end_offset": 82, "user": 1}, {"label": 5, "start_offset": 89, "end_offset": 111, "user": 1}, {"label": 1, "start_offset": 130, "end_offset": 143, "user": 1}, {"label": 5, "start_offset": 158, "end_offset": 180, "user": 1}, {"label": 6, "start_offset": 184, "end_offset": 195, "user": 1}, {"label": 3, "start_offset": 199, "end_offset": 203, "user": 1}, {"label": 5, "start_offset": 254, "end_offset": 265, "user": 1}, {"label": 5, "start_offset": 267, "end_offset": 273, "user": 1}, {"label": 5, "start_offset": 275, "end_offset": 286, "user": 1}, {"label": 3, "start_offset": 290, "end_offset": 294, "user": 1}, {"label": 5, "start_offset": 295, "end_offset": 304, "user": 1}, {"label": 3, "start_offset": 308, "end_offset": 312, "user": 1}, {"label": 5, "start_offset": 313, "end_offset": 323, "user": 1}, {"label": 3, "start_offset": 329, "end_offset": 333, "user": 1}, {"label": 5, "start_offset": 334, "end_offset": 346, "user": 1}], "meta": {}, "annotation_approver": "admin"}
+{"id": 2, "text": "《三体》是中国大陆作家刘慈欣于2006年5月至12月在《科幻世界》杂志上连载的一部长篇科幻小说，出版后成为中国大陆最畅销的科幻长篇小说之一。2008年，该书的单行本由重庆出版社出版。本书是三体系列（系列原名为：地球往事三部曲）的第一部，该系列的第二部《三体II：黑暗森林》已经于2008年5月出版。2010年11月，第三部《三体III：死神永生》出版发行。 2011年，“地球往事三部曲”在台湾陆续出版。小说的英文版获得美国科幻奇幻作家协会2014年度“星云奖”提名，并荣获2015年雨果奖最佳小说奖。", "annotations": [{"label": 2, "start_offset": 1, "end_offset": 3, "user": 1}, {"label": 4, "start_offset": 5, "end_offset": 9, "user": 1}, {"label": 1, "start_offset": 11, "end_offset": 14, "user": 1}, {"label": 3, "start_offset": 15, "end_offset": 26, "user": 1}, {"label": 2, "start_offset": 28, "end_offset": 32, "user": 1}, {"label": 5, "start_offset": 41, "end_offset": 47, "user": 1}, {"label": 4, "start_offset": 53, "end_offset": 57, "user": 1}, {"label": 5, "start_offset": 61, "end_offset": 67, "user": 1}, {"label": 3, "start_offset": 70, "end_offset": 74, "user": 1}, {"label": 6, "start_offset": 83, "end_offset": 88, "user": 1}, {"label": 2, "start_offset": 105, "end_offset": 112, "user": 1}, {"label": 2, "start_offset": 94, "end_offset": 98, "user": 1}, {"label": 2, "start_offset": 126, "end_offset": 135, "user": 1}, {"label": 3, "start_offset": 139, "end_offset": 146, "user": 1}, {"label": 3, "start_offset": 149, "end_offset": 157, "user": 1}, {"label": 2, "start_offset": 162, "end_offset": 172, "user": 1}, {"label": 3, "start_offset": 179, "end_offset": 184, "user": 1}, {"label": 2, "start_offset": 186, "end_offset": 193, "user": 1}, {"label": 4, "start_offset": 195, "end_offset": 197, "user": 1}, {"label": 5, "start_offset": 202, "end_offset": 204, "user": 1}, {"label": 6, "start_offset": 210, "end_offset": 220, "user": 1}, {"label": 3, "start_offset": 220, "end_offset": 225, "user": 1}, {"label": 6, "start_offset": 227, "end_offset": 230, "user": 1}, {"label": 3, "start_offset": 237, "end_offset": 242, "user": 1}, {"label": 6, "start_offset": 242, "end_offset": 245, "user": 1}], "meta": {}, "annotation_approver": "admin"}
+{"id": 3, "text": "『銀河英雄伝説』（ぎんがえいゆうでんせつ）は、田中芳樹によるSF小説。また、これを原作とするアニメ、漫画、コンピュータゲーム、朗読、オーディオブック等の関連作品。略称は『銀英伝』（ぎんえいでん）。原作は累計発行部数が1500万部を超えるベストセラー小説である。1982年から2009年6月までに複数の版で刊行され、発行部数を伸ばし続けている。", "annotations": [{"label": 2, "start_offset": 1, "end_offset": 7, "user": 1}, {"label": 1, "start_offset": 23, "end_offset": 30, "user": 1}, {"label": 5, "start_offset": 30, "end_offset": 34, "user": 1}, {"label": 2, "start_offset": 85, "end_offset": 88, "user": 1}, {"label": 5, "start_offset": 50, "end_offset": 52, "user": 1}, {"label": 5, "start_offset": 63, "end_offset": 65, "user": 1}, {"label": 3, "start_offset": 130, "end_offset": 135, "user": 1}, {"label": 3, "start_offset": 137, "end_offset": 144, "user": 1}], "meta": {}, "annotation_approver": "admin"}
+```
+
+Please note that in the exported JSON file, the label for each entity is an entity ID which is inconvenient if you want to consume the annotations somewhere else. Some post processing is needed if you want to have the entity type value instead of the type ID.
+
+### View Statistics
+
+One good thing of Doccano is that it also has dashboard to display annotation progress and label distributions. Click the `Edit data` button in the navigation bar, and then click `Statistics` on the left side of the menu.
+
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/statistic.jpg" />
+</p>
+
+Congratulation! You just mastered how to use Doccano for a sequence labeling project.
--- a/scenarios/entailment/README.md
+++ b/scenarios/entailment/README.md
@ -0,0 +1,12 @@
+# Natural Language Inference (NLI)  
+Natural Language Inference (NLI) or Recognizing Textual Entailment (RTE) is the
+task of classifying a pair of premise and hypothesis sentences into three
+classes: contradiction, neutral, and entailment. For example,  
+
+|Premise|Hypothesis|Label|
+|-------|----------|-----|
+|A man inspects the uniform of a figure in some East Asian country.|The man is sleeping.|contradiction|
+|An older and younger man smiling.|Two men are smiling and laughing at the cats playing on the floor.|neutral|
+|A soccer game with multiple males playing.|Some men are playing a sport.|entailment|
+
+NLI is one of many NLP tasks that require robust compositional sentence understanding, but it's simpler compared to other tasks like question answering and machine translation.
--- a/scenarios/entailment/entailment_multinli_bert.ipynb
+++ b/scenarios/entailment/entailment_multinli_bert.ipynb
@ -0,0 +1,893 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*Copyright (c) Microsoft Corporation. All rights reserved.*  \n",
+    "\n",
+    "*Licensed under the MIT License.*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Natural Language Inference on MultiNLI Dataset using BERT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Before You Start\n",
+    "\n",
+    "The running time shown in this notebook is on a Standard_NC24s_v3 Azure Deep Learning Virtual Machine with 4 NVIDIA Tesla V100 GPUs. If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n",
+    "The table below provides some reference running time on different machine configurations.  \n",
+    "\n",
+    "|QUICK_RUN|Machine Configurations|Running time|\n",
+    "|:---------|:----------------------|:------------|\n",
+    "|True|4 **CPU**s, 14GB memory| ~ 15 minutes|\n",
+    "|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 5 minutes|\n",
+    "|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 10.5 hours|\n",
+    "|False|4 NVIDIA Tesla V100 GPUs, 64GB GPU memory| ~ 2.5 hours|\n",
+    "\n",
+    "If you run into CUDA out-of-memory error, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n",
+    "QUICK_RUN = False\n",
+    "\n",
+    "TRAIN_DATA_USED_PERCENT = 1\n",
+    "DEV_DATA_USED_PERCENT = 1\n",
+    "NUM_EPOCHS = 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if QUICK_RUN:\n",
+    "    TRAIN_DATA_USED_PERCENT = 0.001\n",
+    "    DEV_DATA_USED_PERCENT = 0.01\n",
+    "    NUM_EPOCHS = 1\n",
+    "\n",
+    "import torch\n",
+    "if torch.cuda.is_available():\n",
+    "    BATCH_SIZE = 32\n",
+    "else:\n",
+    "    BATCH_SIZE = 16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "In this notebook, we demostrate using [BERT](https://arxiv.org/abs/1810.04805) to perform Natural Language Inference (NLI). We use the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset and the task is to classify sentence pairs into three classes: contradiction, entailment, and neutral.   \n",
+    "The figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. It concatenates the tokens in each sentence pairs and separates the sentences by the [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.\n",
+    "<img src=\"https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "import random\n",
+    "import numpy as np\n",
+    "from sklearn.metrics import classification_report\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "\n",
+    "nlp_path = os.path.abspath('../../')\n",
+    "if nlp_path not in sys.path:\n",
+    "    sys.path.insert(0, nlp_path)\n",
+    "\n",
+    "from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier\n",
+    "from utils_nlp.models.bert.common import Language, Tokenizer\n",
+    "from utils_nlp.dataset.multinli import load_pandas_df\n",
+    "from utils_nlp.common.timer import Timer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configurations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set random seeds\n",
+    "RANDOM_SEED = 42\n",
+    "random.seed(RANDOM_SEED)\n",
+    "np.random.seed(RANDOM_SEED)\n",
+    "torch.manual_seed(RANDOM_SEED)\n",
+    "num_cuda_devices = torch.cuda.device_count()\n",
+    "if num_cuda_devices > 1:\n",
+    "    torch.cuda.manual_seed_all(RANDOM_SEED)\n",
+    "\n",
+    "# model configurations\n",
+    "LANGUAGE = Language.ENGLISH\n",
+    "TO_LOWER = True\n",
+    "MAX_SEQ_LENGTH = 128\n",
+    "\n",
+    "# optimizer configurations\n",
+    "LEARNING_RATE= 5e-5\n",
+    "WARMUP_PROPORTION= 0.1\n",
+    "\n",
+    "# data configurations\n",
+    "TEXT_COL = \"text\"\n",
+    "LABEL_COL = \"gold_label\"\n",
+    "\n",
+    "CACHE_DIR = \"./temp\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data\n",
+    "The MultiNLI dataset comes with three subsets: train, dev_matched, dev_mismatched. The dev_matched dataset are from the same genres as the train dataset, while the dev_mismatched dataset are from genres not seen in the training dataset.   \n",
+    "The `load_pandas_df` function downloads and extracts the zip files if they don't already exist in `local_cache_path` and returns the data subset specified by `file_split`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\")\n",
+    "dev_df_matched = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev_matched\")\n",
+    "dev_df_mismatched = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev_mismatched\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dev_df_matched = dev_df_matched.loc[dev_df_matched['gold_label'] != '-']\n",
+    "dev_df_mismatched = dev_df_mismatched.loc[dev_df_mismatched['gold_label'] != '-']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training dataset size: 392702\n",
+      "Development (matched) dataset size: 9815\n",
+      "Development (mismatched) dataset size: 9832\n",
+      "\n",
+      "   gold_label                                          sentence1  \\\n",
+      "0     neutral  Conceptually cream skimming has two basic dime...   \n",
+      "1  entailment  you know during the season and i guess at at y...   \n",
+      "2  entailment  One of our number will carry out your instruct...   \n",
+      "3  entailment  How do you know? All this is their information...   \n",
+      "4     neutral  yeah i tell you what though if you go price so...   \n",
+      "\n",
+      "                                           sentence2  \n",
+      "0  Product and geography are what make cream skim...  \n",
+      "1  You lose the things to the following level if ...  \n",
+      "2  A member of my team will execute your orders w...  \n",
+      "3                  This information belongs to them.  \n",
+      "4           The tennis shoes have a range of prices.  \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Training dataset size: {}\".format(train_df.shape[0]))\n",
+    "print(\"Development (matched) dataset size: {}\".format(dev_df_matched.shape[0]))\n",
+    "print(\"Development (mismatched) dataset size: {}\".format(dev_df_mismatched.shape[0]))\n",
+    "print()\n",
+    "print(train_df[['gold_label', 'sentence1', 'sentence2']].head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Concatenate the first and second sentences to form the input text."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>gold_label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>(Conceptually cream skimming has two basic dim...</td>\n",
+       "      <td>neutral</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>(you know during the season and i guess at at ...</td>\n",
+       "      <td>entailment</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>(One of our number will carry out your instruc...</td>\n",
+       "      <td>entailment</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>(How do you know? All this is their informatio...</td>\n",
+       "      <td>entailment</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>(yeah i tell you what though if you go price s...</td>\n",
+       "      <td>neutral</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  gold_label\n",
+       "0  (Conceptually cream skimming has two basic dim...     neutral\n",
+       "1  (you know during the season and i guess at at ...  entailment\n",
+       "2  (One of our number will carry out your instruc...  entailment\n",
+       "3  (How do you know? All this is their informatio...  entailment\n",
+       "4  (yeah i tell you what though if you go price s...     neutral"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df[TEXT_COL] = list(zip(train_df['sentence1'], train_df['sentence2']))\n",
+    "dev_df_matched[TEXT_COL] = list(zip(dev_df_matched['sentence1'], dev_df_matched['sentence2']))\n",
+    "dev_df_mismatched[TEXT_COL] = list(zip(dev_df_mismatched['sentence1'], dev_df_mismatched['sentence2']))\n",
+    "train_df[[TEXT_COL, LABEL_COL]].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df = train_df.sample(frac=TRAIN_DATA_USED_PERCENT).reset_index(drop=True)\n",
+    "dev_df_matched = dev_df_matched.sample(frac=DEV_DATA_USED_PERCENT).reset_index(drop=True)\n",
+    "dev_df_mismatched = dev_df_mismatched.sample(frac=DEV_DATA_USED_PERCENT).reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tokenize and Preprocess\n",
+    "Before training, we tokenize the sentence texts and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 392702/392702 [03:25<00:00, 1907.47it/s]\n",
+      "100%|██████████| 9815/9815 [00:05<00:00, 1961.13it/s]\n",
+      "100%|██████████| 9832/9832 [00:05<00:00, 1837.42it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer= Tokenizer(LANGUAGE, to_lower=TO_LOWER, cache_dir=CACHE_DIR)\n",
+    "\n",
+    "train_tokens = tokenizer.tokenize(train_df[TEXT_COL])\n",
+    "dev_matched_tokens = tokenizer.tokenize(dev_df_matched[TEXT_COL])\n",
+    "dev_mismatched_tokens = tokenizer.tokenize(dev_df_mismatched[TEXT_COL])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In addition, we perform the following preprocessing steps in the cell below:\n",
+    "\n",
+    "* Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n",
+    "* Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n",
+    "* Pad or truncate the token lists to the specified max length\n",
+    "* Return mask lists that indicate paddings' positions\n",
+    "* Return token type id lists that indicate which sentence the tokens belong to\n",
+    "\n",
+    "*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_token_ids, train_input_mask, train_token_type_ids = \\\n",
+    "    tokenizer.preprocess_classification_tokens(train_tokens, max_len=MAX_SEQ_LENGTH)\n",
+    "dev_matched_token_ids, dev_matched_input_mask, dev_matched_token_type_ids = \\\n",
+    "    tokenizer.preprocess_classification_tokens(dev_matched_tokens, max_len=MAX_SEQ_LENGTH)\n",
+    "dev_mismatched_token_ids, dev_mismatched_input_mask, dev_mismatched_token_type_ids = \\\n",
+    "    tokenizer.preprocess_classification_tokens(dev_mismatched_tokens, max_len=MAX_SEQ_LENGTH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_encoder = LabelEncoder()\n",
+    "train_labels = label_encoder.fit_transform(train_df[LABEL_COL])\n",
+    "num_labels = len(np.unique(train_labels))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train and Predict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create Classifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier = BERTSequenceClassifier(language=LANGUAGE,\n",
+    "                                    num_labels=num_labels,\n",
+    "                                    cache_dir=CACHE_DIR)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train Classifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:   0%|          | 1/12272 [00:10<35:06:53, 10.30s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:1->1228/12272; average training loss:1.199178\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  10%|█         | 1229/12272 [07:20<1:03:16,  2.91it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:1229->2456/12272; average training loss:0.783637\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  20%|██        | 2457/12272 [14:28<55:44,  2.93it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:2457->3684/12272; average training loss:0.692243\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  30%|███       | 3685/12272 [21:37<48:36,  2.94it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:3685->4912/12272; average training loss:0.653206\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  40%|████      | 4913/12272 [28:45<41:36,  2.95it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:4913->6140/12272; average training loss:0.625751\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  50%|█████     | 6141/12272 [35:54<34:44,  2.94it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:6141->7368/12272; average training loss:0.605123\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  60%|██████    | 7369/12272 [42:58<27:46,  2.94it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:7369->8596/12272; average training loss:0.590521\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  70%|███████   | 8597/12272 [50:07<20:52,  2.93it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:8597->9824/12272; average training loss:0.577829\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  80%|████████  | 9825/12272 [57:14<13:46,  2.96it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:9825->11052/12272; average training loss:0.566418\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  90%|█████████ | 11053/12272 [1:04:20<06:53,  2.95it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/2; batch:11053->12272/12272; average training loss:0.556558\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 100%|██████████| 12272/12272 [1:11:21<00:00,  2.88it/s]\n",
+      "Iteration:   0%|          | 1/12272 [00:00<1:12:29,  2.82it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:1->1228/12272; average training loss:0.319802\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  10%|█         | 1229/12272 [07:09<1:02:29,  2.95it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:1229->2456/12272; average training loss:0.331876\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  20%|██        | 2457/12272 [14:15<55:22,  2.95it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:2457->3684/12272; average training loss:0.333463\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  30%|███       | 3685/12272 [21:21<48:41,  2.94it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:3685->4912/12272; average training loss:0.331817\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  40%|████      | 4913/12272 [28:25<41:26,  2.96it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:4913->6140/12272; average training loss:0.327940\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  50%|█████     | 6141/12272 [35:31<34:34,  2.96it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:6141->7368/12272; average training loss:0.325802\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  60%|██████    | 7369/12272 [42:36<27:48,  2.94it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:7369->8596/12272; average training loss:0.324641\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  70%|███████   | 8597/12272 [49:42<20:53,  2.93it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:8597->9824/12272; average training loss:0.322036\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  80%|████████  | 9825/12272 [56:44<13:50,  2.95it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:9825->11052/12272; average training loss:0.321205\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  90%|█████████ | 11053/12272 [1:03:49<06:54,  2.94it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:2/2; batch:11053->12272/12272; average training loss:0.319237\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 100%|██████████| 12272/12272 [1:10:52<00:00,  2.94it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training time : 2.374 hrs\n"
+     ]
+    }
+   ],
+   "source": [
+    "with Timer() as t:\n",
+    "    classifier.fit(token_ids=train_token_ids,\n",
+    "                   input_mask=train_input_mask,\n",
+    "                   token_type_ids=train_token_type_ids,\n",
+    "                   labels=train_labels,\n",
+    "                   num_epochs=NUM_EPOCHS,\n",
+    "                   batch_size=BATCH_SIZE,\n",
+    "                   lr=LEARNING_RATE,\n",
+    "                   warmup_proportion=WARMUP_PROPORTION)\n",
+    "print(\"Training time : {:.3f} hrs\".format(t.interval / 3600))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Predict on Test Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 100%|██████████| 307/307 [00:40<00:00,  8.15it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prediction time : 0.011 hrs\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "with Timer() as t:\n",
+    "    predictions_matched = classifier.predict(token_ids=dev_matched_token_ids,\n",
+    "                                             input_mask=dev_matched_input_mask,\n",
+    "                                             token_type_ids=dev_matched_token_type_ids,\n",
+    "                                             batch_size=BATCH_SIZE)\n",
+    "print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 100%|██████████| 308/308 [00:38<00:00,  8.30it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prediction time : 0.011 hrs\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "with Timer() as t:\n",
+    "    predictions_mismatched = classifier.predict(token_ids=dev_mismatched_token_ids,\n",
+    "                                                input_mask=dev_mismatched_input_mask,\n",
+    "                                                token_type_ids=dev_mismatched_token_type_ids,\n",
+    "                                                batch_size=BATCH_SIZE)\n",
+    "print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "contradiction      0.848     0.865     0.857      3213\n",
+      "   entailment      0.894     0.828     0.860      3479\n",
+      "      neutral      0.783     0.831     0.806      3123\n",
+      "\n",
+      "    micro avg      0.841     0.841     0.841      9815\n",
+      "    macro avg      0.842     0.841     0.841      9815\n",
+      " weighted avg      0.844     0.841     0.842      9815\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "predictions_matched = label_encoder.inverse_transform(predictions_matched)\n",
+    "print(classification_report(dev_df_matched[LABEL_COL], predictions_matched, digits=3))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "contradiction      0.862     0.863     0.863      3240\n",
+      "   entailment      0.878     0.853     0.865      3463\n",
+      "      neutral      0.791     0.815     0.803      3129\n",
+      "\n",
+      "    micro avg      0.844     0.844     0.844      9832\n",
+      "    macro avg      0.844     0.844     0.844      9832\n",
+      " weighted avg      0.845     0.844     0.845      9832\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "predictions_mismatched = label_encoder.inverse_transform(predictions_mismatched)\n",
+    "print(classification_report(dev_df_mismatched[LABEL_COL], predictions_mismatched, digits=3))"
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Tags",
+  "kernelspec": {
+   "display_name": "nlp_gpu",
+   "language": "python",
+   "name": "nlp_gpu"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/scenarios/entailment/entailment_xnli_multilingual.ipynb
+++ b/scenarios/entailment/entailment_xnli_multilingual.ipynb
@ -1,581 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Multi-lingual Inference on XNLI Dataset using BERT"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Summary\n",
-    "In this notebook, we demostrate using the [Multi-lingual BERT model](https://github.com/google-research/bert/blob/master/multilingual.md) to do language inference in Chinese and Hindi. We use the [XNLI](https://github.com/facebookresearch/XNLI) dataset and the task is to classify sentence pairs into three classes: contradiction, entailment, and neutral.   \n",
-    "The figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. It concatenates the tokens in each sentence pairs and separates the sentences by the [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.\n",
-    "<img src=\"https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG\">"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "import os\n",
-    "import random\n",
-    "import numpy as np\n",
-    "from sklearn.metrics import classification_report\n",
-    "from sklearn.preprocessing import LabelEncoder\n",
-    "\n",
-    "import torch\n",
-    "\n",
-    "nlp_path = os.path.abspath('../../')\n",
-    "if nlp_path not in sys.path:\n",
-    "    sys.path.insert(0, nlp_path)\n",
-    "\n",
-    "from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier\n",
-    "from utils_nlp.models.bert.common import Language, Tokenizer\n",
-    "from utils_nlp.dataset.xnli import load_pandas_df\n",
-    "from utils_nlp.common.timer import Timer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Configurations\n",
-    "Note that the running time shown in this notebook are on a Standard_NC12 Azure Deep Learning Virtual Machine with two NVIDIA Tesla K80 GPUs. If you want to run through the notebook quickly, you can change the `TRAIN_DATA_USED_PERCENT` to a small number, e.g. 0.01. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "TRAIN_DATA_USED_PERCENT = 1.0\n",
-    "\n",
-    "# set random seeds\n",
-    "RANDOM_SEED = 42\n",
-    "random.seed(RANDOM_SEED)\n",
-    "np.random.seed(RANDOM_SEED)\n",
-    "torch.manual_seed(RANDOM_SEED)\n",
-    "num_cuda_devices = torch.cuda.device_count()\n",
-    "if num_cuda_devices > 1:\n",
-    "    torch.cuda.manual_seed_all(RANDOM_SEED)\n",
-    "\n",
-    "# model configurations\n",
-    "LANGUAGE_CHINESE = Language.CHINESE\n",
-    "LANGUAGE_MULTI = Language.MULTILINGUAL\n",
-    "TO_LOWER = True\n",
-    "MAX_SEQ_LENGTH = 128\n",
-    "\n",
-    "# training configurations\n",
-    "NUM_GPUS = 2\n",
-    "BATCH_SIZE = 32\n",
-    "NUM_EPOCHS = 2\n",
-    "\n",
-    "# optimizer configurations\n",
-    "LEARNING_RATE= 5e-5\n",
-    "WARMUP_PROPORTION= 0.1\n",
-    "\n",
-    "# data configurations\n",
-    "TEXT_COL = \"text\"\n",
-    "LABEL_COL = \"label\"\n",
-    "\n",
-    "CACHE_DIR = \"./temp\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load Data\n",
-    "The XNLI dataset comes in two zip files:  \n",
-    "* XNLI-1.0.zip: dev and test datasets in 15 languages. The original English data was translated into other languages by human translators. \n",
-    "* XNLI-MT-1.0.zip: training dataset in 15 languages. This dataset is machine translations of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset. It also contains English translations of the dev and test datasets, but not used in this notebook.  \n",
-    "\n",
-    "The `load_pandas_df` function downloads and extracts the zip files if they don't already exist in `local_cache_path` and returns the data subset specified by `file_split` and `language`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_df_chinese = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\", language=\"zh\")\n",
-    "dev_df_chinese = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev\", language=\"zh\")\n",
-    "test_df_chinese = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"test\", language=\"zh\")\n",
-    "\n",
-    "train_df_hindi = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\", language=\"hi\")\n",
-    "dev_df_hindi = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev\", language=\"hi\")\n",
-    "test_df_hindi = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"test\", language=\"hi\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Chinese training dataset size: 392702\n",
-      "Chinese dev dataset size: 2490\n",
-      "Chinese test dataset size: 5010\n",
-      "\n",
-      "Hindi training dataset size: 392702\n",
-      "Hindi dev dataset size: 2490\n",
-      "Hindi test dataset size: 5010\n",
-      "\n",
-      "                                                text       label\n",
-      "0  (从 概念 上 看 , 奶油 收入 有 两 个 基本 方面 产品 和 地理 ., 产品 和 ...     neutral\n",
-      "1  (你 知道 在 这个 季节 , 我 猜 在 你 的 水平 你 把 他们 丢到 下 一个 水平...  entailment\n",
-      "2  (我们 的 一个 号码 会 非常 详细 地 执行 你 的 指示, 我 团队 的 一个 成员 ...  entailment\n",
-      "3   (你 怎么 知道 的 ? 所有 这些 都 是 他们 的 信息 ., 这些 信息 属于 他们 .)  entailment\n",
-      "4  (是 啊 , 我 告诉 你 , 如果 你 去 买 一些 网球鞋 , 我 可以 看到 为什么 ...     neutral\n",
-      "                                                text       label\n",
-      "0  (Conceptually क ् रीम एंजलिस में दो मूल आयाम ह...     neutral\n",
-      "1  (आप मौसम के दौरान जानते हैं और मैं अपने स ् तर...  entailment\n",
-      "2  (हमारे एक नंबर में से एक आपके निर ् देशों को म...  entailment\n",
-      "3  (आप कैसे जानते हैं ? ये सब उनकी जानकारी फिर से...  entailment\n",
-      "4  (हाँ मैं आपको बताता हूँ कि अगर आप उन टेनिस जूत...     neutral\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Chinese training dataset size: {}\".format(train_df_chinese.shape[0]))\n",
-    "print(\"Chinese dev dataset size: {}\".format(dev_df_chinese.shape[0]))\n",
-    "print(\"Chinese test dataset size: {}\".format(test_df_chinese.shape[0]))\n",
-    "print()\n",
-    "print(\"Hindi training dataset size: {}\".format(train_df_hindi.shape[0]))\n",
-    "print(\"Hindi dev dataset size: {}\".format(dev_df_hindi.shape[0]))\n",
-    "print(\"Hindi test dataset size: {}\".format(test_df_hindi.shape[0]))\n",
-    "print()\n",
-    "print(train_df_chinese.head())\n",
-    "print(train_df_hindi.head())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_data_used_count = round(TRAIN_DATA_USED_PERCENT * train_df_chinese.shape[0])\n",
-    "train_df_chinese = train_df_chinese.loc[:train_data_used_count]\n",
-    "train_df_hindi = train_df_hindi.loc[:train_data_used_count]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Language Inference on Chinese\n",
-    "For Chinese dataset, we use the `bert-base-chinese` model which was pretrained on Chinese dataset only. The `bert-base-multilingual-cased` model can also be used on Chinese, but the accuracy is 3% lower."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Tokenize and Preprocess\n",
-    "Before training, we tokenize the sentence texts and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 392702/392702 [02:26<00:00, 2682.67it/s]\n",
-      "100%|██████████| 5010/5010 [00:01<00:00, 3122.04it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "tokenizer_chinese = Tokenizer(LANGUAGE_CHINESE, to_lower=TO_LOWER, cache_dir=CACHE_DIR)\n",
-    "\n",
-    "train_tokens_chinese = tokenizer_chinese.tokenize(train_df_chinese[TEXT_COL])\n",
-    "test_tokens_chinese= tokenizer_chinese.tokenize(test_df_chinese[TEXT_COL])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In addition, we perform the following preprocessing steps in the cell below:\n",
-    "\n",
-    "* Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n",
-    "* Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n",
-    "* Pad or truncate the token lists to the specified max length\n",
-    "* Return mask lists that indicate paddings' positions\n",
-    "* Return token type id lists that indicate which sentence the tokens belong to\n",
-    "\n",
-    "*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_token_ids_chinese, train_input_mask_chinese, train_token_type_ids_chinese = \\\n",
-    "    tokenizer_chinese.preprocess_classification_tokens(train_tokens_chinese, max_len=MAX_SEQ_LENGTH)\n",
-    "test_token_ids_chinese, test_input_mask_chinese, test_token_type_ids_chinese = \\\n",
-    "    tokenizer_chinese.preprocess_classification_tokens(test_tokens_chinese, max_len=MAX_SEQ_LENGTH)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "label_encoder_chinese = LabelEncoder()\n",
-    "train_labels_chinese = label_encoder_chinese.fit_transform(train_df_chinese[LABEL_COL])\n",
-    "num_labels_chinese = len(np.unique(train_labels_chinese))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Create Classifier"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "classifier_chinese = BERTSequenceClassifier(language=LANGUAGE_CHINESE,\n",
-    "                                            num_labels=num_labels_chinese,\n",
-    "                                            cache_dir=CACHE_DIR)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Train Classifier"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/2; batch:1->1228/12271; loss:1.194384\n",
-      "epoch:1/2; batch:1229->2456/12271; loss:0.863067\n",
-      "epoch:1/2; batch:2457->3684/12271; loss:0.781256\n",
-      "epoch:1/2; batch:3685->4912/12271; loss:1.067413\n",
-      "epoch:1/2; batch:4913->6140/12271; loss:0.599279\n",
-      "epoch:1/2; batch:6141->7368/12271; loss:0.471488\n",
-      "epoch:1/2; batch:7369->8596/12271; loss:0.572327\n",
-      "epoch:1/2; batch:8597->9824/12271; loss:0.689093\n",
-      "epoch:1/2; batch:9825->11052/12271; loss:0.651702\n",
-      "epoch:1/2; batch:11053->12271/12271; loss:0.431085\n",
-      "epoch:2/2; batch:1->1228/12271; loss:0.255859\n",
-      "epoch:2/2; batch:1229->2456/12271; loss:0.434052\n",
-      "epoch:2/2; batch:2457->3684/12271; loss:0.433569\n",
-      "epoch:2/2; batch:3685->4912/12271; loss:0.405915\n",
-      "epoch:2/2; batch:4913->6140/12271; loss:0.636128\n",
-      "epoch:2/2; batch:6141->7368/12271; loss:0.416685\n",
-      "epoch:2/2; batch:7369->8596/12271; loss:0.265789\n",
-      "epoch:2/2; batch:8597->9824/12271; loss:0.328964\n",
-      "epoch:2/2; batch:9825->11052/12271; loss:0.436310\n",
-      "epoch:2/2; batch:11053->12271/12271; loss:0.374193\n",
-      "Training time : 8.050 hrs\n"
-     ]
-    }
-   ],
-   "source": [
-    "with Timer() as t:\n",
-    "    classifier_chinese.fit(token_ids=train_token_ids_chinese,\n",
-    "                           input_mask=train_input_mask_chinese,\n",
-    "                           token_type_ids=train_token_type_ids_chinese,\n",
-    "                           labels=train_labels_chinese,\n",
-    "                           num_gpus=NUM_GPUS,\n",
-    "                           num_epochs=NUM_EPOCHS,\n",
-    "                           batch_size=BATCH_SIZE,\n",
-    "                           lr=LEARNING_RATE,\n",
-    "                           warmup_proportion=WARMUP_PROPORTION)\n",
-    "print(\"Training time : {:.3f} hrs\".format(t.interval / 3600))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Predict on Test Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "5024it [00:54, 101.88it/s]                         "
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Prediction time : 0.015 hrs\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "with Timer() as t:\n",
-    "    predictions_chinese = classifier_chinese.predict(token_ids=test_token_ids_chinese,\n",
-    "                                                     input_mask=test_input_mask_chinese,\n",
-    "                                                     token_type_ids=test_token_type_ids_chinese,\n",
-    "                                                     batch_size=BATCH_SIZE)\n",
-    "print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Evaluate"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "               precision    recall  f1-score   support\n",
-      "\n",
-      "contradiction       0.81      0.84      0.82      1670\n",
-      "   entailment       0.84      0.68      0.76      1670\n",
-      "      neutral       0.70      0.80      0.74      1670\n",
-      "\n",
-      "     accuracy                           0.77      5010\n",
-      "    macro avg       0.78      0.77      0.77      5010\n",
-      " weighted avg       0.78      0.77      0.77      5010\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "predictions_chinese = label_encoder_chinese.inverse_transform(predictions_chinese)\n",
-    "print(classification_report(test_df_chinese[LABEL_COL], predictions_chinese))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Language Inference on Hindi\n",
-    "For Hindi and all other languages except Chinese, we use the `bert-base-multilingual-cased` model.  \n",
-    "The preprocesing, model training, and prediction steps are the same as on Chinese data, except for the underlying tokenizer and BERT model used"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Tokenize and Preprocess"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 392702/392702 [03:48<00:00, 1719.84it/s]\n",
-      "100%|██████████| 5010/5010 [00:02<00:00, 1916.46it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "tokenizer_multi = Tokenizer(LANGUAGE_MULTI, cache_dir=CACHE_DIR)\n",
-    "\n",
-    "train_tokens_hindi = tokenizer_multi.tokenize(train_df_hindi[TEXT_COL])\n",
-    "test_tokens_hindi= tokenizer_multi.tokenize(test_df_hindi[TEXT_COL])\n",
-    "\n",
-    "train_token_ids_hindi, train_input_mask_hindi, train_token_type_ids_hindi = \\\n",
-    "    tokenizer_multi.preprocess_classification_tokens(train_tokens_hindi, max_len=MAX_SEQ_LENGTH)\n",
-    "test_token_ids_hindi, test_input_mask_hindi, test_token_type_ids_hindi = \\\n",
-    "    tokenizer_multi.preprocess_classification_tokens(test_tokens_hindi, max_len=MAX_SEQ_LENGTH)\n",
-    "\n",
-    "label_encoder_hindi = LabelEncoder()\n",
-    "train_labels_hindi = label_encoder_hindi.fit_transform(train_df_hindi[LABEL_COL])\n",
-    "num_labels_hindi = len(np.unique(train_labels_hindi))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Create and Train Classifier"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "epoch:1/2; batch:1->1228/12271; loss:1.091754\n",
-      "epoch:1/2; batch:1229->2456/12271; loss:0.992931\n",
-      "epoch:1/2; batch:2457->3684/12271; loss:1.045146\n",
-      "epoch:1/2; batch:3685->4912/12271; loss:0.799912\n",
-      "epoch:1/2; batch:4913->6140/12271; loss:0.815425\n",
-      "epoch:1/2; batch:6141->7368/12271; loss:0.564856\n",
-      "epoch:1/2; batch:7369->8596/12271; loss:0.726981\n",
-      "epoch:1/2; batch:8597->9824/12271; loss:0.764087\n",
-      "epoch:1/2; batch:9825->11052/12271; loss:0.964115\n",
-      "epoch:1/2; batch:11053->12271/12271; loss:0.502252\n",
-      "epoch:2/2; batch:1->1228/12271; loss:0.601600\n",
-      "epoch:2/2; batch:1229->2456/12271; loss:0.695099\n",
-      "epoch:2/2; batch:2457->3684/12271; loss:0.419610\n",
-      "epoch:2/2; batch:3685->4912/12271; loss:0.603106\n",
-      "epoch:2/2; batch:4913->6140/12271; loss:0.705180\n",
-      "epoch:2/2; batch:6141->7368/12271; loss:0.493404\n",
-      "epoch:2/2; batch:7369->8596/12271; loss:0.864921\n",
-      "epoch:2/2; batch:8597->9824/12271; loss:0.518601\n",
-      "epoch:2/2; batch:9825->11052/12271; loss:0.395920\n",
-      "epoch:2/2; batch:11053->12271/12271; loss:0.685858\n",
-      "Training time : 9.520 hrs\n"
-     ]
-    }
-   ],
-   "source": [
-    "classifier_multi = BERTSequenceClassifier(language=LANGUAGE_MULTI,\n",
-    "                                          num_labels=num_labels_hindi,\n",
-    "                                          cache_dir=CACHE_DIR)\n",
-    "with Timer() as t:\n",
-    "    classifier_multi.fit(token_ids=train_token_ids_hindi,\n",
-    "                         input_mask=train_input_mask_hindi,\n",
-    "                         token_type_ids=train_token_type_ids_hindi,\n",
-    "                         labels=train_labels_hindi,\n",
-    "                         num_gpus=NUM_GPUS,\n",
-    "                         num_epochs=NUM_EPOCHS,\n",
-    "                         batch_size=BATCH_SIZE,\n",
-    "                         lr=LEARNING_RATE,\n",
-    "                         warmup_proportion=WARMUP_PROPORTION)\n",
-    "print(\"Training time : {:.3f} hrs\".format(t.interval / 3600))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Predict and Evaluate"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "5024it [01:02, 87.10it/s]                          "
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Prediction time : 0.017 hrs\n",
-      "               precision    recall  f1-score   support\n",
-      "\n",
-      "contradiction       0.69      0.72      0.70      1670\n",
-      "   entailment       0.74      0.51      0.60      1670\n",
-      "      neutral       0.58      0.74      0.65      1670\n",
-      "\n",
-      "     accuracy                           0.65      5010\n",
-      "    macro avg       0.67      0.65      0.65      5010\n",
-      " weighted avg       0.67      0.65      0.65      5010\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "with Timer() as t:\n",
-    "    predictions_hindi = classifier_multi.predict(token_ids=test_token_ids_hindi,\n",
-    "                                                 input_mask=test_input_mask_hindi,\n",
-    "                                                 token_type_ids=test_token_type_ids_hindi,\n",
-    "                                                 batch_size=BATCH_SIZE)\n",
-    "print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))\n",
-    "predictions_hindi= label_encoder_hindi.inverse_transform(predictions_hindi)\n",
-    "print(classification_report(test_df_hindi[LABEL_COL], predictions_hindi))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/scenarios/interpret_NLP_models/understand_models.ipynb
+++ b/scenarios/interpret_NLP_models/understand_models.ipynb
--- a/scenarios/named_entity_recognition/README.md
+++ b/scenarios/named_entity_recognition/README.md
@ -1,8 +1,25 @@
 # Named Entity Recognition (NER)
+
+This folder contains examples and best practices, written in Jupyter notebooks, for building Named Entity Recognition models. The models can be used in a wide variety of applications, such as information extraction and filtering. It also plays an important role in other
+NLP tasks like question answering and text summarization.
+
+## What is Named Entity Recognition (NER)
+
 Named Entity Recognition (NER) is the task of detecting and classifying
 real-world objects mentioned in text. Common named entities include person
-names, locations, organizations, etc. The state-of-the art NER methods include
-combining Long Short-Term Memory neural network with Conditional Random Field
-(LSTM-CRF) and pretrained language models like BERT. NER can be used for
-information extraction and filtering. It also plays an important role in other
-NLP tasks like question answering and text summarization.
+names, locations, organizations, etc. The [state-of-the art](https://paperswithcode.com/task/named-entity-recognition-ner) NER methods include combining Long Short-Term Memory neural network with Conditional Random Field
+(LSTM-CRF) and pretrained language models like BERT.
+
+The figure below illustrates how BERT can be fine tuned for NER tasks. The input data is a list of tokens representing a sentence. In the training data, each token has an entity label. After fine tuning, the model predicts an entity label for each token in a given testing sentence.
+
+<p align="center">
+  <img src="https://nlpbp.blob.core.windows.net/images/bert_architecture.png" alt=" Fine-tuned BERT for NER tasks"/>
+</p>
+
+## Summary
+
+The following summarizes each notebook for NER. Each notebook provides more details and guiding in principles on building state of the art models.
+
+|Notebook|Runs Local|Description|
+|---|---|---|
+|[Bert](ner_wikigold_bert.ipynb)| Yes| Fine-tune a [pretrained BERT model](https://github.com/huggingface/pytorch-pretrained-BERT) using the [wikigold dataset](https://www.aclweb.org/anthology/W09-3302)  for token classification.|
--- a/scenarios/question_answering/bidaf_aml_deep_dive.ipynb
+++ b/scenarios/question_answering/bidaf_aml_deep_dive.ipynb
@ -194,6 +194,7 @@
    "sys.path.append(\"../../\")\n",
    "import json\n",
    "from urllib.request import urlretrieve\n",
+    "import scrapbook as sb\n",
    "\n",
    "#import utils\n",
    "from utils_nlp.common.timer import Timer\n",
@ -211,6 +212,38 @@
    "print(\"Azure ML SDK Version:\", aml.core.VERSION)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "PROJECT_FOLDER = \"./bidaf-question-answering\"\n",
+    "SQUAD_FOLDER = \"./squad\"\n",
+    "BIDAF_CONFIG_PATH = \".\"\n",
+    "LOGS_FOLDER = '.'\n",
+    "NUM_EPOCHS = 25\n",
+    "PIP_PACKAGES = [\n",
+    "        \"allennlp==0.8.4\",\n",
+    "        \"azureml-sdk==1.0.48\",\n",
+    "        \"https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz\",\n",
+    "    ]\n",
+    "CONDA_PACKAGES = [\"jsonnet\", \"cmake\", \"regex\", \"pytorch\", \"torchvision\"]\n",
+    "config_path = (\n",
+    "    \"./.azureml\"\n",
+    ")  # Path to the directory containing config.json with azureml credentials\n",
+    "\n",
+    "# Azure resources\n",
+    "subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
+    "resource_group = \"YOUR_RESOURCE_GROUP_NAME\"  \n",
+    "workspace_name = \"YOUR_WORKSPACE_NAME\"  \n",
+    "workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on."
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -240,14 +273,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n",
-    "\n",
-    "**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
+    "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. Then enter the configuration variables into the cell above."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -275,16 +306,17 @@
   ],
   "source": [
    "ws = azureml_utils.get_or_create_workspace(\n",
-    "    subscription_id=\"<SUBSCRIPTION_ID>\",\n",
-    "    resource_group=\"<RESOURCE_GROUP>\",\n",
-    "    workspace_name=\"<WORKSPACE_NAME>\",\n",
-    "    workspace_region=\"<WORKSPACE_REGION>\",\n",
+    "    config_path=config_path,\n",
+    "    subscription_id=subscription_id,\n",
+    "    resource_group=resource_group,\n",
+    "    workspace_name=workspace_name,\n",
+    "    workspace_region=workspace_region,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -313,20 +345,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make a folder for the project\n",
-    "project_folder = \"./bidaf-question-answering\"\n",
-    "os.makedirs(project_folder, exist_ok=True)\n",
+    "os.makedirs(PROJECT_FOLDER, exist_ok=True)\n",
    "\n",
    "# Set up an experiment\n",
-    "experiment_name = \"bidaf-question-answering\"\n",
+    "experiment_name = \"NLP-QA-BiDAF-deepdive\"\n",
    "experiment = Experiment(ws, experiment_name)\n",
    "\n",
    "# Add logging to our experiment\n",
-    "run = experiment.start_logging()"
+    "run = experiment.start_logging(snapshot_directory=PROJECT_FOLDER)"
   ]
  },
  {
@ -347,7 +378,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@ -355,7 +386,7 @@
     "output_type": "stream",
     "text": [
      "Found existing compute target.\n",
-      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-17T17:18:24.507000+00:00', 'errors': None, 'creationTime': '2019-07-09T16:20:30.625908+00:00', 'modifiedTime': '2019-07-09T16:20:46.601973+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
+      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-23T16:18:34.392000+00:00', 'errors': None, 'creationTime': '2019-07-09T16:20:30.625908+00:00', 'modifiedTime': '2019-07-09T16:20:46.601973+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
     ]
    }
   ],
@ -404,31 +435,31 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "('squad/squad_dev.json', <http.client.HTTPMessage at 0x2640d393320>)"
+       "('./squad/squad_dev.json', <http.client.HTTPMessage at 0x2646892de10>)"
      ]
     },
-     "execution_count": 6,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "os.makedirs(\"squad\", exist_ok=True)  # make squad folder locally\n",
+    "os.makedirs(SQUAD_FOLDER, exist_ok=True)  # make squad folder locally\n",
    "\n",
    "urlretrieve(\n",
    "    \"https://allennlp.s3.amazonaws.com/datasets/squad/squad-train-v1.1.json\",\n",
-    "    filename=\"squad/squad_train.json\",\n",
+    "    filename=SQUAD_FOLDER+\"/squad_train.json\",\n",
    ")\n",
    "\n",
    "urlretrieve(\n",
    "    \"https://allennlp.s3.amazonaws.com/datasets/squad/squad-dev-v1.1.json\",\n",
-    "    filename=\"squad/squad_dev.json\",\n",
+    "    filename=SQUAD_FOLDER+\"/squad_dev.json\",\n",
    ")"
   ]
  },
@ -441,22 +472,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "'squad\\\\bidaf_config.json'"
+       "'./squad\\\\bidaf_config.json'"
      ]
     },
-     "execution_count": 7,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "shutil.copy('bidaf_config.json', \"squad\")"
+    "shutil.copy(BIDAF_CONFIG_PATH+'/bidaf_config.json', SQUAD_FOLDER)"
   ]
  },
  {
@ -468,7 +499,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -488,10 +519,10 @@
    {
     "data": {
      "text/plain": [
-       "$AZUREML_DATAREFERENCE_64cd400292b5405d9deea6ee03786597"
+       "$AZUREML_DATAREFERENCE_09a567b57ea546b697d8d7ce1bcf2d86"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -503,7 +534,7 @@
    "\n",
    "# Upload files in squad data folder to the datastore\n",
    "ds.upload(\n",
-    "    src_dir=\"./squad\", target_path=\"squad_data\", overwrite=True, show_progress=True\n",
+    "    src_dir=SQUAD_FOLDER, target_path=\"squad_data\", overwrite=True, show_progress=True\n",
    ")"
   ]
  },
@ -530,19 +561,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Writing ./bidaf-question-answering/train.py\n"
+      "Overwriting ./bidaf-question-answering/train.py\n"
     ]
    }
   ],
   "source": [
-    "%%writefile $project_folder/train.py\n",
+    "%%writefile $PROJECT_FOLDER/train.py\n",
    "import torch\n",
    "import argparse\n",
    "import os\n",
@ -588,7 +619,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -597,26 +628,22 @@
       "'bidafenv.yml'"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "myenv = CondaDependencies.create(\n",
-    "    conda_packages=[\"jsonnet\", \"cmake\", \"regex\", \"pytorch\", \"torchvision\"],\n",
-    "    pip_packages=[\n",
-    "        \"allennlp==0.8.4\",\n",
-    "        \"azureml-sdk==1.0.48\",\n",
-    "        \"https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz\",\n",
-    "    ],\n",
+    "    conda_packages= CONDA_PACKAGES,\n",
+    "    pip_packages= PIP_PACKAGES,\n",
    "    python_version=\"3.6.8\",\n",
    ")\n",
    "myenv.add_channel(\"conda-forge\")\n",
    "myenv.add_channel(\"pytorch\")\n",
    "\n",
    "conda_env_file_name = \"bidafenv.yml\"\n",
-    "myenv.save_to_file(project_folder, conda_env_file_name)"
+    "myenv.save_to_file(PROJECT_FOLDER, conda_env_file_name)"
   ]
  },
  {
@ -628,11 +655,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
-    "overrides = {\"trainer\":{'num_epochs': 25}}\n",
+    "overrides = {\"trainer\":{'num_epochs': NUM_EPOCHS}}\n",
    "overrides = json.dumps(overrides)"
   ]
  },
@ -645,7 +672,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@ -667,7 +694,7 @@
    "}\n",
    "\n",
    "estimator = PyTorch(\n",
-    "    source_directory=project_folder,\n",
+    "    source_directory=PROJECT_FOLDER,\n",
    "    script_params=script_params,\n",
    "    compute_target=compute_target,\n",
    "    entry_script=\"train.py\",\n",
@ -692,7 +719,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
@ -700,7 +727,7 @@
     "output_type": "stream",
     "text": [
      "Run(Experiment: bidaf-question-answering,\n",
-      "Id: bidaf-question-answering_1563384448_9b24b038,\n",
+      "Id: bidaf-question-answering_1563899344_bce3c688,\n",
      "Type: azureml.scriptrun,\n",
      "Status: Starting)\n"
     ]
@ -713,13 +740,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "673919a19d314f628760e4c896992a34",
+       "model_id": "3da61f9cf1a84f91ae23925843b584d7",
       "version_major": 2,
       "version_minor": 0
      },
@ -737,7 +764,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
@ -756,7 +783,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
@ -779,11 +806,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
-    "run.download_files(prefix=\"./logs/\", output_directory=\".\")"
+    "run.download_files(prefix=\"./logs\", output_directory=LOGS_FOLDER)"
   ]
  },
  {
@ -802,24 +829,43 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.6152317880794702,
+       "encoder": "json",
+       "name": "validation_EM",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "validation_EM"
+      }
+     },
+     "output_type": "display_data"
+    },
    {
     "data": {
      "text/plain": [
-       "0.6674550614947966"
+       "0.6152317880794702"
      ]
     },
-     "execution_count": 18,
+     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "with open(\"./logs/metrics.json\") as f:\n",
+    "with open(LOGS_FOLDER+\"/logs/metrics.json\") as f:\n",
    "    metrics = json.load(f)\n",
    "\n",
+    "sb.glue(\"validation_EM\", metrics[\"best_validation_em\"])\n",
    "metrics[\"best_validation_em\"]"
   ]
  },
@ -839,19 +885,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "WARNING - _jsonnet not loaded, treating ./logs/config.json as json\n"
+      "WARNING - _jsonnet not loaded, treating ./logs\\config.json as json\n"
     ]
    }
   ],
   "source": [
-    "model = Predictor.from_path('./logs/')"
+    "model = Predictor.from_path(LOGS_FOLDER+\"/logs\")"
   ]
  },
  {
@ -863,7 +909,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
@ -884,7 +930,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
@ -893,7 +939,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
@ -913,6 +959,7 @@
  }
 ],
 "metadata": {
+  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
@ -928,7 +975,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.6.8"
  }
 },
 "nbformat": 4,
--- a/scenarios/question_answering/bidaf_config.json
+++ b/scenarios/question_answering/bidaf_config.json
@ -0,0 +1,103 @@
+{
+  "dataset_reader": {
+    "type": "squad",
+    "token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "lowercase_tokens": true
+      },
+      "token_characters": {
+        "type": "characters",
+        "character_tokenizer": {
+          "byte_encoding": "utf-8",
+          "start_tokens": [259],
+          "end_tokens": [260]
+        },
+        "min_padding_length": 5
+      }
+    }
+  },
+  "train_data_path": "https://allennlp.s3.amazonaws.com/datasets/squad/squad-train-v1.1.json",
+  "validation_data_path": "https://allennlp.s3.amazonaws.com/datasets/squad/squad-dev-v1.1.json",
+  "evaluate_on_test": true,
+  "model": {
+    "type": "bidaf",
+    "text_field_embedder": {
+      "token_embedders": {
+        "tokens": {
+          "type": "embedding",
+          "pretrained_file": "https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.100d.txt.gz",
+          "embedding_dim": 100,
+          "trainable": false
+        },
+        "token_characters": {
+          "type": "character_encoding",
+          "embedding": {
+            "num_embeddings": 262,
+            "embedding_dim": 16
+          },
+          "encoder": {
+            "type": "cnn",
+            "embedding_dim": 16,
+            "num_filters": 100,
+            "ngram_filter_sizes": [5]
+          },
+          "dropout": 0.2
+        }
+      }
+    },
+    "num_highway_layers": 2,
+    "phrase_layer": {
+      "type": "lstm",
+      "bidirectional": true,
+      "input_size": 200,
+      "hidden_size": 100,
+      "num_layers": 1
+    },
+    "similarity_function": {
+      "type": "linear",
+      "combination": "x,y,x*y",
+      "tensor_1_dim": 200,
+      "tensor_2_dim": 200
+    },
+    "modeling_layer": {
+      "type": "lstm",
+      "bidirectional": true,
+      "input_size": 800,
+      "hidden_size": 100,
+      "num_layers": 2,
+      "dropout": 0.2
+    },
+    "span_end_encoder": {
+      "type": "lstm",
+      "bidirectional": true,
+      "input_size": 1400,
+      "hidden_size": 100,
+      "num_layers": 1
+    },
+    "dropout": 0.2
+  },
+  "iterator": {
+    "type": "bucket",
+    "sorting_keys": [["passage", "num_tokens"], ["question", "num_tokens"]],
+    "batch_size": 40
+  },
+
+  "trainer": {
+    "num_epochs": 20, 
+    "grad_norm": 5.0,
+    "patience": 10,
+    "validation_metric": "+em",
+    "cuda_device": 0,
+    "learning_rate_scheduler": {
+      "type": "reduce_on_plateau",
+      "factor": 0.5,
+      "mode": "max",
+      "patience": 2
+    },
+    "optimizer": {
+      "type": "adam",
+      "betas": [0.9, 0.9]
+    }
+  }
+}
--- a/scenarios/question_answering/pretrained-BERT-SQuAD-deep-dive-aml.ipynb
+++ b/scenarios/question_answering/pretrained-BERT-SQuAD-deep-dive-aml.ipynb
@ -135,9 +135,11 @@
    "import math\n",
    "import json \n",
    "import pandas as pd\n",
+    "import papermill as pm\n",
    "#package for flattening json in pandas df\n",
    "from pandas.io.json import json_normalize\n",
    "import shutil\n",
+    "import scrapbook as sb\n",
    "# Check core SDK version number\n",
    "import azureml.core\n",
    "from azureml.core import Datastore\n",
@ -153,6 +155,35 @@
    "print(\"SDK version:\", azureml.core.VERSION)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Model configuration\n",
+    "AZUREML_CONFIG_PATH = \"./.azureml\"\n",
+    "DATA_FOLDER = './squad'\n",
+    "PROJECT_FOLDER = './pytorch-transformers'\n",
+    "EXPERIMENT_NAME = 'NLP-QA-BERT-deepdive'\n",
+    "BERT_MODEL = 'bert-large-uncased'\n",
+    "TARGET_GRADIENT_STEPS = 16\n",
+    "INIT_GRADIENT_STEPS = 2\n",
+    "MAX_SEQ_LENGTH = 384\n",
+    "NUM_TRAIN_EPOCHS = 2.0\n",
+    "NODE_COUNT = 2\n",
+    "TRAIN_SCRIPT_PATH = 'bert_run_squad_azureml.py'\n",
+    "MAX_TOTAL_RUNS = 8\n",
+    "MAX_CONCURRENT_RUNS = 4\n",
+    "BERT_UTIL_PATH = '../../utils_nlp/azureml/azureml_bert_util.py'\n",
+    "EVALUATE_SQAD_PATH = '../../utils_nlp/eval/evaluate_squad.py'\n",
+    "AZUREML_VERBOSE = False"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -169,43 +200,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Performing interactive authentication. Please follow the instructions on the terminal.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING - Note, we have launched a browser for you to login. For old experience with device code, use \"az login --use-device-code\"\n",
-      "WARNING - You have logged in. Now let us find all the subscriptions to which you have access...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Interactive authentication successfully completed.\n",
-      "Workspace name: MAIDAIPBERT-eastus\n",
-      "Azure region: eastus\n",
-      "Subscription id: 15ae9cb6-95c1-483d-a0e3-b1a1a3b06324\n",
-      "Resource group: nlprg\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
+    "if os.path.exists(AZUREML_CONFIG_PATH):\n",
+    "    ws = azureml_utils.get_or_create_workspace(config_path=AZUREML_CONFIG_PATH)\n",
+    "else:\n",
    "    ws = azureml_utils.get_or_create_workspace(\n",
+    "        config_path=AZUREML_CONFIG_PATH,\n",
    "        subscription_id=\"<SUBSCRIPTION_ID>\",\n",
    "        resource_group=\"<RESOURCE_GROUP>\",\n",
    "        workspace_name=\"<WORKSPACE_NAME>\",\n",
-    "    workspace_region=\"<WORKSPACE_REGION>\"\n",
+    "        workspace_region=\"<WORKSPACE_REGION>\",\n",
    "    )\n",
+    "\n",
+    "if AZUREML_VERBOSE:\n",
    "    print('Workspace name: ' + ws.name, \n",
    "          'Azure region: ' + ws.location, \n",
    "          'Subscription id: ' + ws.subscription_id, \n",
@ -221,7 +231,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -255,11 +265,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
-    "data_folder = './squad'"
+    "data_folder = DATA_FOLDER"
   ]
  },
  {
@ -275,7 +285,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@ -288,6 +298,7 @@
   ],
   "source": [
    "ds = ws.get_default_datastore()\n",
+    "if AZUREML_VERBOSE:\n",
    "    print(ds.datastore_type, ds.account_name, ds.container_name, ds.as_mount())"
   ]
  },
@ -309,16 +320,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "('./squad\\\\dev-v1.1.json', <http.client.HTTPMessage at 0x1e1a85ef198>)"
+       "('./squad\\\\dev-v1.1.json', <http.client.HTTPMessage at 0x1569b645f28>)"
      ]
     },
-     "execution_count": 14,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -339,7 +350,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@ -392,7 +403,7 @@
       "0  [{'answers': [{'answer_start': 515, 'text': 'S...  "
      ]
     },
-     "execution_count": 57,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -412,7 +423,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -499,7 +510,7 @@
       "4  5733be284776f4190066117e  What sits on top of the Main Building at Notre...  "
      ]
     },
-     "execution_count": 58,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -525,32 +536,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
-     "name": "stderr",
+     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "WARNING - Target already exists. Skipping upload for ./squad\\dev-v1.1.json\n",
-      "WARNING - Target already exists. Skipping upload for ./squad\\train-v1.1.json\n"
+      "Uploading an estimated of 2 files\n",
+      "Target already exists. Skipping upload for squad\\dev-v1.1.json\n",
+      "Target already exists. Skipping upload for squad\\train-v1.1.json\n",
+      "Uploaded 0 files\n"
     ]
    },
    {
     "data": {
      "text/plain": [
-       "$AZUREML_DATAREFERENCE_5a4cead96ec140b8b5884e917df16e3a"
+       "$AZUREML_DATAREFERENCE_972d18f476b34d26a1ffd6a11b473114"
      ]
     },
-     "execution_count": 15,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "ds.upload(src_dir='./squad', target_path='./squad')"
+    "ds.upload(src_dir='./squad', target_path='./squad', show_progress=AZUREML_VERBOSE)"
   ]
  },
  {
@ -595,7 +608,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -603,7 +616,7 @@
     "output_type": "stream",
     "text": [
      "Found existing compute target.\n",
-      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-12T22:32:24.801000+00:00', 'errors': None, 'creationTime': '2019-07-12T19:59:45.933132+00:00', 'modifiedTime': '2019-07-12T20:00:01.793458+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\n"
+      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-22T22:38:04.496000+00:00', 'errors': None, 'creationTime': '2019-07-12T19:59:45.933132+00:00', 'modifiedTime': '2019-07-12T20:00:01.793458+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\n"
     ]
    }
   ],
@ -638,11 +651,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
-    "project_folder = './pytorch-transformers'"
+    "project_folder = PROJECT_FOLDER"
   ]
  },
  {
@ -654,7 +667,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@ -694,49 +707,29 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Let's retrieve and copy the training script [bert_run_squad_azureml.py](.\\bert_run_squad_azureml.py), evaluation script for SQuAD v1.1 [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/498c8026d92a8bcf0286e2d216d092d444d02d76/squad/evaluate-v1.1.py) and the helper utility script for Horovod [azureml_bert_util.py](https://github.com/microsoft/AzureML-BERT/blob/master/PyTorch/azureml_bert_util.py) into our project directory."
+    "Let's retrieve and copy the training script [bert_run_squad_azureml.py](.\\bert_run_squad_azureml.py), evaluation script for SQuAD v1.1 [evaluate-v1.1.py](../../utils_nlp/eval/evaluate_squad.py) and the helper utility script for Horovod [azureml_bert_util.py](../../utils_nlp/azureml/azureml_bert_util.py) into our project directory."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "('./pytorch-pretrained-BERT\\\\evaluate_squad.py',\n",
-       " <http.client.HTTPMessage at 0x25103433c88>)"
+       "'./pytorch-transformers\\\\bert_run_squad_azureml.py'"
      ]
     },
-     "execution_count": 7,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "urllib.request.urlretrieve('https://raw.githubusercontent.com/allenai/bi-att-flow/498c8026d92a8bcf0286e2d216d092d444d02d76/squad/evaluate-v1.1.py', filename= project_folder + '\\\\evaluate_squad.py')\n",
-    "urllib.request.urlretrieve('https://raw.githubusercontent.com/microsoft/AzureML-BERT/master/finetune/PyTorch/azureml_bert_util.py', filename= project_folder + '\\\\azureml_bert_util.py')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 61,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'./pytorch-pretrained-BERT\\\\bert_run_squad_azureml.py'"
-      ]
-     },
-     "execution_count": 61,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "shutil.copy('bert_run_squad_azureml.py', project_folder)"
+    "shutil.copy(EVALUATE_SQAD_PATH, project_folder)\n",
+    "shutil.copy(BERT_UTIL_PATH, project_folder)\n",
+    "shutil.copy(TRAIN_SCRIPT_PATH, project_folder)"
   ]
  },
  {
@ -784,26 +777,26 @@
    "estimator = PyTorch(source_directory=project_folder,\n",
    "                    compute_target=gpu_compute_target,\n",
    "                    script_params = {\n",
-    "                          '--bert_model':'bert-large-uncased',\n",
+    "                          '--bert_model':BERT_MODEL,\n",
    "                          '--do_train' : '',\n",
    "                          '--do_predict': '',\n",
    "                          '--train_file': ds.path('squad/train-v1.1.json').as_mount(),\n",
    "                          '--predict_file': ds.path('squad/dev-v1.1.json').as_mount(),\n",
-    "                          '--max_seq_length': 384,\n",
+    "                          '--max_seq_length': MAX_SEQ_LENGTH,\n",
    "                          '--train_batch_size': 8,\n",
    "                          '--learning_rate': 6.8e-5,\n",
-    "                          '--num_train_epochs': 2.0,\n",
+    "                          '--num_train_epochs': NUM_TRAIN_EPOCHS,\n",
    "                          '--doc_stride': 128,\n",
    "                          '--seed': 32,\n",
-    "                          '--init_gradient_accumulation_steps':2,\n",
-    "                          '--target_gradient_accumulation_steps':16,\n",
+    "                          '--init_gradient_accumulation_steps':INIT_GRADIENT_STEPS,\n",
+    "                          '--target_gradient_accumulation_steps':TARGET_GRADIENT_STEPS,\n",
    "                          '--accumulation_warmup_proportion':0.25,\n",
    "                          '--output_dir': './outputs',\n",
    "                          '--loss_scale':256,\n",
    "                    },\n",
    "                    custom_docker_image='azuremlsamples/bert:torch-1.0.0-apex-cuda9',\n",
    "                    entry_script='bert_run_squad_azureml.py',\n",
-    "                    node_count=2,\n",
+    "                    node_count=NODE_COUNT,\n",
    "                    distributed_training=mpiConfig,\n",
    "                    framework_version='1.1',\n",
    "                    use_gpu=True)\n",
@ -814,7 +807,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "**Note: You can try with `--bert_model':'bert-base-uncased`to run a smaller bert model faster.**"
+    "**Note: You can try with `--bert_model:'bert-base-uncased'`to run a smaller bert model faster.**"
   ]
  },
  {
@ -831,7 +824,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "experiment_name = 'BERT-SQuAD'\n",
+    "experiment_name = EXPERIMENT_NAME\n",
    "experiment = Experiment(ws, name=experiment_name)"
   ]
  },
@ -874,6 +867,15 @@
    "RunDetails(run).show()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ = run.wait_for_completion(show_output=AZUREML_VERBOSE) # Block until complete"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -936,7 +938,15 @@
   "metadata": {},
   "source": [
    "### 3.1 Start a hyperparameter sweep\n",
-    "First, we will define the hyperparameter space to sweep over. In this example we will use random sampling to try different configuration sets of hyperparameter to minimize our primary metric, the f1 score (`f1`). For simplicity, we tune the BERT base model with  `--bert_model':'bert-base-uncased` and  `node_count=1`."
+    "First, we will define the hyperparameter space to sweep over. In this example we will use random sampling to try different configuration sets of hyperparameter to minimize our primary metric, the f1 score (`f1`). For simplicity, we tune the BERT base model with  `--bert_model':'bert-base-uncased` and  `node_count=1`.\n",
+    "\n",
+    "We can also try with `BayesianParameterSampling` with suggested `max_total_runs=20`.\n",
+    "```Python\n",
+    "param_sampling = BayesianParameterSampling( {\n",
+    "         'learning_rate': uniform(5e-5, 9e-5),\n",
+    "    }\n",
+    ")\n",
+    "```"
   ]
  },
  {
@ -953,8 +963,8 @@
    "                                         hyperparameter_sampling=param_sampling, \n",
    "                                         primary_metric_name='f1',\n",
    "                                         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,\n",
-    "                                         max_total_runs=8,\n",
-    "                                         max_concurrent_runs=4)"
+    "                                         max_total_runs=MAX_TOTAL_RUNS,\n",
+    "                                         max_concurrent_runs=MAX_CONCURRENT_RUNS)"
   ]
  },
  {
@ -971,7 +981,8 @@
   "outputs": [],
   "source": [
    "# start the HyperDrive run\n",
-    "hyperdrive_run = experiment.submit(hyperdrive_config)"
+    "hyperdrive_run = experiment.submit(hyperdrive_config)\n",
+    "RunDetails(hyperdrive_run).show()"
   ]
  },
  {
@ -990,7 +1001,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "RunDetails(hyperdrive_run).show()"
+    "_ = hyperdrive_run.wait_for_completion(show_output=AZUREML_VERBOSE) # Block until complete"
   ]
  },
  {
@ -1017,7 +1028,7 @@
   "metadata": {},
   "source": [
    "### 3.3 Find and register the best model\n",
-    "Once all the runs complete, we can find the run that produced the model with the highest F1 score. The F1 score with default learning rate is **86.18** in [Submit and Monitor your run](#2.6-Submit-and-Monitor-your-run) . The best F1 score is **87.01** after tuning with `learning rate=0.000090`."
+    "Once all the runs complete, we can find the run that produced the model with the highest F1 score. The F1 score with default learning rate is **86.18** in [Submit and Monitor your run](#2.6-Submit-and-Monitor-your-run) . The best F1 score is **87.01** after tuning with `learning rate=0.000090` with random sampling. With Bayesian sampling, the best F1 score is **86.87** after tuning with `learning rate=0.0000896`."
   ]
  },
  {
@ -1042,10 +1053,20 @@
   "source": [
    "best_run = hyperdrive_run.get_best_run_by_primary_metric()\n",
    "best_run_metrics = best_run.get_metrics()\n",
-    "print(best_run)\n",
    "print('Best Run is:\\n  F1 score: %.2f \\n  Learning rate: %f' % (best_run_metrics['f1'], best_run_metrics['lr']))"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Persist properties of the run so we can access the logged metrics later\n",
+    "sb.glue(\"f1\", best_run_metrics['f1'])\n",
+    "sb.glue(\"learning_rate\", best_run_metrics['lr'])"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -1063,6 +1084,7 @@
    "name": "minxia"
   }
  ],
+  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
--- a/scenarios/question_answering/question_answering_system_bidaf_quickstart.ipynb
+++ b/scenarios/question_answering/question_answering_system_bidaf_quickstart.ipynb
@ -49,6 +49,7 @@
    "sys.path.append(\"../../\")\n",
    "import json\n",
    "import urllib\n",
+    "import scrapbook as sb\n",
    "\n",
    "#import utils\n",
    "from utils_nlp.common.timer import Timer\n",
@ -63,7 +64,11 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
   "outputs": [],
   "source": [
    "CPU_CORES = 1\n",
@ -72,7 +77,18 @@
    "DEPLOYMENT_CONDA_PACKAGES = ['jsonnet','cmake','regex','pytorch','torchvision']\n",
    "DEPLOYMENT_PIP_PACKAGES = ['allennlp==0.8.4','azureml-sdk==1.0.48']\n",
    "CONTAINER_TAGS = {'area': \"nlp\", 'type': \"question-answering BiDAF\"}\n",
-    "MODEL_TAGS = {\"bidaf\": \"demo\"}"
+    "MODEL_TAGS = {\"bidaf\": \"demo\"}\n",
+    "config_path = (\n",
+    "    \"./.azureml\"\n",
+    ")  # Path to the directory containing config.json with azureml credentials\n",
+    "\n",
+    "webservice_name = \"aci-bidaf-service\" #name for webservice; must be unique within your workspace\n",
+    "\n",
+    "# Azure resources\n",
+    "subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
+    "resource_group = \"YOUR_RESOURCE_GROUP_NAME\"  \n",
+    "workspace_name = \"YOUR_WORKSPACE_NAME\"  \n",
+    "workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on."
   ]
  },
  {
@ -93,9 +109,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML Workspace. This will create a config.json file containing the values needed below to create a workspace.\n",
-    "\n",
-    "**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
+    "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML Workspace. Enter the configuration values in the parameter cell above."
   ]
  },
  {
@ -130,10 +144,11 @@
   ],
   "source": [
    "ws = azureml_utils.get_or_create_workspace(\n",
-    "    subscription_id=\"<SUBSCRIPTION_ID>\",\n",
-    "    resource_group=\"<RESOURCE_GROUP>\",\n",
-    "    workspace_name=\"<WORKSPACE_NAME>\",\n",
-    "    workspace_region=\"<WORKSPACE_REGION>\"\n",
+    "    config_path=config_path,\n",
+    "    subscription_id=subscription_id,\n",
+    "    resource_group=resource_group,\n",
+    "    workspace_name=workspace_name,\n",
+    "    workspace_region=workspace_region,\n",
    ")"
   ]
  },
@ -459,7 +474,7 @@
   "source": [
    "# deploy image as web service\n",
    "aci_service = Webservice.deploy_from_image(workspace = ws, \n",
-    "                                           name = 'bidaf-aci-service-1',\n",
+    "                                           name = webservice_name,\n",
    "                                           image = image,\n",
    "                                           deployment_config = aci_config)\n",
    "\n",
@ -593,6 +608,7 @@
    "result = json.loads(score)\n",
    "try:\n",
    "    output = result[\"result\"]\n",
+    "    sb.glue(\"answer\", output)\n",
    "    print(\"Answer:\", output)\n",
    "except:\n",
    "    print(result[\"error\"])"
@ -677,10 +693,23 @@
  }
 ],
 "metadata": {
+  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
  }
 },
 "nbformat": 4,
--- a/scenarios/sentence_similarity/README.md
+++ b/scenarios/sentence_similarity/README.md
@ -21,7 +21,8 @@ The following summarizes each notebook for Sentence Similarity. Each notebook pr
 |Notebook|Runs Local|Description|
 |---|---|---|
 |[Creating a Baseline model](baseline_deep_dive.ipynb)| Yes| A baseline model is a basic solution that serves as a point of reference for comparing other models to. The baseline model's performance gives us an indication of how much better our models can perform relative to a naive approach.|
-|Senteval |[Local](senteval_local.ipynb), [AzureML](senteval_azureml.ipynb)|SentEval is a widely used benchmarking tool for evaluating general-purpose sentence embeddings. Running SentEval locally is easy, but not necessarily efficient depending on the model specs. We provide an example on how to do this efficiently in Azure Machine Learning Service. |
+|[BERT Sentence Encoder](bert_encoder.ipynb)|Yes|In this notebook, we show how to extract features from pretrained BERT as sentence embeddings.|
+|[BERT with SentEval](bert_senteval.ipynb)|No|In this notebook, we show how to use SentEval to compare the performance of BERT sequence encodings with various pooling strategies on a sentence similarity task. We leverage AzureML  resources such as Datastore and AmlCompute to autoscale our compute cluster and run the experiments in parallel.|
 |Gensen | [Local](gensen_local.ipynb), [AzureML](gensen_aml_deep_dive.ipynb)|This notebook serves as an introduction to an end-to-end NLP solution for sentence similarity building one of the State of the Art models, GenSen. We provide two notebooks. One, which runs on the AzureML platform.  We show the advantages of AzureML when training large NLP models with GPU in this notebook. The other example walks through using a GPU enabled VM to train and score Gensen.|
 |[Automated Machine Learning(AutoML) with Deployment on Azure Container Instance](automl_local_deployment_aci.ipynb)| Yes |This notebook shows users how to use AutoML on local machine and deploy the model as a webservice to Azure Container Instance(ACI) to get a sentence similarity score.
 |[Google Universal Sentence Encoder with Azure Machine Learning Pipeline, AutoML with Deployment on Azure Kubernetes Service](automl_with_pipelines_deployment_aks.ipynb)| No | This notebook shows a user how to use AzureML pipelines and deploy the pipeline output model as a webservice to Azure Kubernetes Service which can be used as an end point to get sentence similarity scores.|
--- a/scenarios/sentence_similarity/automl_local_deployment_aci.ipynb
+++ b/scenarios/sentence_similarity/automl_local_deployment_aci.ipynb
@ -125,6 +125,7 @@
    "from scipy.spatial import distance\n",
    "from sklearn.externals import joblib\n",
    "import json\n",
+    "import scrapbook as sb\n",
    "\n",
    "# Import utils\n",
    "from utils_nlp.azureml import azureml_utils\n",
@ -162,12 +163,36 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
   "outputs": [],
   "source": [
    "BASE_DATA_PATH = \"../../data\"\n",
    "CPU_CORES = 1\n",
-    "MEMORY_GB = 8"
+    "MEMORY_GB = 8\n",
+    "\n",
+    "# Define the settings for AutoML\n",
+    "automl_task = \"regression\"\n",
+    "automl_iteration_timeout = 15\n",
+    "automl_iterations = 50\n",
+    "automl_metric = \"spearman_correlation\"\n",
+    "automl_preprocess = True\n",
+    "automl_model_blacklist = ['XGBoostRegressor']\n",
+    "\n",
+    "config_path = (\n",
+    "    \"./.azureml\"\n",
+    ")  # Path to the directory containing config.json with azureml credentials\n",
+    "\n",
+    "webservice_name = \"aci-automl-service\" #name for webservice; must be unique within your workspace\n",
+    "\n",
+    "# Azure resources\n",
+    "subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
+    "resource_group = \"YOUR_RESOURCE_GROUP_NAME\"  \n",
+    "workspace_name = \"YOUR_WORKSPACE_NAME\"  \n",
+    "workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on."
   ]
  },
  {
@ -176,16 +201,15 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Define the settings for AutoML\n",
    "automl_settings = {\n",
-    "    \"task\": \"regression\",  # type of task: classification, regression or forecasting\n",
+    "    \"task\": automl_task,  # type of task: classification, regression or forecasting\n",
    "    \"debug_log\": \"automated_ml_errors.log\",\n",
    "    \"path\": \"./automated-ml-regression\",\n",
-    "    \"iteration_timeout_minutes\": 15,  # How long each iteration can take before moving on\n",
-    "    \"iterations\": 50,  # Number of algorithm options to try\n",
-    "    \"primary_metric\": \"spearman_correlation\",  # Metric to optimize\n",
-    "    \"preprocess\": True,  # Whether dataset preprocessing should be applied\n",
-    "    \"blacklist_models\": ['XGBoostRegressor'] #exclude this model due to installation issues\n",
+    "    \"iteration_timeout_minutes\": automl_iteration_timeout,  # How long each iteration can take before moving on\n",
+    "    \"iterations\": automl_iterations,  # Number of algorithm options to try\n",
+    "    \"primary_metric\": automl_metric,  # Metric to optimize\n",
+    "    \"preprocess\": automl_preprocess,  # Whether dataset preprocessing should be applied\n",
+    "    \"blacklist_models\": automl_model_blacklist #exclude this model due to installation issues\n",
    "}"
   ]
  },
@ -429,9 +453,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n",
-    "\n",
-    "**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
+    "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. Enter the configuration values in the cell above."
   ]
  },
  {
@ -464,10 +486,11 @@
   ],
   "source": [
    "ws = azureml_utils.get_or_create_workspace(\n",
-    "    subscription_id=\"<SUBSCRIPTION_ID>\",\n",
-    "    resource_group=\"<RESOURCE_GROUP>\",\n",
-    "    workspace_name=\"<WORKSPACE_NAME>\",\n",
-    "    workspace_region=\"<WORKSPACE_REGION>\",\n",
+    "    config_path=config_path,\n",
+    "    subscription_id=subscription_id,\n",
+    "    resource_group=resource_group,\n",
+    "    workspace_name=workspace_name,\n",
+    "    workspace_region=workspace_region,\n",
    ")"
   ]
  },
@ -570,7 +593,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "experiment = Experiment(ws, \"automated-ml-regression\")\n",
+    "experiment = Experiment(ws, \"NLP-SS-automl\")\n",
    "local_run = experiment.submit(automated_ml_config, show_output=True)"
   ]
  },
@ -762,7 +785,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "experiment = Experiment(ws, \"automated-ml-regression\")\n",
    "ml_run = AutoMLRun(experiment=experiment, run_id=local_run.id)"
   ]
  },
@ -980,7 +1002,7 @@
   "source": [
    "# deploy image as web service\n",
    "aci_service = Webservice.deploy_from_image(\n",
-    "    workspace=ws, name=\"aci-automl-service-8\", image=image, deployment_config=aci_config\n",
+    "    workspace=ws, name=webservice_name, image=image, deployment_config=aci_config\n",
    ")\n",
    "\n",
    "aci_service.wait_for_deployment(show_output=True)\n",
@ -1141,6 +1163,26 @@
    "    print(result[\"error\"])"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get Pearson Correlation\n",
+    "pearson = pearsonr(output, test_y)[0]\n",
+    "print(pearson)\n",
+    "\n",
+    "sb.glue(\"pearson_correlation\", pearson)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The goal of this notebook is to demonstrate how to use AutoML locally and then deploy the model to Azure Container Instance quickly. The model utilizes the built-in capabilities of AutoML to embed our sentences. The model performance on its own, without tweaking, is not very strong with this particular dataset. For a more advanced model, see [AutoML with Pipelines Deployment AKS](automl_with_pipelines_deployment_aks.ipynb) for much stronger performance on the same task. This notebook utilizes AzureML Pipelines to explicitly embed our sentences using the Google Universal Sentence Encoder (USE) model. For our dataset, the Google USE embeddings result in superior model performance."
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -1155,7 +1197,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1173,7 +1215,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1191,7 +1233,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1205,16 +1247,17 @@
   "source": [
    "As mentioned above, Azure Container Instances tend to be used to develop and test deployments. They are typically configured with CPUs, which usually suffice when the number of requests per second is not too high. When working with several instances, we can configure them further by specifically allocating CPU resources to each of them.\n",
    "\n",
+    "For production requirements, i.e. when > 100 requests per second are expected, we recommend deploying models to Azure Kubernetes Service (AKS). It is a convenient infrastructure as it manages hosted Kubernetes environments, and makes it easy to deploy and manage containerized applications without container orchestration expertise. It also supports deployments with CPU clusters and deployments with GPU clusters.For more examples on deployment follow [MachineLearningNotebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/deployment) github repository.\n",
    "\n",
-    "For production requirements, i.e. when > 100 requests per second are expected, we recommend deploying models to Azure Kubernetes Service (AKS). It is a convenient infrastructure as it manages hosted Kubernetes environments, and makes it easy to deploy and manage containerized applications without container orchestration expertise. It also supports deployments with CPU clusters and deployments with GPU clusters.\n",
    "\n",
-    "To see an example with Azure Kubernetes Service example, go to [this notebook](automl_with_pipelines_deployment_aks.ipynb)\n",
+    "## Next Steps\n",
    "\n",
-    "For more examples on deployment follow [MachineLearningNotebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/deployment) github repository."
+    "Check out [AutoML with Pipelines Deployment AKS](automl_with_pipelines_deployment_aks.ipynb) to see how to construct a AzureML Pipeline with an embedding step (using Google Universal Sentence Encoder model) and an AutoMLStep, increasing our Pearson correlation score. Also, this notebooks demonstrates deployment using AKS versus ACI."
   ]
  }
 ],
 "metadata": {
+  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
--- a/scenarios/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb
+++ b/scenarios/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb
@ -20,7 +20,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "This notebook demonstrates how to use [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-service/\n",
+    "This notebook builds off of the [AutoML Local Deployment ACI](automl_local_deployment_aci.ipynb) notebook and demonstrates how to use [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-service/\n",
    ") pipelines and Automated Machine Learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml\n",
    ")) to streamline the creation of a machine learning workflow for predicting sentence similarity. The pipeline contains two steps:   \n",
    "1. PythonScriptStep: embeds sentences using a popular sentence embedding model, Google Universal Sentence Encoder\n",
@ -228,7 +228,11 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
   "outputs": [],
   "source": [
    "automl_settings = {\n",
@ -239,7 +243,17 @@
    "    \"preprocess\": True,  # Whether dataset preprocessing should be applied\n",
    "    \"verbosity\": logging.INFO,\n",
    "    \"blacklist_models\": ['XGBoostRegressor'] #this model is blacklisted due to installation issues\n",
-    "}"
+    "}\n",
+    "\n",
+    "config_path = (\n",
+    "    \"./.azureml\"\n",
+    ")  # Path to the directory containing config.json with azureml credentials\n",
+    "\n",
+    "# Azure resources\n",
+    "subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
+    "resource_group = \"YOUR_RESOURCE_GROUP_NAME\"  \n",
+    "workspace_name = \"YOUR_WORKSPACE_NAME\"  \n",
+    "workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on."
   ]
  },
  {
@ -494,9 +508,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n",
-    "\n",
-    "**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
+    "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. Enter the configuration values in the cell above."
   ]
  },
  {
@ -529,10 +541,11 @@
   ],
   "source": [
    "ws = azureml_utils.get_or_create_workspace(\n",
-    "    subscription_id=\"<SUBSCRIPTION_ID>\",\n",
-    "    resource_group=\"<RESOURCE_GROUP>\",\n",
-    "    workspace_name=\"<WORKSPACE_NAME>\",\n",
-    "    workspace_region=\"<WORKSPACE_REGION>\",\n",
+    "    config_path=config_path,\n",
+    "    subscription_id=subscription_id,\n",
+    "    resource_group=resource_group,\n",
+    "    workspace_name=workspace_name,\n",
+    "    workspace_region=workspace_region,\n",
    ")"
   ]
  },
@ -569,7 +582,7 @@
    "os.makedirs(project_folder, exist_ok=True)\n",
    "\n",
    "# Set up an experiment\n",
-    "experiment_name = \"automl-sentence-similarity\"\n",
+    "experiment_name = \"NLP-SS-googleUSE\"\n",
    "experiment = Experiment(ws, experiment_name)\n",
    "\n",
    "# Add logging to our experiment\n",
@ -587,7 +600,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "To use AzureML Pipelines we need to link a compute target as they can not be run locally (see [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) for explanation of the different options). We will use an AmlCompute target in this example."
+    "To use AzureML Pipelines we need to link a compute target as they can not be run locally. The different options include AmlCompute, Azure Databricks, Remote VMs, etc. All [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) can be found in this table with details about whether the given options work with automated ML, pipelines, and GPU. For the following example, we will use an AmlCompute target because it supports Azure Pipelines and GPU. "
   ]
  },
  {
@ -738,7 +751,9 @@
  {
   "cell_type": "code",
   "execution_count": 16,
-   "metadata": {},
+   "metadata": {
+    "format": "row"
+   },
   "outputs": [
    {
     "name": "stdout",
@ -758,9 +773,8 @@
    "conda_run_config.environment.docker.enabled = True\n",
    "conda_run_config.environment.docker.base_image = aml.core.runconfig.DEFAULT_CPU_IMAGE\n",
    "\n",
-    "# Use conda_dependencies.yml to create a conda environment in the Docker image for execution\n",
+    "# Specify our own conda dependencies for the execution environment\n",
    "conda_run_config.environment.python.user_managed_dependencies = False\n",
-    "\n",
    "conda_run_config.environment.python.conda_dependencies = CondaDependencies.create(\n",
    "    pip_packages=[\n",
    "        \"azureml-sdk[automl]==1.0.48\",\n",
@ -961,7 +975,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "This step defines the `PythonScriptStep`. We give the step a name, tell the step which python script to run (embed.py) and what directory that script is located in (source_directory). Note that the hash_paths parameter will be deprecated but currently is needed to check for any updates to the embed.py file.\n",
+    "This step defines the `PythonScriptStep`. We give the step a name, tell the step which python script to run (embed.py) and what directory that script is located in (source_directory). \n",
    "\n",
    "We also link the compute target and run configuration that we made previously. Our input is the `DataReference` object (input_data) where our raw sentence data was uploaded and our ouput is the `PipelineData` object (embedded_data) where the embedded data produced by this step will be stored. These are also passed in as arguments so that we have access to the correct data paths."
   ]
@ -1011,7 +1025,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Define the get_data.py file and get_data() function that the `AutoMLStep` will execute to collect data. Note that we can directly access the path of the intermediate data (called embedded_data) through `os.environ['AZUREML_DATAREFERENCE_embedded_data']`. This is necessary because the AutoMLStep does not accept additional parameters like the PythonScriptStep does with `arguments`."
+    "Define the get_data.py file and get_data() function that the `AutoMLStep` will execute to collect data. When AutoML is used with a remote compute, the data can not be passed directly as parameters. Rather, a get_data function must be defined to access the data (see [this resource](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-auto-train-remote) for further details). Note that we can directly access the path of the intermediate data (called embedded_data) through `os.environ['AZUREML_DATAREFERENCE_embedded_data']`. This is necessary because the AutoMLStep does not accept additional parameters like the PythonScriptStep does with `arguments`."
   ]
  },
  {
@ -1104,7 +1118,7 @@
    "    run_configuration=conda_run_config,\n",
    "    data_script=project_folder\n",
    "    + \"/get_data.py\",  # local path to script with get_data() function\n",
-    "    **automl_settings #where the autoML main settings are defined\n",
+    "    **automl_settings #where the AutoML main settings are defined\n",
    ")"
   ]
  },
@ -1119,7 +1133,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Finally, we create `PipelineData` objects for the model data (our outputs) and then create the `AutoMLStep`. The `AutoMLStep` requires a `AutoMLConfig` object and we pass our intermediate data (embedded_data) in as the inputs. Again, note that the hash_paths parameter will be deprecated but currently is needed to check for any updates to the get_data.py file."
+    "Finally, we create `PipelineData` objects for the model data (our outputs) and then create the `AutoMLStep`. The `AutoMLStep` requires a `AutoMLConfig` object and we pass our intermediate data (embedded_data) in as the inputs. "
   ]
  },
  {
@ -1528,7 +1542,6 @@
    "    scores: list of target variables\n",
    "    \"\"\"\n",
    "    google_USE_emb1, google_USE_emb2 = google_encoder(dataset)\n",
-    "    n_google = google_USE_emb1.shape[1]  # length of the embeddings\n",
    "    return np.concatenate((google_USE_emb1, google_USE_emb2), axis=1)\n",
    "\n",
    "\n",
@ -1751,7 +1764,8 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "We are now ready to deploy our web service. We will deploy from the Docker image. It contains our AutoML model as well as the Google Universal Sentence Encoder model and the conda environment needed for the scoring script to work properly. The parameters to pass to the Webservice.deploy_from_image() command are similar to those used for the deployment on ACI. The only major difference is the compute target (aks_target), i.e. the CPU cluster we just spun up.\n",
+    "We are now ready to deploy our web service. We will deploy from the Docker image. It contains our AutoML model as well as the Google Universal Sentence Encoder model and the conda environment needed for the scoring script to work properly. The parameters to pass to the Webservice.deploy_from_image() command are similar to those used for deployment on Azure Container Instance ([ACI](https://azure.microsoft.com/en-us/services/container-instances/\n",
+    ")). The only major difference is the compute target (aks_target), i.e. the CPU cluster we just spun up.\n",
    "\n",
    "**Note:** This deployment takes a few minutes to complete."
   ]
@ -1947,11 +1961,13 @@
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
   "metadata": {},
-   "outputs": [],
-   "source": []
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "This notebook demonstrated how to use AzureML Pipelines and AutoML to streamline the creation of a machine learning workflow for predicting sentence similarity. After creating the pipeline, the notebook demonstrated the deployment of our sentence similarity model using AKS. The model results reported in this notebook (using Google USE embeddings) are much stronger than the results from using AutoML with its built-in embedding capabilities (as in [AutoML Local Deployment ACI](automl_local_deployment_aci.ipynb)). "
+   ]
  }
 ],
 "metadata": {
--- a/scenarios/sentence_similarity/bert_encoder.ipynb
+++ b/scenarios/sentence_similarity/bert_encoder.ipynb
@ -0,0 +1,253 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sentence Similarity with Pretrained BERT\n",
+    "In this notebook, we use pretrained [BERT](https://arxiv.org/abs/1810.04805) as a sentence encoder to measure sentence similarity. We use a [feature extractor](../../utils_nlp/bert/extract_features.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 00 Global Settings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "import torch\n",
+    "import itertools\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import scrapbook as sb\n",
+    "from collections import OrderedDict\n",
+    "\n",
+    "sys.path.append(\"../../\")\n",
+    "from utils_nlp.models.bert.common import Language, Tokenizer\n",
+    "from utils_nlp.models.bert.sequence_encoding import BERTSentenceEncoder, PoolingStrategy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# device config\n",
+    "NUM_GPUS = 0\n",
+    "\n",
+    "# model config\n",
+    "LANGUAGE = Language.ENGLISH\n",
+    "TO_LOWER = True\n",
+    "MAX_SEQ_LENGTH = 128\n",
+    "LAYER_INDEX = -2\n",
+    "POOLING_STRATEGY = PoolingStrategy.MEAN\n",
+    "\n",
+    "# path config\n",
+    "CACHE_DIR = \"./temp\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(CACHE_DIR):\n",
+    "    os.makedirs(CACHE_DIR, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 01 Define the Sentence Encoder with Pretrained BERT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `BERTSentenceEncoder` defaults to Pretrained BERT."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 407873900/407873900 [00:15<00:00, 26602678.27B/s]\n",
+      "100%|██████████| 231508/231508 [00:00<00:00, 905295.88B/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "se = BERTSentenceEncoder(\n",
+    "    language=LANGUAGE,\n",
+    "    num_gpus=NUM_GPUS,\n",
+    "    cache_dir=CACHE_DIR,\n",
+    "    to_lower=TO_LOWER,\n",
+    "    max_len=MAX_SEQ_LENGTH,\n",
+    "    layer_index=LAYER_INDEX,\n",
+    "    pooling_strategy=POOLING_STRATEGY,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 02 Compute the Sentence Encodings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `encode` method of the sentence encoder accepts a list of text to encode, as well as the layers we want to extract the embeddings from and the pooling strategy we want to use. The embedding size is 768. We can also return just the values column as a list of numpy arrays by setting the `as_numpy` parameter to True."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2/2 [00:00<00:00, 2917.78it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text_index</th>\n",
+       "      <th>layer_index</th>\n",
+       "      <th>values</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>-2</td>\n",
+       "      <td>[0.038080588, 0.0926698, 0.0366186, -0.1218368...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>-2</td>\n",
+       "      <td>[0.084241375, 0.099506006, -0.38437817, 0.2164...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   text_index  layer_index                                             values\n",
+       "0           0           -2  [0.038080588, 0.0926698, 0.0366186, -0.1218368...\n",
+       "1           1           -2  [0.084241375, 0.099506006, -0.38437817, 0.2164..."
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = se.encode(\n",
+    "    [\"Coffee is good\", \"The moose is across the street\"],\n",
+    "    as_numpy=False\n",
+    ")\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 768,
+       "encoder": "json",
+       "name": "result",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "result"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# for testing\n",
+    "size_emb = len(result[\"values\"].iloc[0])\n",
+    "sb.glue(\"size_emb\", size_emb)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Tags",
+  "kernelspec": {
+   "display_name": "Python (nlp_gpu)",
+   "language": "python",
+   "name": "nlp_gpu"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/scenarios/sentence_similarity/bert_senteval.ipynb
+++ b/scenarios/sentence_similarity/bert_senteval.ipynb
--- a/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb
+++ b/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb
--- a/scenarios/sentence_similarity/gensen_local.ipynb
+++ b/scenarios/sentence_similarity/gensen_local.ipynb
@ -2,11 +2,7 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "75caf421-c00a-4d6d-8a3d-47ebe7493af5"
-    }
-   },
+   "metadata": {},
   "source": [
    "\n",
    "Copyright (c) Microsoft Corporation. All rights reserved.\n",
@ -16,11 +12,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "0738bb22-14af-45ca-9ad7-e0c068f280cf"
-    }
-   },
+   "metadata": {},
   "source": [
    "# GenSen with Pytorch\n",
    "In this tutorial, you will train a GenSen model for the sentence similarity task. We use the [SNLI](https://nlp.stanford.edu/projects/snli/) dataset in this example. For a more detailed walkthrough about data processing jump to [SNLI Data Prep](../01-prep-data/snli.ipynb). A quickstart version of this notebook can be found [here](../00-quick-start/)\n",
@ -59,21 +51,17 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "e91468d4-7bb8-469b-95a6-4e6f4dfcdf55"
-    }
-   },
+   "metadata": {},
   "source": [
    "## 0. Global Settings"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
   "metadata": {
-    "nbpresent": {
-     "id": "a6e277ee-edbb-44a5-81d4-93565d2f3a83"
+    "pycharm": {
+     "name": "#%%\n"
    }
   },
   "outputs": [
@ -92,46 +80,50 @@
    "\n",
    "import os\n",
    "import papermill as pm\n",
+    "import scrapbook as sb\n",
    "\n",
    "from utils_nlp.dataset.preprocess import to_lowercase, to_nltk_tokens\n",
    "from utils_nlp.dataset import snli, preprocess\n",
-    "from scenarios.sentence_similarity.gensen_wrapper import GenSenClassifier\n",
    "from utils_nlp.models.pretrained_embeddings.glove import download_and_extract\n",
+    "from utils_nlp.dataset import Split\n",
+    "from scenarios.sentence_similarity.gensen_wrapper import GenSenClassifier\n",
    "\n",
-    "\n",
-    "print(\"System version: {}\".format(sys.version))\n",
-    "BASE_DATA_PATH = '../../data'"
+    "print(\"System version: {}\".format(sys.version))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "max_epoch = None\n",
+    "config_filepath = 'gensen_config.json'\n",
+    "base_data_path = '../../data'\n",
+    "nrows = None"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "aee768e5-f317-4dfb-807c-cb4f5f0c0204"
-    }
-   },
+   "metadata": {},
   "source": [
    "## 1. Data Preparation and inspection"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "4c570c1b-0e4e-41e9-aa27-5ab1ce8c13a1"
-    }
-   },
+   "metadata": {},
   "source": [
    "The [SNLI](https://nlp.stanford.edu/projects/snli/) corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment (RTE). "
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "99c241e1-2f23-4fb3-9d3c-8f479c6b0030"
-    }
-   },
+   "metadata": {},
   "source": [
    "### 1.1 Load the dataset\n",
    "\n",
@ -152,10 +144,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
   "metadata": {
-    "nbpresent": {
-     "id": "5952e06d-1dae-462d-8fce-66eb7ef536dd"
+    "pycharm": {
+     "name": "#%%\n"
    }
   },
   "outputs": [
@ -337,38 +329,34 @@
       "4  2267923837.jpg#2r1e     entailment    NaN    NaN    NaN    NaN  "
      ]
     },
-     "execution_count": 7,
+     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
-    "dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n",
-    "test = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")\n",
+    "train = snli.load_pandas_df(base_data_path, file_split=Split.TRAIN, nrows=nrows)\n",
+    "dev = snli.load_pandas_df(base_data_path, file_split=Split.DEV, nrows=nrows)\n",
+    "test = snli.load_pandas_df(base_data_path, file_split=Split.TEST, nrows=nrows)\n",
    "\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "6d5f7565-1f84-4489-8d06-dabd6bd99190"
-    }
-   },
+   "metadata": {},
   "source": [
    "### 1.2 Tokenize\n",
    "\n",
-    "We have loaded the dataset into pandas.DataFrame, we now convert sentences to tokens. We also clean the data before tokenizing. This includes dropping unneccessary columns and renaming the relevant columns as score, sentence_1, and sentence_2. Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK] (https://www.nltk.org/) library for tokenization."
+    "We have loaded the dataset into pandas.DataFrame, we now convert sentences to tokens. We also clean the data before tokenizing. This includes dropping unneccessary columns and renaming the relevant columns as score, sentence_1, and sentence_2."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
   "metadata": {
-    "nbpresent": {
-     "id": "e6160617-03f0-4809-9360-8b040dc4395f"
+    "pycharm": {
+     "name": "#%%\n"
    }
   },
   "outputs": [],
@ -385,12 +373,19 @@
    "test = clean_and_tokenize(test)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK] (https://www.nltk.org/) library for tokenization."
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
   "metadata": {
-    "nbpresent": {
-     "id": "4912b609-8141-4212-a6ad-814d73f724ed"
+    "pycharm": {
+     "name": "#%%\n"
    }
   },
   "outputs": [
@ -497,7 +492,7 @@
       "4  [two, kids, at, a, ballgame, wash, their, hand...  "
      ]
     },
-     "execution_count": 9,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -508,11 +503,7 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "59494d88-c7c9-4efc-a191-f16d6ac2ac40"
-    }
-   },
+   "metadata": {},
   "source": [
    "##  2. Model application, performance and analysis of the results\n",
    "The model has been implemented as a GenSen class with the specifics hidden inside the fit() method, so that no explicit call is needed. The algorithm operates in three different steps:\n",
@ -538,8 +529,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
+   "execution_count": 14,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
@ -550,69 +545,47 @@
    }
   ],
   "source": [
-    "pretrained_embedding_path = download_and_extract(BASE_DATA_PATH)"
+    "pretrained_embedding_path = download_and_extract(base_data_path)"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "ab565124-43de-4862-b286-2b5db3a868fe"
-    }
-   },
+   "metadata": {},
   "source": [
    "### 2.1 Initialize Model"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
   "metadata": {
-    "nbpresent": {
-     "id": "641a9c74-974c-4aac-8c16-3b44d686f0f3"
-    },
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
+    "pycharm": {
+     "name": "#%%\n"
    }
-   ],
+   },
+   "outputs": [],
   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "\n",
-    "config_filepath = 'gensen_config.json'\n",
    "clf = GenSenClassifier(config_file = config_filepath, \n",
    "                       pretrained_embedding_path = pretrained_embedding_path,\n",
    "                       learning_rate = 0.0001, \n",
-    "                       cache_dir=BASE_DATA_PATH)"
+    "                       cache_dir=base_data_path,\n",
+    "                      max_epoch=max_epoch)"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "nbpresent": {
-     "id": "5f87d13c-d04f-4d38-820e-fb82082153c4"
-    }
-   },
+   "metadata": {},
   "source": [
    "### 2.2 Train Model"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 8,
   "metadata": {
-    "nbpresent": {
-     "id": "6ea45671-c7a5-4fe8-a450-8b54161f26c5"
-    },
-    "scrolled": false
+    "pycharm": {
+     "name": "#%%\n"
+    }
   },
   "outputs": [
    {
@ -621,7 +594,7 @@
     "text": [
      "/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/modules/rnn.py:46: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.8 and num_layers=1\n",
      "  \"num_layers={}\".format(dropout, num_layers))\n",
-      "../../scenarios/sentence_similarity/gensen_train.py:428: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
+      "../../scenarios/sentence_similarity/gensen_train.py:431: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
      "  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\n",
      "../../utils_nlp/models/gensen/utils.py:364: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
      "  Variable(torch.LongTensor(sorted_src_lens), volatile=True)\n",
@ -629,13 +602,13 @@
      "  warnings.warn(\"nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\")\n",
      "/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/functional.py:1320: UserWarning: nn.functional.tanh is deprecated. Use torch.tanh instead.\n",
      "  warnings.warn(\"nn.functional.tanh is deprecated. Use torch.tanh instead.\")\n",
-      "../../scenarios/sentence_similarity/gensen_train.py:520: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
+      "../../scenarios/sentence_similarity/gensen_train.py:523: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
      "  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\n",
      "/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/horovod/torch/__init__.py:163: UserWarning: optimizer.step(synchronize=True) called after optimizer.synchronize(). This can cause training slowdown. You may want to consider using optimizer.step(synchronize=False) if you use optimizer.synchronize() in your code.\n",
      "  warnings.warn(\"optimizer.step(synchronize=True) called after \"\n",
-      "../../scenarios/sentence_similarity/gensen_train.py:241: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
+      "../../scenarios/sentence_similarity/gensen_train.py:243: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
      "  f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n",
-      "../../scenarios/sentence_similarity/gensen_train.py:260: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
+      "../../scenarios/sentence_similarity/gensen_train.py:262: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
      "  f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n"
     ]
    },
@ -643,8 +616,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "CPU times: user 29min 21s, sys: 8min 11s, total: 37min 32s\n",
-      "Wall time: 37min 29s\n"
+      "CPU times: user 1h 19min 28s, sys: 22min 1s, total: 1h 41min 30s\n",
+      "Wall time: 1h 41min 22s\n"
     ]
    }
   ],
@ -657,13 +630,19 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### 2.3 Predict"
+    "### 2.3 Predict\n",
+    "\n",
+    "In the predict method we perform Pearson's Correlation computation [\\[2\\]](#References) on the outputs of the model. The predictions of the model can be further improved by hyperparameter tuning which we walk through in the other example [here](gensen_aml_deep_dive.ipynb). "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
+   "execution_count": 16,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
@ -671,20 +650,23 @@
     "text": [
      "******** Similarity Score for sentences **************\n",
      "          0         1\n",
-      "0  1.000000  0.936469\n",
-      "1  0.936469  1.000000\n"
+      "0  1.000000  0.966793\n",
+      "1  0.966793  1.000000\n"
     ]
    }
   ],
   "source": [
    "sentences = [\n",
-    "        'the quick brown fox jumped over the lazy dog',\n",
-    "        'bright sunshiny day tomorrow.'\n",
+    "        'The sky is blue and beautiful',\n",
+    "        'Love this blue and beautiful sky!'\n",
    "    ]\n",
    "\n",
    "results = clf.predict(sentences)\n",
    "print(\"******** Similarity Score for sentences **************\")\n",
-    "print(results)"
+    "print(results)\n",
+    "\n",
+    "# Record results with scrapbook for tests\n",
+    "sb.glue(\"results\", results.to_dict())"
   ]
  },
  {
@ -694,11 +676,15 @@
    "## References\n",
    "\n",
    "1. Subramanian, Sandeep and Trischler, Adam and Bengio, Yoshua and Pal, Christopher J, [*Learning general purpose distributed sentence representations via large scale multi-task learning*](https://arxiv.org/abs/1804.00079), ICLR, 2018.\n",
-    "3. Semantic textual similarity. url: http://nlpprogress.com/english/semantic_textual_similarity.html"
+    "2. Pearson's Correlation Coefficient. url: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient\n",
+    "3. Semantic textual similarity. url: http://nlpprogress.com/english/semantic_textual_similarity.html\n",
+    "4. Minh-Thang Luong, Quoc V Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. [*Multi-task sequence to sequence learning*](https://arxiv.org/abs/1511.06114), 2015.\n",
+    "5. Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. [*Learned in translation: Contextualized word vectors](https://arxiv.org/abs/1708.00107), 2017. "
   ]
  }
 ],
 "metadata": {
+  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": "Python (nlp_gpu)",
   "language": "python",
@ -715,6 +701,15 @@
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "metadata": {
+     "collapsed": false
+    },
+    "source": []
+   }
  }
 },
 "nbformat": 4,
--- a/scenarios/sentence_similarity/gensen_train.py
+++ b/scenarios/sentence_similarity/gensen_train.py
@ -134,10 +134,12 @@ def evaluate(
    save_dir,
    starting_time,
    model_state,
+    max_epoch,
 ):
    """ Function to validate the model.

    Args:
+        max_epoch(int): Limit training to specified number of epochs.
        model_state(dict): Saved model weights.
        config(dict): Config object.
        train_iterator(BufferedDataIterator): BufferedDataIterator object.
@ -197,7 +199,7 @@ def evaluate(
        )
        if (monitor_epoch - min_val_loss_epoch) > config["training"][
            "stop_patience"
-        ]:
+        ] or (max_epoch is not None and monitor_epoch >= max_epoch):
            logging.info("Saving model ...")
            # Save the name with validation loss.
            torch.save(
@ -269,10 +271,11 @@ def evaluate_nli(nli_iterator, model, batch_size, n_gpus):
    logging.info("******************************************************")


-def train(config, data_folder, learning_rate=0.0001):
+def train(config, data_folder, learning_rate=0.0001, max_epoch=None):
    """ Train the Gensen model.

    Args:
+        max_epoch(int): Limit training to specified number of epochs.
        config(dict): Loaded json file as a python object.
        data_folder(str): Path to the folder containing the data.
        learning_rate(float): Learning rate for the model.
@ -562,7 +565,7 @@ def train(config, data_folder, learning_rate=0.0001):
                    )

                    logging.info(
-                        "Average time per mininbatch : %.5f"
+                        "Average time per minibatch : %.5f"
                        % (np.mean(mbatch_times))
                    )
                    mlflow.log_metric(
@ -588,8 +591,11 @@ def train(config, data_folder, learning_rate=0.0001):
                        save_dir=save_dir,
                        starting_time=start,
                        model_state=model_state,
+                        max_epoch=max_epoch,
                    )
                    if training_complete:
+                        mlflow.log_metric("min_val_loss", float(min_val_loss))
+                        mlflow.log_metric("learning_rate", learning_rate)
                        break

                    logging.info("Evaluating on NLI")
@ -621,11 +627,18 @@ if __name__ == "__main__":
    parser.add_argument(
        "--learning_rate", type=float, default=0.0001, help="learning rate"
    )
+    parser.add_argument(
+        "--max_epoch",
+        type=int,
+        default=None,
+        help="Limit training to specified number of epochs.",
+    )

    args = parser.parse_args()
    data_path = args.data_folder
    lr = args.learning_rate

    config_file_path = args.config
+    max_epoch = args.max_epoch
    config_obj = read_config(config_file_path)
-    train(config_obj, data_path, lr)
+    train(config_obj, data_path, lr, max_epoch)
--- a/scenarios/sentence_similarity/gensen_wrapper.py
+++ b/scenarios/sentence_similarity/gensen_wrapper.py
@ -3,11 +3,11 @@
 import json
 import os

-import numpy as np
-import pandas as pd
-
 from scenarios.sentence_similarity.gensen_train import train
-from utils_nlp.models.gensen.create_gensen_model import create_multiseq2seq_model
+from utils_nlp.eval.classification import compute_correlation_coefficients
+from utils_nlp.models.gensen.create_gensen_model import (
+    create_multiseq2seq_model,
+)
 from utils_nlp.models.gensen.gensen import GenSenSingle
 from utils_nlp.models.gensen.preprocess_utils import gensen_preprocess

@ -30,12 +30,14 @@ class GenSenClassifier:
        pretrained_embedding_path,
        learning_rate=0.0001,
        cache_dir=".",
+        max_epoch=None,
    ):
        self.learning_rate = learning_rate
        self.config_file = config_file
        self.cache_dir = cache_dir
        self.pretrained_embedding_path = pretrained_embedding_path
        self.model_name = "gensen_multiseq2seq"
+        self.max_epoch = max_epoch

        self._validate_params()

@ -118,6 +120,7 @@ class GenSenClassifier:
            data_folder=os.path.abspath(self.cache_dir),
            config=self.config,
            learning_rate=self.learning_rate,
+            max_epoch=self.max_epoch,
        )

        self._create_multiseq2seq_model()
@ -132,13 +135,13 @@ class GenSenClassifier:
            sentences(list) : List of sentences.

        Returns
-            array: A pairwise cosine similarity for the sentences provided based on their gensen
-            vector representations.
+            pd.Dataframe: A pairwise cosine similarity for the sentences provided based on their
+            gensen vector representations.

        """

        # self.cache_dir = os.path.join(self.cache_dir, "clean/snli_1.0")
-        self._create_multiseq2seq_model()
+        # self._create_multiseq2seq_model()

        gensen_model = GenSenSingle(
            model_folder=os.path.join(
@ -149,7 +152,7 @@ class GenSenClassifier:
        )

        reps_h, reps_h_t = gensen_model.get_representation(
-            sentences, pool="last", return_numpy=True
+            sentences, pool="last", return_numpy=True, tokenize=True
        )

-        return pd.DataFrame(np.corrcoef(reps_h_t))
+        return compute_correlation_coefficients(reps_h_t)
--- a/scenarios/text_classification/README.md
+++ b/scenarios/text_classification/README.md
@ -1,3 +1,13 @@
 # Text Classification

 Text classification is a supervised learning method of learning and predicting the category or the class of a document given its text content. The state-of-the-art methods are based on neural networks of different architectures as well as pretrained language models or word embeddings. Text classification is a core task in natural language Processing and has numerous applications such as sentiment analysis, document indexing in digital libraries, hate speech detection, and general-purpose categorization in medical, academic, legal, and many other domains.
+
+
+## Summary
+
+The following summarizes each notebook for Text Classification. Each notebook provides more details and guiding in principles on building state of the art models.
+
+|Notebook|Runs Local|Description|
+|---|---|---|
+|[BERT for TC with MNLI](tc_mnli_bert.ipynb)| Yes| A notebook which walks through fine-tuning and evaluating a pretrained BERT model on a subset of the MultiNLI dataset|
+|[BERT for TC on AzureML](tc_bert_azureml.ipynb) | No |A notebook which walks through fine-tuning and evaluating pretrained BERT model on a distributed setup with AzureML. |
--- a/scenarios/text_classification/tc_bert_azureml.ipynb
+++ b/scenarios/text_classification/tc_bert_azureml.ipynb
@ -0,0 +1,903 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*Copyright (c) Microsoft Corporation. All rights reserved.*\n",
+    "\n",
+    "*Licensed under the MIT License.*\n",
+    "\n",
+    "# Text Classification of MultiNLI Sentences using BERT with Azure ML Pipelines"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Introduction\n",
+    "\n",
+    "In this notebook, we fine-tune and evaluate a pretrained [BERT](https://arxiv.org/abs/1810.04805) model on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset using [AzureML](https://azure.microsoft.com/en-us/services/machine-learning-service/) Pipelines.\n",
+    "\n",
+    "We use a [distributed sequence classifier](../../utils_nlp/bert/sequence_classification_distributed.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert).\n",
+    "\n",
+    "The notebooks acts as a template to,\n",
+    "1. Process a massive dataset in parallel by dividing the dataset into chunks using [DASK](https://dask.org/) .\n",
+    "2. Perform distributed training on AzureML compute on these processed chunks.\n",
+    "\n",
+    "We create an [AzureML Pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines) for the two steps mentioned above. With this pipeline, the notebook can be scheduled regularly to fine tune BERT with new data and get a model which can be further deployed on [Azure Container Instance](https://docs.microsoft.com/en-us/azure/container-service/).\n",
+    "\n",
+    "AzureML Pipeline define reusable machine learning workflows that can be used as a template for your machine learning scenarios. Pipelines allow you to optimize your workflow and spend time on machine learning rather than infrastructure. If you are new to the concept of pipelines, [this would be a good place to get started](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n",
+      "Azure ML SDK Version: 1.0.48\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.append(\"../../\")\n",
+    "import os\n",
+    "import json\n",
+    "import random\n",
+    "import shutil\n",
+    "import pandas as pd\n",
+    "\n",
+    "from utils_nlp.azureml import azureml_utils\n",
+    "from utils_nlp.dataset.multinli import get_generator\n",
+    "\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "import azureml.core\n",
+    "from azureml.core import Datastore, Experiment,  get_run\n",
+    "from azureml.core.conda_dependencies import CondaDependencies\n",
+    "from azureml.core.runconfig import RunConfiguration\n",
+    "from azureml.core.compute import ComputeTarget,  AmlCompute\n",
+    "from azureml.exceptions import ComputeTargetException\n",
+    "from azureml.data.data_reference import DataReference\n",
+    "from azureml.pipeline.steps import PythonScriptStep\n",
+    "from azureml.pipeline.core import Pipeline, PipelineData\n",
+    "from azureml.widgets import RunDetails\n",
+    "from azureml.train.dnn import PyTorch\n",
+    "from azureml.core.runconfig import MpiConfiguration\n",
+    "from azureml.pipeline.steps import EstimatorStep\n",
+    "\n",
+    "print(\"System version: {}\".format(sys.version))\n",
+    "print(\"Azure ML SDK Version:\", azureml.core.VERSION)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's define a few variables before we get started, these variables define the folder where the data would reside, the batch size and the number of epochs we are training for. \n",
+    "We also define the variables for AzureML workspace, which you can use to create a new workspace. You can ignore these variables if you have `config.json` in `.azureml` directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "LABEL_COL = \"genre\"\n",
+    "DATA_FOLDER = \"../../data/temp\"\n",
+    "TRAIN_FOLDER = \"../../data/temp/train\"\n",
+    "TEST_FOLDER = \"../../data/temp/test\"\n",
+    "ENCODED_LABEL_COL = \"label\"\n",
+    "NUM_PARTITIONS = None\n",
+    "LABELS = ['telephone', 'government', 'travel', 'slate', 'fiction']\n",
+    "PROJECT_FOLDER = \"../../\"\n",
+    "NODE_COUNT = 4\n",
+    "\n",
+    "config_path = (\n",
+    "    \"./.azureml\"\n",
+    ")  # Path to the directory containing config.json with azureml credentials\n",
+    "\n",
+    "# Azure resources\n",
+    "subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
+    "resource_group = \"YOUR_RESOURCE_GROUP_NAME\"  \n",
+    "workspace_name = \"YOUR_WORKSPACE_NAME\"  \n",
+    "workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on.\n",
+    "cluster_name = \"pipelines-tc-12\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this example we will use AzureML pipelines to execute training pipelines. Each preprocessing step is included as a step in the pipeline. For a more detailed walkthrough of what pipelines are with a getting started guidelines check this [notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb). We start by doing some AzureML related setup below."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 0.1 Create a workspace\n",
+    "\n",
+    "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n",
+    "\n",
+    "**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "ws = azureml_utils.get_or_create_workspace(\n",
+    "    config_path=config_path,\n",
+    "    subscription_id=subscription_id,\n",
+    "    resource_group=resource_group,\n",
+    "    workspace_name=workspace_name,\n",
+    "    workspace_region=workspace_region,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 0.2 Create a compute target\n",
+    "We create and attach a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training the model. Here we use the AzureML-managed compute target ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) as our remote training compute resource. Our cluster autoscales from 0 to 8 `STANDARD_NC12` GPU nodes.\n",
+    "\n",
+    "Creating and configuring the AmlCompute cluster takes approximately 5 minutes the first time around. Once a cluster with the given configuration is created, it does not need to be created again.\n",
+    "\n",
+    "As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Read more about the default limits and how to request more quota [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found existing compute target.\n",
+      "{'currentNodeCount': 2, 'targetNodeCount': 2, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 2, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-31T22:29:42.732000+00:00', 'errors': None, 'creationTime': '2019-07-25T04:16:20.598768+00:00', 'modifiedTime': '2019-07-25T04:16:36.486727+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 2, 'maxNodeCount': 10, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
+    "    print(\"Found existing compute target.\")\n",
+    "except ComputeTargetException:\n",
+    "    print(\"Creating a new compute target...\")\n",
+    "    compute_config = AmlCompute.provisioning_configuration(\n",
+    "        vm_size=\"STANDARD_NC12\", max_nodes=8\n",
+    "    )\n",
+    "\n",
+    "    # create the cluster\n",
+    "    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
+    "\n",
+    "    compute_target.wait_for_completion(show_output=True)\n",
+    "\n",
+    "# use get_status() to get a detailed status for the current AmlCompute.\n",
+    "print(compute_target.get_status().serialize())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Preprocessing\n",
+    "\n",
+    "The pipeline is defined by a series of steps, the first being a PythonScriptStep which utilizes [DASK](https://dask.org/) to load dataframes in partitions allowing us to load and preprocess different sets of data in parallel."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.1 Read Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_batches = get_generator(DATA_FOLDER, \"train\", num_batches=NUM_PARTITIONS, batch_size=10e6)\n",
+    "test_batches = get_generator(DATA_FOLDER, \"dev_matched\", num_batches=NUM_PARTITIONS, batch_size=10e6)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.2 Preprocess and Tokenize\n",
+    "\n",
+    "In the classification task, we use the first sentence only as the text input, and the corresponding genre as the label. Select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are.\n",
+    "\n",
+    "Once filtered, we encode the labels. To do this, fit a label encoder with the known labels in a MNLI dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "os.makedirs(TRAIN_FOLDER, exist_ok=True)\n",
+    "os.makedirs(TEST_FOLDER, exist_ok=True)\n",
+    "\n",
+    "labels = LABELS\n",
+    "label_encoder = LabelEncoder()\n",
+    "label_encoder.fit(labels)\n",
+    "\n",
+    "num_train_partitions = 0\n",
+    "for batch in train_batches:\n",
+    "    batch = batch[batch[\"gold_label\"]==\"neutral\"]\n",
+    "    batch[ENCODED_LABEL_COL] = label_encoder.transform(batch[LABEL_COL])\n",
+    "    batch.to_csv(TRAIN_FOLDER+\"/batch{}.csv\".format(str(num_train_partitions)))\n",
+    "    num_train_partitions += 1\n",
+    "    \n",
+    "num_test_partitions = 0\n",
+    "for batch in test_batches:\n",
+    "    batch = batch[batch[\"gold_label\"]==\"neutral\"]\n",
+    "    batch[ENCODED_LABEL_COL] = label_encoder.transform(batch[LABEL_COL])\n",
+    "    batch.to_csv(TEST_FOLDER+\"/batch{}.csv\".format(str(num_test_partitions)))\n",
+    "    num_test_partitions += 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we have the partitions of data ready they are uploaded to the datastore."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = ws.get_default_datastore()\n",
+    "ds.upload(src_dir=TRAIN_FOLDER, target_path=\"mnli_data/train\", overwrite=True, show_progress=False)\n",
+    "ds.upload(src_dir=TEST_FOLDER, target_path=\"mnli_data/test\", overwrite=True, show_progress=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shutil.rmtree(TRAIN_FOLDER)\n",
+    "shutil.rmtree(TEST_FOLDER)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now parallely operate on each batch to tokenize the data and preprocess the tokens. To do this, we create a PythonScript step below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing preprocess.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile preprocess.py\n",
+    "# Copyright (c) Microsoft Corporation. All rights reserved.\n",
+    "# Licensed under the MIT License.\n",
+    "import argparse\n",
+    "import logging\n",
+    "import os\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from utils_nlp.models.bert.common import Language, Tokenizer\n",
+    "\n",
+    "LABEL_COL = \"genre\"\n",
+    "TEXT_COL = \"sentence1\"\n",
+    "LANGUAGE = Language.ENGLISH\n",
+    "TO_LOWER = True\n",
+    "MAX_LEN = 150\n",
+    "\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "\n",
+    "def tokenize(df):\n",
+    "    \"\"\"Tokenize the text documents and convert them to lists of tokens using the BERT tokenizer.\n",
+    "    Args:\n",
+    "        df(pd.Dataframe): Dataframe with training or test samples\n",
+    "\n",
+    "    Returns:\n",
+    "\n",
+    "        list: List of lists of tokens for train set.\n",
+    "\n",
+    "    \"\"\"\n",
+    "    tokenizer = Tokenizer(\n",
+    "        LANGUAGE, to_lower=TO_LOWER)\n",
+    "    tokens = tokenizer.tokenize(list(df[TEXT_COL]))\n",
+    "\n",
+    "    return tokens\n",
+    "\n",
+    "\n",
+    "def preprocess(tokens):\n",
+    "    \"\"\" Preprocess method that does the following,\n",
+    "            Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n",
+    "            Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n",
+    "            Pad or truncate the token lists to the specified max length\n",
+    "            Return mask lists that indicate paddings' positions\n",
+    "            Return token type id lists that indicate which sentence the tokens belong to (not needed\n",
+    "            for one-sequence classification)\n",
+    "\n",
+    "    Args:\n",
+    "        tokens(pd.Dataframe): Dataframe with tokens for train set.\n",
+    "\n",
+    "    Returns:\n",
+    "        list: List of lists of tokens for train or test set with special tokens added.\n",
+    "        list: Input mask.\n",
+    "    \"\"\"\n",
+    "    tokenizer = Tokenizer(\n",
+    "        LANGUAGE, to_lower=TO_LOWER)\n",
+    "    tokens, mask, _ = tokenizer.preprocess_classification_tokens(\n",
+    "        tokens, MAX_LEN\n",
+    "    )\n",
+    "\n",
+    "    return tokens, mask\n",
+    "\n",
+    "\n",
+    "parser = argparse.ArgumentParser()\n",
+    "parser.add_argument(\"--input_data\", type=str, help=\"input data\")\n",
+    "parser.add_argument(\"--output_data\", type=str, help=\"Path to the output file.\")\n",
+    "\n",
+    "args = parser.parse_args()\n",
+    "input_data = args.input_data\n",
+    "output_data = args.output_data\n",
+    "output_dir = os.path.dirname(os.path.abspath(output_data))\n",
+    "\n",
+    "if output_dir is not None:\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    logger.info(\"%s created\" % output_dir)\n",
+    "\n",
+    "df = pd.read_csv(args.input_data)\n",
+    "tokens_array = tokenize(df)\n",
+    "tokens_array, mask_array = preprocess(tokens_array)\n",
+    "\n",
+    "df['tokens'] = tokens_array\n",
+    "df['mask'] = mask_array\n",
+    "\n",
+    "# Filter columns\n",
+    "cols = ['tokens', 'mask', 'label']\n",
+    "df = df[cols]\n",
+    "df.to_csv(output_data, header=False, index=False)\n",
+    "logger.info(\"Completed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'../../utils_nlp/models/bert/preprocess.py'"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preprocess_file = os.path.join(PROJECT_FOLDER,'utils_nlp/models/bert/preprocess.py')\n",
+    "shutil.move('preprocess.py',preprocess_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a conda environment for the steps below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conda_dependencies = CondaDependencies.create(\n",
+    "    conda_packages=[\n",
+    "        \"numpy\",\n",
+    "        \"scikit-learn\",\n",
+    "        \"pandas\",\n",
+    "    ],\n",
+    "    pip_packages=[\"azureml-sdk==1.0.43.*\", \n",
+    "                  \"torch==1.1\", \n",
+    "                  \"tqdm==4.31.1\",\n",
+    "                 \"pytorch-pretrained-bert>=0.6\"],\n",
+    "    python_version=\"3.6.8\",\n",
+    ")\n",
+    "run_config = RunConfiguration(conda_dependencies=conda_dependencies)\n",
+    "run_config.environment.docker.enabled = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then create the list of steps that use the preprocess.py created above. We use the output of these steps as input to training in the next section."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processed_train_files = []\n",
+    "processed_test_files = []\n",
+    "ds = ws.get_default_datastore()\n",
+    "\n",
+    "for i in range(num_train_partitions):\n",
+    "        input_data = DataReference(datastore=ds, \n",
+    "                                   data_reference_name='train_batch_{}'.format(str(i)), \n",
+    "                                   path_on_datastore='mnli_data/train/batch{}.csv'.format(str(i)),\n",
+    "                                   overwrite=False)\n",
+    "\n",
+    "        output_data = PipelineData(name=\"train{}\".format(str(i)), datastore=ds,\n",
+    "                       output_path_on_compute='mnli_data/processed_train/batch{}.csv'.format(str(i)))\n",
+    "\n",
+    "        step = PythonScriptStep(\n",
+    "            name='preprocess_step_train_{}'.format(str(i)),\n",
+    "            arguments=[\"--input_data\", input_data, \"--output_data\", output_data],\n",
+    "            script_name= 'utils_nlp/models/bert/preprocess.py',\n",
+    "            inputs=[input_data],\n",
+    "            outputs=[output_data],\n",
+    "            source_directory=PROJECT_FOLDER,\n",
+    "            compute_target=compute_target,\n",
+    "            runconfig=run_config,\n",
+    "            allow_reuse=False,\n",
+    "        )\n",
+    "        \n",
+    "        processed_train_files.append(output_data)         \n",
+    "            \n",
+    "for i in range(num_test_partitions):\n",
+    "            input_data = DataReference(datastore=ds, \n",
+    "                                       data_reference_name='test_batch_{}'.format(str(i)), \n",
+    "                                       path_on_datastore='mnli_data/test/batch{}.csv'.format(str(i)),\n",
+    "                                       overwrite=False)\n",
+    "        \n",
+    "            output_data = PipelineData(name=\"test{}\".format(str(i)), datastore=ds,\n",
+    "                        output_path_on_compute='mnli_data/processed_test/batch{}.csv'.format(str(i)))\n",
+    "            \n",
+    "            step = PythonScriptStep(\n",
+    "                name='preprocess_step_test_{}'.format(str(i)),\n",
+    "                arguments=[\"--input_data\", input_data, \"--output_data\", output_data],\n",
+    "                script_name= 'utils_nlp/models/bert/preprocess.py',\n",
+    "                inputs=[input_data],\n",
+    "                outputs=[output_data],\n",
+    "                source_directory=PROJECT_FOLDER,\n",
+    "                compute_target=compute_target,\n",
+    "                runconfig=run_config,\n",
+    "                allow_reuse=False,\n",
+    "            )\n",
+    "            \n",
+    "            processed_test_files.append(output_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Train and Score"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once the data is processed and available on datastore, we  train the classifier using the training examples. This involves fine-tuning the BERT Transformer and learning a linear classification layer on top of that. After training is complete we score the performance of the model on the test dataset\n",
+    "\n",
+    "The training is distributed and is done AzureML's capability to support distributed using MPI with horovod. \n",
+    "\n",
+    "**Please note** that training requires a GPU enabled cluster in AzureML Compute. We suggest using NC12. If you would like to change the GPU configuration, please changes `NUM_GPUS` variable accordingly.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Setup training script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing train.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile train.py\n",
+    "# Copyright (c) Microsoft Corporation. All rights reserved.\n",
+    "# Licensed under the MIT License.\n",
+    "\n",
+    "import argparse\n",
+    "import json\n",
+    "import logging\n",
+    "import os\n",
+    "import torch\n",
+    "from sklearn.metrics import classification_report\n",
+    "\n",
+    "from utils_nlp.models.bert.common import Language\n",
+    "from utils_nlp.models.bert.sequence_classification_distributed import (\n",
+    "    BERTSequenceDistClassifier,\n",
+    ")\n",
+    "from utils_nlp.common.timer import Timer\n",
+    "\n",
+    "BATCH_SIZE = 32\n",
+    "NUM_GPUS = 2\n",
+    "NUM_EPOCHS = 1\n",
+    "LABELS = [\"telephone\", \"government\", \"travel\", \"slate\", \"fiction\"]\n",
+    "OUTPUT_DIR = \"./outputs/\"\n",
+    "\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "parser = argparse.ArgumentParser()\n",
+    "parser.add_argument(\n",
+    "    \"--train_files\",\n",
+    "    nargs=\"+\",\n",
+    "    default=[],\n",
+    "    help=\"List of file paths to all the files in train dataset.\",\n",
+    ")\n",
+    "\n",
+    "parser.add_argument(\n",
+    "    \"--test_files\",\n",
+    "    nargs=\"+\",\n",
+    "    default=[],\n",
+    "    help=\"List of file paths to all the files in test dataset.\",\n",
+    ")\n",
+    "\n",
+    "args = parser.parse_args()\n",
+    "train_files = [file.strip() for file in args.train_files]\n",
+    "test_files = [file.strip() for file in args.test_files]\n",
+    "\n",
+    "# Handle square brackets from train list\n",
+    "train_files[0] = train_files[0][1:]\n",
+    "train_files[len(train_files) - 1] = train_files[len(train_files) - 1][:-1]\n",
+    "\n",
+    "# Handle square brackets from test list\n",
+    "test_files[0] = test_files[0][1:]\n",
+    "test_files[len(test_files) - 1] = test_files[len(test_files) - 1][:-1]\n",
+    "\n",
+    "# Train\n",
+    "classifier = BERTSequenceDistClassifier(\n",
+    "    language=Language.ENGLISH, num_labels=len(LABELS)\n",
+    ")\n",
+    "with Timer() as t:\n",
+    "    classifier.fit(\n",
+    "        train_files,\n",
+    "        num_gpus=NUM_GPUS,\n",
+    "        num_epochs=NUM_EPOCHS,\n",
+    "        batch_size=BATCH_SIZE,\n",
+    "        verbose=True,\n",
+    "    )\n",
+    "\n",
+    "# Predict\n",
+    "preds, labels_test = classifier.predict(\n",
+    "    test_files, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE\n",
+    ")\n",
+    "\n",
+    "results = classification_report(\n",
+    "    labels_test, preds, target_names=LABELS, output_dict=True\n",
+    ")\n",
+    "\n",
+    "# Write out results.\n",
+    "result_file = os.path.join(OUTPUT_DIR, \"results.json\")\n",
+    "with open(result_file, \"w+\") as fp:\n",
+    "    json.dump(results, fp)\n",
+    "\n",
+    "# Save model\n",
+    "model_file = os.path.join(OUTPUT_DIR, \"model.pt\")\n",
+    "torch.save(classifier.model.state_dict(), model_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'../../utils_nlp/models/bert/train.py'"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_file = os.path.join(PROJECT_FOLDER,'utils_nlp/models/bert/train.py')\n",
+    "shutil.move('train.py',train_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Create a Pytorch Estimator\n",
+    "\n",
+    "We create a Pytorch Estimator using AzureML SDK and additonally define an EstimatorStep to run it on AzureML pipelines.\n",
+    "\n",
+    "The Azure ML SDK's PyTorch Estimator allows us to submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch).\n",
+    "\n",
+    "This Estimator specifies that the training script will run on 4 nodes, with 2 worker per node. In order to execute a distributed run using GPU, we must define `use_gpu` and `distributed_backend` to use MPI/Horovod. PyTorch, Horovod, and other necessary dependencies are installed automatically."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING - framework_version is not specified, defaulting to version 1.1.\n",
+      "WARNING - 'process_count_per_node' parameter will be deprecated. Please use it as part of 'distributed_training' parameter.\n"
+     ]
+    }
+   ],
+   "source": [
+    "estimator = PyTorch(source_directory=PROJECT_FOLDER,\n",
+    "                    compute_target=compute_target,\n",
+    "                    entry_script='utils_nlp/models/bert/train.py',\n",
+    "                    node_count= NODE_COUNT,\n",
+    "                    distributed_training=MpiConfiguration(),\n",
+    "                    process_count_per_node=2,\n",
+    "                    use_gpu=True,\n",
+    "                    conda_packages=['scikit-learn=0.20.3', 'numpy>=1.16.0', 'pandas'],\n",
+    "                    pip_packages=[\"tqdm==4.31.1\",\"pytorch-pretrained-bert>=0.6\"]\n",
+    "                   )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = processed_train_files + processed_test_files\n",
+    "\n",
+    "est_step = EstimatorStep(name=\"Estimator-Train\", \n",
+    "                         estimator=estimator, \n",
+    "                         estimator_entry_script_arguments=[\n",
+    "                             '--train_files',  str(processed_train_files),\n",
+    "                             '--test_files', str(processed_test_files)],\n",
+    "                         inputs = inputs,\n",
+    "                         runconfig_pipeline_params=None, \n",
+    "                         compute_target=compute_target)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Submit the pipeline\n",
+    "\n",
+    "The model is fine tuned on AML Compute and takes **45 minutes** to train. The total time to run the pipeline will be around **1h 30 minutes** if you use the default value `max_epoch=1`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "pipeline = Pipeline(workspace=ws, steps=[est_step])\n",
+    "experiment = Experiment(ws, 'NLP-TC-BERT-distributed')\n",
+    "pipeline_run = experiment.submit(pipeline)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "48df85f533834264a8a8b65a57d60d59",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "RunDetails(pipeline_run).show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#If you would like to cancel the job for any reasons uncomment the code below.\n",
+    "#pipeline_run.cancel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#wait for the run to complete before continuing in the notebook\n",
+    "pipeline_run.wait_for_completion()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Download and analyze results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading file outputs/results.json to ./outputs\\results.json...\n",
+      "Downloading file outputs/model.pt to ./outputs\\model.pt...\n"
+     ]
+    }
+   ],
+   "source": [
+    "step_run = pipeline_run.find_step_run(\"Estimator-Train\")[0]\n",
+    "file_names = ['outputs/results.json', 'outputs/model.pt']\n",
+    "azureml_utils.get_output_files(step_run, './outputs', file_names=file_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              f1-score  precision    recall  support\n",
+      "telephone     0.920217   0.897281  0.944356    629.0\n",
+      "government    0.967905   0.979487  0.956594    599.0\n",
+      "travel        0.856683   0.900169  0.817204    651.0\n",
+      "slate         0.991093   0.991896  0.990291    618.0\n",
+      "fiction       0.936434   0.906907  0.967949    624.0\n",
+      "micro avg     0.933996   0.933996  0.933996   3121.0\n",
+      "macro avg     0.934466   0.935148  0.935279   3121.0\n",
+      "weighted avg  0.933394   0.934321  0.933996   3121.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open('outputs/results.json', 'r') as handle:\n",
+    "    parsed = json.load(handle)\n",
+    "    print(pd.DataFrame.from_dict(parsed).transpose())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From the above chart we can notice the performance of the model trained on a distributed setup in AzureML Compute. From our comparison to fine tuning the same model on MNLI dataset on a `STANDARD_NC12` machine [here](tc_mnli_bert.ipynb) we notice a gain of 20% in the model training time with no drop in performance for AzureML Compute. We present the comparison of weight avg of the metrics along with the training time below,\n",
+    "\n",
+    "| Training Setup | F1-Score | Precision | Recall | Training Time |\n",
+    "| --- | --- | --- | --- | --- |\n",
+    "|Standard NC12 | 0.93 |0.93 |0.93 | 58 min |\n",
+    "|AzureML Compute*|0.934| 0.934 | 0.934| 46 min |\n",
+    "\n",
+    "* AzureML Compute - The setup used 4 nodes with `STANDARD_NC12` machines.\n",
+    "\n",
+    "We also observe common tradeoffs associated with distributed training. We make use of [Horovod](https://github.com/horovod/horovod), a distributed training tool for many popular deep learning frameworks that enables parallelization of work across the nodes in the cluster. Distributed training decreases the time it takes for the model to converge in theory, but the model may also take more time in communicating with each node. Note that the communication time will eventually become negligible when training on larger and larger datasets, but being aware of this tradeoff is helpful for choosing the node configuration when training on smaller datasets. We expect the gains of using AzureML to increase with increased dataset size."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally clean up any intermediate files we created."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.remove(train_file)\n",
+    "os.remove(preprocess_file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Tags",
+  "kernelspec": {
+   "display_name": "Python nlp_cpu",
+   "language": "python",
+   "name": "ame"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/scenarios/text_classification/tc_mnli_bert.ipynb
+++ b/scenarios/text_classification/tc_mnli_bert.ipynb
@ -13,25 +13,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../../\")\n",
    "import os\n",
+    "import json\n",
    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import scrapbook as sb\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import train_test_split\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
    "from utils_nlp.dataset.multinli import load_pandas_df\n",
    "from utils_nlp.eval.classification import eval_classification\n",
    "from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier\n",
    "from utils_nlp.models.bert.common import Language, Tokenizer\n",
-    "from utils_nlp.common.timer import Timer\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import numpy as np"
+    "from utils_nlp.common.timer import Timer"
   ]
  },
  {
@ -46,8 +49,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
+   "execution_count": 17,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
   "outputs": [],
   "source": [
    "DATA_FOLDER = \"../../../temp\"\n",
@ -56,7 +63,8 @@
    "TO_LOWER = True\n",
    "MAX_LEN = 150\n",
    "BATCH_SIZE = 32\n",
-    "NUM_GPUS = 2\n",
+    "BATCH_SIZE_PRED = 512\n",
+    "NUM_GPUS = 1\n",
    "NUM_EPOCHS = 1\n",
    "TRAIN_SIZE = 0.6\n",
    "LABEL_COL = \"genre\"\n",
@ -256,15 +264,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 78540/78540 [00:26<00:00, 2991.68it/s]\n",
-      "100%|██████████| 52360/52360 [00:17<00:00, 2981.71it/s]\n"
+      "100%|██████████| 78540/78540 [00:26<00:00, 2968.10it/s]\n",
+      "100%|██████████| 52360/52360 [00:17<00:00, 2960.85it/s]\n"
     ]
    }
   ],
@ -291,7 +299,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@ -313,7 +321,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@ -332,7 +340,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
@ -341,24 +349,162 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "t_total value of -1 results in schedule not being applied\n"
+      "t_total value of -1 results in schedule not being applied\n",
+      "Iteration:   0%|          | 1/2455 [00:00<35:04,  1.17it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch:1/1; batch:1->246/2454; loss:1.584357\n",
-      "epoch:1/1; batch:247->492/2454; loss:0.110689\n",
-      "epoch:1/1; batch:493->738/2454; loss:0.208907\n",
-      "epoch:1/1; batch:739->984/2454; loss:0.423804\n",
-      "epoch:1/1; batch:985->1230/2454; loss:0.035525\n",
-      "epoch:1/1; batch:1231->1476/2454; loss:0.189890\n",
-      "epoch:1/1; batch:1477->1722/2454; loss:0.216201\n",
-      "epoch:1/1; batch:1723->1968/2454; loss:0.245825\n",
-      "epoch:1/1; batch:1969->2214/2454; loss:0.138958\n",
-      "epoch:1/1; batch:2215->2454/2454; loss:0.066018\n",
-      "[Training time: 0.963 hrs]\n"
+      "epoch:1/1; batch:1->246/2455; average training loss:1.610151\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  10%|█         | 247/2455 [02:21<21:02,  1.75it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/1; batch:247->492/2455; average training loss:0.376939\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  20%|██        | 493/2455 [04:42<18:42,  1.75it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/1; batch:493->738/2455; average training loss:0.305378\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  30%|███       | 739/2455 [07:03<16:22,  1.75it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/1; batch:739->984/2455; average training loss:0.279816\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  40%|████      | 985/2455 [09:24<13:59,  1.75it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/1; batch:985->1230/2455; average training loss:0.262505\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  50%|█████     | 1231/2455 [11:44<11:38,  1.75it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/1; batch:1231->1476/2455; average training loss:0.250177\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  60%|██████    | 1477/2455 [14:05<09:17,  1.75it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/1; batch:1477->1722/2455; average training loss:0.241982\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  70%|███████   | 1723/2455 [16:25<06:57,  1.75it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/1; batch:1723->1968/2455; average training loss:0.232584\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  80%|████████  | 1969/2455 [18:46<04:37,  1.75it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/1; batch:1969->2214/2455; average training loss:0.226051\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration:  90%|█████████ | 2215/2455 [21:06<02:16,  1.75it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch:1/1; batch:2215->2455/2455; average training loss:0.221012\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 100%|██████████| 2455/2455 [23:23<00:00,  2.09it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Training time: 0.390 hrs]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
     ]
    }
   ],
@ -386,21 +532,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "52384it [11:51, 88.76it/s]                           \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "preds = classifier.predict(\n",
-    "    token_ids=tokens_test, input_mask=mask_test, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE\n",
-    ")"
+    "preds = classifier.predict(token_ids=tokens_test, \n",
+    "                           input_mask=mask_test, \n",
+    "                           num_gpus=NUM_GPUS, \n",
+    "                           batch_size=BATCH_SIZE_PRED)"
   ]
  },
  {
@ -413,45 +552,159 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "     fiction       0.88      0.96      0.91     10275\n",
-      "  government       0.94      0.94      0.94     10292\n",
-      "       slate       0.91      0.80      0.85     10277\n",
-      "   telephone       0.99      1.00      0.99     11205\n",
-      "      travel       0.95      0.97      0.96     10311\n",
-      "\n",
-      "    accuracy                           0.93     52360\n",
-      "   macro avg       0.93      0.93      0.93     52360\n",
-      "weighted avg       0.93      0.93      0.93     52360\n",
-      "\n"
+      "{\n",
+      "    \"accuracy\": 0.9343964858670741,\n",
+      "    \"fiction\": {\n",
+      "        \"f1-score\": 0.9240671732081498,\n",
+      "        \"precision\": 0.9190412013862148,\n",
+      "        \"recall\": 0.9291484184914842,\n",
+      "        \"support\": 10275\n",
+      "    },\n",
+      "    \"government\": {\n",
+      "        \"f1-score\": 0.943645744627561,\n",
+      "        \"precision\": 0.9739427153345053,\n",
+      "        \"recall\": 0.9151768363777691,\n",
+      "        \"support\": 10292\n",
+      "    },\n",
+      "    \"macro avg\": {\n",
+      "        \"f1-score\": 0.9329061626350004,\n",
+      "        \"precision\": 0.9340480538608924,\n",
+      "        \"recall\": 0.9332503791830062,\n",
+      "        \"support\": 52360\n",
+      "    },\n",
+      "    \"slate\": {\n",
+      "        \"f1-score\": 0.8626293944091614,\n",
+      "        \"precision\": 0.8873456790123457,\n",
+      "        \"recall\": 0.8392527002043398,\n",
+      "        \"support\": 10277\n",
+      "    },\n",
+      "    \"telephone\": {\n",
+      "        \"f1-score\": 0.9943437402574267,\n",
+      "        \"precision\": 0.9924431009957326,\n",
+      "        \"recall\": 0.9962516733601071,\n",
+      "        \"support\": 11205\n",
+      "    },\n",
+      "    \"travel\": {\n",
+      "        \"f1-score\": 0.9398447606727038,\n",
+      "        \"precision\": 0.897467572575664,\n",
+      "        \"recall\": 0.9864222674813307,\n",
+      "        \"support\": 10311\n",
+      "    },\n",
+      "    \"weighted avg\": {\n",
+      "        \"f1-score\": 0.9340029685187979,\n",
+      "        \"precision\": 0.9350712643460813,\n",
+      "        \"recall\": 0.9343964858670741,\n",
+      "        \"support\": 52360\n",
+      "    }\n",
+      "}\n"
     ]
    }
   ],
   "source": [
-    "print(classification_report(labels_test, preds, target_names=label_encoder.classes_))"
+    "report = classification_report(labels_test, preds, target_names=label_encoder.classes_, output_dict=True) \n",
+    "print(json.dumps(report, indent=4, sort_keys=True))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
   "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.9343964858670741,
+       "encoder": "json",
+       "name": "accuracy",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "accuracy"
+      }
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.9340480538608924,
+       "encoder": "json",
+       "name": "precision",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "precision"
+      }
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.9332503791830062,
+       "encoder": "json",
+       "name": "recall",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "recall"
+      }
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.9329061626350004,
+       "encoder": "json",
+       "name": "f1",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "f1"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# for testing\n",
+    "sb.glue(\"accuracy\",  report[\"accuracy\"])\n",
+    "sb.glue(\"precision\", report[\"macro avg\"][\"precision\"])\n",
+    "sb.glue(\"recall\", report[\"macro avg\"][\"recall\"])\n",
+    "sb.glue(\"f1\", report[\"macro avg\"][\"f1-score\"])\n"
+   ]
  }
 ],
 "metadata": {
+  "celltoolbar": "Tags",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python (nlp_gpu)",
   "language": "python",
-   "name": "python3"
+   "name": "nlp_gpu"
  },
  "language_info": {
   "codemirror_mode": {
--- a/setup.py
+++ b/setup.py
@ -11,9 +11,6 @@ from os.path import basename, dirname, join, splitext

 from setuptools import find_packages, setup

-VERSION = __import__("__init__").VERSION
-
-
 def read(*names, **kwargs):
    with io.open(
        join(dirname(__file__), *names),
@ -24,7 +21,6 @@ def read(*names, **kwargs):

 setup(
    name="utils_nlp",
-    version=VERSION,
    license="MIT License",
    description="NLP Utility functions that are used for best practices in building state-of-the-art NLP methods and scenarios. Developed by Microsoft AI CAT",
    long_description="%s\n%s"
@ -73,8 +69,9 @@ setup(
        "Word Embedding",
    ],
    python_requires=">=3.6",
-    install_requires=[],
+    install_requires=['setuptools_scm>=3.2.0',],
    dependency_links=[],
    extras_require={},
-    setup_requires=[],
+    use_scm_version=True,
+    setup_requires=['setuptools_scm'],
 )
--- a/tests/README.md
+++ b/tests/README.md
@ -1,33 +1,45 @@
 # Tests

-This project uses unit, smoke and integration tests with Python files and notebooks. For more information, see a [quick introduction to unit, smoke and integration tests](https://miguelgfierro.com/blog/2018/a-beginners-guide-to-python-testing/). To manually execute the unit tests in the different environments, first **make sure you are in the correct environment as described in the [SETUP.md](/SETUP.md)**. 
+This project uses unit, smoke and integration tests with Python files and notebooks. 

-Tests are automatically run as part of a DevOps pipeline. The pipelines are defined in .yml files in tests/ci with filenames that align with pipeline names.
+ * In the unit tests we just make sure the notebook runs. 
+ * In the smoke tests, we run them with a small dataset or a small number of epochs to make sure that, apart from running, they provide reasonable metrics. 
+ * In the integration tests we use a bigger dataset for more epochs and we test that the metrics are what we expect. 
+
+For more information, see a [quick introduction to unit, smoke and integration tests](https://miguelgfierro.com/blog/2018/a-beginners-guide-to-python-testing/). To manually execute the unit tests in the different environments, first **make sure you are in the correct environment as described in the [SETUP.md](../SETUP.md)**. 
+
+Tests are automatically run as part of a DevOps pipeline. The pipelines are defined in the `.yml` files in [tests/ci](./ci) with filenames that align with pipeline names.

 ## Test execution

-Click on the following menus to see more details on how to execute the unit, smoke and integration tests:
+**Click on the following menus** to see more details on how to execute the unit, smoke and integration tests:

 <details>
 <summary><strong><em>Unit tests</em></strong></summary>

 Unit tests ensure that each class or function behaves as it should. Every time a developer makes a pull request to staging or master branch, a battery of unit tests is executed. 

+**Note that the next instructions execute the tests from the root folder.**
+
 For executing the Python unit tests for the utilities:

-    pytest tests/unit -m "not notebooks and not gpu"
+    pytest tests/unit -m "not notebooks and not gpu and not azureml"

 For executing the Python unit tests for the notebooks:

-    pytest tests/unit -m "notebooks and not gpu"
+    pytest tests/unit -m "notebooks and not gpu and not azureml"

 For executing the Python GPU unit tests for the utilities:

-    pytest tests/unit -m "not notebooks and gpu"
+    pytest tests/unit -m "not notebooks and gpu and not azureml"

 For executing the Python GPU unit tests for the notebooks:

-    pytest tests/unit -m "notebooks and gpu"
+    pytest tests/unit -m "notebooks and gpu and not azureml"
+
+For executing the AzureML unit tests:
+
+    pytest tests/unit -m "azureml"

 </details>

@ -37,13 +49,19 @@ For executing the Python GPU unit tests for the notebooks:

 Smoke tests make sure that the system works and are executed just before the integration tests every night.

+**Note that the next instructions execute the tests from the root folder.**
+
 For executing the Python smoke tests:

-    pytest tests/smoke -m "smoke and not gpu"
+    pytest --durations=0 tests/smoke -m "smoke and not gpu and not azureml"

 For executing the Python GPU smoke tests:

-    pytest tests/smoke -m "smoke and gpu"
+    pytest --durations=0 tests/smoke -m "smoke and gpu and not azureml"
+
+For executing the AzureML smoke tests:
+
+    pytest --durations=0 tests/smoke -m "azureml"

 </details>

@ -52,13 +70,19 @@ For executing the Python GPU smoke tests:

 Integration tests make sure that the program results are acceptable

+**Note that the next instructions execute the tests from the root folder.**
+
 For executing the Python integration tests:

-    pytest tests/integration -m "integration and not gpu"
+    pytest --durations=0 tests/integration -m "integration and not gpu and not azureml"

 For executing the Python GPU integration tests:

-    pytest tests/integration -m "integration and gpu"
+    pytest --durations=0 tests/integration -m "integration and gpu and not azureml"
+
+For executing the AzureML integration tests:
+
+    pytest --durations=0 tests/smoke -m "azureml"

 </details>

--- a/tests/ci/cpu_integration_tests_linux.yml
+++ b/tests/ci/cpu_integration_tests_linux.yml
@ -0,0 +1,62 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+
+# More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers
+# Implementing the scheduler from the dashboard
+# Uncomment in case it wants to be done from using the yml
+#schedules:
+#- cron: "56 22 * * *"
+#  displayName: Daily computation of nightly builds
+#  branches:
+#    include:
+#    - master
+#  always: true
+
+
+# no PR builds
+pr: none
+
+# no CI trigger
+trigger: none
+
+jobs:
+- job: nightly
+  displayName : 'Nightly tests'
+  timeoutInMinutes: 180 # how long to run the job before automatically cancelling
+  pool:
+    name: nlpagentpool
+
+  steps:
+  - bash: |
+      echo "##vso[task.prependpath]/data/anaconda/bin"
+      conda env list
+    displayName: 'Add Conda to PATH'
+
+  # Conda creation can take around 10min
+  - bash: |
+      python tools/generate_conda_file.py 
+      conda env create -n integration_cpu -f nlp_cpu.yaml
+    displayName: 'Creating Conda Environment with dependencies'
+
+  - bash: |
+      source activate integration_cpu
+      pytest --durations=0 tests/smoke -m "smoke and not gpu and not azureml" --junitxml=junit/test-smoke-test.xml
+    displayName: 'Run smoke tests'
+
+  - bash: |
+      source activate integration_cpu
+      pytest --durations=0 tests/integration -m "integration and not gpu and not azureml" --junitxml=junit/test-integration-test.xml
+    displayName: 'Run integration tests'
+
+  - bash: |
+      echo Remove Conda Environment
+      conda remove -n integration_cpu --all -q --force -y
+      echo Done Cleanup
+    displayName: 'Cleanup Task'
+    condition: always()
+
+  - task: PublishTestResults@2
+    inputs:
+      testResultsFiles: '**/test-*-test.xml'
+      testRunTitle: 'Test results for PyTest'
--- a/tests/ci/cpu_unit_tests_linux.yml
+++ b/tests/ci/cpu_unit_tests_linux.yml
@ -43,7 +43,7 @@ jobs:

  - bash: |
      source activate nlp_cpu
-      pytest --durations=0 tests/unit -m "not notebooks and not gpu" --junitxml=junit/test-unitttest.xml
+      pytest --durations=0 tests/unit -m "not notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml
    displayName: 'Run Unit tests'

 # Uncomment if needed
--- a/tests/ci/gpu_integration_tests_linux.yml
+++ b/tests/ci/gpu_integration_tests_linux.yml
@ -0,0 +1,62 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+
+# More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers
+# Implementing the scheduler from the dashboard
+# Uncomment in case it wants to be done from using the yml
+#schedules:
+#- cron: "56 11 * * *"
+#  displayName: Daily computation of nightly builds
+#  branches:
+#    include:
+#    - master
+#  always: true
+
+
+# no PR builds
+pr: none
+
+# no CI trigger
+trigger: none
+
+jobs:
+- job: nightly
+  displayName : 'Nightly tests'
+  timeoutInMinutes: 180 # how long to run the job before automatically cancelling
+  pool:
+    name: nlpagentpool
+
+  steps:
+  - bash: |
+      echo "##vso[task.prependpath]/data/anaconda/bin"
+      conda env list
+    displayName: 'Add Conda to PATH'
+
+  # Conda creation can take around 10min
+  - bash: |
+      python tools/generate_conda_file.py --gpu
+      conda env create -n integration_gpu -f nlp_gpu.yaml
+    displayName: 'Creating Conda Environment with dependencies'
+
+  - bash: |
+      source activate integration_gpu
+      pytest --durations=0 tests/smoke -m "smoke and gpu and not azureml" --junitxml=junit/test-smoke-test.xml
+    displayName: 'Run smoke tests'
+
+  - bash: |
+      source activate integration_gpu
+      pytest --durations=0 tests/integration -m "integration and gpu and not azureml" --junitxml=junit/test-integration-test.xml
+    displayName: 'Run integration tests'
+
+  - bash: |
+      echo Remove Conda Environment
+      conda remove -n integration_gpu --all -q --force -y
+      echo Done Cleanup
+    displayName: 'Cleanup Task'
+    condition: always()
+
+  - task: PublishTestResults@2
+    inputs:
+      testResultsFiles: '**/test-*-test.xml'
+      testRunTitle: 'Test results for PyTest'
--- a/tests/ci/gpu_unit_tests_linux.yml
+++ b/tests/ci/gpu_unit_tests_linux.yml
@ -32,7 +32,7 @@ jobs:

  - bash: |
      source activate nlp_gpu
-      pytest --durations=0 tests/unit -m "not notebooks and gpu" --junitxml=junit/test-unitttest.xml
+      pytest --durations=0 tests/unit -m "not notebooks and gpu and not azureml" --junitxml=junit/test-unitttest.xml
    displayName: 'Run Unit tests'

 # Uncomment if needed
--- a/tests/ci/notebooks_cpu_unit_tests_linux.yml
+++ b/tests/ci/notebooks_cpu_unit_tests_linux.yml
@ -32,7 +32,7 @@ jobs:

  - bash: |
      source activate nlp_cpu
-      pytest --durations=0 tests/unit -m "notebooks and not gpu" --junitxml=junit/test-unitttest.xml
+      pytest --durations=0 tests/unit -m "notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml
    displayName: 'Run Unit tests'

 # Uncomment if needed
--- a/tests/ci/notebooks_gpu_unit_tests_linux.yml
+++ b/tests/ci/notebooks_gpu_unit_tests_linux.yml
@ -32,7 +32,7 @@ jobs:

  - bash: |
      source activate nlp_gpu
-      pytest --durations=0 tests/unit -m "notebooks and gpu" --junitxml=junit/test-unitttest.xml
+      pytest --durations=0 tests/unit -m "notebooks and gpu and not azureml" --junitxml=junit/test-unitttest.xml
    displayName: 'Run Unit tests'

 # Uncomment if needed
--- a/tests/ci/repo_metrics_pipeline.yml
+++ b/tests/ci/repo_metrics_pipeline.yml
@ -1,11 +1,17 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 # More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers
-schedules:
- cron: "56 22 * * *"
-  displayName: Daily track of metrics
-  branches:
-    include:
-    - master
-  always: true
+# Implementing the scheduler from the dashboard
+# Uncomment in case it wants to be done from using the yml
+# schedules:
+# - cron: "56 22 * * *"
+#  displayName: Daily track of metrics
+#  branches:
+#    include:
+#    - master
+#  always: true
+

 # no PR builds
 pr: none
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -17,6 +17,9 @@ from tests.notebooks_common import path_notebooks

 from utils_nlp.models.bert.common import Language
 from utils_nlp.models.bert.common import Tokenizer as BERTTokenizer
+from utils_nlp.azureml import azureml_utils
+from azureml.core.webservice import Webservice
+


@pytest.fixture(scope="module")
@ -25,11 +28,55 @@ def notebooks():

    # Path for the notebooks
    paths = {
+        "embedding_trainer": os.path.join(
+            folder_notebooks, "embeddings", "embedding_trainer.ipynb"
+        ),
        "similarity_embeddings_baseline": os.path.join(
            folder_notebooks, "sentence_similarity", "baseline_deep_dive.ipynb"
        ),
-        "embedding_trainer": os.path.join(
-            folder_notebooks, "embeddings", "embedding_trainer.ipynb"
+        "bert_encoder": os.path.join(
+            folder_notebooks, "sentence_similarity", "bert_encoder.ipynb"
+        ),
+        "gensen_local": os.path.join(
+            folder_notebooks, "sentence_similarity", "gensen_local.ipynb"
+        ),
+        "gensen_azureml": os.path.join(
+            folder_notebooks, "sentence_similarity", "gensen_aml_deep_dive.ipynb"
+        ),
+        "similarity_automl_local": os.path.join(
+            folder_notebooks,
+            "sentence_similarity",
+            "automl_local_deployment_aci.ipynb",
+        ),
+        "automl_with_pipelines_deployment_aks": os.path.join(
+            folder_notebooks,
+            "sentence_similarity",
+            "automl_with_pipelines_deployment_aks.ipynb",
+        ),        
+        "bert_qa_trainer": os.path.join(
+            folder_notebooks,
+            "question_answering",
+            "pretrained-BERT-SQuAD-deep-dive-aml.ipynb",
+        ),
+        "bidaf_deep_dive": os.path.join(
+            folder_notebooks, "question_answering", "bidaf_aml_deep_dive.ipynb"
+        ),
+        "bidaf_quickstart": os.path.join(
+            folder_notebooks,
+            "question_answering",
+            "question_answering_system_bidaf_quickstart.ipynb",
+        ),
+        "entailment_multinli_bert": os.path.join(
+            folder_notebooks, "entailment", "entailment_multinli_bert.ipynb"
+        ),
+        "tc_bert_azureml": os.path.join(
+            folder_notebooks, "text_classification", "tc_bert_azureml.ipynb"
+        ),
+        "tc_mnli_bert": os.path.join(
+            folder_notebooks, "text_classification", "tc_mnli_bert.ipynb"
+        ),
+        "deep_and_unified_understanding": os.path.join(
+            folder_notebooks, "interpret_NLP_models", "understand_models.ipynb"
        ),
    }
    return paths
@ -52,22 +99,10 @@ def ner_test_data():
    false_pos = [1, 2]
    for p in false_pos:
        TRAILING_TOKEN_MASK[0][p] = False
-    INPUT_LABEL_IDS = [
-        [3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-    ]
+    INPUT_LABEL_IDS = [[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    return {
        "INPUT_TEXT": [
-            [
-                "Johnathan",
-                "is",
-                "studying",
-                "in",
-                "the",
-                "University",
-                "of",
-                "Michigan",
-                ".",
-            ]
+            ["Johnathan", "is", "studying", "in", "the", "University", "of", "Michigan", "."]
        ],
        "INPUT_TEXT_SINGLE": [
            "Johnathan",
@ -80,23 +115,9 @@ def ner_test_data():
            "Michigan",
            ".",
        ],
-        "INPUT_LABELS": [
-            ["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"]
-        ],
-        "INPUT_LABELS_SINGLE": [
-            "I-PER",
-            "O",
-            "O",
-            "O",
-            "O",
-            "I-ORG",
-            "I-ORG",
-            "I-ORG",
-            "O",
-        ],
-        "INPUT_LABELS_WRONG": [
-            ["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]
-        ],
+        "INPUT_LABELS": [["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"]],
+        "INPUT_LABELS_SINGLE": ["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"],
+        "INPUT_LABELS_WRONG": [["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]],
        "INPUT_TOKEN_IDS": [
            [
                1287,
@ -123,26 +144,12 @@ def ner_test_data():
        ],
        "INPUT_LABEL_IDS": INPUT_LABEL_IDS,
        "INPUT_MASK": [[1] * 11 + [0] * 9],
-        "PREDICTED_LABELS": [
-            [3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-        ],
+        "PREDICTED_LABELS": [[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
        "TRAILING_TOKEN_MASK": TRAILING_TOKEN_MASK,
        "UNIQUE_LABELS": UNIQUE_LABELS,
        "LABEL_MAP": LABEL_MAP,
        "EXPECTED_TOKENS_NO_PADDING": [
-            [
-                "I-PER",
-                "X",
-                "X",
-                "O",
-                "O",
-                "O",
-                "O",
-                "I-ORG",
-                "I-ORG",
-                "I-ORG",
-                "O",
-            ]
+            ["I-PER", "X", "X", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"]
        ],
        "EXPECTED_TOKENS_NO_PADDING_NO_TRAILING": [
            ["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"]
@ -152,6 +159,62 @@ def ner_test_data():
    }


+def pytest_addoption(parser):
+    parser.addoption("--subscription_id", help="Azure Subscription Id to create resources in")
+    parser.addoption("--resource_group", help="Name of the resource group")
+    parser.addoption("--workspace_name", help="Name of Azure ML Workspace")
+    parser.addoption("--workspace_region", help="Azure region to create the workspace in")
+    parser.addoption("--cluster_name", help="Name of the AzureML Cluster.")
+
+
+@pytest.fixture(scope="module")
+def subscription_id(request):
+    return request.config.getoption("--subscription_id")
+
+
+@pytest.fixture(scope="module")
+def resource_group(request):
+    return request.config.getoption("--resource_group")
+
+
+@pytest.fixture(scope="module")
+def workspace_name(request):
+    return request.config.getoption("--workspace_name")
+
+
+@pytest.fixture(scope="module")
+def workspace_region(request):
+    return request.config.getoption("--workspace_region")
+
+
+@pytest.fixture(scope="module")
+def cluster_name(request):
+    return request.config.getoption("--cluster_name")
+
+
@pytest.fixture()
 def bert_english_tokenizer():
    return BERTTokenizer(language=Language.ENGLISHCASED, to_lower=False)
+
+
+@pytest.fixture(scope="module")
+def teardown_service(
+    subscription_id, resource_group, workspace_name, workspace_region
+):
+
+    yield
+
+    # connect to workspace
+    ws = azureml_utils.get_or_create_workspace(
+        config_path="tests/ci",
+        subscription_id=subscription_id,
+        resource_group=resource_group,
+        workspace_name=workspace_name,
+        workspace_region=workspace_region,
+    )
+
+    # connect to aci_service
+    aci_service = Webservice(workspace=ws, name="aci-test-service")
+
+    # delete aci_service
+    aci_service.delete()
--- a/tests/integration/test_gpu_utils.py
+++ b/tests/integration/test_gpu_utils.py
@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pytest
+import torch
+
+
+@pytest.mark.gpu
+@pytest.mark.integration
+def test_machine_is_gpu_machine():
+    assert torch.cuda.is_available() is True
--- a/tests/integration/test_notebooks_entailment.py
+++ b/tests/integration/test_notebooks_entailment.py
@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pytest
+import papermill as pm
+from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
+
+
+@pytest.mark.gpu
+@pytest.mark.integration
+def test_entailment_multinli_bert(notebooks):
+    notebook_path = notebooks["entailment_multinli_bert"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        parameters={
+            "TRAIN_DATA_USED_PERCENT": 0.001,
+            "DEV_DATA_USED_PERCENT": 0.01,
+            "NUM_EPOCHS": 1,
+        },
+        kernel_name=KERNEL_NAME,
+    )
--- a/tests/integration/test_notebooks_interpretability.py
+++ b/tests/integration/test_notebooks_interpretability.py
@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pytest
+import numpy as np
+import papermill as pm
+import scrapbook as sb
+from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
+
+
+@pytest.mark.gpu
+@pytest.mark.integration
+def test_deep_and_unified_understanding(notebooks):
+    notebook_path = notebooks["deep_and_unified_understanding"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        kernel_name=KERNEL_NAME)
+    
+    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
+    sigma_numbers = [0.00317593, 0.00172284, 0.00634005, 0.00164305, 0.00317159]
+    sigma_bert = [0.1735696 , 0.14028822, 0.14590865, 0.2263149 , 0.20640415,
+       0.21249843, 0.18685372, 0.14112663, 0.25824168, 0.22399105,
+       0.2393731 , 0.12868434, 0.27386534, 0.35876372]
+    
+    np.testing.assert_array_almost_equal(result["sigma_numbers"], sigma_numbers, decimal=4) 
+    np.testing.assert_array_almost_equal(result["sigma_bert"], sigma_bert, decimal=1) 
+    
--- a/tests/integration/test_notebooks_question_answering.py
+++ b/tests/integration/test_notebooks_question_answering.py
@ -0,0 +1,86 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pytest
+import papermill as pm
+import scrapbook as sb
+from tests.notebooks_common import OUTPUT_NOTEBOOK
+
+ABS_TOL = 0.2
+
+
+@pytest.mark.integration
+@pytest.mark.azureml
+def test_bidaf_deep_dive(notebooks,
+                         subscription_id,
+                         resource_group,
+                         workspace_name,
+                         workspace_region):
+    notebook_path = notebooks["bidaf_deep_dive"]
+    pm.execute_notebook(notebook_path,
+                        OUTPUT_NOTEBOOK,
+                        parameters = {'NUM_EPOCHS':2,
+                                      'config_path': "tests/ci",
+                                      'PROJECT_FOLDER': "scenarios/question_answering/bidaf-question-answering",
+                                      'SQUAD_FOLDER': "scenarios/question_answering/squad",
+                                      'LOGS_FOLDER': "scenarios/question_answering/",
+                                      'BIDAF_CONFIG_PATH': "scenarios/question_answering/",
+                                      'subscription_id': subscription_id,
+                                      'resource_group': resource_group,
+                                      'workspace_name': workspace_name,
+                                      'workspace_region': workspace_region})
+    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["validation_EM"]
+    assert result == pytest.approx(0.5, abs=ABS_TOL)
+
+
+@pytest.mark.usefixtures("teardown_service")
+@pytest.mark.integration
+@pytest.mark.azureml
+def test_bidaf_quickstart(notebooks,
+                          subscription_id,
+                           resource_group,
+                           workspace_name,
+                           workspace_region):
+    notebook_path = notebooks["bidaf_quickstart"]
+    pm.execute_notebook(notebook_path,
+                        OUTPUT_NOTEBOOK,
+                        parameters = {'config_path': "tests/ci",
+                                      'subscription_id': subscription_id,
+                                      'resource_group': resource_group,
+                                      'workspace_name': workspace_name,
+                                      'workspace_region': workspace_region,
+                                      'webservice_name': "aci-test-service"})
+    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["answer"]
+    assert result == "Bi-Directional Attention Flow"
+
+
+@pytest.mark.integration
+@pytest.mark.azureml
+@pytest.mark.gpu
+def test_bert_qa_runs(notebooks):
+    notebook_path = notebooks["bert_qa_trainer"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        parameters=dict(
+            AZUREML_CONFIG_PATH="./tests/integration/.azureml",
+            DATA_FOLDER='./tests/integration/squad',
+            PROJECT_FOLDER='./tests/integration/pytorch-transformers',
+            EXPERIMENT_NAME='NLP-QA-BERT-deepdive',
+            BERT_UTIL_PATH='./utils_nlp/azureml/azureml_bert_util.py',
+            EVALUATE_SQAD_PATH = './utils_nlp/eval/evaluate_squad.py',
+            TRAIN_SCRIPT_PATH="./scenarios/question_answering/bert_run_squad_azureml.py",
+            BERT_MODEL="bert-base-uncased",
+            NUM_TRAIN_EPOCHS=1.0,
+            NODE_COUNT=1,
+            MAX_TOTAL_RUNS=1,
+            MAX_CONCURRENT_RUNS=1,
+            TARGET_GRADIENT_STEPS=1,
+            INIT_GRADIENT_STEPS=1,
+        ),
+    )
+    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
+    assert result["f1"] > 70
+    assert result["learning_rate"] >= 5e-5
+    assert result["learning_rate"] <= 9e-5
+
--- a/tests/integration/test_notebooks_sentence_similarity.py
+++ b/tests/integration/test_notebooks_sentence_similarity.py
@ -1,15 +1,14 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

-import sys
 import pytest
 import papermill as pm
 import scrapbook as sb
-
-from tests.notebooks_common import OUTPUT_NOTEBOOK
+from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME


 ABS_TOL = 0.2
+ABS_TOL_PEARSONS = 0.05


@pytest.fixture(scope="module")
@ -34,11 +33,109 @@ def baseline_results():
    }


-@pytest.mark.notebooks
+@pytest.mark.gpu
+@pytest.mark.integration
+def test_gensen_local(notebooks):
+    notebook_path = notebooks["gensen_local"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        kernel_name=KERNEL_NAME,
+        parameters=dict(
+            max_epoch=1,
+            config_filepath="scenarios/sentence_similarity/gensen_config.json",
+            base_data_path="data",
+        ),
+    )
+
+    results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["results"]
+    expected = {"0": {"0": 1, "1": 0.95}, "1": {"0": 0.95, "1": 1}}
+
+    for key, value in expected.items():
+        for k, v in value.items():
+            assert results[key][k] == pytest.approx(v, abs=ABS_TOL_PEARSONS)
+            
+
+@pytest.mark.gpu
+@pytest.mark.integration
+def test_bert_encoder(notebooks, tmp):
+    notebook_path = notebooks["bert_encoder"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        kernel_name=KERNEL_NAME,
+        parameters=dict(NUM_GPUS=1,
+                        MAX_SEQ_LENGTH=128,
+                        CACHE_DIR=tmp),
+    )
+    size_emb = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["size_emb"]
+    assert size_emb == 768
+    
+            
+@pytest.mark.integration
+@pytest.mark.azureml
 def test_similarity_embeddings_baseline_runs(notebooks, baseline_results):
    notebook_path = notebooks["similarity_embeddings_baseline"]
-    pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK)
+    pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
    results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["results"]
    for key, value in baseline_results.items():
        assert results[key] == pytest.approx(value, abs=ABS_TOL)

+
+@pytest.mark.usefixtures("teardown_service")
+@pytest.mark.integration
+@pytest.mark.azureml
+def test_automl_local_runs(notebooks,
+                           subscription_id,
+                           resource_group,
+                           workspace_name,
+                           workspace_region):
+    notebook_path = notebooks["similarity_automl_local"]
+
+    pm.execute_notebook(notebook_path,
+                        OUTPUT_NOTEBOOK,
+                        parameters = {'automl_iterations': 2,
+                                      'automl_iteration_timeout':7,
+                                      'config_path': "tests/ci",
+                                      'webservice_name': "aci-test-service",
+                                      'subscription_id': subscription_id,
+                                      'resource_group': resource_group,
+                                      'workspace_name': workspace_name,
+                                      'workspace_region': workspace_region})
+    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["pearson_correlation"]
+    assert result == pytest.approx(0.5, abs=ABS_TOL)
+
+
+@pytest.mark.integration
+@pytest.mark.azureml
+def test_similarity_gensen_azureml_runs(notebooks):
+    notebook_path = notebooks["gensen_azureml"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        parameters=dict(
+            CACHE_DIR="./tests/integration/temp",
+            AZUREML_CONFIG_PATH="./tests/integration/.azureml",
+            UTIL_NLP_PATH="./utils_nlp",
+            MAX_EPOCH=1,
+            TRAIN_SCRIPT="./scenarios/sentence_similarity/gensen_train.py",
+            CONFIG_PATH="./scenarios/sentence_similarity/gensen_config.json",
+            MAX_TOTAL_RUNS=1,
+            MAX_CONCURRENT_RUNS=1,
+        ),
+    )
+    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
+    assert result["min_val_loss"] > 5
+    assert result["learning_rate"] >= 0.0001
+    assert result["learning_rate"] <= 0.001
+
+
+@pytest.mark.integration
+@pytest.mark.azureml
+@pytest.mark.skip(reason="can't run programmatically, AKS cluster takes ~20 minutes to create and there is no blocking call in the notebook to tell that the cluster creation is in progress")
+def test_automl_with_pipelines_deployment_aks(notebooks):
+    notebook_path = notebooks["automl_with_pipelines_deployment_aks"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK)
+    
--- a/tests/integration/test_notebooks_text_classification.py
+++ b/tests/integration/test_notebooks_text_classification.py
@ -0,0 +1,74 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import json
+import shutil
+import pytest
+import papermill as pm
+import scrapbook as sb
+from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
+
+
+ABS_TOL = 0.1
+
+
+@pytest.mark.gpu
+@pytest.mark.integration
+def test_tc_mnli_bert(notebooks, tmp):
+    notebook_path = notebooks["tc_mnli_bert"]
+    pm.execute_notebook(
+        notebook_path, 
+        OUTPUT_NOTEBOOK, 
+        kernel_name=KERNEL_NAME, 
+        parameters=dict(NUM_GPUS=1,
+                        DATA_FOLDER=tmp,
+                        BERT_CACHE_DIR=tmp,
+                        BATCH_SIZE=32,
+                        BATCH_SIZE_PRED=512,
+                        NUM_EPOCHS=1
+                       )
+    )
+    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
+    assert pytest.approx(result["accuracy"], 0.93, abs=ABS_TOL)
+    assert pytest.approx(result["precision"], 0.93, abs=ABS_TOL)
+    assert pytest.approx(result["recall"], 0.93, abs=ABS_TOL)
+    assert pytest.approx(result["f1"], 0.93, abs=ABS_TOL)
+    
+
+@pytest.mark.integration
+@pytest.mark.azureml
+@pytest.mark.gpu
+def test_tc_bert_azureml(
+    notebooks, subscription_id, resource_group, workspace_name, workspace_region, cluster_name, tmp
+):
+    notebook_path = notebooks["tc_bert_azureml"]
+
+    train_folder = os.path.join(tmp, "train")
+    test_folder = os.path.join(tmp, "test")
+
+    parameters = {
+        "config_path": "tests/ci",
+        "subscription_id": subscription_id,
+        "resource_group": resource_group,
+        "workspace_name": workspace_name,
+        "workspace_region": workspace_region,
+        "cluster_name": cluster_name,
+        "DATA_FOLDER": tmp,
+        "TRAIN_FOLDER": train_folder,
+        "TEST_FOLDER": test_folder,
+        "PROJECT_FOLDER": "./",
+        "NUM_PARTITIONS": 1,
+        "NODE_COUNT": 1,
+    }
+
+    pm.execute_notebook(
+        notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=parameters
+    )
+
+    with open("outputs/results.json", "r") as handle:
+        result_dict = json.load(handle)
+        assert result_dict["weighted avg"]["f1-score"] == pytest.approx(0.85, abs=ABS_TOL)
+
+    if os.path.exists("outputs"):
+        shutil.rmtree("outputs")
--- a/tests/smoke/test_dataset.py
+++ b/tests/smoke/test_dataset.py
@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import pytest
+
+from utils_nlp.dataset import msrpc
+from utils_nlp.dataset import xnli
+
+
+@pytest.mark.smoke
+def test_msrpc_download(tmp_path):
+    filepath = msrpc.download_msrpc(tmp_path)
+    statinfo = os.stat(filepath)
+    assert statinfo.st_size == 1359872
+
+
+@pytest.mark.skip(reason="Can't test it programmatically, needs input")
+@pytest.mark.smoke
+def test_msrpc_load_df(tmp_path):
+    df_train = msrpc.load_pandas_df(
+        local_cache_path=tmp_path, dataset_type="train"
+    )
+
+
+@pytest.mark.smoke
+def test_xnli(tmp_path):
+    df_train = xnli.load_pandas_df(
+        local_cache_path=tmp_path, file_split="train"
+    )
+    assert df_train.shape == (392702, 2)
--- a/tests/smoke/test_gpu_utils.py
+++ b/tests/smoke/test_gpu_utils.py
@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pytest
+import torch
+
+
+@pytest.mark.smoke
+@pytest.mark.gpu
+def test_machine_is_gpu_machine():
+    assert torch.cuda.is_available() is True
+
--- a/tests/smoke/test_msrpc.py
+++ b/tests/smoke/test_msrpc.py
@ -1,14 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-import os
-import pytest
-
-from utils_nlp.dataset import msrpc
-
-
-@pytest.mark.smoke
-def test_download_msrpc(tmp_path):
-    filepath = msrpc.download_msrpc(tmp_path)
-    statinfo = os.stat(filepath)
-    assert statinfo.st_size == 1359872
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@ -5,51 +5,14 @@ import os
 import pytest

 from utils_nlp.dataset.url_utils import maybe_download
-from utils_nlp.dataset.msrpc import load_pandas_df
-import utils_nlp.dataset.wikigold as wg
-import utils_nlp.dataset.xnli as xnli
+from utils_nlp.dataset import msrpc
+from utils_nlp.dataset import wikigold
+from utils_nlp.dataset import xnli
+from utils_nlp.dataset import snli
+from utils_nlp.dataset import Split
 from utils_nlp.dataset.ner_utils import preprocess_conll


-def test_maybe_download():
-    # ToDo: Change this url when repo goes public.
-    file_url = (
-        "https://raw.githubusercontent.com/Microsoft/Recommenders/"
-        "master/LICENSE"
-    )
-    filepath = "license.txt"
-    assert not os.path.exists(filepath)
-    filepath = maybe_download(file_url, "license.txt", expected_bytes=1162)
-    assert os.path.exists(filepath)
-    os.remove(filepath)
-    with pytest.raises(IOError):
-        filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
-
-
-def test_load_pandas_df_msrpc():
-    with pytest.raises(Exception):
-        load_pandas_df(dataset_type="Dummy")
-
-
-def test_wikigold(tmp_path):
-    wg_sentence_count = 1841
-    wg_test_percentage = 0.5
-    wg_test_sentence_count = round(wg_sentence_count * wg_test_percentage)
-    wg_train_sentence_count = wg_sentence_count - wg_test_sentence_count
-
-    downloaded_file = os.path.join(tmp_path, "wikigold.conll.txt")
-    assert not os.path.exists(downloaded_file)
-
-    train_df, test_df = wg.load_train_test_dfs(
-        tmp_path, test_percentage=wg_test_percentage
-    )
-
-    assert os.path.exists(downloaded_file)
-
-    assert train_df.shape == (wg_train_sentence_count, 2)
-    assert test_df.shape == (wg_test_sentence_count, 2)
-
-
@pytest.fixture
 def ner_utils_test_data(scope="module"):
    return {
@ -115,6 +78,45 @@ def ner_utils_test_data(scope="module"):
    }


+def test_maybe_download():
+    # ToDo: Change this url when repo goes public.
+    file_url = (
+        "https://raw.githubusercontent.com/Microsoft/Recommenders/"
+        "master/LICENSE"
+    )
+    filepath = "license.txt"
+    assert not os.path.exists(filepath)
+    filepath = maybe_download(file_url, "license.txt", expected_bytes=1162)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+    with pytest.raises(IOError):
+        filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
+
+
+def test_msrpc():
+    with pytest.raises(Exception):
+        msrpc.load_pandas_df(dataset_type="Dummy")
+
+
+def test_wikigold(tmp_path):
+    wg_sentence_count = 1841
+    wg_test_percentage = 0.5
+    wg_test_sentence_count = round(wg_sentence_count * wg_test_percentage)
+    wg_train_sentence_count = wg_sentence_count - wg_test_sentence_count
+
+    downloaded_file = os.path.join(tmp_path, "wikigold.conll.txt")
+    assert not os.path.exists(downloaded_file)
+
+    train_df, test_df = wikigold.load_train_test_dfs(
+        tmp_path, test_percentage=wg_test_percentage
+    )
+
+    assert os.path.exists(downloaded_file)
+
+    assert train_df.shape == (wg_train_sentence_count, 2)
+    assert test_df.shape == (wg_test_sentence_count, 2)
+
+
 def test_ner_utils(ner_utils_test_data):
    output = preprocess_conll(ner_utils_test_data["input"])
    assert output == ner_utils_test_data["expected_output"]
@ -123,5 +125,21 @@ def test_ner_utils(ner_utils_test_data):
 def test_xnli(tmp_path):
    # only test for the dev df as the train dataset takes several
    # minutes to download
-    dev_df = xnli.load_pandas_df(local_cache_path=tmp_path)
+    dev_df = xnli.load_pandas_df(local_cache_path=tmp_path, file_split="dev")
    assert dev_df.shape == (2490, 2)
+
+
+def test_snli(tmp_path):
+    df_train = snli.load_pandas_df(
+        local_cache_path=tmp_path, file_split=Split.TRAIN
+    )
+    assert df_train.shape == (550152, 14)
+    df_test = snli.load_pandas_df(
+        local_cache_path=tmp_path, file_split=Split.TEST
+    )
+    assert df_test.shape == (10000, 14)
+    df_dev = snli.load_pandas_df(
+        local_cache_path=tmp_path, file_split=Split.DEV
+    )
+    assert df_dev.shape == (10000, 14)
+
--- a/tests/unit/test_eval_classification.py
+++ b/tests/unit/test_eval_classification.py
@ -0,0 +1,16 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+
+from utils_nlp.eval.classification import compute_correlation_coefficients
+
+
+def test_compute():
+    x = np.random.rand(2, 100)
+    df = compute_correlation_coefficients(x)
+    assert df.shape == (2, 2)
+
+    y = np.random.rand(2, 100)
+    df = compute_correlation_coefficients(x, y)
+    assert df.shape == (4, 4)
--- a/tests/unit/test_gensen_utils.py
+++ b/tests/unit/test_gensen_utils.py
@ -0,0 +1,62 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+
+import pandas as pd
+
+from utils_nlp.models.gensen.preprocess_utils import gensen_preprocess
+from utils_nlp.models.gensen.utils import DataIterator
+
+
+def test_gensen_preprocess(tmp_path):
+    data = [
+        [
+            "neutral",
+            "it is a lovely day",
+            "the weather is great outside.",
+            ["it", "is", "lovely", "day"],
+            ["the", "weather", "is", "great", "outside"],
+        ]
+    ]
+
+    df = pd.DataFrame(data)
+    df.columns = [
+        "score",
+        "sentence1",
+        "sentence2",
+        "sentence1_tokens",
+        "sentence2_tokens",
+    ]
+
+    expected_files = [
+        "snli_1.0_test.txt.lab",
+        "snli_1.0_test.txt.s1.tok",
+        "snli_1.0_dev.txt.clean.noblank",
+        "snli_1.0_train.txt.s1.tok",
+        "snli_1.0_train.txt.lab",
+        "snli_1.0_dev.txt.s1.tok",
+        "snli_1.0_dev.txt.s2.tok",
+        "snli_1.0_test.txt.s2.tok",
+        "snli_1.0_train.txt.clean",
+        "snli_1.0_train.txt.s2.tok",
+        "snli_1.0_test.txt.clean.noblank",
+        "snli_1.0_test.txt.clean",
+        "snli_1.0_train.txt.clean.noblank",
+        "snli_1.0_dev.txt.lab",
+        "snli_1.0_dev.txt.clean",
+    ]
+    path = gensen_preprocess(df, df, df, tmp_path)
+    assert os.path.isdir(path) is True
+    assert set(os.listdir(path)) == set(expected_files)
+
+
+def test_data_iterator():
+    sentences = ["it is a lovely day", "the weather is great outside.", ]
+    expected_vocab = ["it", "is", "a", "lovely", "day", "the", "weather", "is", "great", "outside."]
+
+    vocab_size = 10
+    di = DataIterator()
+    word2id, id2word = di.construct_vocab(sentences, vocab_size)
+    assert set(expected_vocab).issubset(word2id.keys())
+    assert set(expected_vocab).issubset(id2word.values())
--- a/tests/unit/test_notebooks_cpu.py
+++ b/tests/unit/test_notebooks_cpu.py
@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import pytest
+from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
+import papermill as pm
+from utils_nlp.models.bert.common import Language
+
+
+@pytest.mark.notebooks
+def test_bert_encoder(notebooks):
+    notebook_path = notebooks["bert_encoder"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        kernel_name=KERNEL_NAME,
+        parameters=dict(
+            NUM_GPUS=0,
+            LANGUAGE=Language.ENGLISH,
+            TO_LOWER=True,
+            MAX_SEQ_LENGTH=128,
+            CACHE_DIR="./temp",
+        ),
+    )
--- a/tests/unit/test_notebooks_gpu.py
+++ b/tests/unit/test_notebooks_gpu.py
@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import pytest
+from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
+import papermill as pm
+from utils_nlp.models.bert.common import Language
+
+
+@pytest.mark.notebooks
+@pytest.mark.gpu
+def test_bert_encoder(notebooks):
+    notebook_path = notebooks["bert_encoder"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        kernel_name=KERNEL_NAME,
+        parameters=dict(
+            NUM_GPUS=1,
+            LANGUAGE=Language.ENGLISH,
+            TO_LOWER=True,
+            MAX_SEQ_LENGTH=128,
+            CACHE_DIR="./temp",
+        ),
+    )
--- a/tools/generate_conda_file.py
+++ b/tools/generate_conda_file.py
@ -13,6 +13,8 @@

 import argparse
 import textwrap
+from sys import platform
+

 HELP_MSG = """
 To create the conda environment:
@ -50,6 +52,7 @@ CONDA_GPU = {
    "numba": "numba>=0.38.1",
    "pytorch": "pytorch>=1.0.0",
    "tensorflow": "tensorflow-gpu==1.12.0",
+    "cudatoolkit": "cudatoolkit==9.2",
 }

 PIP_BASE = {
@ -69,6 +72,7 @@ PIP_BASE = {
    "ipywebrtc": "ipywebrtc==0.4.3",
    "pre-commit": "pre-commit>=1.14.4",
    "scikit-learn": "scikit-learn>=0.19.0,<=0.20.3",
+    "sklearn-crfsuite": "sklearn-crfsuite>=0.3.6",
    "spacy": "spacy>=2.1.4",
    "spacy-models": (
        "https://github.com/explosion/spacy-models/releases/download/"
@ -80,7 +84,18 @@ PIP_BASE = {
    "seqeval": "seqeval>=0.0.12",
 }

-PIP_GPU = {"horovod": "horovod>=0.16.1"}
+PIP_GPU = {}
+
+PIP_DARWIN = {}
+PIP_DARWIN_GPU = {"horovod": "horovod>=0.16.1"}
+
+PIP_LINUX = {}
+PIP_LINUX_GPU = {"horovod": "horovod>=0.16.1"}
+
+PIP_WIN32 = {}
+PIP_WIN32_GPU = {}
+
+CONDA_WIN32 = {"pytorch": "pytorch==1.0.0", "cudatoolkit": "cuda90"}

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
@ -111,6 +126,23 @@ if __name__ == "__main__":
    # update conda and pip packages based on flags provided
    conda_packages = CONDA_BASE
    pip_packages = PIP_BASE
+
+    # check for os platform support
+    if platform == "darwin":
+        pip_packages.update(PIP_DARWIN)
+        PIP_GPU.update(PIP_DARWIN_GPU)
+    elif platform.startswith("linux"):
+        pip_packages.update(PIP_LINUX)
+        PIP_GPU.update(PIP_LINUX_GPU)
+    elif platform == "win32":
+        conda_packages.update(CONDA_WIN32)
+        pip_packages.update(PIP_WIN32)
+        PIP_GPU.update(PIP_WIN32_GPU)
+    else:
+        raise Exception(
+            "Unsupported platform, must be Windows, Linux, or macOS"
+        )
+
    if args.gpu:
        conda_packages.update(CONDA_GPU)
        pip_packages.update(PIP_GPU)
--- a/utils_nlp/README.md
+++ b/utils_nlp/README.md
@ -0,0 +1,107 @@
+# NLP Utilities
+
+This module (utils_nlp) contains functions to simplify common tasks used when developing and evaluating NLP systems. A short description of the sub-modules is provided below. For more details about what functions are available and how to use them, please review the doc-strings provided with the code.
+
+## Sub-Modules
+
+### [AzureML](azureml)
+
+The AzureML submodule contains utilities to connect to a workspace, train, tune and operationalize NLP systems at scale using AzureML.
+
+```python
+from utils_nlp.azureml.azureml_utils import get_or_create_workspace
+
+###Note: you do not need to fill in these values if you have a config.json in the same folder as this notebook
+ws = get_or_create_workspace(
+    config_path=config_path,
+    subscription_id=subscription_id,
+    resource_group=resource_group,
+    workspace_name=workspace_name,
+    workspace_region=workspace_region,
+)
+```
+
+### [Common](common)
+
+This submodule contains high-level utilities for defining constants used in most algorithms as well as helper functions for managing aspects of different frameworks like pytorch.
+
+### [Dataset](dataset)
+Dataset includes helper functions for interacting with different datasets and formatting them appropriately as well as utilities for splitting data for training / testing.
+
+#### Data Loading
+There are dataloaders for several datasets. For example, the snli module will allow you to load a dataframe in pandas from the SNLI dataset, with the option to set the number of rows to load in order to test algorithms and evaluate performance benchmarks. Information on the datasets used in the repo can be found [here](https://github.com/microsoft/nlp/tree/staging/utils_nlp/dataset#datasets).
+
+Most datasets may be split into `train`, `dev`, and `test`.
+
+```python
+from utils_nlp.dataset.snli import load_pandas_df
+
+df = load_pandas_df(DATA_FOLDER, file_split ="train", nrows = 1000)
+```
+
+### [Evaluation (Eval)](eval)
+The evaluation (eval) submodule includes functionality for computing eturns common classification evaluation metrics like accuracy, precision, recall, and f1 scores for classification scenarios, normalizing and finding f1_scores for different datasets like SQuAD, as well as logging the means and other coefficients for datasets like senteval.
+
+### [Models](models)
+The models submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new natural language processing systems. A description of which algorithms are used in each scenario can be found on [this table](../README.md#content)
+
+This includes:
+* BERT
+* GenSen
+* Pretrained embeddings (Word2Vec,
+fastText,
+GloVe)
+* Pytorch's conditional Gated Recurrent Unit (GRU)
+
+### [Interpreter](interpreter)
+The interpreter submodule contains implementations to explain hidden states of models. It is a code implementation of the paper [Towards a Deep and Unified Understanding of Deep Neural Models in NLP](http://proceedings.mlr.press/v97/guan19a/guan19a.pdf).  
+
+
+### [Semantic Versioning](versioning)
+
+This library is configured to use
+[setuptools_scm](https://github.com/pypa/setuptools_scm/), following the
+instructions there, to automatically get package version from git commit histories.
+
+> NOTE: **There shouldn't be any references to manually coded versions**.
+
+Verify what git tag to use by running:
+
+```bash
+python setup.py --version
+```
+It should look something like `0.1.0.dev4+gdfedba7.d20190209`
+
+Using the information above the master branch, after a merge commit, can be _**Tagged**_ with the above semantic version `0.1.0` (ignoring the `dev4+gdfedba7.d20190209`)  
+
+For example: 
+
+    git tag v0.1.0  
+
+Now verify the semantic version for the package:
+
+    python setup.py --version
+
+
+All new merged commit on master must have a
+   [Semantic Versioning](https://semver.org/) release version with an
+   accompanying tag.  TL;DR:
+   * `major.minor.patch`
+   * Patch is for bugfix
+   * Minor is for new features
+   * Major is for backwards-incompatible changes
+   * tags should be of the form `v0.1.2`  
+
+Installing this library into another clean git repository with a tag version, you should get a nice version like `0.2.1`.  
+
+However, if you inspect the `__version__` in this repo,
+you'll get a nice **'dirty'** version number like `'0.2.1.dev0+g850a76d.d20180908'`.  
+
+This is useful for debugging, building sphinx docs in dev and so on.   
+
+You should never have to specify a version manually except just tagging your commit from the tag calculation generated by running  
+
+    python setup.py --version 
+
+
+   
--- a/utils_nlp/init.py
+++ b/utils_nlp/init.py
@ -0,0 +1,5 @@
+from setuptools_scm import get_version
+
+# Determine semantic versioning automatically
+# from git commits
+__version__ = get_version()
--- a/utils_nlp/azureml/azureml_bert_util.py
+++ b/utils_nlp/azureml/azureml_bert_util.py
@ -0,0 +1,133 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from horovod.torch.mpi_ops import allreduce, allreduce_async_, synchronize
+from horovod.torch.compression import Compression
+import horovod.torch as hvd
+import torch
+import time
+
+from collections import OrderedDict
+try: 
+    from apex_C import flatten
+    from apex_C import unflatten
+except ImportError:
+    try:
+        _ = warned_flatten
+    except NameError:
+        print("Warning:  apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten.")
+        warned_flatten = True
+    from torch._utils import _flatten_dense_tensors as flatten
+    from torch._utils import _unflatten_dense_tensors as unflatten
+
+
+def warmup_linear(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 1.0 - x
+
+
+def adjust_gradient_accumulation_steps(x, initial_steps, target_steps, warmup):
+    return min(max(int(x/warmup*target_steps), initial_steps), target_steps)
+
+
+class DistributedCommunicator:
+    def __init__(self, accumulation_step=1):
+        hvd.init()
+        self.local_rank = hvd.local_rank()
+        self.world_size = hvd.size()
+        self.rank = hvd.rank()
+        self.n_gpu = torch.cuda.device_count()
+        self.node_count = self.world_size // self.n_gpu
+        self.accumulation_step = accumulation_step
+        self.count_down = accumulation_step - 1
+        self._multi_node = self.node_count > 1 
+        if not self._multi_node:
+            # use PyTorch build-in NCCL backend for single node training
+            torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:6000',
+                                world_size=self.n_gpu,  rank=self.local_rank)
+
+
+    def register_model(self, model, fp16):
+        #  broadcast model parameters
+        if self.node_count > 1:
+            hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+        else:
+            for param in model.parameters():
+                torch.distributed.broadcast_multigpu([param], 0)
+
+        # register hook for reduce when backpropagate
+        self._parameter_names = {v: k for k, v in sorted(model.named_parameters())}
+        self._handles = {}
+        self._requires_update = set()
+        self._grad_accs = []
+        self._grad = []
+        self._compression = hvd.Compression.fp16 if fp16 else hvd.Compression.none
+        for p in model.parameters():
+            if p.requires_grad:
+                p.grad = p.data.new(p.size()).zero_()
+                self._requires_update.add(p)
+                p_tmp = p.expand_as(p)
+                grad_acc = p_tmp.grad_fn.next_functions[0][0]
+                grad_acc.register_hook(self._make_hook(p))
+                self._grad_accs.append(grad_acc)
+
+
+    def _allreduce_tensor(self, p):
+        assert p not in self._handles
+        assert not p.grad.requires_grad
+        tensor = p.grad
+        name = self._parameter_names.get(p)
+        if self._multi_node: 
+            tensor_compressed, ctx = self._compression.compress(tensor)
+            handle = allreduce_async_(tensor_compressed, average=True, name=name)
+            self._handles[p] = (handle, ctx)
+        else:
+            self._handles[p] = tensor
+
+
+    def _make_hook(self, p):
+        def hook(*ignore):
+            if self.count_down == 0:
+                self._allreduce_tensor(p)
+        return hook
+
+
+    def synchronize(self):
+        synced = False
+        if self.count_down == 0:
+            missing_p = self._requires_update - set(self._handles.keys())
+            for p in missing_p:
+                self._allreduce_tensor(p)
+
+            if self._multi_node:
+                for p, value in self._handles.items():
+                    handle, ctx = value
+                    output = synchronize(handle)
+                    p.grad.set_(self._compression.decompress(output, ctx) / self.accumulation_step)
+            else:
+                buckets = OrderedDict()
+                for tensor in self._handles.values():
+                    tp = tensor.type()
+                    if tp not in buckets:
+                        buckets[tp] = []
+                    buckets[tp].append(tensor)
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    coalesced = flatten(bucket) / self.world_size / self.accumulation_step
+                    torch.distributed.all_reduce_multigpu([coalesced])
+                    for buf, synced in zip(bucket, unflatten(coalesced, bucket)):
+                        buf.copy_(synced)
+            self._handles.clear()
+            synced = True
+            self.count_down = self.accumulation_step
+
+        self.count_down -= 1
+        return synced
+
+    def set_accumulation_step(self, accumulation_step):
+        self.accumulation_step = accumulation_step
+        self.count_down = self.accumulation_step - 1
+
+# Original source:
+# https://github.com/microsoft/AzureML-BERT/blob/dec79be13befdd51fa72c05419cf9288d32eb263/finetune/PyTorch/azureml_bert_util.py
--- a/utils_nlp/azureml/azureml_utils.py
+++ b/utils_nlp/azureml/azureml_utils.py
@ -2,8 +2,27 @@
 # Licensed under the MIT License.

 import os
-
+from azureml.core.authentication import AzureCliAuthentication
+from azureml.core.authentication import InteractiveLoginAuthentication
+from azureml.core.authentication import AuthenticationException
 from azureml.core import Workspace
+from azureml.exceptions import WorkspaceException
+from azureml.core.compute import ComputeTarget, AmlCompute
+from azureml.core.compute_target import ComputeTargetException
+
+def get_auth():
+    """
+    Method to get the correct Azure ML Authentication type
+
+    Always start with CLI Authentication and if it fails, fall back
+    to interactive login
+    """
+    try:
+        auth_type = AzureCliAuthentication()
+        auth_type.get_authentication_header()
+    except AuthenticationException:
+        auth_type = InteractiveLoginAuthentication()
+    return auth_type


 def get_or_create_workspace(
@ -13,92 +32,114 @@ def get_or_create_workspace(
    workspace_name=None,
    workspace_region=None,
 ):
-    """Get or create AzureML Workspace this will save the config to the path specified for later use
+    """
+    Method to get or create workspace.

    Args:
-        config_path (str): optional directory to look for / store config.json file (defaults to current directory)
-        subscription_id (str): subscription id
-        resource_group (str): resource group
-        workspace_name (str): workspace name
-        workspace_region (str): region
+        config_path: optional directory to look for / store config.json file (defaults to current directory)
+        subscription_id: Azure subscription id
+        resource_group: Azure resource group to create workspace and related resources
+        workspace_name: name of azure ml workspace
+        workspace_region: region for workspace

    Returns:
-        Workspace
+        obj: AzureML workspace if one exists already with the name otherwise creates a new one.
    """

-    # use environment variables if needed
-    if subscription_id is None:
-        subscription_id = os.getenv("SUBSCRIPTION_ID")
-    if resource_group is None:
-        resource_group = os.getenv("RESOURCE_GROUP")
-    if workspace_name is None:
-        workspace_name = os.getenv("WORKSPACE_NAME")
-    if workspace_region is None:
-        workspace_region = os.getenv("WORKSPACE_REGION")
-
-    # define fallback options in order to try
-    options = [
-        (
-            Workspace,
-            dict(
-                subscription_id=subscription_id,
-                resource_group=resource_group,
-                workspace_name=workspace_name,
-            ),
-        ),
-        (Workspace.from_config, dict(path=config_path)),
-        (
-            Workspace.create,
-            dict(
-                subscription_id=subscription_id,
-                resource_group=resource_group,
-                name=workspace_name,
-                location=workspace_region,
-                create_resource_group=True,
-                exist_ok=True,
-            ),
-        ),
-    ]
-
-    for function, kwargs in options:
    try:
-            ws = function(**kwargs)
-            break
-        except Exception:
-            continue
+        # get existing azure ml workspace
+        if config_path is not None:
+            ws = Workspace.from_config(config_path, auth=get_auth())
        else:
-        raise ValueError(
-            "Failed to get or create AzureML Workspace with the configuration information provided"
+            ws = Workspace.get(
+                name=workspace_name,
+                subscription_id=subscription_id,
+                resource_group=resource_group,
+                auth=get_auth(),
+            )
+
+    except WorkspaceException:
+        # this call might take a minute or two.
+        print("Creating new workspace")
+        ws = Workspace.create(
+            name=workspace_name,
+            subscription_id=subscription_id,
+            resource_group=resource_group,
+            create_resource_group=True,
+            location=workspace_region,
+            auth=get_auth(),
        )

        ws.write_config(path=config_path)
    return ws

-def log_metrics_scalar(value, run, name="", description=None):
-    """Log scalar metric to the AzureML run
+def get_or_create_amlcompute(
+    workspace,
+    compute_name,
+    vm_size="",
+    min_nodes=0,
+    max_nodes=None,
+    idle_seconds_before_scaledown=None,
+    verbose=False,
+):
+    """Get or create AmlCompute as the compute target. If a cluster of the same name is found, attach it and rescale
+       accordingly. Otherwise, create a new cluster.
    
    Args:
-        value : numerical or string value to log
-        run : AzureML Run object
-        name : name of metric
-        description : description of metric
+        workspace (Workspace): workspace
+        compute_name (str): name
+        vm_size (str, optional): vm size
+        min_nodes (int, optional): minimum number of nodes in cluster
+        max_nodes (None, optional): maximum number of nodes in cluster
+        idle_seconds_before_scaledown (None, optional): how long to wait before the cluster autoscales down
+        verbose (bool, optional): if true, print logs
+    Returns:
+        Compute target
    """
-    run.log(name, value, description)
+    try:
+        if verbose:
+            print("Found compute target: {}".format(compute_name))

-def log_metrics_table(df, run, name="", description=None, as_scalar=False):
-    """Log data from pd.DataFrame to the AzureML run
+        compute_target = ComputeTarget(workspace=workspace, name=compute_name)
+        if len(compute_target.list_nodes()) < max_nodes:
+            if verbose:
+                print("Rescaling to {} nodes".format(max_nodes))
+            compute_target.update(max_nodes=max_nodes)
+            compute_target.wait_for_completion(show_output=verbose)
+
+    except ComputeTargetException:
+        if verbose:
+            print("Creating new compute target: {}".format(compute_name))
+
+        compute_config = AmlCompute.provisioning_configuration(
+            vm_size=vm_size,
+            min_nodes=min_nodes,
+            max_nodes=max_nodes,
+            idle_seconds_before_scaledown=idle_seconds_before_scaledown,
+        )
+        compute_target = ComputeTarget.create(ws, compute_name, compute_config)
+        compute_target.wait_for_completion(show_output=verbose)
+
+    return compute_target
+
+def get_output_files(run, output_path, file_names=None):
+    """
+    Method to get the output files from an AzureML output directory.

    Args:
-        df : pd.DataFrame containing metrics to log
-        run : AzureML Run object
-        name : name of metric
-        description : description of metric
-        as_scalar : when True, logs each cell of the table as a scalar metric; defaults to False
-    """
-    if as_scalar:
-        for rn in df.index:
-            for cn in df.columns:
-                log_metrics_scalar(df.loc[rn, cn], run, name="{0}::{1}".format(rn, cn), description=description)
+        file_names(list): Names of the files to download.
+        run(azureml.core.run.Run): Run object of the run.
+        output_path(str): Path to download the output files.

-    else:
-        run.log_table(name, df.to_dict(), description)
+    Returns: None
+
+    """
+    os.makedirs(output_path, exist_ok=True)
+
+    if file_names is None:
+        file_names = run.get_file_names()
+
+    for f in file_names:
+        dest = os.path.join(output_path, f.split("/")[-1])
+        print("Downloading file {} to {}...".format(f, dest))
+        run.download_file(f, dest)
--- a/utils_nlp/dataset/README.md
+++ b/utils_nlp/dataset/README.md
@ -36,6 +36,12 @@ Alexis Conneau, Guillaume Lample, Ruty Rinott, Holger Schwenk, Ves Stoyanov. 201
 Original source: https://www.nyu.edu/projects/bowman/xnli/  
 The dataset is preprocessed to remove unused columns.

+### The SQuAD dataset
+>This dataset is provided under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode).
+Redistributing the datasets "train-v1.1.json" and "dev-v1.1.json" with attribution:  
+Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. SQuAD: 100,000+ Questions for Machine Comprehension of Text. Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing (EMNLP).  
+Original source: https://github.com/rajpurkar/SQuAD-explorer
+

 ### The STSbenchmark dataset
 >Redistributing the dataset "Stsbenchmark.tar.gz" with attribution:   
--- a/utils_nlp/dataset/init.py
+++ b/utils_nlp/dataset/init.py
@ -8,7 +8,7 @@ nltk.download("punkt", quiet=True)
 nltk.download("stopwords", quiet=True)


-class Split(Enum):
-    TRAIN = "train"
-    DEV = "dev"
-    TEST = "test"
+class Split(str, Enum):
+    TRAIN : str = "train"
+    DEV : str = "dev"
+    TEST : str = "test"
--- a/utils_nlp/dataset/data_loaders.py
+++ b/utils_nlp/dataset/data_loaders.py
@ -118,15 +118,19 @@ class DaskJSONLoader:
            else:
                yield sample_part

-    def get_sequential_batches(self, batch_size):
+    def get_sequential_batches(self, batch_size, num_batches=None):
        """Creates a sequential generator.
            Batches returned are pandas dataframes of length=batch_size.
            Note: Final batch might be of smaller size.

        Args:
+            num_batches: Number of batches to generate.
            batch_size (int): Batch size.
        """
-        for i in range(self.df.npartitions):
+
+        if num_batches is None:
+            num_batches = self.df.npartitions
+        for i in range(num_batches):
            part = self.df.partitions[i].compute()
            for j in range(0, part.shape[0], batch_size):
                yield part.iloc[j: j + batch_size, :]
--- a/utils_nlp/dataset/multinli.py
+++ b/utils_nlp/dataset/multinli.py
@ -50,9 +50,8 @@ def get_generator(
    local_cache_path=".",
    file_split="train",
    block_size=10e6,
-    random_seed=None,
-    num_batches=1000,
-    batch_size=1000,
+    batch_size=10e6,
+    num_batches=None,
 ):
    """ Downloads and extracts the dataset files and then returns a random batch generator that
    yields pandas dataframes.
@ -81,10 +80,8 @@ def get_generator(

    loader = DaskJSONLoader(
        os.path.join(local_cache_path, DATA_FILES[file_split]),
-        block_size=block_size,
-        random_seed=random_seed,
-    )
+        block_size=block_size,)

-    return loader.get_random_batches(
-        num_batches=num_batches, batch_size=batch_size
+    return loader.get_sequential_batches(
+        batch_size=int(batch_size), num_batches=num_batches
    )
--- a/utils_nlp/dataset/snli.py
+++ b/utils_nlp/dataset/snli.py
@ -74,14 +74,14 @@ def _maybe_download_and_extract(zip_path, file_split, file_type):
        os.makedirs(dir_path)

    # format csv filename
-    file_name = "{0}_{1}.{2}".format(SNLI_FILE_PREFIX, file_split, file_type)
+    file_name = "{0}_{1}.{2}".format(SNLI_FILE_PREFIX, file_split.value, file_type)
    extract_path = os.path.join(dir_path, file_name)

    if not os.path.exists(extract_path):
-        download_snli(zip_path)
+        dpath = download_snli(zip_path)
        extract_snli(
            zip_path,
-            source_path=os.path.join(SNLI_DIRNAME, file_name),
+            source_path=SNLI_DIRNAME + "/" + file_name,
            dest_path=extract_path,
        )

@ -156,8 +156,10 @@ def clean_cols(df):
 def clean_rows(df, label_col=LABEL_COL):
    """Drop badly formatted rows from the input dataframe
    
-    Args: df (pd.DataFrame): Input dataframe label_col (str): Name of label column. Defaults to
-    the standardized column name that is set after running the clean_col method.
+    Args:
+        df (pd.DataFrame): Input dataframe
+        label_col (str): Name of label column. 
+                         Defaults to the standardized column name that is set after running the clean_col method.  
    
    Returns:
        pd.DataFrame
@ -167,6 +169,11 @@ def clean_rows(df, label_col=LABEL_COL):

    return snli_df

+def clean_df(df, label_col=LABEL_COL):
+    df = clean_cols(df)
+    df = clean_rows(df, label_col)
+
+    return df

 def load_azureml_df(
    local_cache_path=None, file_split=Split.TRAIN, file_type="txt"
--- a/utils_nlp/dataset/xnli.py
+++ b/utils_nlp/dataset/xnli.py
@ -1,12 +1,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

-"""XNLI dataset utils
-https://www.nyu.edu/projects/bowman/xnli/
-"""

 import os
-
 import pandas as pd

 from utils_nlp.dataset.url_utils import extract_zip, maybe_download
@ -16,9 +12,11 @@ URL_XNLI = "https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip"
 URL_XNLI_MT = "https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip"


-def load_pandas_df(local_cache_path="./", file_split="dev", language="zh"):
+def load_pandas_df(local_cache_path=".", file_split="dev", language="zh"):
    """Downloads and extracts the dataset files.

+    Utilities information can be found `on this link <https://www.nyu.edu/projects/bowman/xnli/>`_.
+
    Args:
        local_cache_path (str, optional): Path to store the data.
            Defaults to "./".
--- a/utils_nlp/eval/SentEval/.gitignore
+++ b/utils_nlp/eval/SentEval/.gitignore
@ -0,0 +1,15 @@
+# SentEval data and .pyc files
+
+
+
+# python
+__pycache__/
+*.py[cod]
+*$py.class
+
+# log files
+*.log
+*.txt
+
+# data files
+data/senteval_data*
--- a/utils_nlp/eval/SentEval/LICENSE
+++ b/utils_nlp/eval/SentEval/LICENSE
@ -0,0 +1,30 @@
+BSD License
+
+For SentEval software
+
+Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/utils_nlp/eval/SentEval/README.md
+++ b/utils_nlp/eval/SentEval/README.md
@ -0,0 +1,244 @@
+# SentEval: evaluation toolkit for sentence embeddings
+
+SentEval is a library for evaluating the quality of sentence embeddings. We assess their generalization power by using them as features on a broad and diverse set of "transfer" tasks. **SentEval currently includes 17 downstream tasks**. We also include a suite of **10 probing tasks** which evaluate what linguistic properties are encoded in sentence embeddings. Our goal is to ease the study and the development of general-purpose fixed-size sentence representations.
+
+
+**(04/22) SentEval new tasks: Added probing tasks for evaluating what linguistic properties are encoded in sentence embeddings**
+
+**(10/04) SentEval example scripts for three sentence encoders: [SkipThought-LN](https://github.com/ryankiros/layer-norm#skip-thoughts)/[GenSen](https://github.com/Maluuba/gensen)/[Google-USE](https://tfhub.dev/google/universal-sentence-encoder/1)**
+
+## Dependencies
+
+This code is written in python. The dependencies are:
+
+* Python 2/3 with [NumPy](http://www.numpy.org/)/[SciPy](http://www.scipy.org/)
+* [Pytorch](http://pytorch.org/)>=0.4
+* [scikit-learn](http://scikit-learn.org/stable/index.html)>=0.18.0
+
+## Transfer tasks
+
+### Downstream tasks
+SentEval allows you to evaluate your sentence embeddings as features for the following *downstream* tasks:
+
+| Task     	| Type                         	| #train 	| #test 	| needs_train 	| set_classifier |
+|----------	|------------------------------	|-----------:|----------:|:-----------:|:----------:|
+| [MR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)       	| movie review                 	| 11k     	| 11k    	| 1 | 1 |
+| [CR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)       	| product review               	| 4k      	| 4k     	| 1 | 1 |
+| [SUBJ](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)     	| subjectivity status          	| 10k     	| 10k    	| 1 | 1 |
+| [MPQA](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)     	| opinion-polarity  | 11k     	| 11k    	| 1 | 1 |
+| [SST](https://nlp.stanford.edu/sentiment/index.html)      	| binary sentiment analysis  	| 67k     	| 1.8k   	| 1 | 1 |
+| **[SST](https://nlp.stanford.edu/sentiment/index.html)**      	| **fine-grained sentiment analysis**  	| 8.5k     	| 2.2k   	| 1 | 1 |
+| [TREC](http://cogcomp.cs.illinois.edu/Data/QA/QC/)     	| question-type classification 	| 6k      	| 0.5k    	| 1 | 1 |
+| [SICK-E](http://clic.cimec.unitn.it/composes/sick.html)   	| natural language inference 	| 4.5k    	| 4.9k   	| 1 | 1 |
+| [SNLI](https://nlp.stanford.edu/projects/snli/)     	| natural language inference   	| 550k    	| 9.8k   	| 1 | 1 |
+| [MRPC](https://aclweb.org/aclwiki/Paraphrase_Identification_(State_of_the_art)) | paraphrase detection  | 4.1k | 1.7k | 1 | 1 |
+| [STS 2012](https://www.cs.york.ac.uk/semeval-2012/task6/) 	| semantic textual similarity  	| N/A     	| 3.1k   	| 0  | 0 |
+| [STS 2013](http://ixa2.si.ehu.es/sts/) 	| semantic textual similarity  	| N/A     	| 1.5k   	| 0  | 0 |
+| [STS 2014](http://alt.qcri.org/semeval2014/task10/) 	| semantic textual similarity  	| N/A     	| 3.7k   	| 0  | 0 |
+| [STS 2015](http://alt.qcri.org/semeval2015/task2/) 	| semantic textual similarity  	| N/A     	| 8.5k   	| 0  | 0 |
+| [STS 2016](http://alt.qcri.org/semeval2016/task1/) 	| semantic textual similarity  	| N/A     	| 9.2k   	| 0  | 0 |
+| [STS B](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results)    	| semantic textual similarity  	| 5.7k    	| 1.4k   	| 1 | 0 |
+| [SICK-R](http://clic.cimec.unitn.it/composes/sick.html)   	| semantic textual similarity | 4.5k    	| 4.9k   	| 1 | 0 |
+| [COCO](http://mscoco.org/)     	| image-caption retrieval      	| 567k    	| 5*1k   	| 1 | 0 |
+
+where **needs_train** means a model with parameters is learned on top of the sentence embeddings, and **set_classifier** means you can define the parameters of the classifier in the case of a classification task (see below).
+
+Note: COCO comes with ResNet-101 2048d image embeddings. [More details on the tasks.](https://arxiv.org/pdf/1705.02364.pdf)
+
+### Probing tasks
+SentEval also includes a series of [*probing* tasks](https://github.com/facebookresearch/SentEval/tree/master/data/probing) to evaluate what linguistic properties are encoded in your sentence embeddings:
+
+| Task     	| Type                         	| #train 	| #test 	| needs_train 	| set_classifier |
+|----------	|------------------------------	|-----------:|----------:|:-----------:|:----------:|
+| [SentLen](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Length prediction	| 100k     	| 10k    	| 1 | 1 |
+| [WC](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Word Content analysis	| 100k     	| 10k    	| 1 | 1 |
+| [TreeDepth](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Tree depth prediction	| 100k     	| 10k    	| 1 | 1 |
+| [TopConst](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Top Constituents prediction	| 100k     	| 10k    	| 1 | 1 |
+| [BShift](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Word order analysis	| 100k     	| 10k    	| 1 | 1 |
+| [Tense](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Verb tense prediction	| 100k     	| 10k    	| 1 | 1 |
+| [SubjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Subject number prediction	| 100k     	| 10k    	| 1 | 1 |
+| [ObjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Object number prediction	| 100k     	| 10k    	| 1 | 1 |
+| [SOMO](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Semantic odd man out	| 100k     	| 10k    	| 1 | 1 |
+| [CoordInv](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Coordination Inversion | 100k     	| 10k    	| 1 | 1 |
+
+## Download datasets
+To get all the transfer tasks datasets, run (in data/downstream/):
+```bash
+./get_transfer_data.bash
+```
+This will automatically download and preprocess the downstream datasets, and store them in data/downstream (warning: for MacOS users, you may have to use p7zip instead of unzip). The probing tasks are already in data/probing by default.
+
+## How to use SentEval: examples
+
+### examples/bow.py
+
+In examples/bow.py, we evaluate the quality of the average of word embeddings.
+
+To download state-of-the-art fastText embeddings:
+
+```bash
+curl -Lo glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
+curl -Lo crawl-300d-2M.vec.zip https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip
+```
+
+To reproduce the results for bag-of-vectors, run (in examples/):  
+```bash
+python bow.py
+```
+
+As required by SentEval, this script implements two functions: **prepare** (optional) and **batcher** (required) that turn text sentences into sentence embeddings. Then SentEval takes care of the evaluation on the transfer tasks using the embeddings as features.
+
+### examples/infersent.py
+
+To get the **[InferSent](https://www.github.com/facebookresearch/InferSent)** model and reproduce our results, download our best models and run infersent.py (in examples/):
+```bash
+curl -Lo examples/infersent1.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent1.pkl
+curl -Lo examples/infersent2.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent2.pkl
+```
+
+### examples/skipthought.py - examples/gensen.py - examples/googleuse.py
+
+We also provide example scripts for three other encoders:
+
+* [SkipThought with Layer-Normalization](https://github.com/ryankiros/layer-norm#skip-thoughts) in Theano
+* [GenSen encoder](https://github.com/Maluuba/gensen) in Pytorch
+* [Google encoder](https://tfhub.dev/google/universal-sentence-encoder/1) in TensorFlow
+
+Note that for SkipThought and GenSen, following the steps of the associated githubs is necessary.
+The Google encoder script should work as-is.
+
+## How to use SentEval
+
+To evaluate your sentence embeddings, SentEval requires that you implement two functions:
+
+1. **prepare** (sees the whole dataset of each task and can thus construct the word vocabulary, the dictionary of word vectors etc)
+2. **batcher** (transforms a batch of text sentences into sentence embeddings)
+
+
+### 1.) prepare(params, samples) (optional)
+
+*batcher* only sees one batch at a time while the *samples* argument of *prepare* contains all the sentences of a task.
+
+```
+prepare(params, samples)
+```
+* *params*: senteval parameters.
+* *samples*: list of all sentences from the tranfer task.
+* *output*: No output. Arguments stored in "params" can further be used by *batcher*.
+
+*Example*: in bow.py, prepare is is used to build the vocabulary of words and construct the "params.word_vect* dictionary of word vectors.
+
+
+### 2.) batcher(params, batch)
+```
+batcher(params, batch)
+```
+* *params*: senteval parameters.
+* *batch*: numpy array of text sentences (of size params.batch_size)
+* *output*: numpy array of sentence embeddings (of size params.batch_size)
+
+*Example*: in bow.py, batcher is used to compute the mean of the word vectors for each sentence in the batch using params.word_vec. Use your own encoder in that function to encode sentences.
+
+### 3.) evaluation on transfer tasks
+
+After having implemented the batch and prepare function for your own sentence encoder,
+
+1) to perform the actual evaluation, first import senteval and set its parameters:
+```python
+import senteval
+params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
+```
+
+2) (optional) set the parameters of the classifier (when applicable):
+```python
+params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
+                                 'tenacity': 5, 'epoch_size': 4}
+```
+You can choose **nhid=0** (Logistic Regression) or **nhid>0** (MLP) and define the parameters for training.
+
+3) Create an instance of the class SE:
+```python
+se = senteval.engine.SE(params, batcher, prepare)
+```
+
+4) define the set of transfer tasks and run the evaluation:
+```python
+transfer_tasks = ['MR', 'SICKEntailment', 'STS14', 'STSBenchmark']
+results = se.eval(transfer_tasks)
+```
+The current list of available tasks is:
+```python
+['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SNLI',
+'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'ImageCaptionRetrieval',
+'STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+'Length', 'WordContent', 'Depth', 'TopConstituents','BigramShift', 'Tense',
+'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion']
+```
+
+## SentEval parameters
+Global parameters of SentEval:
+```bash
+# senteval parameters
+task_path                   # path to SentEval datasets (required)
+seed                        # seed
+usepytorch                  # use cuda-pytorch (else scikit-learn) where possible
+kfold                       # k-fold validation for MR/CR/SUB/MPQA.
+```
+
+Parameters of the classifier:
+```bash
+nhid:                       # number of hidden units (0: Logistic Regression, >0: MLP); Default nonlinearity: Tanh
+optim:                      # optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..)
+tenacity:                   # how many times dev acc does not increase before training stops
+epoch_size:                 # each epoch corresponds to epoch_size pass on the train set
+max_epoch:                  # max number of epoches
+dropout:                    # dropout for MLP
+```
+
+Note that to get a proxy of the results while **dramatically reducing computation time**,
+we suggest the **prototyping config**:
+```python
+params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+```
+which will results in a 5 times speedup for classification tasks.
+
+To produce results that are **comparable to the literature**, use the **default config**:
+```python
+params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
+params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
+                                 'tenacity': 5, 'epoch_size': 4}
+```
+which takes longer but will produce better and comparable results.
+
+For probing tasks, we used an MLP with a Sigmoid nonlinearity and and tuned the nhid (in [50, 100, 200]) and dropout (in [0.0, 0.1, 0.2]) on the dev set.
+
+## References
+
+Please considering citing [[1]](https://arxiv.org/abs/1803.05449) if using this code for evaluating sentence embedding methods.
+
+### SentEval: An Evaluation Toolkit for Universal Sentence Representations
+
+[1] A. Conneau, D. Kiela, [*SentEval: An Evaluation Toolkit for Universal Sentence Representations*](https://arxiv.org/abs/1803.05449)
+
+```
+@article{conneau2018senteval,
+  title={SentEval: An Evaluation Toolkit for Universal Sentence Representations},
+  author={Conneau, Alexis and Kiela, Douwe},
+  journal={arXiv preprint arXiv:1803.05449},
+  year={2018}
+}
+```
+
+Contact: [aconneau@fb.com](mailto:aconneau@fb.com), [dkiela@fb.com](mailto:dkiela@fb.com)
+
+### Related work
+* [J. R Kiros, Y. Zhu, R. Salakhutdinov, R. S. Zemel, A. Torralba, R. Urtasun, S. Fidler - SkipThought Vectors, NIPS 2015](https://arxiv.org/abs/1506.06726)
+* [S. Arora, Y. Liang, T. Ma - A Simple but Tough-to-Beat Baseline for Sentence Embeddings, ICLR 2017](https://openreview.net/pdf?id=SyK00v5xx)
+* [Y. Adi, E. Kermany, Y. Belinkov, O. Lavi, Y. Goldberg - Fine-grained analysis of sentence embeddings using auxiliary prediction tasks, ICLR 2017](https://arxiv.org/abs/1608.04207)
+* [A. Conneau, D. Kiela, L. Barrault, H. Schwenk, A. Bordes - Supervised Learning of Universal Sentence Representations from Natural Language Inference Data, EMNLP 2017](https://arxiv.org/abs/1705.02364)
+* [S. Subramanian, A. Trischler, Y. Bengio, C. J Pal - Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning, ICLR 2018](https://arxiv.org/abs/1804.00079)
+* [A. Nie, E. D. Bennett, N. D. Goodman - DisSent: Sentence Representation Learning from Explicit Discourse Relations, 2018](https://arxiv.org/abs/1710.04334)
+* [D. Cer, Y. Yang, S. Kong, N. Hua, N. Limtiaco, R. St. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y. Sung, B. Strope, R. Kurzweil - Universal Sentence Encoder, 2018](https://arxiv.org/abs/1803.11175)
+* [A. Conneau, G. Kruszewski, G. Lample, L. Barrault, M. Baroni - What you can cram into a single vector: Probing sentence embeddings for linguistic properties, ACL 2018](https://arxiv.org/abs/1805.01070)
--- a/utils_nlp/eval/SentEval/senteval/init.py
+++ b/utils_nlp/eval/SentEval/senteval/init.py
@ -0,0 +1,10 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+from __future__ import absolute_import
+
+from senteval.engine import SE
--- a/utils_nlp/eval/SentEval/senteval/binary.py
+++ b/utils_nlp/eval/SentEval/senteval/binary.py
@ -0,0 +1,92 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import io
+import os
+import numpy as np
+import logging
+
+from senteval.tools.validation import InnerKFoldClassifier
+
+
+class BinaryClassifierEval(object):
+    def __init__(self, pos, neg, seed=1111):
+        self.seed = seed
+        self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg)
+        self.n_samples = len(self.samples)
+
+    def do_prepare(self, params, prepare):
+        # prepare is given the whole text
+        return prepare(params, self.samples)
+        # prepare puts everything it outputs in "params" : params.word2id etc
+        # Those output will be further used by "batcher".
+
+    def loadFile(self, fpath):
+        with io.open(fpath, 'r', encoding='latin-1') as f:
+            return [line.split() for line in f.read().splitlines()]
+
+    def run(self, params, batcher):
+        enc_input = []
+        # Sort to reduce padding
+        sorted_corpus = sorted(zip(self.samples, self.labels),
+                               key=lambda z: (len(z[0]), z[1]))
+        sorted_samples = [x for (x, y) in sorted_corpus]
+        sorted_labels = [y for (x, y) in sorted_corpus]
+        logging.info('Generating sentence embeddings')
+        for ii in range(0, self.n_samples, params.batch_size):
+            batch = sorted_samples[ii:ii + params.batch_size]
+            embeddings = batcher(params, batch)
+            enc_input.append(embeddings)
+        enc_input = np.vstack(enc_input)
+        logging.info('Generated sentence embeddings')
+
+        config = {'nclasses': 2, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'classifier': params.classifier,
+                  'nhid': params.nhid, 'kfold': params.kfold}
+        clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config)
+        devacc, testacc = clf.run()
+        logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples,
+                'ntest': self.n_samples}
+
+
+class CREval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : CR *****\n\n')
+        pos = self.loadFile(os.path.join(task_path, 'custrev.pos'))
+        neg = self.loadFile(os.path.join(task_path, 'custrev.neg'))
+        super(self.__class__, self).__init__(pos, neg, seed)
+
+
+class MREval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : MR *****\n\n')
+        pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos'))
+        neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg'))
+        super(self.__class__, self).__init__(pos, neg, seed)
+
+
+class SUBJEval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : SUBJ *****\n\n')
+        obj = self.loadFile(os.path.join(task_path, 'subj.objective'))
+        subj = self.loadFile(os.path.join(task_path, 'subj.subjective'))
+        super(self.__class__, self).__init__(obj, subj, seed)
+
+
+class MPQAEval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : MPQA *****\n\n')
+        pos = self.loadFile(os.path.join(task_path, 'mpqa.pos'))
+        neg = self.loadFile(os.path.join(task_path, 'mpqa.neg'))
+        super(self.__class__, self).__init__(pos, neg, seed)
--- a/utils_nlp/eval/SentEval/senteval/engine.py
+++ b/utils_nlp/eval/SentEval/senteval/engine.py
@ -0,0 +1,123 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+
+Generic sentence evaluation scripts wrapper
+
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+from senteval import utils
+from senteval.binary import CREval, MREval, MPQAEval, SUBJEval
+from senteval.snli import SNLIEval
+from senteval.trec import TRECEval
+from senteval.sick import SICKRelatednessEval, SICKEntailmentEval
+from senteval.mrpc import MRPCEval
+from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval
+from senteval.sst import SSTEval
+from senteval.rank import ImageCaptionRetrievalEval
+from senteval.probing import *
+
+class SE(object):
+    def __init__(self, params, batcher, prepare=None):
+        # parameters
+        params = utils.dotdict(params)
+        params.usepytorch = True if 'usepytorch' not in params else params.usepytorch
+        params.seed = 1111 if 'seed' not in params else params.seed
+
+        params.batch_size = 128 if 'batch_size' not in params else params.batch_size
+        params.nhid = 0 if 'nhid' not in params else params.nhid
+        params.kfold = 5 if 'kfold' not in params else params.kfold
+
+        if 'classifier' not in params or not params['classifier']:
+            params.classifier = {'nhid': 0}
+
+        assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!'
+
+        self.params = params
+
+        # batcher and prepare
+        self.batcher = batcher
+        self.prepare = prepare if prepare else lambda x, y: None
+
+        self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                           'SICKRelatedness', 'SICKEntailment', 'STSBenchmark',
+                           'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13',
+                           'STS14', 'STS15', 'STS16',
+                           'Length', 'WordContent', 'Depth', 'TopConstituents',
+                           'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                           'OddManOut', 'CoordinationInversion']
+
+    def eval(self, name):
+        # evaluate on evaluation [name], either takes string or list of strings
+        if (isinstance(name, list)):
+            self.results = {x: self.eval(x) for x in name}
+            return self.results
+
+        tpath = self.params.task_path
+        assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks)
+
+        # Original SentEval tasks
+        if name == 'CR':
+            self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed)
+        elif name == 'MR':
+            self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed)
+        elif name == 'MPQA':
+            self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed)
+        elif name == 'SUBJ':
+            self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed)
+        elif name == 'SST2':
+            self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed)
+        elif name == 'SST5':
+            self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed)
+        elif name == 'TREC':
+            self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed)
+        elif name == 'MRPC':
+            self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed)
+        elif name == 'SICKRelatedness':
+            self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed)
+        elif name == 'STSBenchmark':
+            self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
+        elif name == 'SICKEntailment':
+            self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed)
+        elif name == 'SNLI':
+            self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed)
+        elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
+            fpath = name + '-en-test'
+            self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed)
+        elif name == 'ImageCaptionRetrieval':
+            self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed)
+
+        # Probing Tasks
+        elif name == 'Length':
+                self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'WordContent':
+                self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'Depth':
+                self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'TopConstituents':
+                self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'BigramShift':
+                self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'Tense':
+                self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'SubjNumber':
+                self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'ObjNumber':
+                self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'OddManOut':
+                self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'CoordinationInversion':
+                self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed)
+
+        self.params.current_task = name
+        self.evaluation.do_prepare(self.params, self.prepare)
+
+        self.results = self.evaluation.run(self.params, self.batcher)
+
+        return self.results
--- a/utils_nlp/eval/SentEval/senteval/mrpc.py
+++ b/utils_nlp/eval/SentEval/senteval/mrpc.py
@ -0,0 +1,104 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+MRPC : Microsoft Research Paraphrase (detection) Corpus
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import logging
+import numpy as np
+import io
+
+from senteval.tools.validation import KFoldClassifier
+
+from sklearn.metrics import f1_score
+
+
+class MRPCEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.info('***** Transfer task : MRPC *****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path,
+                              'msr_paraphrase_train.txt'))
+        test = self.loadFile(os.path.join(task_path,
+                             'msr_paraphrase_test.txt'))
+        self.mrpc_data = {'train': train, 'test': test}
+
+    def do_prepare(self, params, prepare):
+        # TODO : Should we separate samples in "train, test"?
+        samples = self.mrpc_data['train']['X_A'] + \
+                  self.mrpc_data['train']['X_B'] + \
+                  self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B']
+        return prepare(params, samples)
+
+    def loadFile(self, fpath):
+        mrpc_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip().split('\t')
+                mrpc_data['X_A'].append(text[3].split())
+                mrpc_data['X_B'].append(text[4].split())
+                mrpc_data['y'].append(text[0])
+
+        mrpc_data['X_A'] = mrpc_data['X_A'][1:]
+        mrpc_data['X_B'] = mrpc_data['X_B'][1:]
+        mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]]
+        return mrpc_data
+
+    def run(self, params, batcher):
+        mrpc_embed = {'train': {}, 'test': {}}
+
+        for key in self.mrpc_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            text_data = {}
+            sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'],
+                                       self.mrpc_data[key]['X_B'],
+                                       self.mrpc_data[key]['y']),
+                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
+
+            text_data['A'] = [x for (x, y, z) in sorted_corpus]
+            text_data['B'] = [y for (x, y, z) in sorted_corpus]
+            text_data['y'] = [z for (x, y, z) in sorted_corpus]
+
+            for txt_type in ['A', 'B']:
+                mrpc_embed[key][txt_type] = []
+                for ii in range(0, len(text_data['y']), params.batch_size):
+                    batch = text_data[txt_type][ii:ii + params.batch_size]
+                    embeddings = batcher(params, batch)
+                    mrpc_embed[key][txt_type].append(embeddings)
+                mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type])
+            mrpc_embed[key]['y'] = np.array(text_data['y'])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        # Train
+        trainA = mrpc_embed['train']['A']
+        trainB = mrpc_embed['train']['B']
+        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
+        trainY = mrpc_embed['train']['y']
+
+        # Test
+        testA = mrpc_embed['test']['A']
+        testB = mrpc_embed['test']['B']
+        testF = np.c_[np.abs(testA - testB), testA * testB]
+        testY = mrpc_embed['test']['y']
+
+        config = {'nclasses': 2, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'classifier': params.classifier,
+                  'nhid': params.nhid, 'kfold': params.kfold}
+        clf = KFoldClassifier(train={'X': trainF, 'y': trainY},
+                              test={'X': testF, 'y': testY}, config=config)
+
+        devacc, testacc, yhat = clf.run()
+        testf1 = round(100*f1_score(testY, yhat), 2)
+        logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n'
+                      .format(devacc, testacc, testf1))
+        return {'devacc': devacc, 'acc': testacc, 'f1': testf1,
+                'ndev': len(trainA), 'ntest': len(testA)}
--- a/utils_nlp/eval/SentEval/senteval/probing.py
+++ b/utils_nlp/eval/SentEval/senteval/probing.py
@ -0,0 +1,171 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+probing tasks
+'''
+
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import io
+import copy
+import logging
+import numpy as np
+
+from senteval.tools.validation import SplitClassifier
+
+
+class PROBINGEval(object):
+    def __init__(self, task, task_path, seed=1111):
+        self.seed = seed
+        self.task = task
+        logging.debug('***** (Probing) Transfer task : %s classification *****', self.task.upper())
+        self.task_data = {'train': {'X': [], 'y': []},
+                          'dev': {'X': [], 'y': []},
+                          'test': {'X': [], 'y': []}}
+        self.loadFile(task_path)
+        logging.info('Loaded %s train - %s dev - %s test for %s' %
+                     (len(self.task_data['train']['y']), len(self.task_data['dev']['y']),
+                      len(self.task_data['test']['y']), self.task))
+
+    def do_prepare(self, params, prepare):
+        samples = self.task_data['train']['X'] + self.task_data['dev']['X'] + \
+                  self.task_data['test']['X']
+        return prepare(params, samples)
+
+    def loadFile(self, fpath):
+        self.tok2split = {'tr': 'train', 'va': 'dev', 'te': 'test'}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.rstrip().split('\t')
+                self.task_data[self.tok2split[line[0]]]['X'].append(line[-1].split())
+                self.task_data[self.tok2split[line[0]]]['y'].append(line[1])
+
+        labels = sorted(np.unique(self.task_data['train']['y']))
+        self.tok2label = dict(zip(labels, range(len(labels))))
+        self.nclasses = len(self.tok2label)
+
+        for split in self.task_data:
+            for i, y in enumerate(self.task_data[split]['y']):
+                self.task_data[split]['y'][i] = self.tok2label[y]
+
+    def run(self, params, batcher):
+        task_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+        logging.info('Computing embeddings for train/dev/test')
+        for key in self.task_data:
+            # Sort to reduce padding
+            sorted_data = sorted(zip(self.task_data[key]['X'],
+                                     self.task_data[key]['y']),
+                                 key=lambda z: (len(z[0]), z[1]))
+            self.task_data[key]['X'], self.task_data[key]['y'] = map(list, zip(*sorted_data))
+
+            task_embed[key]['X'] = []
+            for ii in range(0, len(self.task_data[key]['y']), bsize):
+                batch = self.task_data[key]['X'][ii:ii + bsize]
+                embeddings = batcher(params, batch)
+                task_embed[key]['X'].append(embeddings)
+            task_embed[key]['X'] = np.vstack(task_embed[key]['X'])
+            task_embed[key]['y'] = np.array(self.task_data[key]['y'])
+        logging.info('Computed embeddings')
+
+        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
+                             'usepytorch': params.usepytorch,
+                             'classifier': params.classifier}
+
+        if self.task == "WordContent" and params.classifier['nhid'] > 0:
+            config_classifier = copy.deepcopy(config_classifier)
+            config_classifier['classifier']['nhid'] = 0
+            print(params.classifier['nhid'])
+
+        clf = SplitClassifier(X={'train': task_embed['train']['X'],
+                                 'valid': task_embed['dev']['X'],
+                                 'test': task_embed['test']['X']},
+                              y={'train': task_embed['train']['y'],
+                                 'valid': task_embed['dev']['y'],
+                                 'test': task_embed['test']['y']},
+                              config=config_classifier)
+
+        devacc, testacc = clf.run()
+        logging.debug('\nDev acc : %.1f Test acc : %.1f for %s classification\n' % (devacc, testacc, self.task.upper()))
+
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(task_embed['dev']['X']),
+                'ntest': len(task_embed['test']['X'])}
+
+"""
+Surface Information
+"""
+class LengthEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'sentence_length.txt')
+        # labels: bins
+        PROBINGEval.__init__(self, 'Length', task_path, seed)
+
+class WordContentEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'word_content.txt')
+        # labels: 200 target words
+        PROBINGEval.__init__(self, 'WordContent', task_path, seed)
+
+"""
+Latent Structural Information
+"""
+class DepthEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'tree_depth.txt')
+        # labels: bins
+        PROBINGEval.__init__(self, 'Depth', task_path, seed)
+
+class TopConstituentsEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'top_constituents.txt')
+        # labels: 'PP_NP_VP_.' .. (20 classes)
+        PROBINGEval.__init__(self, 'TopConstituents', task_path, seed)
+
+class BigramShiftEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'bigram_shift.txt')
+        # labels: 0 or 1
+        PROBINGEval.__init__(self, 'BigramShift', task_path, seed)
+
+# TODO: Voice?
+
+"""
+Latent Semantic Information
+"""
+
+class TenseEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'past_present.txt')
+        # labels: 'PRES', 'PAST'
+        PROBINGEval.__init__(self, 'Tense', task_path, seed)
+
+class SubjNumberEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'subj_number.txt')
+        # labels: 'NN', 'NNS'
+        PROBINGEval.__init__(self, 'SubjNumber', task_path, seed)
+
+class ObjNumberEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'obj_number.txt')
+        # labels: 'NN', 'NNS'
+        PROBINGEval.__init__(self, 'ObjNumber', task_path, seed)
+
+class OddManOutEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'odd_man_out.txt')
+        # labels: 'O', 'C'
+        PROBINGEval.__init__(self, 'OddManOut', task_path, seed)
+
+class CoordinationInversionEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'coordination_inversion.txt')
+        # labels: 'O', 'I'
+        PROBINGEval.__init__(self, 'CoordinationInversion', task_path, seed)
--- a/utils_nlp/eval/SentEval/senteval/rank.py
+++ b/utils_nlp/eval/SentEval/senteval/rank.py
@ -0,0 +1,108 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+Image-Caption Retrieval with COCO dataset
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import sys
+import logging
+import numpy as np
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+from senteval.tools.ranking import ImageSentenceRankingPytorch
+
+
+class ImageCaptionRetrievalEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task: Image Caption Retrieval *****\n\n')
+
+        # Get captions and image features
+        self.seed = seed
+        train, dev, test = self.loadFile(task_path)
+        self.coco_data = {'train': train, 'dev': dev, 'test': test}
+
+    def do_prepare(self, params, prepare):
+        samples = self.coco_data['train']['sent'] + \
+                  self.coco_data['dev']['sent'] + \
+                  self.coco_data['test']['sent']
+        prepare(params, samples)
+
+    def loadFile(self, fpath):
+        coco = {}
+
+        for split in ['train', 'valid', 'test']:
+            list_sent = []
+            list_img_feat = []
+            if sys.version_info < (3, 0):
+                with open(os.path.join(fpath, split + '.pkl')) as f:
+                    cocodata = pickle.load(f)
+            else:
+                with open(os.path.join(fpath, split + '.pkl'), 'rb') as f:
+                    cocodata = pickle.load(f, encoding='latin1')
+
+            for imgkey in range(len(cocodata['features'])):
+                assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \
+                       cocodata['image_to_caption_ids'][imgkey]
+                for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]:
+                    sent = cocodata['captions'][captkey]['cleaned_caption']
+                    sent += ' .'  # add punctuation to end of sentence in COCO
+                    list_sent.append(sent.encode('utf-8').split())
+                    list_img_feat.append(cocodata['features'][imgkey])
+            assert len(list_sent) == len(list_img_feat) and \
+                len(list_sent) % 5 == 0
+            list_img_feat = np.array(list_img_feat).astype('float32')
+            coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat}
+        return coco['train'], coco['valid'], coco['test']
+
+    def run(self, params, batcher):
+        coco_embed = {'train': {'sentfeat': [], 'imgfeat': []},
+                      'dev': {'sentfeat': [], 'imgfeat': []},
+                      'test': {'sentfeat': [], 'imgfeat': []}}
+
+        for key in self.coco_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent'])
+            self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent'])
+            idx_unsort = np.argsort(idx_sort)
+
+            coco_embed[key]['X'] = []
+            nsent = len(self.coco_data[key]['sent'])
+            for ii in range(0, nsent, params.batch_size):
+                batch = self.coco_data[key]['sent'][ii:ii + params.batch_size]
+                embeddings = batcher(params, batch)
+                coco_embed[key]['sentfeat'].append(embeddings)
+            coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort]
+            coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat'])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2}
+        clf = ImageSentenceRankingPytorch(train=coco_embed['train'],
+                                          valid=coco_embed['dev'],
+                                          test=coco_embed['test'],
+                                          config=config)
+
+        bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \
+            r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run()
+
+        logging.debug("\nTest scores | Image to text: \
+            {0}, {1}, {2}, {3}".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))
+        logging.debug("Test scores | Text to image: \
+            {0}, {1}, {2}, {3}\n".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))
+
+        return {'devacc': bestdevscore,
+                'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t),
+                        (r1_t2i, r5_t2i, r10_t2i, medr_t2i)],
+                'ndev': len(coco_embed['dev']['sentfeat']),
+                'ntest': len(coco_embed['test']['sentfeat'])}
--- a/utils_nlp/eval/SentEval/senteval/sick.py
+++ b/utils_nlp/eval/SentEval/senteval/sick.py
@ -0,0 +1,217 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+SICK Relatedness and Entailment
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import io
+import logging
+import numpy as np
+
+from sklearn.metrics import mean_squared_error
+from scipy.stats import pearsonr, spearmanr
+
+from senteval.tools.relatedness import RelatednessPytorch
+from senteval.tools.validation import SplitClassifier
+
+
+class SICKRelatednessEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : SICK-Relatedness*****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
+        dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
+        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
+        self.sick_data = {'train': train, 'dev': dev, 'test': test}
+
+    def do_prepare(self, params, prepare):
+        samples = self.sick_data['train']['X_A'] + \
+                  self.sick_data['train']['X_B'] + \
+                  self.sick_data['dev']['X_A'] + \
+                  self.sick_data['dev']['X_B'] + \
+                  self.sick_data['test']['X_A'] + self.sick_data['test']['X_B']
+        return prepare(params, samples)
+
+    def loadFile(self, fpath):
+        skipFirstLine = True
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if skipFirstLine:
+                    skipFirstLine = False
+                else:
+                    text = line.strip().split('\t')
+                    sick_data['X_A'].append(text[1].split())
+                    sick_data['X_B'].append(text[2].split())
+                    sick_data['y'].append(text[3])
+
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        return sick_data
+
+    def run(self, params, batcher):
+        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+
+        for key in self.sick_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
+                                       self.sick_data[key]['X_B'],
+                                       self.sick_data[key]['y']),
+                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
+
+            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
+
+            for txt_type in ['X_A', 'X_B']:
+                sick_embed[key][txt_type] = []
+                for ii in range(0, len(self.sick_data[key]['y']), bsize):
+                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
+                    embeddings = batcher(params, batch)
+                    sick_embed[key][txt_type].append(embeddings)
+                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
+            sick_embed[key]['y'] = np.array(self.sick_data[key]['y'])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        # Train
+        trainA = sick_embed['train']['X_A']
+        trainB = sick_embed['train']['X_B']
+        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
+        trainY = self.encode_labels(self.sick_data['train']['y'])
+
+        # Dev
+        devA = sick_embed['dev']['X_A']
+        devB = sick_embed['dev']['X_B']
+        devF = np.c_[np.abs(devA - devB), devA * devB]
+        devY = self.encode_labels(self.sick_data['dev']['y'])
+
+        # Test
+        testA = sick_embed['test']['X_A']
+        testB = sick_embed['test']['X_B']
+        testF = np.c_[np.abs(testA - testB), testA * testB]
+        testY = self.encode_labels(self.sick_data['test']['y'])
+
+        config = {'seed': self.seed, 'nclasses': 5}
+        clf = RelatednessPytorch(train={'X': trainF, 'y': trainY},
+                                 valid={'X': devF, 'y': devY},
+                                 test={'X': testF, 'y': testY},
+                                 devscores=self.sick_data['dev']['y'],
+                                 config=config)
+
+        devpr, yhat = clf.run()
+
+        pr = pearsonr(yhat, self.sick_data['test']['y'])[0]
+        sr = spearmanr(yhat, self.sick_data['test']['y'])[0]
+        pr = 0 if pr != pr else pr
+        sr = 0 if sr != sr else sr
+        se = mean_squared_error(yhat, self.sick_data['test']['y'])
+        logging.debug('Dev : Pearson {0}'.format(devpr))
+        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
+                       for SICK Relatedness\n'.format(pr, sr, se))
+
+        return {'devpearson': devpr, 'pearson': pr, 'spearman': sr, 'mse': se,
+                'yhat': yhat, 'ndev': len(devA), 'ntest': len(testA)}
+
+    def encode_labels(self, labels, nclass=5):
+        """
+        Label encoding from Tree LSTM paper (Tai, Socher, Manning)
+        """
+        Y = np.zeros((len(labels), nclass)).astype('float32')
+        for j, y in enumerate(labels):
+            for i in range(nclass):
+                if i+1 == np.floor(y) + 1:
+                    Y[j, i] = y - np.floor(y)
+                if i+1 == np.floor(y):
+                    Y[j, i] = np.floor(y) - y + 1
+        return Y
+
+
+class SICKEntailmentEval(SICKRelatednessEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : SICK-Entailment*****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
+        dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
+        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
+        self.sick_data = {'train': train, 'dev': dev, 'test': test}
+
+    def loadFile(self, fpath):
+        label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2}
+        skipFirstLine = True
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if skipFirstLine:
+                    skipFirstLine = False
+                else:
+                    text = line.strip().split('\t')
+                    sick_data['X_A'].append(text[1].split())
+                    sick_data['X_B'].append(text[2].split())
+                    sick_data['y'].append(text[4])
+        sick_data['y'] = [label2id[s] for s in sick_data['y']]
+        return sick_data
+
+    def run(self, params, batcher):
+        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+
+        for key in self.sick_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
+                                       self.sick_data[key]['X_B'],
+                                       self.sick_data[key]['y']),
+                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
+
+            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
+
+            for txt_type in ['X_A', 'X_B']:
+                sick_embed[key][txt_type] = []
+                for ii in range(0, len(self.sick_data[key]['y']), bsize):
+                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
+                    embeddings = batcher(params, batch)
+                    sick_embed[key][txt_type].append(embeddings)
+                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        # Train
+        trainA = sick_embed['train']['X_A']
+        trainB = sick_embed['train']['X_B']
+        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
+        trainY = np.array(self.sick_data['train']['y'])
+
+        # Dev
+        devA = sick_embed['dev']['X_A']
+        devB = sick_embed['dev']['X_B']
+        devF = np.c_[np.abs(devA - devB), devA * devB]
+        devY = np.array(self.sick_data['dev']['y'])
+
+        # Test
+        testA = sick_embed['test']['X_A']
+        testB = sick_embed['test']['X_B']
+        testF = np.c_[np.abs(testA - testB), testA * testB]
+        testY = np.array(self.sick_data['test']['y'])
+
+        config = {'nclasses': 3, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'classifier': params.classifier,
+                  'nhid': params.nhid}
+        clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF},
+                              y={'train': trainY, 'valid': devY, 'test': testY},
+                              config=config)
+
+        devacc, testacc = clf.run()
+        logging.debug('\nDev acc : {0} Test acc : {1} for \
+                       SICK entailment\n'.format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(devA), 'ntest': len(testA)}
--- a/utils_nlp/eval/SentEval/senteval/snli.py
+++ b/utils_nlp/eval/SentEval/senteval/snli.py
@ -0,0 +1,113 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+SNLI - Entailment
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import codecs
+import os
+import io
+import copy
+import logging
+import numpy as np
+
+from senteval.tools.validation import SplitClassifier
+
+
+class SNLIEval(object):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : SNLI Entailment*****\n\n')
+        self.seed = seed
+        train1 = self.loadFile(os.path.join(taskpath, 's1.train'))
+        train2 = self.loadFile(os.path.join(taskpath, 's2.train'))
+
+        trainlabels = io.open(os.path.join(taskpath, 'labels.train'),
+                              encoding='utf-8').read().splitlines()
+
+        valid1 = self.loadFile(os.path.join(taskpath, 's1.dev'))
+        valid2 = self.loadFile(os.path.join(taskpath, 's2.dev'))
+        validlabels = io.open(os.path.join(taskpath, 'labels.dev'),
+                              encoding='utf-8').read().splitlines()
+
+        test1 = self.loadFile(os.path.join(taskpath, 's1.test'))
+        test2 = self.loadFile(os.path.join(taskpath, 's2.test'))
+        testlabels = io.open(os.path.join(taskpath, 'labels.test'),
+                             encoding='utf-8').read().splitlines()
+
+        # sort data (by s2 first) to reduce padding
+        sorted_train = sorted(zip(train2, train1, trainlabels),
+                              key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        train2, train1, trainlabels = map(list, zip(*sorted_train))
+
+        sorted_valid = sorted(zip(valid2, valid1, validlabels),
+                              key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        valid2, valid1, validlabels = map(list, zip(*sorted_valid))
+
+        sorted_test = sorted(zip(test2, test1, testlabels),
+                             key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        test2, test1, testlabels = map(list, zip(*sorted_test))
+
+        self.samples = train1 + train2 + valid1 + valid2 + test1 + test2
+        self.data = {'train': (train1, train2, trainlabels),
+                     'valid': (valid1, valid2, validlabels),
+                     'test': (test1, test2, testlabels)
+                     }
+
+    def do_prepare(self, params, prepare):
+        return prepare(params, self.samples)
+
+    def loadFile(self, fpath):
+        with codecs.open(fpath, 'rb', 'latin-1') as f:
+            return [line.split() for line in
+                    f.read().splitlines()]
+
+    def run(self, params, batcher):
+        self.X, self.y = {}, {}
+        dico_label = {'entailment': 0,  'neutral': 1, 'contradiction': 2}
+        for key in self.data:
+            if key not in self.X:
+                self.X[key] = []
+            if key not in self.y:
+                self.y[key] = []
+
+            input1, input2, mylabels = self.data[key]
+            enc_input = []
+            n_labels = len(mylabels)
+            for ii in range(0, n_labels, params.batch_size):
+                batch1 = input1[ii:ii + params.batch_size]
+                batch2 = input2[ii:ii + params.batch_size]
+
+                if len(batch1) == len(batch2) and len(batch1) > 0:
+                    enc1 = batcher(params, batch1)
+                    enc2 = batcher(params, batch2)
+                    enc_input.append(np.hstack((enc1, enc2, enc1 * enc2,
+                                                np.abs(enc1 - enc2))))
+                if (ii*params.batch_size) % (20000*params.batch_size) == 0:
+                    logging.info("PROGRESS (encoding): %.2f%%" %
+                                 (100 * ii / n_labels))
+            self.X[key] = np.vstack(enc_input)
+            self.y[key] = [dico_label[y] for y in mylabels]
+
+        config = {'nclasses': 3, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'cudaEfficient': True,
+                  'nhid': params.nhid, 'noreg': True}
+
+        config_classifier = copy.deepcopy(params.classifier)
+        config_classifier['max_epoch'] = 15
+        config_classifier['epoch_size'] = 1
+        config['classifier'] = config_classifier
+
+        clf = SplitClassifier(self.X, self.y, config)
+        devacc, testacc = clf.run()
+        logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'
+                      .format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(self.data['valid'][0]),
+                'ntest': len(self.data['test'][0])}
--- a/utils_nlp/eval/SentEval/senteval/sst.py
+++ b/utils_nlp/eval/SentEval/senteval/sst.py
@ -0,0 +1,96 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+SST - binary classification
+'''
+
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import io
+import logging
+import numpy as np
+
+from senteval.tools.validation import SplitClassifier
+
+
+class SSTEval(object):
+    def __init__(self, task_path, nclasses=2, seed=1111):
+        self.seed = seed
+
+        # binary of fine-grained
+        assert nclasses in [2, 5]
+        self.nclasses = nclasses
+        self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained'
+        logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name)
+
+        train = self.loadFile(os.path.join(task_path, 'sentiment-train'))
+        dev = self.loadFile(os.path.join(task_path, 'sentiment-dev'))
+        test = self.loadFile(os.path.join(task_path, 'sentiment-test'))
+        self.sst_data = {'train': train, 'dev': dev, 'test': test}
+
+    def do_prepare(self, params, prepare):
+        samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \
+                  self.sst_data['test']['X']
+        return prepare(params, samples)
+
+    def loadFile(self, fpath):
+        sst_data = {'X': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if self.nclasses == 2:
+                    sample = line.strip().split('\t')
+                    sst_data['y'].append(int(sample[1]))
+                    sst_data['X'].append(sample[0].split())
+                elif self.nclasses == 5:
+                    sample = line.strip().split(' ', 1)
+                    sst_data['y'].append(int(sample[0]))
+                    sst_data['X'].append(sample[1].split())
+        assert max(sst_data['y']) == self.nclasses - 1
+        return sst_data
+
+    def run(self, params, batcher):
+        sst_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+
+        for key in self.sst_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            sorted_data = sorted(zip(self.sst_data[key]['X'],
+                                     self.sst_data[key]['y']),
+                                 key=lambda z: (len(z[0]), z[1]))
+            self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))
+
+            sst_embed[key]['X'] = []
+            for ii in range(0, len(self.sst_data[key]['y']), bsize):
+                batch = self.sst_data[key]['X'][ii:ii + bsize]
+                embeddings = batcher(params, batch)
+                sst_embed[key]['X'].append(embeddings)
+            sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
+            sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
+                             'usepytorch': params.usepytorch,
+                             'classifier': params.classifier}
+
+        clf = SplitClassifier(X={'train': sst_embed['train']['X'],
+                                 'valid': sst_embed['dev']['X'],
+                                 'test': sst_embed['test']['X']},
+                              y={'train': sst_embed['train']['y'],
+                                 'valid': sst_embed['dev']['y'],
+                                 'test': sst_embed['test']['y']},
+                              config=config_classifier)
+
+        devacc, testacc = clf.run()
+        logging.debug('\nDev acc : {0} Test acc : {1} for \
+            SST {2} classification\n'.format(devacc, testacc, self.task_name))
+
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(sst_embed['dev']['X']),
+                'ntest': len(sst_embed['test']['X'])}
--- a/utils_nlp/eval/SentEval/senteval/sts.py
+++ b/utils_nlp/eval/SentEval/senteval/sts.py
@ -0,0 +1,171 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+STS-{2012,2013,2014,2015,2016} (unsupervised) and
+STS-benchmark (supervised) tasks
+'''
+
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import io
+import numpy as np
+import logging
+
+from scipy.stats import spearmanr, pearsonr
+
+from senteval.utils import cosine
+from senteval.sick import SICKRelatednessEval
+
+
+class STSEval(object):
+    def loadFile(self, fpath):
+        self.data = {}
+        self.samples = []
+
+        for dataset in self.datasets:
+            sent1, sent2 = zip(*[l.split("\t") for l in
+                               io.open(fpath + '/STS.input.%s.txt' % dataset,
+                                       encoding='utf8').read().splitlines()])
+            raw_scores = np.array([x for x in
+                                   io.open(fpath + '/STS.gs.%s.txt' % dataset,
+                                           encoding='utf8')
+                                   .read().splitlines()])
+            not_empty_idx = raw_scores != ''
+
+            gs_scores = [float(x) for x in raw_scores[not_empty_idx]]
+            sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
+            sent2 = np.array([s.split() for s in sent2])[not_empty_idx]
+            # sort data by length to minimize padding in batcher
+            sorted_data = sorted(zip(sent1, sent2, gs_scores),
+                                 key=lambda z: (len(z[0]), len(z[1]), z[2]))
+            sent1, sent2, gs_scores = map(list, zip(*sorted_data))
+
+            self.data[dataset] = (sent1, sent2, gs_scores)
+            self.samples += sent1 + sent2
+
+    def do_prepare(self, params, prepare):
+        if 'similarity' in params:
+            self.similarity = params.similarity
+        else:  # Default similarity is cosine
+            self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2)))
+        return prepare(params, self.samples)
+
+    def run(self, params, batcher):
+        results = {}
+        for dataset in self.datasets:
+            sys_scores = []
+            input1, input2, gs_scores = self.data[dataset]
+            for ii in range(0, len(gs_scores), params.batch_size):
+                batch1 = input1[ii:ii + params.batch_size]
+                batch2 = input2[ii:ii + params.batch_size]
+
+                # we assume get_batch already throws out the faulty ones
+                if len(batch1) == len(batch2) and len(batch1) > 0:
+                    enc1 = batcher(params, batch1)
+                    enc2 = batcher(params, batch2)
+
+                    for kk in range(enc2.shape[0]):
+                        sys_score = self.similarity(enc1[kk], enc2[kk])
+                        sys_scores.append(sys_score)
+
+            results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores),
+                                'spearman': spearmanr(sys_scores, gs_scores),
+                                'nsamples': len(sys_scores)}
+            logging.debug('%s : pearson = %.4f, spearman = %.4f' %
+                          (dataset, results[dataset]['pearson'][0],
+                           results[dataset]['spearman'][0]))
+
+        weights = [results[dset]['nsamples'] for dset in results.keys()]
+        list_prs = np.array([results[dset]['pearson'][0] for
+                            dset in results.keys()])
+        list_spr = np.array([results[dset]['spearman'][0] for
+                            dset in results.keys()])
+
+        avg_pearson = np.average(list_prs)
+        avg_spearman = np.average(list_spr)
+        wavg_pearson = np.average(list_prs, weights=weights)
+        wavg_spearman = np.average(list_spr, weights=weights)
+
+        results['all'] = {'pearson': {'mean': avg_pearson,
+                                      'wmean': wavg_pearson},
+                          'spearman': {'mean': avg_spearman,
+                                       'wmean': wavg_spearman}}
+        logging.debug('ALL (weighted average) : Pearson = %.4f, \
+            Spearman = %.4f' % (wavg_pearson, wavg_spearman))
+        logging.debug('ALL (average) : Pearson = %.4f, \
+            Spearman = %.4f\n' % (avg_pearson, avg_spearman))
+
+        return results
+
+
+class STS12Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS12 *****\n\n')
+        self.seed = seed
+        self.datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl',
+                         'surprise.OnWN', 'surprise.SMTnews']
+        self.loadFile(taskpath)
+
+
+class STS13Eval(STSEval):
+    # STS13 here does not contain the "SMT" subtask due to LICENSE issue
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS13 (-SMT) *****\n\n')
+        self.seed = seed
+        self.datasets = ['FNWN', 'headlines', 'OnWN']
+        self.loadFile(taskpath)
+
+
+class STS14Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS14 *****\n\n')
+        self.seed = seed
+        self.datasets = ['deft-forum', 'deft-news', 'headlines',
+                         'images', 'OnWN', 'tweet-news']
+        self.loadFile(taskpath)
+
+
+class STS15Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS15 *****\n\n')
+        self.seed = seed
+        self.datasets = ['answers-forums', 'answers-students',
+                         'belief', 'headlines', 'images']
+        self.loadFile(taskpath)
+
+
+class STS16Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS16 *****\n\n')
+        self.seed = seed
+        self.datasets = ['answer-answer', 'headlines', 'plagiarism',
+                         'postediting', 'question-question']
+        self.loadFile(taskpath)
+
+
+class STSBenchmarkEval(SICKRelatednessEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))
+        dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
+        test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
+        self.sick_data = {'train': train, 'dev': dev, 'test': test}
+
+    def loadFile(self, fpath):
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip().split('\t')
+                sick_data['X_A'].append(text[5].split())
+                sick_data['X_B'].append(text[6].split())
+                sick_data['y'].append(text[4])
+
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        return sick_data
--- a/utils_nlp/eval/SentEval/senteval/tools/init.py
+++ b/utils_nlp/eval/SentEval/senteval/tools/init.py
--- a/utils_nlp/eval/SentEval/senteval/tools/classifier.py
+++ b/utils_nlp/eval/SentEval/senteval/tools/classifier.py
@ -0,0 +1,202 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+"""
+Pytorch Classifier class in the style of scikit-learn
+Classifiers include Logistic Regression and MLP
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+import numpy as np
+import copy
+from senteval import utils
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+class PyTorchClassifier(object):
+    def __init__(self, inputdim, nclasses, l2reg=0., batch_size=64, seed=1111,
+                 cudaEfficient=False):
+        # fix seed
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+
+        self.inputdim = inputdim
+        self.nclasses = nclasses
+        self.l2reg = l2reg
+        self.batch_size = batch_size
+        self.cudaEfficient = cudaEfficient
+
+    def prepare_split(self, X, y, validation_data=None, validation_split=None):
+        # Preparing validation data
+        assert validation_split or validation_data
+        if validation_data is not None:
+            trainX, trainy = X, y
+            devX, devy = validation_data
+        else:
+            permutation = np.random.permutation(len(X))
+            trainidx = permutation[int(validation_split * len(X)):]
+            devidx = permutation[0:int(validation_split * len(X))]
+            trainX, trainy = X[trainidx], y[trainidx]
+            devX, devy = X[devidx], y[devidx]
+
+        device = torch.device('cpu') if self.cudaEfficient else torch.device('cuda')
+
+        trainX = torch.from_numpy(trainX).to(device, dtype=torch.float32)
+        trainy = torch.from_numpy(trainy).to(device, dtype=torch.int64)
+        devX = torch.from_numpy(devX).to(device, dtype=torch.float32)
+        devy = torch.from_numpy(devy).to(device, dtype=torch.int64)
+
+        return trainX, trainy, devX, devy
+
+    def fit(self, X, y, validation_data=None, validation_split=None,
+            early_stop=True):
+        self.nepoch = 0
+        bestaccuracy = -1
+        stop_train = False
+        early_stop_count = 0
+
+        # Preparing validation data
+        trainX, trainy, devX, devy = self.prepare_split(X, y, validation_data,
+                                                        validation_split)
+
+        # Training
+        while not stop_train and self.nepoch <= self.max_epoch:
+            self.trainepoch(trainX, trainy, epoch_size=self.epoch_size)
+            accuracy = self.score(devX, devy)
+            if accuracy > bestaccuracy:
+                bestaccuracy = accuracy
+                bestmodel = copy.deepcopy(self.model)
+            elif early_stop:
+                if early_stop_count >= self.tenacity:
+                    stop_train = True
+                early_stop_count += 1
+        self.model = bestmodel
+        return bestaccuracy
+
+    def trainepoch(self, X, y, epoch_size=1):
+        self.model.train()
+        for _ in range(self.nepoch, self.nepoch + epoch_size):
+            permutation = np.random.permutation(len(X))
+            all_costs = []
+            for i in range(0, len(X), self.batch_size):
+                # forward
+                idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().to(X.device)
+
+                Xbatch = X[idx]
+                ybatch = y[idx]
+
+                if self.cudaEfficient:
+                    Xbatch = Xbatch.cuda()
+                    ybatch = ybatch.cuda()
+                output = self.model(Xbatch)
+                # loss
+                loss = self.loss_fn(output, ybatch)
+                all_costs.append(loss.data.item())
+                # backward
+                self.optimizer.zero_grad()
+                loss.backward()
+                # Update parameters
+                self.optimizer.step()
+        self.nepoch += epoch_size
+
+    def score(self, devX, devy):
+        self.model.eval()
+        correct = 0
+        if not isinstance(devX, torch.cuda.FloatTensor) or self.cudaEfficient:
+            devX = torch.FloatTensor(devX).cuda()
+            devy = torch.LongTensor(devy).cuda()
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                ybatch = devy[i:i + self.batch_size]
+                if self.cudaEfficient:
+                    Xbatch = Xbatch.cuda()
+                    ybatch = ybatch.cuda()
+                output = self.model(Xbatch)
+                pred = output.data.max(1)[1]
+                correct += pred.long().eq(ybatch.data.long()).sum().item()
+            accuracy = 1.0 * correct / len(devX)
+        return accuracy
+
+    def predict(self, devX):
+        self.model.eval()
+        if not isinstance(devX, torch.cuda.FloatTensor):
+            devX = torch.FloatTensor(devX).cuda()
+        yhat = np.array([])
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                output = self.model(Xbatch)
+                yhat = np.append(yhat,
+                                 output.data.max(1)[1].cpu().numpy())
+        yhat = np.vstack(yhat)
+        return yhat
+
+    def predict_proba(self, devX):
+        self.model.eval()
+        probas = []
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                vals = F.softmax(self.model(Xbatch).data.cpu().numpy())
+                if not probas:
+                    probas = vals
+                else:
+                    probas = np.concatenate(probas, vals, axis=0)
+        return probas
+
+
+"""
+MLP with Pytorch (nhid=0 --> Logistic Regression)
+"""
+
+class MLP(PyTorchClassifier):
+    def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64,
+                 seed=1111, cudaEfficient=False):
+        super(self.__class__, self).__init__(inputdim, nclasses, l2reg,
+                                             batch_size, seed, cudaEfficient)
+        """
+        PARAMETERS:
+        -nhid:       number of hidden units (0: Logistic Regression)
+        -optim:      optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..)
+        -tenacity:   how many times dev acc does not increase before stopping
+        -epoch_size: each epoch corresponds to epoch_size pass on the train set
+        -max_epoch:  max number of epoches
+        -dropout:    dropout for MLP
+        """
+
+        self.nhid = 0 if "nhid" not in params else params["nhid"]
+        self.optim = "adam" if "optim" not in params else params["optim"]
+        self.tenacity = 5 if "tenacity" not in params else params["tenacity"]
+        self.epoch_size = 4 if "epoch_size" not in params else params["epoch_size"]
+        self.max_epoch = 200 if "max_epoch" not in params else params["max_epoch"]
+        self.dropout = 0. if "dropout" not in params else params["dropout"]
+        self.batch_size = 64 if "batch_size" not in params else params["batch_size"]
+
+        if params["nhid"] == 0:
+            self.model = nn.Sequential(
+                nn.Linear(self.inputdim, self.nclasses),
+            ).cuda()
+        else:
+            self.model = nn.Sequential(
+                nn.Linear(self.inputdim, params["nhid"]),
+                nn.Dropout(p=self.dropout),
+                nn.Sigmoid(),
+                nn.Linear(params["nhid"], self.nclasses),
+            ).cuda()
+
+        self.loss_fn = nn.CrossEntropyLoss().cuda()
+        self.loss_fn.size_average = False
+
+        optim_fn, optim_params = utils.get_optimizer(self.optim)
+        self.optimizer = optim_fn(self.model.parameters(), **optim_params)
+        self.optimizer.param_groups[0]['weight_decay'] = self.l2reg
--- a/utils_nlp/eval/SentEval/senteval/tools/ranking.py
+++ b/utils_nlp/eval/SentEval/senteval/tools/ranking.py
@ -0,0 +1,359 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+"""
+Image Annotation/Search for COCO with Pytorch
+"""
+from __future__ import absolute_import, division, unicode_literals
+
+import logging
+import copy
+import numpy as np
+
+import torch
+from torch import nn
+from torch.autograd import Variable
+import torch.optim as optim
+
+
+class COCOProjNet(nn.Module):
+    def __init__(self, config):
+        super(COCOProjNet, self).__init__()
+        self.imgdim = config['imgdim']
+        self.sentdim = config['sentdim']
+        self.projdim = config['projdim']
+        self.imgproj = nn.Sequential(
+                        nn.Linear(self.imgdim, self.projdim),
+                        )
+        self.sentproj = nn.Sequential(
+                        nn.Linear(self.sentdim, self.projdim),
+                        )
+
+    def forward(self, img, sent, imgc, sentc):
+        # imgc : (bsize, ncontrast, imgdim)
+        # sentc : (bsize, ncontrast, sentdim)
+        # img : (bsize, imgdim)
+        # sent : (bsize, sentdim)
+        img = img.unsqueeze(1).expand_as(imgc).contiguous()
+        img = img.view(-1, self.imgdim)
+        imgc = imgc.view(-1, self.imgdim)
+        sent = sent.unsqueeze(1).expand_as(sentc).contiguous()
+        sent = sent.view(-1, self.sentdim)
+        sentc = sentc.view(-1, self.sentdim)
+
+        imgproj = self.imgproj(img)
+        imgproj = imgproj / torch.sqrt(torch.pow(imgproj, 2).sum(1, keepdim=True)).expand_as(imgproj)
+        imgcproj = self.imgproj(imgc)
+        imgcproj = imgcproj / torch.sqrt(torch.pow(imgcproj, 2).sum(1, keepdim=True)).expand_as(imgcproj)
+        sentproj = self.sentproj(sent)
+        sentproj = sentproj / torch.sqrt(torch.pow(sentproj, 2).sum(1, keepdim=True)).expand_as(sentproj)
+        sentcproj = self.sentproj(sentc)
+        sentcproj = sentcproj / torch.sqrt(torch.pow(sentcproj, 2).sum(1, keepdim=True)).expand_as(sentcproj)
+        # (bsize*ncontrast, projdim)
+
+        anchor1 = torch.sum((imgproj*sentproj), 1)
+        anchor2 = torch.sum((sentproj*imgproj), 1)
+        img_sentc = torch.sum((imgproj*sentcproj), 1)
+        sent_imgc = torch.sum((sentproj*imgcproj), 1)
+
+        # (bsize*ncontrast)
+        return anchor1, anchor2, img_sentc, sent_imgc
+
+    def proj_sentence(self, sent):
+        output = self.sentproj(sent)
+        output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output)
+        return output # (bsize, projdim)
+
+    def proj_image(self, img):
+        output = self.imgproj(img)
+        output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output)
+        return output # (bsize, projdim)
+
+
+class PairwiseRankingLoss(nn.Module):
+    """
+    Pairwise ranking loss
+    """
+    def __init__(self, margin):
+        super(PairwiseRankingLoss, self).__init__()
+        self.margin = margin
+
+    def forward(self, anchor1, anchor2, img_sentc, sent_imgc):
+
+        cost_sent = torch.clamp(self.margin - anchor1 + img_sentc,
+                                min=0.0).sum()
+        cost_img = torch.clamp(self.margin - anchor2 + sent_imgc,
+                               min=0.0).sum()
+        loss = cost_sent + cost_img
+        return loss
+
+
+class ImageSentenceRankingPytorch(object):
+    # Image Sentence Ranking on COCO with Pytorch
+    def __init__(self, train, valid, test, config):
+        # fix seed
+        self.seed = config['seed']
+        np.random.seed(self.seed)
+        torch.manual_seed(self.seed)
+        torch.cuda.manual_seed(self.seed)
+
+        self.train = train
+        self.valid = valid
+        self.test = test
+
+        self.imgdim = len(train['imgfeat'][0])
+        self.sentdim = len(train['sentfeat'][0])
+        self.projdim = config['projdim']
+        self.margin = config['margin']
+
+        self.batch_size = 128
+        self.ncontrast = 30
+        self.maxepoch = 20
+        self.early_stop = True
+
+        config_model = {'imgdim': self.imgdim,'sentdim': self.sentdim,
+                        'projdim': self.projdim}
+        self.model = COCOProjNet(config_model).cuda()
+
+        self.loss_fn = PairwiseRankingLoss(margin=self.margin).cuda()
+
+        self.optimizer = optim.Adam(self.model.parameters())
+
+    def prepare_data(self, trainTxt, trainImg, devTxt, devImg,
+                     testTxt, testImg):
+        trainTxt = torch.FloatTensor(trainTxt)
+        trainImg = torch.FloatTensor(trainImg)
+        devTxt = torch.FloatTensor(devTxt).cuda()
+        devImg = torch.FloatTensor(devImg).cuda()
+        testTxt = torch.FloatTensor(testTxt).cuda()
+        testImg = torch.FloatTensor(testImg).cuda()
+
+        return trainTxt, trainImg, devTxt, devImg, testTxt, testImg
+
+    def run(self):
+        self.nepoch = 0
+        bestdevscore = -1
+        early_stop_count = 0
+        stop_train = False
+
+        # Preparing data
+        logging.info('prepare data')
+        trainTxt, trainImg, devTxt, devImg, testTxt, testImg = \
+            self.prepare_data(self.train['sentfeat'], self.train['imgfeat'],
+                              self.valid['sentfeat'], self.valid['imgfeat'],
+                              self.test['sentfeat'], self.test['imgfeat'])
+
+        # Training
+        while not stop_train and self.nepoch <= self.maxepoch:
+            logging.info('start epoch')
+            self.trainepoch(trainTxt, trainImg, devTxt, devImg, nepoches=1)
+            logging.info('Epoch {0} finished'.format(self.nepoch))
+
+            results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
+                       't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
+                       'dev': bestdevscore}
+            score = 0
+            for i in range(5):
+                devTxt_i = devTxt[i*5000:(i+1)*5000]
+                devImg_i = devImg[i*5000:(i+1)*5000]
+                # Compute dev ranks img2txt
+                r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg_i,
+                                                             devTxt_i)
+                results['i2t']['r1'] += r1_i2t / 5
+                results['i2t']['r5'] += r5_i2t / 5
+                results['i2t']['r10'] += r10_i2t / 5
+                results['i2t']['medr'] += medr_i2t / 5
+                logging.info("Image to text: {0}, {1}, {2}, {3}"
+                             .format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))
+                # Compute dev ranks txt2img
+                r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg_i,
+                                                             devTxt_i)
+                results['t2i']['r1'] += r1_t2i / 5
+                results['t2i']['r5'] += r5_t2i / 5
+                results['t2i']['r10'] += r10_t2i / 5
+                results['t2i']['medr'] += medr_t2i / 5
+                logging.info("Text to Image: {0}, {1}, {2}, {3}"
+                             .format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))
+                score += (r1_i2t + r5_i2t + r10_i2t +
+                          r1_t2i + r5_t2i + r10_t2i) / 5
+
+            logging.info("Dev mean Text to Image: {0}, {1}, {2}, {3}".format(
+                        results['t2i']['r1'], results['t2i']['r5'],
+                        results['t2i']['r10'], results['t2i']['medr']))
+            logging.info("Dev mean Image to text: {0}, {1}, {2}, {3}".format(
+                        results['i2t']['r1'], results['i2t']['r5'],
+                        results['i2t']['r10'], results['i2t']['medr']))
+
+            # early stop on Pearson
+            if score > bestdevscore:
+                bestdevscore = score
+                bestmodel = copy.deepcopy(self.model)
+            elif self.early_stop:
+                if early_stop_count >= 3:
+                    stop_train = True
+                early_stop_count += 1
+        self.model = bestmodel
+
+        # Compute test for the 5 splits
+        results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
+                   't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
+                   'dev': bestdevscore}
+        for i in range(5):
+            testTxt_i = testTxt[i*5000:(i+1)*5000]
+            testImg_i = testImg[i*5000:(i+1)*5000]
+            # Compute test ranks img2txt
+            r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(testImg_i, testTxt_i)
+            results['i2t']['r1'] += r1_i2t / 5
+            results['i2t']['r5'] += r5_i2t / 5
+            results['i2t']['r10'] += r10_i2t / 5
+            results['i2t']['medr'] += medr_i2t / 5
+            # Compute test ranks txt2img
+            r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(testImg_i, testTxt_i)
+            results['t2i']['r1'] += r1_t2i / 5
+            results['t2i']['r5'] += r5_t2i / 5
+            results['t2i']['r10'] += r10_t2i / 5
+            results['t2i']['medr'] += medr_t2i / 5
+
+        return bestdevscore, results['i2t']['r1'], results['i2t']['r5'], \
+                             results['i2t']['r10'], results['i2t']['medr'], \
+                             results['t2i']['r1'], results['t2i']['r5'], \
+                             results['t2i']['r10'], results['t2i']['medr']
+
+    def trainepoch(self, trainTxt, trainImg, devTxt, devImg, nepoches=1):
+        self.model.train()
+        for _ in range(self.nepoch, self.nepoch + nepoches):
+            permutation = list(np.random.permutation(len(trainTxt)))
+            all_costs = []
+            for i in range(0, len(trainTxt), self.batch_size):
+                # forward
+                if i % (self.batch_size*500) == 0 and i > 0:
+                    logging.info('samples : {0}'.format(i))
+                    r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg,
+                                                                 devTxt)
+                    logging.info("Image to text: {0}, {1}, {2}, {3}".format(
+                        r1_i2t, r5_i2t, r10_i2t, medr_i2t))
+                    # Compute test ranks txt2img
+                    r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg,
+                                                                 devTxt)
+                    logging.info("Text to Image: {0}, {1}, {2}, {3}".format(
+                        r1_t2i, r5_t2i, r10_t2i, medr_t2i))
+                idx = torch.LongTensor(permutation[i:i + self.batch_size])
+                imgbatch = Variable(trainImg.index_select(0, idx)).cuda()
+                sentbatch = Variable(trainTxt.index_select(0, idx)).cuda()
+
+                idximgc = np.random.choice(permutation[:i] +
+                                           permutation[i + self.batch_size:],
+                                           self.ncontrast*idx.size(0))
+                idxsentc = np.random.choice(permutation[:i] +
+                                            permutation[i + self.batch_size:],
+                                            self.ncontrast*idx.size(0))
+                idximgc = torch.LongTensor(idximgc)
+                idxsentc = torch.LongTensor(idxsentc)
+                # Get indexes for contrastive images and sentences
+                imgcbatch = Variable(trainImg.index_select(0, idximgc)).view(
+                    -1, self.ncontrast, self.imgdim).cuda()
+                sentcbatch = Variable(trainTxt.index_select(0, idxsentc)).view(
+                    -1, self.ncontrast, self.sentdim).cuda()
+
+                anchor1, anchor2, img_sentc, sent_imgc = self.model(
+                    imgbatch, sentbatch, imgcbatch, sentcbatch)
+                # loss
+                loss = self.loss_fn(anchor1, anchor2, img_sentc, sent_imgc)
+                all_costs.append(loss.data.item())
+                # backward
+                self.optimizer.zero_grad()
+                loss.backward()
+                # Update parameters
+                self.optimizer.step()
+        self.nepoch += nepoches
+
+    def t2i(self, images, captions):
+        """
+        Images: (5N, imgdim) matrix of images
+        Captions: (5N, sentdim) matrix of captions
+        """
+        with torch.no_grad():
+            # Project images and captions
+            img_embed, sent_embed = [], []
+            for i in range(0, len(images), self.batch_size):
+                img_embed.append(self.model.proj_image(
+                    Variable(images[i:i + self.batch_size])))
+                sent_embed.append(self.model.proj_sentence(
+                    Variable(captions[i:i + self.batch_size])))
+            img_embed = torch.cat(img_embed, 0).data
+            sent_embed = torch.cat(sent_embed, 0).data
+
+            npts = int(img_embed.size(0) / 5)
+            idxs = torch.cuda.LongTensor(range(0, len(img_embed), 5))
+            ims = img_embed.index_select(0, idxs)
+
+            ranks = np.zeros(5 * npts)
+            for index in range(npts):
+
+                # Get query captions
+                queries = sent_embed[5*index: 5*index + 5]
+
+                # Compute scores
+                scores = torch.mm(queries, ims.transpose(0, 1)).cpu().numpy()
+                inds = np.zeros(scores.shape)
+                for i in range(len(inds)):
+                    inds[i] = np.argsort(scores[i])[::-1]
+                    ranks[5 * index + i] = np.where(inds[i] == index)[0][0]
+
+            # Compute metrics
+            r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
+            r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
+            r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
+            medr = np.floor(np.median(ranks)) + 1
+            return (r1, r5, r10, medr)
+
+    def i2t(self, images, captions):
+        """
+        Images: (5N, imgdim) matrix of images
+        Captions: (5N, sentdim) matrix of captions
+        """
+        with torch.no_grad():
+            # Project images and captions
+            img_embed, sent_embed = [], []
+            for i in range(0, len(images), self.batch_size):
+                img_embed.append(self.model.proj_image(
+                    Variable(images[i:i + self.batch_size])))
+                sent_embed.append(self.model.proj_sentence(
+                    Variable(captions[i:i + self.batch_size])))
+            img_embed = torch.cat(img_embed, 0).data
+            sent_embed = torch.cat(sent_embed, 0).data
+
+            npts = int(img_embed.size(0) / 5)
+            index_list = []
+
+            ranks = np.zeros(npts)
+            for index in range(npts):
+
+                # Get query image
+                query_img = img_embed[5 * index]
+
+                # Compute scores
+                scores = torch.mm(query_img.view(1, -1),
+                                  sent_embed.transpose(0, 1)).view(-1)
+                scores = scores.cpu().numpy()
+                inds = np.argsort(scores)[::-1]
+                index_list.append(inds[0])
+
+                # Score
+                rank = 1e20
+                for i in range(5*index, 5*index + 5, 1):
+                    tmp = np.where(inds == i)[0][0]
+                    if tmp < rank:
+                        rank = tmp
+                ranks[index] = rank
+
+            # Compute metrics
+            r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
+            r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
+            r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
+            medr = np.floor(np.median(ranks)) + 1
+            return (r1, r5, r10, medr)
--- a/utils_nlp/eval/SentEval/senteval/tools/relatedness.py
+++ b/utils_nlp/eval/SentEval/senteval/tools/relatedness.py
@ -0,0 +1,134 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+"""
+Semantic Relatedness (supervised) with Pytorch
+"""
+from __future__ import absolute_import, division, unicode_literals
+
+import copy
+import numpy as np
+
+import torch
+from torch import nn
+import torch.optim as optim
+
+from scipy.stats import pearsonr
+
+
+class RelatednessPytorch(object):
+    # Can be used for SICK-Relatedness, and STS14
+    def __init__(self, train, valid, test, devscores, config):
+        # fix seed
+        np.random.seed(config['seed'])
+        torch.manual_seed(config['seed'])
+        assert torch.cuda.is_available(), 'torch.cuda required for Relatedness'
+        torch.cuda.manual_seed(config['seed'])
+
+        self.train = train
+        self.valid = valid
+        self.test = test
+        self.devscores = devscores
+
+        self.inputdim = train['X'].shape[1]
+        self.nclasses = config['nclasses']
+        self.seed = config['seed']
+        self.l2reg = 0.
+        self.batch_size = 64
+        self.maxepoch = 1000
+        self.early_stop = True
+
+        self.model = nn.Sequential(
+            nn.Linear(self.inputdim, self.nclasses),
+            nn.Softmax(dim=-1),
+        )
+        self.loss_fn = nn.MSELoss()
+
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+            self.loss_fn = self.loss_fn.cuda()
+
+        self.loss_fn.size_average = False
+        self.optimizer = optim.Adam(self.model.parameters(),
+                                    weight_decay=self.l2reg)
+
+    def prepare_data(self, trainX, trainy, devX, devy, testX, testy):
+        # Transform probs to log-probs for KL-divergence
+        trainX = torch.from_numpy(trainX).float().cuda()
+        trainy = torch.from_numpy(trainy).float().cuda()
+        devX = torch.from_numpy(devX).float().cuda()
+        devy = torch.from_numpy(devy).float().cuda()
+        testX = torch.from_numpy(testX).float().cuda()
+        testY = torch.from_numpy(testy).float().cuda()
+
+        return trainX, trainy, devX, devy, testX, testy
+
+    def run(self):
+        self.nepoch = 0
+        bestpr = -1
+        early_stop_count = 0
+        r = np.arange(1, 6)
+        stop_train = False
+
+        # Preparing data
+        trainX, trainy, devX, devy, testX, testy = self.prepare_data(
+            self.train['X'], self.train['y'],
+            self.valid['X'], self.valid['y'],
+            self.test['X'], self.test['y'])
+
+        # Training
+        while not stop_train and self.nepoch <= self.maxepoch:
+            self.trainepoch(trainX, trainy, nepoches=50)
+            yhat = np.dot(self.predict_proba(devX), r)
+            pr = pearsonr(yhat, self.devscores)[0]
+            pr = 0 if pr != pr else pr  # if NaN bc std=0
+            # early stop on Pearson
+            if pr > bestpr:
+                bestpr = pr
+                bestmodel = copy.deepcopy(self.model)
+            elif self.early_stop:
+                if early_stop_count >= 3:
+                    stop_train = True
+                early_stop_count += 1
+        self.model = bestmodel
+
+        yhat = np.dot(self.predict_proba(testX), r)
+
+        return bestpr, yhat
+
+    def trainepoch(self, X, y, nepoches=1):
+        self.model.train()
+        for _ in range(self.nepoch, self.nepoch + nepoches):
+            permutation = np.random.permutation(len(X))
+            all_costs = []
+            for i in range(0, len(X), self.batch_size):
+                # forward
+                idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda()
+                Xbatch = X[idx]
+                ybatch = y[idx]
+                output = self.model(Xbatch)
+                # loss
+                loss = self.loss_fn(output, ybatch)
+                all_costs.append(loss.item())
+                # backward
+                self.optimizer.zero_grad()
+                loss.backward()
+                # Update parameters
+                self.optimizer.step()
+        self.nepoch += nepoches
+
+    def predict_proba(self, devX):
+        self.model.eval()
+        probas = []
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                if len(probas) == 0:
+                    probas = self.model(Xbatch).data.cpu().numpy()
+                else:
+                    probas = np.concatenate((probas, self.model(Xbatch).data.cpu().numpy()), axis=0)
+        return probas
--- a/utils_nlp/eval/SentEval/senteval/tools/validation.py
+++ b/utils_nlp/eval/SentEval/senteval/tools/validation.py
@ -0,0 +1,246 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+"""
+Validation and classification
+(train)            :  inner-kfold classifier
+(train, test)      :  kfold classifier
+(train, dev, test) :  split classifier
+
+"""
+from __future__ import absolute_import, division, unicode_literals
+
+import logging
+import numpy as np
+from senteval.tools.classifier import MLP
+
+import sklearn
+assert(sklearn.__version__ >= "0.18.0"), \
+    "need to update sklearn to version >= 0.18.0"
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import StratifiedKFold
+
+
+def get_classif_name(classifier_config, usepytorch):
+    if not usepytorch:
+        modelname = 'sklearn-LogReg'
+    else:
+        nhid = classifier_config['nhid']
+        optim = 'adam' if 'optim' not in classifier_config else classifier_config['optim']
+        bs = 64 if 'batch_size' not in classifier_config else classifier_config['batch_size']
+        modelname = 'pytorch-MLP-nhid%s-%s-bs%s' % (nhid, optim, bs)
+    return modelname
+
+# Pytorch version
+class InnerKFoldClassifier(object):
+    """
+    (train) split classifier : InnerKfold.
+    """
+    def __init__(self, X, y, config):
+        self.X = X
+        self.y = y
+        self.featdim = X.shape[1]
+        self.nclasses = config['nclasses']
+        self.seed = config['seed']
+        self.devresults = []
+        self.testresults = []
+        self.usepytorch = config['usepytorch']
+        self.classifier_config = config['classifier']
+        self.modelname = get_classif_name(self.classifier_config, self.usepytorch)
+
+        self.k = 5 if 'kfold' not in config else config['kfold']
+
+    def run(self):
+        logging.info('Training {0} with (inner) {1}-fold cross-validation'
+                     .format(self.modelname, self.k))
+
+        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
+               [2**t for t in range(-2, 4, 1)]
+        skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111)
+        innerskf = StratifiedKFold(n_splits=self.k, shuffle=True,
+                                   random_state=1111)
+        count = 0
+        for train_idx, test_idx in skf.split(self.X, self.y):
+            count += 1
+            X_train, X_test = self.X[train_idx], self.X[test_idx]
+            y_train, y_test = self.y[train_idx], self.y[test_idx]
+            scores = []
+            for reg in regs:
+                regscores = []
+                for inner_train_idx, inner_test_idx in innerskf.split(X_train, y_train):
+                    X_in_train, X_in_test = X_train[inner_train_idx], X_train[inner_test_idx]
+                    y_in_train, y_in_test = y_train[inner_train_idx], y_train[inner_test_idx]
+                    if self.usepytorch:
+                        clf = MLP(self.classifier_config, inputdim=self.featdim,
+                                  nclasses=self.nclasses, l2reg=reg,
+                                  seed=self.seed)
+                        clf.fit(X_in_train, y_in_train,
+                                validation_data=(X_in_test, y_in_test))
+                    else:
+                        clf = LogisticRegression(C=reg, random_state=self.seed)
+                        clf.fit(X_in_train, y_in_train)
+                    regscores.append(clf.score(X_in_test, y_in_test))
+                scores.append(round(100*np.mean(regscores), 2))
+            optreg = regs[np.argmax(scores)]
+            logging.info('Best param found at split {0}: l2reg = {1} \
+                with score {2}'.format(count, optreg, np.max(scores)))
+            self.devresults.append(np.max(scores))
+
+            if self.usepytorch:
+                clf = MLP(self.classifier_config, inputdim=self.featdim,
+                          nclasses=self.nclasses, l2reg=optreg,
+                          seed=self.seed)
+
+                clf.fit(X_train, y_train, validation_split=0.05)
+            else:
+                clf = LogisticRegression(C=optreg, random_state=self.seed)
+                clf.fit(X_train, y_train)
+
+            self.testresults.append(round(100*clf.score(X_test, y_test), 2))
+
+        devaccuracy = round(np.mean(self.devresults), 2)
+        testaccuracy = round(np.mean(self.testresults), 2)
+        return devaccuracy, testaccuracy
+
+
+class KFoldClassifier(object):
+    """
+    (train, test) split classifier : cross-validation on train.
+    """
+    def __init__(self, train, test, config):
+        self.train = train
+        self.test = test
+        self.featdim = self.train['X'].shape[1]
+        self.nclasses = config['nclasses']
+        self.seed = config['seed']
+        self.usepytorch = config['usepytorch']
+        self.classifier_config = config['classifier']
+        self.modelname = get_classif_name(self.classifier_config, self.usepytorch)
+
+        self.k = 5 if 'kfold' not in config else config['kfold']
+
+    def run(self):
+        # cross-validation
+        logging.info('Training {0} with {1}-fold cross-validation'
+                     .format(self.modelname, self.k))
+        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
+               [2**t for t in range(-1, 6, 1)]
+        skf = StratifiedKFold(n_splits=self.k, shuffle=True,
+                              random_state=self.seed)
+        scores = []
+
+        for reg in regs:
+            scanscores = []
+            for train_idx, test_idx in skf.split(self.train['X'],
+                                                 self.train['y']):
+                # Split data
+                X_train, y_train = self.train['X'][train_idx], self.train['y'][train_idx]
+
+                X_test, y_test = self.train['X'][test_idx], self.train['y'][test_idx]
+
+                # Train classifier
+                if self.usepytorch:
+                    clf = MLP(self.classifier_config, inputdim=self.featdim,
+                              nclasses=self.nclasses, l2reg=reg,
+                              seed=self.seed)
+                    clf.fit(X_train, y_train, validation_data=(X_test, y_test))
+                else:
+                    clf = LogisticRegression(C=reg, random_state=self.seed)
+                    clf.fit(X_train, y_train)
+                score = clf.score(X_test, y_test)
+                scanscores.append(score)
+            # Append mean score
+            scores.append(round(100*np.mean(scanscores), 2))
+
+        # evaluation
+        logging.info([('reg:' + str(regs[idx]), scores[idx])
+                      for idx in range(len(scores))])
+        optreg = regs[np.argmax(scores)]
+        devaccuracy = np.max(scores)
+        logging.info('Cross-validation : best param found is reg = {0} \
+            with score {1}'.format(optreg, devaccuracy))
+
+        logging.info('Evaluating...')
+        if self.usepytorch:
+            clf = MLP(self.classifier_config, inputdim=self.featdim,
+                      nclasses=self.nclasses, l2reg=optreg,
+                      seed=self.seed)
+            clf.fit(self.train['X'], self.train['y'], validation_split=0.05)
+        else:
+            clf = LogisticRegression(C=optreg, random_state=self.seed)
+            clf.fit(self.train['X'], self.train['y'])
+        yhat = clf.predict(self.test['X'])
+
+        testaccuracy = clf.score(self.test['X'], self.test['y'])
+        testaccuracy = round(100*testaccuracy, 2)
+
+        return devaccuracy, testaccuracy, yhat
+
+
+class SplitClassifier(object):
+    """
+    (train, valid, test) split classifier.
+    """
+    def __init__(self, X, y, config):
+        self.X = X
+        self.y = y
+        self.nclasses = config['nclasses']
+        self.featdim = self.X['train'].shape[1]
+        self.seed = config['seed']
+        self.usepytorch = config['usepytorch']
+        self.classifier_config = config['classifier']
+        self.cudaEfficient = False if 'cudaEfficient' not in config else \
+            config['cudaEfficient']
+        self.modelname = get_classif_name(self.classifier_config, self.usepytorch)
+        self.noreg = False if 'noreg' not in config else config['noreg']
+        self.config = config
+
+    def run(self):
+        logging.info('Training {0} with standard validation..'
+                     .format(self.modelname))
+        regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
+               [2**t for t in range(-2, 4, 1)]
+        if self.noreg:
+            regs = [1e-9 if self.usepytorch else 1e9]
+        scores = []
+        for reg in regs:
+            if self.usepytorch:
+                clf = MLP(self.classifier_config, inputdim=self.featdim,
+                          nclasses=self.nclasses, l2reg=reg,
+                          seed=self.seed, cudaEfficient=self.cudaEfficient)
+
+                # TODO: Find a hack for reducing nb epoches in SNLI
+                clf.fit(self.X['train'], self.y['train'],
+                        validation_data=(self.X['valid'], self.y['valid']))
+            else:
+                clf = LogisticRegression(C=reg, random_state=self.seed)
+                clf.fit(self.X['train'], self.y['train'])
+            scores.append(round(100*clf.score(self.X['valid'],
+                                self.y['valid']), 2))
+        logging.info([('reg:'+str(regs[idx]), scores[idx])
+                      for idx in range(len(scores))])
+        optreg = regs[np.argmax(scores)]
+        devaccuracy = np.max(scores)
+        logging.info('Validation : best param found is reg = {0} with score \
+            {1}'.format(optreg, devaccuracy))
+        clf = LogisticRegression(C=optreg, random_state=self.seed)
+        logging.info('Evaluating...')
+        if self.usepytorch:
+            clf = MLP(self.classifier_config, inputdim=self.featdim,
+                      nclasses=self.nclasses, l2reg=optreg,
+                      seed=self.seed, cudaEfficient=self.cudaEfficient)
+
+            # TODO: Find a hack for reducing nb epoches in SNLI
+            clf.fit(self.X['train'], self.y['train'],
+                    validation_data=(self.X['valid'], self.y['valid']))
+        else:
+            clf = LogisticRegression(C=optreg, random_state=self.seed)
+            clf.fit(self.X['train'], self.y['train'])
+
+        testaccuracy = clf.score(self.X['test'], self.y['test'])
+        testaccuracy = round(100*testaccuracy, 2)
+        return devaccuracy, testaccuracy
--- a/utils_nlp/eval/SentEval/senteval/trec.py
+++ b/utils_nlp/eval/SentEval/senteval/trec.py
@ -0,0 +1,89 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+TREC question-type classification
+'''
+
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import io
+import logging
+import numpy as np
+
+from senteval.tools.validation import KFoldClassifier
+
+
+class TRECEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.info('***** Transfer task : TREC *****\n\n')
+        self.seed = seed
+        self.train = self.loadFile(os.path.join(task_path, 'train_5500.label'))
+        self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label'))
+
+    def do_prepare(self, params, prepare):
+        samples = self.train['X'] + self.test['X']
+        return prepare(params, samples)
+
+    def loadFile(self, fpath):
+        trec_data = {'X': [], 'y': []}
+        tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2,
+                   'HUM': 3, 'LOC': 4, 'NUM': 5}
+        with io.open(fpath, 'r', encoding='latin-1') as f:
+            for line in f:
+                target, sample = line.strip().split(':', 1)
+                sample = sample.split(' ', 1)[1].split()
+                assert target in tgt2idx, target
+                trec_data['X'].append(sample)
+                trec_data['y'].append(tgt2idx[target])
+        return trec_data
+
+    def run(self, params, batcher):
+        train_embeddings, test_embeddings = [], []
+
+        # Sort to reduce padding
+        sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']),
+                                     key=lambda z: (len(z[0]), z[1]))
+        train_samples = [x for (x, y) in sorted_corpus_train]
+        train_labels = [y for (x, y) in sorted_corpus_train]
+
+        sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']),
+                                    key=lambda z: (len(z[0]), z[1]))
+        test_samples = [x for (x, y) in sorted_corpus_test]
+        test_labels = [y for (x, y) in sorted_corpus_test]
+
+        # Get train embeddings
+        for ii in range(0, len(train_labels), params.batch_size):
+            batch = train_samples[ii:ii + params.batch_size]
+            embeddings = batcher(params, batch)
+            train_embeddings.append(embeddings)
+        train_embeddings = np.vstack(train_embeddings)
+        logging.info('Computed train embeddings')
+
+        # Get test embeddings
+        for ii in range(0, len(test_labels), params.batch_size):
+            batch = test_samples[ii:ii + params.batch_size]
+            embeddings = batcher(params, batch)
+            test_embeddings.append(embeddings)
+        test_embeddings = np.vstack(test_embeddings)
+        logging.info('Computed test embeddings')
+
+        config_classifier = {'nclasses': 6, 'seed': self.seed,
+                             'usepytorch': params.usepytorch,
+                             'classifier': params.classifier,
+                             'kfold': params.kfold}
+        clf = KFoldClassifier({'X': train_embeddings,
+                               'y': np.array(train_labels)},
+                              {'X': test_embeddings,
+                               'y': np.array(test_labels)},
+                              config_classifier)
+        devacc, testacc, _ = clf.run()
+        logging.debug('\nDev acc : {0} Test acc : {1} \
+            for TREC\n'.format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(self.train['X']), 'ntest': len(self.test['X'])}
--- a/utils_nlp/eval/SentEval/senteval/utils.py
+++ b/utils_nlp/eval/SentEval/senteval/utils.py
@ -0,0 +1,95 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+from __future__ import absolute_import, division, unicode_literals
+
+import numpy as np
+import re
+import inspect
+from torch import optim
+
+
+def create_dictionary(sentences):
+    words = {}
+    for s in sentences:
+        for word in s:
+            if word in words:
+                words[word] += 1
+            else:
+                words[word] = 1
+    words['<s>'] = 1e9 + 4
+    words['</s>'] = 1e9 + 3
+    words['<p>'] = 1e9 + 2
+    # words['<UNK>'] = 1e9 + 1
+    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
+    id2word = []
+    word2id = {}
+    for i, (w, _) in enumerate(sorted_words):
+        id2word.append(w)
+        word2id[w] = i
+
+    return id2word, word2id
+
+
+def cosine(u, v):
+    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
+
+
+class dotdict(dict):
+    """ dot.notation access to dictionary attributes """
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+
+
+def get_optimizer(s):
+    """
+    Parse optimizer parameters.
+    Input should be of the form:
+        - "sgd,lr=0.01"
+        - "adagrad,lr=0.1,lr_decay=0.05"
+    """
+    if "," in s:
+        method = s[:s.find(',')]
+        optim_params = {}
+        for x in s[s.find(',') + 1:].split(','):
+            split = x.split('=')
+            assert len(split) == 2
+            assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
+            optim_params[split[0]] = float(split[1])
+    else:
+        method = s
+        optim_params = {}
+
+    if method == 'adadelta':
+        optim_fn = optim.Adadelta
+    elif method == 'adagrad':
+        optim_fn = optim.Adagrad
+    elif method == 'adam':
+        optim_fn = optim.Adam
+    elif method == 'adamax':
+        optim_fn = optim.Adamax
+    elif method == 'asgd':
+        optim_fn = optim.ASGD
+    elif method == 'rmsprop':
+        optim_fn = optim.RMSprop
+    elif method == 'rprop':
+        optim_fn = optim.Rprop
+    elif method == 'sgd':
+        optim_fn = optim.SGD
+        assert 'lr' in optim_params
+    else:
+        raise Exception('Unknown optimization method: "%s"' % method)
+
+    # check that we give good parameters to the optimizer
+    expected_args = inspect.getargspec(optim_fn.__init__)[0]
+    assert expected_args[:2] == ['self', 'params']
+    if not all(k in expected_args[2:] for k in optim_params.keys()):
+        raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
+            str(expected_args[2:]), str(optim_params.keys())))
+
+    return optim_fn, optim_params
--- a/utils_nlp/eval/SentEval/setup.py
+++ b/utils_nlp/eval/SentEval/setup.py
@ -0,0 +1,21 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import io
+from setuptools import setup, find_packages
+
+with io.open('./README.md', encoding='utf-8') as f:
+    readme = f.read()
+
+setup(
+    name='SentEval',
+    version='0.1.0',
+    url='https://github.com/facebookresearch/SentEval',
+    packages=find_packages(exclude=['examples']),
+    license='Attribution-NonCommercial 4.0 International',
+    long_description=readme,
+)
--- a/utils_nlp/eval/classification.py
+++ b/utils_nlp/eval/classification.py
@ -8,6 +8,9 @@ from sklearn.metrics import (
    f1_score,
 )

+from numpy import corrcoef
+import pandas as pd
+

 def eval_classification(actual, predicted, round_decimals=4):
    """Returns common classification evaluation metrics.
@ -32,3 +35,23 @@ def eval_classification(actual, predicted, round_decimals=4):
            f1_score(actual, predicted, average=None).round(round_decimals)
        ),
    }
+
+
+def compute_correlation_coefficients(x, y=None):
+    """
+    Compute Pearson product-moment correlation coefficients.
+
+    Args:
+        x: array_like
+            A 1-D or 2-D array containing multiple variables and observations.
+            Each row of `x` represents a variable, and each column a single
+            observation of all those variables.
+
+        y: array_like, optional
+            An additional set of variables and observations. `y` has the same
+            shape as `x`.
+
+    Returns:
+        pd.DataFrame : A pandas dataframe from the correlation coefficient matrix of the variables.
+    """
+    return pd.DataFrame(corrcoef(x, y))
--- a/utils_nlp/eval/evaluate_squad.py
+++ b/utils_nlp/eval/evaluate_squad.py
@ -0,0 +1,97 @@
+""" Official evaluation script for v1.1 of the SQuAD dataset. """
+from __future__ import print_function
+from collections import Counter
+import string
+import re
+import argparse
+import json
+import sys
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return (normalize_answer(prediction) == normalize_answer(ground_truth))
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions):
+    f1 = exact_match = total = 0
+    for article in dataset:
+        for paragraph in article['paragraphs']:
+            for qa in paragraph['qas']:
+                total += 1
+                if qa['id'] not in predictions:
+                    message = 'Unanswered question ' + qa['id'] + \
+                              ' will receive score 0.'
+                    print(message, file=sys.stderr)
+                    continue
+                ground_truths = list(map(lambda x: x['text'], qa['answers']))
+                prediction = predictions[qa['id']]
+                exact_match += metric_max_over_ground_truths(
+                    exact_match_score, prediction, ground_truths)
+                f1 += metric_max_over_ground_truths(
+                    f1_score, prediction, ground_truths)
+
+    exact_match = 100.0 * exact_match / total
+    f1 = 100.0 * f1 / total
+
+    return {'exact_match': exact_match, 'f1': f1}
+
+
+if __name__ == '__main__':
+    expected_version = '1.1'
+    parser = argparse.ArgumentParser(
+        description='Evaluation for SQuAD ' + expected_version)
+    parser.add_argument('dataset_file', help='Dataset file')
+    parser.add_argument('prediction_file', help='Prediction File')
+    args = parser.parse_args()
+    with open(args.dataset_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if (dataset_json['version'] != expected_version):
+            print('Evaluation expects v-' + expected_version +
+                  ', but got dataset with v-' + dataset_json['version'],
+                  file=sys.stderr)
+        dataset = dataset_json['data']
+    with open(args.prediction_file) as prediction_file:
+        predictions = json.load(prediction_file)
+    print(json.dumps(evaluate(dataset, predictions)))
+
+# Original source:
+# https://github.com/allenai/bi-att-flow/blob/498c8026d92a8bcf0286e2d216d092d444d02d76/squad/evaluate-v1.1.py
--- a/utils_nlp/eval/senteval.py
+++ b/utils_nlp/eval/senteval.py
@ -1,43 +1,49 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 import os
 import sys
+import itertools
 import pandas as pd
+from collections import OrderedDict
+from copy import deepcopy


-class SentEvalRunner:
-    def __init__(self, path_to_senteval="."):
-        """Wrapper class interfacing with the original implementation of SentEval
+class SentEvalConfig:
+
+    """Object to store static properties of senteval experiments
+    
+    Attributes:
+        model_params (dict): model parameters that stay consistent across all runs
+        senteval_params (dict): senteval parameters that stay consistent across all runs
+    
+    """
+    
+    def __init__(
+        self,
+        model_params,
+        senteval_params,
+    ):
+        """Summary
        
        Args:
-            path_to_senteval (str, optional): Path to the SentEval source code.
+            model_params (dict): model parameters that stay consistent across all runs
+            senteval_params (dict): senteval parameters that stay consistent across all runs
        """
-        self.path_to_senteval = path_to_senteval
-        self.params_senteval = {}
+        self.model_params = model_params
+        self.senteval_params = senteval_params

-    def set_transfer_data_path(self, relative_path):
-        """Set the datapath that contains the datasets for the SentEval transfer tasks
+    @property
+    def model_params(self):
+        return self._model_params

-        Args:
-            relative_path (str): Relative datapath
-        """
-        self.transfer_data_path = os.path.join(
-            self.path_to_senteval, relative_path
-        )
-        self.params_senteval["task_path"] = self.transfer_data_path
+    @model_params.setter
+    def model_params(self, model_params):
+        self._model_params = model_params

-    def set_transfer_tasks(self, task_list):
-        """Set the transfer tasks to use for evaluation
-        
-        Args:
-            task_list (list(str)): List of downstream transfer tasks
-        """
-        self.transfer_tasks = task_list
-
-    def set_model(self, model):
-        """Set the model to evaluate"""
-        self.params_senteval["model"] = model
-
-    def set_params(self, params):
-        self.params_senteval = dict(self.params_senteval, **params)
+    def append_senteval_params(self, params):
+        """Util to append any params to senteval_params after initialization"""
+        self.senteval_params = dict(self.senteval_params, **params)

        classifying_tasks = {
            "MR",
@ -54,7 +60,7 @@ class SentEvalRunner:

        if any(t in classifying_tasks for t in self.transfer_tasks):
            try:
-                a = "classifier" in self.params_senteval
+                a = "classifier" in self.senteval_params
                if not a:
                    raise ValueError(
                        "Include param['classifier'] to run task {}".format(t)
@ -68,7 +74,7 @@ class SentEvalRunner:
                            "tenacity",
                            "epoch_size",
                        )
-                        in self.params_senteval["classifier"].keys()
+                        in self.senteval_params["classifier"].keys()
                    )
                    if not b:
                        raise ValueError(
@ -78,50 +84,3 @@ class SentEvalRunner:
                        )
            except ValueError as ve:
                print(ve)
-
-    def run(self, batcher_func, prepare_func):
-        """Run the SentEval engine on the model on the transfer tasks
-        
-        Args:
-            batcher_func (function): Function required by SentEval that transforms a batch of text sentences into 
-                                     sentence embeddings
-            prepare_func (function): Function that sees the whole dataset of each task and can thus construct the word 
-                                     vocabulary, the dictionary of word vectors, etc
-        
-        Returns:
-            dict: Dictionary of results
-        """
-        sys.path.insert(0, self.path_to_senteval)
-        import senteval
-
-        se = senteval.engine.SE(
-            self.params_senteval, batcher_func, prepare_func
-        )
-
-        return se.eval(self.transfer_tasks)
-
-    def log_mean(self, results, selected_metrics=[], round_decimals=3):
-        """Log the means of selected metrics of the transfer tasks
-        
-        Args:
-            results (dict): Results from the SentEval evaluation engine
-            selected_metrics (list(str), optional): List of metric names
-            round_decimals (int, optional): Number of decimal digits to round to; defaults to 3
-        
-        Returns:
-            pd.DataFrame table of formatted results
-        """
-        data = []
-        for task in self.transfer_tasks:
-            if "all" in results[task]:
-                row = [
-                    results[task]["all"][metric]["mean"]
-                    for metric in selected_metrics
-                ]
-            else:
-                row = [results[task][metric] for metric in selected_metrics]
-            data.append(row)
-        table = pd.DataFrame(
-            data=data, columns=selected_metrics, index=self.transfer_tasks
-        )
-        return table.round(round_decimals)
--- a/utils_nlp/interpreter/README.md
+++ b/utils_nlp/interpreter/README.md
@ -17,7 +17,7 @@ tqdm

 ## How to use

-We provide a notebook tutorial [here](../../scenarios/interpret_NLP_models/explain_simple_model.ipynb) to help you start quickly. The important class we need to utilize is the `Interpreter` in [Interpreter.py](Interpreter.py). Given any input word embeddings and a forward function $\Phi$ that transforms the word embeddings $\bf x$ to a hidden state $\bf s$, Interpreter helps understand how much each input word contributes to the hidden state. Suppose the $\Phi$, the input $\bf x$ and the input words are defined as:
+We provide a notebook tutorial [here](../../scenarios/interpret_NLP_models/understand_models.ipynb) to help you start quickly. The important class we need to utilize is the `Interpreter` in [Interpreter.py](Interpreter.py). Given any input word embeddings and a forward function $\Phi$ that transforms the word embeddings $\bf x$ to a hidden state $\bf s$, Interpreter helps understand how much each input word contributes to the hidden state. Suppose the $\Phi$, the input $\bf x$ and the input words are defined as:
 ```
 import torch

--- a/utils_nlp/models/init.py
+++ b/utils_nlp/models/init.py
--- a/utils_nlp/models/bert/init.py
+++ b/utils_nlp/models/bert/init.py
--- a/utils_nlp/models/bert/common.py
+++ b/utils_nlp/models/bert/common.py
@ -6,42 +6,44 @@
 # https://github.com/huggingface/pytorch-transformers/blob/master/examples
 # /run_glue.py

-from enum import Enum
+import csv
+import linecache
+import subprocess
 import warnings
 from collections import Iterable
+from enum import Enum
+
 import torch
-from tqdm import tqdm
-
 from pytorch_pretrained_bert.tokenization import BertTokenizer
-
 from torch.utils.data import (
    DataLoader,
+    Dataset,
    RandomSampler,
    SequentialSampler,
    TensorDataset,
+    ConcatDataset,
 )
+from tqdm import tqdm

 # Max supported sequence length
 BERT_MAX_LEN = 512


-class Language(Enum):
+class Language(str, Enum):
    """An enumeration of the supported pretrained models and languages."""

-    ENGLISH = "bert-base-uncased"
-    ENGLISHCASED = "bert-base-cased"
-    ENGLISHLARGE = "bert-large-uncased"
-    ENGLISHLARGECASED = "bert-large-cased"
-    ENGLISHLARGEWWM = "bert-large-uncased-whole-word-masking"
-    ENGLISHLARGECASEDWWM = "bert-large-cased-whole-word-masking"
-    CHINESE = "bert-base-chinese"
-    MULTILINGUAL = "bert-base-multilingual-cased"
+    ENGLISH: str = "bert-base-uncased"
+    ENGLISHCASED: str = "bert-base-cased"
+    ENGLISHLARGE: str = "bert-large-uncased"
+    ENGLISHLARGECASED: str = "bert-large-cased"
+    ENGLISHLARGEWWM: str = "bert-large-uncased-whole-word-masking"
+    ENGLISHLARGECASEDWWM: str = "bert-large-cased-whole-word-masking"
+    CHINESE: str = "bert-base-chinese"
+    MULTILINGUAL: str = "bert-base-multilingual-cased"


 class Tokenizer:
-    def __init__(
-        self, language=Language.ENGLISH, to_lower=False, cache_dir="."
-    ):
+    def __init__(self, language=Language.ENGLISH, to_lower=False, cache_dir="."):
        """Initializes the underlying pretrained BERT tokenizer.

        Args:
@ -51,7 +53,7 @@ class Tokenizer:
                Defaults to ".".
        """
        self.tokenizer = BertTokenizer.from_pretrained(
-            language.value, do_lower_case=to_lower, cache_dir=cache_dir
+            language, do_lower_case=to_lower, cache_dir=cache_dir
        )
        self.language = language

@ -69,10 +71,7 @@ class Tokenizer:
        if isinstance(text[0], str):
            return [self.tokenizer.tokenize(x) for x in tqdm(text)]
        else:
-            return [
-                [self.tokenizer.tokenize(x) for x in sentences]
-                for sentences in tqdm(text)
-            ]
+            return [[self.tokenizer.tokenize(x) for x in sentences] for sentences in tqdm(text)]

    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""
@ -121,11 +120,7 @@ class Tokenizer:
                list of token type id lists
        """
        if max_len > BERT_MAX_LEN:
-            print(
-                "setting max_len to max allowed tokens: {}".format(
-                    BERT_MAX_LEN
-                )
-            )
+            print("setting max_len to max allowed tokens: {}".format(BERT_MAX_LEN))
            max_len = BERT_MAX_LEN

        if isinstance(tokens[0][0], str):
@ -141,23 +136,16 @@ class Tokenizer:
            # construct token_type_ids
            # [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ]
            token_type_ids = [
-                [[i] * len(sentence) for i, sentence in enumerate(example)]
-                for example in tokens
+                [[i] * len(sentence) for i, sentence in enumerate(example)] for example in tokens
            ]
            # merge sentences
-            tokens = [
-                [token for sentence in example for token in sentence]
-                for example in tokens
-            ]
+            tokens = [[token for sentence in example for token in sentence] for example in tokens]
            # prefix with [0] for [CLS]
            token_type_ids = [
-                [0] + [i for sentence in example for i in sentence]
-                for example in token_type_ids
+                [0] + [i for sentence in example for i in sentence] for example in token_type_ids
            ]
            # pad sequence
-            token_type_ids = [
-                x + [0] * (max_len - len(x)) for x in token_type_ids
-            ]
+            token_type_ids = [x + [0] * (max_len - len(x)) for x in token_type_ids]

        tokens = [["[CLS]"] + x for x in tokens]
        # convert tokens to indices
@ -168,13 +156,65 @@ class Tokenizer:
        input_mask = [[min(1, x) for x in y] for y in tokens]
        return tokens, input_mask, token_type_ids

+    def preprocess_encoder_tokens(self, tokens, max_len=BERT_MAX_LEN):
+        """Preprocessing of input tokens:
+            - add BERT sentence markers ([CLS] and [SEP])
+            - map tokens to token indices in the BERT vocabulary
+            - pad and truncate sequences
+            - create an input_mask
+            - create token type ids, aka. segment ids
+
+        Args:
+            tokens (list): List of token lists to preprocess.
+            max_len (int, optional): Maximum number of tokens
+                            (documents will be truncated or padded).
+                            Defaults to 512.
+        Returns:
+            tuple: A tuple containing the following four lists
+                list of preprocesssed token lists
+                list of input id lists
+                list of input mask lists
+                list of token type id lists
+        """
+        if max_len > BERT_MAX_LEN:
+            print("setting max_len to max allowed tokens: {}".format(BERT_MAX_LEN))
+            max_len = BERT_MAX_LEN
+
+        if isinstance(tokens[0][0], str):
+            tokens = [x[0 : max_len - 2] + ["[SEP]"] for x in tokens]
+            token_type_ids = None
+        else:
+            # get tokens for each sentence [[t00, t01, ...] [t10, t11,... ]]
+            tokens = [
+                self._truncate_seq_pair(sentence[0], sentence[1], max_len - 3)
+                for sentence in tokens
+            ]
+
+            # construct token_type_ids
+            # [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ]
+            token_type_ids = [
+                [[i] * len(sentence) for i, sentence in enumerate(example)] for example in tokens
+            ]
+            # merge sentences
+            tokens = [[token for sentence in example for token in sentence] for example in tokens]
+            # prefix with [0] for [CLS]
+            token_type_ids = [
+                [0] + [i for sentence in example for i in sentence] for example in token_type_ids
+            ]
+            # pad sequence
+            token_type_ids = [x + [0] * (max_len - len(x)) for x in token_type_ids]
+
+        tokens = [["[CLS]"] + x for x in tokens]
+        # convert tokens to indices
+        input_ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokens]
+        # pad sequence
+        input_ids = [x + [0] * (max_len - len(x)) for x in input_ids]
+        # create input mask
+        input_mask = [[min(1, x) for x in y] for y in input_ids]
+        return tokens, input_ids, input_mask, token_type_ids
+
    def tokenize_ner(
-        self,
-        text,
-        max_len=BERT_MAX_LEN,
-        labels=None,
-        label_map=None,
-        trailing_piece_tag="X",
+        self, text, max_len=BERT_MAX_LEN, labels=None, label_map=None, trailing_piece_tag="X"
    ):
        """
        Tokenize and preprocesses input word lists, involving the following steps
@ -232,18 +272,12 @@ class Tokenizer:
            return isinstance(obj, Iterable) and not isinstance(obj, str)

        if max_len > BERT_MAX_LEN:
-            warnings.warn(
-                "setting max_len to max allowed tokens: {}".format(
-                    BERT_MAX_LEN
-                )
-            )
+            warnings.warn("setting max_len to max allowed tokens: {}".format(BERT_MAX_LEN))
            max_len = BERT_MAX_LEN

        if not _is_iterable_but_not_string(text):
            # The input text must be an non-string Iterable
-            raise ValueError(
-                "Input text must be an iterable and not a string."
-            )
+            raise ValueError("Input text must be an iterable and not a string.")
        else:
            # If the input text is a single list of words, convert it to
            # list of lists for later iteration
@ -251,9 +285,7 @@ class Tokenizer:
                text = [text]
        if labels is not None:
            if not _is_iterable_but_not_string(labels):
-                raise ValueError(
-                    "labels must be an iterable and not a string."
-                )
+                raise ValueError("labels must be an iterable and not a string.")
            else:
                if not _is_iterable_but_not_string(labels[0]):
                    labels = [labels]
@ -316,10 +348,7 @@ class Tokenizer:
            new_labels += label_padding

            trailing_token_mask_all.append(
-                [
-                    True if label != trailing_piece_tag else False
-                    for label in new_labels
-                ]
+                [True if label != trailing_piece_tag else False for label in new_labels]
            )

            if label_map:
@ -332,22 +361,13 @@ class Tokenizer:
            label_ids_all.append(label_ids)

        if label_available:
-            return (
-                input_ids_all,
-                input_mask_all,
-                trailing_token_mask_all,
-                label_ids_all,
-            )
+            return (input_ids_all, input_mask_all, trailing_token_mask_all, label_ids_all)
        else:
            return input_ids_all, input_mask_all, trailing_token_mask_all, None


 def create_data_loader(
-    input_ids,
-    input_mask,
-    label_ids=None,
-    sample_method="random",
-    batch_size=32,
+    input_ids, input_mask, label_ids=None, sample_method="random", batch_size=32
 ):
    """
    Create a dataloader for sampling and serving data batches.
@ -377,9 +397,7 @@ def create_data_loader(

    if label_ids:
        label_ids_tensor = torch.tensor(label_ids, dtype=torch.long)
-        tensor_data = TensorDataset(
-            input_ids_tensor, input_mask_tensor, label_ids_tensor
-        )
+        tensor_data = TensorDataset(input_ids_tensor, input_mask_tensor, label_ids_tensor)
    else:
        tensor_data = TensorDataset(input_ids_tensor, input_mask_tensor)

@ -389,12 +407,73 @@ def create_data_loader(
        sampler = SequentialSampler(tensor_data)
    else:
        raise ValueError(
-            "Invalid sample_method value, accepted values are: "
-            "random and sequential."
+            "Invalid sample_method value, accepted values are: " "random and sequential."
        )

-    dataloader = DataLoader(
-        tensor_data, sampler=sampler, batch_size=batch_size
-    )
+    dataloader = DataLoader(tensor_data, sampler=sampler, batch_size=batch_size)

    return dataloader
+
+
+class TextDataset(Dataset):
+    """
+    Characterizes a dataset for PyTorch which can be used to load a file containing multiple rows
+    where each row is a training example.
+    """
+
+    def __init__(self, filename):
+        """
+        Initialization. We set the filename and number of lines in the file.
+        Args:
+            filename(str): Name of the file.
+        """
+        self._filename = filename
+        self._total_data = (
+            int(subprocess.check_output("wc -l " + filename, shell=True).split()[0]) - 1
+        )
+
+    def __len__(self):
+        """Denotes the total number of samples in the file."""
+        return self._total_data
+
+    @staticmethod
+    def _cast(row):
+        return [int(x.strip()) for x in row]
+
+    def __getitem__(self, index):
+        """
+        Generates one sample of data. We assume that the last column is label here. We use
+        linecache to load files lazily.
+
+        Args:
+            index(int): Index of the test case.
+
+        Returns(list, list, int): Returns the tokens, mask and label for a single item.
+
+        """
+        line = linecache.getline(self._filename, index + 1)
+        row = next(csv.reader([line]))
+
+        tokens = self._cast(row[0][1:-1].split(","))
+        mask = self._cast(row[1][1:-1].split(","))
+
+        return (
+            torch.tensor(tokens, dtype=torch.long),
+            torch.tensor(mask, dtype=torch.long),
+            torch.tensor(int(row[2]), dtype=torch.long),
+        )
+
+
+def get_dataset_multiple_files(files):
+    """ Get dataset from multiple files
+
+    Args:
+        files(list): List of paths to the files.
+
+    Returns:
+
+        torch.utils.data.Dataset : A combined dataset of all files in the directory.
+
+    """
+    datasets = [TextDataset(x) for x in files]
+    return ConcatDataset(datasets)
--- a/utils_nlp/models/bert/sequence_classification.py
+++ b/utils_nlp/models/bert/sequence_classification.py
@ -44,7 +44,7 @@ class BERTSequenceClassifier:

        # create classifier
        self.model = BertForSequenceClassification.from_pretrained(
-            language.value, cache_dir=cache_dir, num_labels=num_labels
+            language, cache_dir=cache_dir, num_labels=num_labels
        )

    def fit(
--- a/utils_nlp/models/bert/sequence_classification_distributed.py
+++ b/utils_nlp/models/bert/sequence_classification_distributed.py
@ -0,0 +1,325 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+import logging
+
+import horovod.torch as hvd
+import numpy as np
+import torch.nn as nn
+from torch.utils.data import TensorDataset
+import torch.utils.data.distributed
+from pytorch_pretrained_bert.modeling import BertForSequenceClassification
+from pytorch_pretrained_bert.optimization import BertAdam
+from tqdm import tqdm
+
+from utils_nlp.models.bert.common import Language
+
+from utils_nlp.models.bert.common import get_dataset_multiple_files
+from utils_nlp.common.pytorch_utils import get_device, move_to_device
+
+logger = logging.getLogger(__name__)
+hvd.init()
+torch.manual_seed(42)
+
+if torch.cuda.is_available():
+    # Horovod: pin GPU to local rank.
+    torch.cuda.set_device(hvd.local_rank())
+    torch.cuda.manual_seed(42)
+
+
+class BERTSequenceDistClassifier:
+    """Distributed BERT-based sequence classifier"""
+
+    def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir="."):
+        """Initializes the classifier and the underlying pretrained model.
+
+        Args:
+            language (Language, optional): The pretrained model's language.
+                                           Defaults to Language.ENGLISH.
+            num_labels (int, optional): The number of unique labels in the
+                training data. Defaults to 2.
+            cache_dir (str, optional): Location of BERT's cache directory.
+                Defaults to ".".
+        """
+        if num_labels < 2:
+            raise ValueError("Number of labels should be at least 2.")
+
+        self.language = language
+        self.num_labels = num_labels
+        self.cache_dir = cache_dir
+        self.kwargs = (
+            {"num_workers": 1, "pin_memory": True}
+            if torch.cuda.is_available()
+            else {}
+        )
+
+        # create classifier
+        self.model = BertForSequenceClassification.from_pretrained(
+            language.value, num_labels=num_labels
+        )
+
+    def fit(
+        self,
+        token_ids,
+        input_mask,
+        labels,
+        token_type_ids=None,
+        input_files,
+        num_gpus=1,
+        num_epochs=1,
+        batch_size=32,
+        lr=2e-5,
+        warmup_proportion=None,
+        verbose=True,
+        fp16_allreduce=False,
+    ):
+        """fine-tunes the bert classifier using the given training data.
+
+        args:
+            input_files(list, required): list of paths to the training data files.
+            token_ids (list): List of training token id lists.
+            input_mask (list): List of input mask lists.
+            labels (list): List of training labels.
+            token_type_ids (list, optional): List of lists. Each sublist
+                contains segment ids indicating if the token belongs to
+                the first sentence(0) or second sentence(1). Only needed
+                for two-sentence tasks.
+            num_gpus (int, optional): the number of gpus to use.
+                                      if none is specified, all available gpus
+                                      will be used. defaults to none.
+            num_epochs (int, optional): number of training epochs.
+                defaults to 1.
+            batch_size (int, optional): training batch size. defaults to 32.
+            lr (float): learning rate of the adam optimizer. defaults to 2e-5.
+            warmup_proportion (float, optional): proportion of training to
+                perform linear learning rate warmup for. e.g., 0.1 = 10% of
+                training. defaults to none.
+            verbose (bool, optional): if true, shows the training progress and
+                loss values. defaults to true.
+            fp16_allreduce(bool, optional)L if true, use fp16 compression during allreduce
+        """
+
+        if input_files is not None:
+            train_dataset = get_dataset_multiple_files(input_files)
+        else:
+            token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
+            input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
+            labels_tensor = torch.tensor(labels, dtype=torch.long)
+
+            if token_type_ids:
+                token_type_ids_tensor = torch.tensor(
+                    token_type_ids, dtype=torch.long
+                )
+                train_dataset = TensorDataset(
+                    token_ids_tensor,
+                    input_mask_tensor,
+                    token_type_ids_tensor,
+                    labels_tensor,
+                )
+            else:
+                train_dataset = TensorDataset(
+                    token_ids_tensor, input_mask_tensor, labels_tensor
+                )
+
+        train_sampler = torch.utils.data.distributed.DistributedSampler(
+            train_dataset, num_replicas=hvd.size(), rank=hvd.rank()
+        )
+        train_loader = torch.utils.data.DataLoader(
+            train_dataset,
+            batch_size=batch_size,
+            sampler=train_sampler,
+            **self.kwargs
+        )
+
+        device = get_device()
+        self.model.cuda()
+
+        hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
+        # hvd.broadcast_optimizer_state(optimizer, root_rank=0)
+
+        # define loss function
+        loss_func = nn.CrossEntropyLoss().to(device)
+
+        # define optimizer and model parameters
+        param_optimizer = list(self.model.named_parameters())
+        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [
+                    p
+                    for n, p in param_optimizer
+                    if not any(nd in n for nd in no_decay)
+                ],
+                "weight_decay": 0.01,
+            },
+            {
+                "params": [
+                    p
+                    for n, p in param_optimizer
+                    if any(nd in n for nd in no_decay)
+                ]
+            },
+        ]
+
+        num_examples = len(train_dataset)
+        num_batches = int(num_examples / batch_size)
+        num_train_optimization_steps = num_batches * num_epochs
+
+        if warmup_proportion is None:
+            optimizer = BertAdam(
+                optimizer_grouped_parameters, lr=lr * hvd.size()
+            )
+        else:
+            optimizer = BertAdam(
+                optimizer_grouped_parameters,
+                lr=lr * hvd.size(),
+                t_total=num_train_optimization_steps,
+                warmup=warmup_proportion,
+            )
+
+        # Horovod: (optional) compression algorithm.
+        compression = (
+            hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none
+        )
+
+        # Horovod: wrap optimizer with DistributedOptimizer.
+        optimizer = hvd.DistributedOptimizer(
+            optimizer,
+            named_parameters=self.model.named_parameters(),
+            compression=compression,
+        )
+
+        # Horovod: set epoch to sampler for shuffling.
+        for epoch in range(num_epochs):
+            self.model.train()
+            train_sampler.set_epoch(epoch)
+            for batch_idx, batch in enumerate(train_loader):
+
+                if token_type_ids:
+                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
+                        t.to(device) for t in batch
+                    )
+                else:
+                    token_type_ids_batch = None
+                    x_batch, mask_batch, y_batch = tuple(
+                        t.to(device) for t in batch
+                    )
+
+                optimizer.zero_grad()
+
+                output = self.model(
+                    input_ids=x_batch, attention_mask=mask_batch, labels=None
+                )
+
+                loss = loss_func(output, y_batch).mean()
+                loss.backward()
+                optimizer.step()
+                if verbose and (batch_idx % ((num_batches // 10) + 1)) == 0:
+                    # Horovod: use train_sampler to determine the number of examples in
+                    # this worker's partition.
+                    print(
+                        "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                            epoch,
+                            batch_idx * len(x_batch),
+                            len(train_sampler),
+                            100.0 * batch_idx / len(train_loader),
+                            loss.item(),
+                        )
+                    )
+
+        # empty cache
+        torch.cuda.empty_cache()
+
+    def predict(
+        self,
+        input_files = None,
+        token_ids,
+        input_mask,
+        token_type_ids=None,
+        input_files, num_gpus=1, batch_size=32, probabilities=False
+    ):
+        """Scores the given set of train files and returns the predicted classes.
+
+        Args:
+            input_files(list, required): list of paths to the test data files.
+            token_ids (list): List of training token lists.
+            input_mask (list): List of input mask lists.
+            token_type_ids (list, optional): List of lists. Each sublist
+                contains segment ids indicating if the token belongs to
+                the first sentence(0) or second sentence(1). Only needed
+                for two-sentence tasks.
+            num_gpus (int, optional): The number of gpus to use.
+                                      If None is specified, all available GPUs
+                                      will be used. Defaults to None.
+            batch_size (int, optional): Scoring batch size. Defaults to 32.
+            probabilities (bool, optional):
+                If True, the predicted probability distribution
+                is also returned. Defaults to False.
+        Returns:
+            1darray, dict(1darray, 1darray, ndarray): Predicted classes and target labels or
+                a dictionary with classes, target labels, probabilities) if probabilities is True.
+        """
+
+        if input_files is not None:
+            test_dataset = get_dataset_multiple_files(input_files)
+
+        else:
+            token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
+            input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
+
+            if token_type_ids:
+                token_type_ids_tensor = torch.tensor(
+                    token_type_ids, dtype=torch.long
+                )
+                test_dataset = TensorDataset(
+                    token_ids_tensor, input_mask_tensor, token_type_ids_tensor
+                )
+            else:
+                test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor)
+
+        # Horovod: use DistributedSampler to partition the test data.
+        test_sampler = torch.utils.data.sampler.SequentialSampler(test_dataset)
+
+        test_loader = torch.utils.data.DataLoader(
+            test_dataset,
+            batch_size=batch_size,
+            sampler=test_sampler,
+            **self.kwargs
+        )
+
+        device = get_device()
+        self.model = move_to_device(self.model, device, num_gpus)
+        self.model.eval()
+        preds = []
+        labels_test = []
+
+        with tqdm(total=len(test_loader)) as pbar:
+            for i, (tokens, mask, target) in enumerate(test_loader):
+                if torch.cuda.is_available():
+                    tokens, mask, target = (
+                        tokens.cuda(),
+                        mask.cuda(),
+                        target.cuda(),
+                    )
+
+                with torch.no_grad():
+                    p_batch = self.model(
+                        input_ids=tokens, attention_mask=mask, labels=None
+                    )
+                preds.append(p_batch.cpu())
+                labels_test.append(target.cpu())
+                if i % batch_size == 0:
+                    pbar.update(batch_size)
+
+        preds = np.concatenate(preds)
+        labels_test = np.concatenate(labels_test)
+
+        if probabilities:
+            return {
+                "Predictions": preds.argmax(axis=1),
+                "Target": labels_test,
+                "classes probabilities": nn.Softmax(dim=1)(
+                    torch.Tensor(preds)
+                ).numpy(),
+            }
+        else:
+            return preds.argmax(axis=1), labels_test
--- a/utils_nlp/models/bert/sequence_encoding.py
+++ b/utils_nlp/models/bert/sequence_encoding.py
@ -0,0 +1,251 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# This script reuses code from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples
+# /extract_features.py, with necessary modifications.
+
+from pytorch_pretrained_bert.modeling import BertModel
+
+from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from enum import Enum
+import numpy as np
+import pandas as pd
+import os
+import torch
+
+from torch.utils.data import (
+    DataLoader,
+    RandomSampler,
+    SequentialSampler,
+    TensorDataset,
+)
+
+from utils_nlp.models.bert.common import Language, Tokenizer
+
+
+class PoolingStrategy(str, Enum):
+    """Enumerate pooling strategies"""   
+    MAX : str = "max"
+    MEAN : str = "mean"
+    CLS : str = "cls"
+
+
+class BERTSentenceEncoder:
+    """BERT-based sentence encoder"""
+    
+    def __init__(
+        self,
+        bert_model=None,
+        tokenizer=None,
+        language=Language.ENGLISH,
+        num_gpus=None,
+        cache_dir=".",
+        to_lower=True,
+        max_len=512,
+        layer_index=-1,
+        pooling_strategy=PoolingStrategy.MEAN,
+    ):
+        """Initialize the encoder's underlying model and tokenizer
+        
+        Args:
+            bert_model: BERT model to use for encoding. Defaults to pretrained BertModel.
+            tokenizer: Tokenizer to use for preprocessing. Defaults to pretrained BERT tokenizer.
+            language: The pretrained model's language. Defaults to Language.ENGLISH.
+            num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used. 
+            cache_dir: Location of BERT's cache directory. Defaults to "."
+            to_lower: True to lowercase before tokenization. Defaults to False.
+            max_len: Maximum number of tokens.
+            layer_index: The layer from which to extract features. 
+                         Defaults to the last layer; can also be a list of integers for experimentation.
+            pooling_strategy: Pooling strategy to aggregate token embeddings into sentence embedding.
+        """
+        self.model = (
+            bert_model.model.bert
+            if bert_model
+            else BertModel.from_pretrained(language, cache_dir=cache_dir)
+        )
+        self.tokenizer = (
+            tokenizer
+            if tokenizer
+            else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
+        )
+        self.num_gpus = num_gpus
+        self.max_len = max_len
+        self.layer_index = layer_index
+        self.pooling_strategy = pooling_strategy
+
+    @property
+    def layer_index(self):
+        return self._layer_index
+
+    @layer_index.setter
+    def layer_index(self, layer_index):
+        if isinstance(layer_index, int):
+            self._layer_index = [layer_index]
+        else:
+            self._layer_index = layer_index
+    
+    @property
+    def pooling_strategy(self):
+        return self._pooling_strategy
+
+    @pooling_strategy.setter
+    def pooling_strategy(self, pooling_strategy):
+        self._pooling_strategy = pooling_strategy
+
+    def get_hidden_states(self, text, batch_size=32):
+        """Extract the hidden states from the pretrained model
+        
+        Args:
+            text: List of documents to extract features from.
+            batch_size: Batch size, defaults to 32.
+        
+        Returns:
+            pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). 
+        """
+        device = get_device("cpu" if self.num_gpus == 0 else "gpu")
+        self.model = move_to_device(self.model, device, self.num_gpus)
+        self.model.eval()
+
+        tokens = self.tokenizer.tokenize(text)
+
+        tokens, input_ids, input_mask, input_type_ids = self.tokenizer.preprocess_encoder_tokens(
+            tokens, max_len=self.max_len
+        )
+
+        input_ids = torch.tensor(input_ids, dtype=torch.long, device=device)
+        input_mask = torch.tensor(input_mask, dtype=torch.long, device=device)
+        input_type_ids = torch.arange(
+            input_ids.size(0), dtype=torch.long, device=device
+        )
+
+        eval_data = TensorDataset(input_ids, input_mask, input_type_ids)
+        eval_dataloader = DataLoader(
+            eval_data,
+            sampler=SequentialSampler(eval_data),
+            batch_size=batch_size,
+        )
+
+        hidden_states = {
+            "text_index": [],
+            "token": [],
+            "layer_index": [],
+            "values": [],
+        }
+        for (
+            input_ids_tensor,
+            input_mask_tensor,
+            example_indices_tensor,
+        ) in eval_dataloader:
+            with torch.no_grad(): 
+                all_encoder_layers, _ = self.model(
+                    input_ids_tensor,
+                    token_type_ids=None,
+                    attention_mask=input_mask_tensor,
+                )
+                self.embedding_dim = all_encoder_layers[0].size()[-1]
+
+            for b, example_index in enumerate(example_indices_tensor):
+                for (i, token) in enumerate(tokens[example_index.item()]):
+                    for (j, layer_index) in enumerate(self.layer_index):
+                        layer_output = (
+                            all_encoder_layers[int(layer_index)]
+                            .detach()
+                            .cpu()
+                            .numpy()
+                        )
+                        layer_output = layer_output[b]
+                        hidden_states["text_index"].append(
+                            example_index.item()
+                        )
+                        hidden_states["token"].append(token)
+                        hidden_states["layer_index"].append(layer_index)
+                        hidden_states["values"].append(
+                            [round(x.item(), 6) for x in layer_output[i]]
+                        )
+            
+            # empty cache
+            del [input_ids_tensor, input_mask_tensor, example_indices_tensor]
+            torch.cuda.empty_cache()
+
+        # empty cache
+        del [input_ids, input_mask, input_type_ids]
+        torch.cuda.empty_cache()
+
+        return pd.DataFrame.from_dict(hidden_states)
+
+    def pool(self, df):
+        """Pooling to aggregate token-wise embeddings to sentence embeddings
+        
+        Args:
+            df: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float])
+            pooling_strategy: The pooling strategy to use
+        
+        Returns:
+            pd.DataFrame grouped by text index and layer index
+        """
+        def max_pool(x):
+            values = np.array(
+                [
+                    np.reshape(np.array(x.values[i]), self.embedding_dim)
+                    for i in range(x.values.shape[0])
+                ]
+            )
+            m, _ = torch.max(torch.tensor(values, dtype=torch.float), 0)
+            return m.numpy()
+
+        def mean_pool(x):
+            values = np.array(
+                [
+                    np.reshape(np.array(x.values[i]), self.embedding_dim)
+                    for i in range(x.values.shape[0])
+                ]
+            )
+            return torch.mean(
+                torch.tensor(values, dtype=torch.float), 0
+            ).numpy()
+
+        def cls_pool(x):
+            values = np.array(
+                [
+                    np.reshape(np.array(x.values[i]), self.embedding_dim)
+                    for i in range(x.values.shape[0])
+                ]
+            )
+            return values[0]
+        
+        try:
+            if self.pooling_strategy == "max":
+                pool_func = max_pool
+            elif self.pooling_strategy == "mean":
+                pool_func = mean_pool
+            elif self.pooling_strategy == "cls":
+                pool_func = cls_pool
+            else:
+                raise ValueError("Please enter valid pooling strategy")
+        except ValueError as ve:
+            print(ve)
+        
+        return df.groupby(["text_index", "layer_index"])["values"].apply(lambda x: pool_func(x)).reset_index()
+
+    def encode(
+        self,
+        text,
+        batch_size=32,
+        as_numpy=False
+    ):
+        """Computes sentence encodings 
+        
+        Args:
+            text: List of documents to encode.
+            batch_size: Batch size, defaults to 32.
+        """
+        df = self.get_hidden_states(text, batch_size)
+        pooled = self.pool(df)
+        
+        if as_numpy:
+            return np.array(pooled["values"].tolist())
+        else:
+            return pooled
+
+
--- a/utils_nlp/models/bert/token_classification.py
+++ b/utils_nlp/models/bert/token_classification.py
@ -52,7 +52,7 @@ class BERTTokenClassifier:
        self.cache_dir = cache_dir

        self.model = BertForTokenClassification.from_pretrained(
-            language.value, cache_dir=cache_dir, num_labels=num_labels
+            language, cache_dir=cache_dir, num_labels=num_labels
        )

    def _get_optimizer(
--- a/utils_nlp/models/gensen/utils.py
+++ b/utils_nlp/models/gensen/utils.py
@ -11,14 +11,11 @@ import pickle

 import numpy as np
 import torch
-from azureml.core.run import Run
 from sklearn.utils import shuffle
 from torch.autograd import Variable

 # Change to python3+.
 # from itertools import zip
-# get the Azure ML run object
-run = Run.get_context()


 class DataIterator(object):
@ -393,7 +390,7 @@ class NLIIterator(DataIterator):
            test(torch.Tensor): Testing dataset.
            vocab_size(int): The size of the vocabulary.
            lowercase(bool): If lowercase the dataset.
-            vocab(list): The list of the vocabulary.
+            vocab(Union[bytes,str): The list of the vocabulary.
        """
        self.seed = seed
        self.train = train