From af9f6716457958100f01dc3b185caa45c5a3a053 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Tue, 4 Jun 2019 11:09:15 +0100
Subject: [PATCH 1/6] update setup

---
 SETUP.md | 231 +++++--------------------------------------------------
 1 file changed, 20 insertions(+), 211 deletions(-)

diff --git a/SETUP.md b/SETUP.md
index 49c049d..dbba312 100755
--- a/SETUP.md
+++ b/SETUP.md
@@ -1,9 +1,11 @@
 # Setup guide
 
-This document describes how to setup all the dependencies to run the notebooks in this repository in two different environments:
+This document describes how to setup all the dependencies to run the notebooks in this repository.
+
+The recommended environment to run these notebooks is the [Azure Data Science Virtual Machine (DSVM)](https://azure.microsoft.com/en-us/services/virtual-machines/data-science-virtual-machines/). Since a considerable number of the algorithms rely on deep learning, it is recommended to use a GPU DSVM.
+
+For training at scale, operationalization or hyperparameter tuning, it is recommended to use [Azure ML](https://docs.microsoft.com/en-us/azure/machine-learning/service/).
 
-* a Linux system (local or an [Azure Data Science Virtual Machine (DSVM)](https://azure.microsoft.com/en-us/services/virtual-machines/data-science-virtual-machines/)) 
-* [Azure Databricks](https://azure.microsoft.com/en-us/services/databricks/).
 
 ## Table of Contents
 
@@ -11,23 +13,17 @@ This document describes how to setup all the dependencies to run the notebooks i
 * [Setup guide for Local or DSVM](#setup-guide-for-local-or-dsvm)
   * [Setup Requirements](#setup-requirements)
   * [Dependencies setup](#dependencies-setup)
-  * [Register the conda environment in Jupyter notebook](#register-the-conda-environment-in-jupyter-notebook)
-  * [Troubleshooting for the DSVM](#troubleshooting-for-the-dsvm)
-* [Setup guide for Azure Databricks](#setup-guide-for-azure-databricks)
-  * [Requirements of Azure Databricks](#requirements-of-azure-databricks)
-  * [Repository installation](#repository-installation)
-  * [Troubleshooting for Azure Databricks](#troubleshooting-for-azure-databricks)
-* [Prepare Azure Databricks for Operationalization](#prepare-azure-databricks-for-operationalization)
+  * [Register the conda environment in the DSVM JupyterHub](#register-the-conda-environment-in--the-dsvm-jupyterhub)
+
 
 ## Compute environments
 
-Depending on the type of recommender system and the notebook that needs to be run, there are different computational requirements.
+Depending on the type of NLP system and the notebook that needs to be run, there are different computational requirements.
 
 Currently, this repository supports the following environments:
 
 * Python CPU
 * Python GPU
-* PySpark
 
 
 ## Setup guide for Local or DSVM
@@ -35,18 +31,12 @@ Currently, this repository supports the following environments:
 ### Setup Requirements
 
 * Anaconda with Python version >= 3.6. [Miniconda](https://conda.io/miniconda.html) is the fastest way to get started.
-* The Python library dependencies can be found in this [script](scripts/generate_conda_file.sh).
-* Machine with Spark (optional for Python environment but mandatory for PySpark environment).
+* The Python library dependencies can be found in this [script](tools/generate_conda_file.sh).
 
 ### Dependencies setup
 
-We install the dependencies with Conda. As a pre-requisite, we may want to make sure that Conda is up-to-date:
 
-```{shell}
-conda update anaconda
-```
-
-We provide a script to [generate a conda file](scripts/generate_conda_file.sh), depending of the environment we want to use. This will create the environment using the Python version 3.6 with all the correct dependencies.
+We provide a script to [generate a conda file](tools/generate_conda_file.sh), depending of the environment we want to use. This will create the environment using the Python version 3.6 with all the correct dependencies.
 
 To install each environment, first we need to generate a conda yaml file and then install the environment. We can specify the environment name with the input `-n`.
 
@@ -55,11 +45,11 @@ Click on the following menus to see more details:
 <details>
 <summary><strong><em>Python CPU environment</em></strong></summary>
 
-Assuming the repo is cloned as `Recommenders` in the local system, to install the Python CPU environment:
+Assuming the repo is cloned as `NLP` in the system, to install the Python CPU environment:
 
-    cd Recommenders
-    ./scripts/generate_conda_file.sh
-    conda env create -n reco_bare -f conda_bare.yaml 
+    cd NLP
+    ./tools/generate_conda_file.sh
+    conda env create -n nlp_cpu -f nlp_cpu.yaml 
 
 </details>
 
@@ -68,201 +58,20 @@ Assuming the repo is cloned as `Recommenders` in the local system, to install th
 
 Assuming that you have a GPU machine, to install the Python GPU environment, which by default installs the CPU environment:
 
-    cd Recommenders
-    ./scripts/generate_conda_file.sh --gpu
-    conda env create -n reco_gpu -f conda_gpu.yaml 
+    cd NLP
+    ./tools/generate_conda_file.sh --gpu
+    conda env create -n nlp_gpu -f nlp_gpu.yaml 
 
 </details>
 
-<details>
-<summary><strong><em>PySpark environment</em></strong></summary>
 
-To install the PySpark environment, which by default installs the CPU environment:
 
-    cd Recommenders
-    ./scripts/generate_conda_file.sh --pyspark
-    conda env create -n reco_pyspark -f conda_pyspark.yaml
+### Register the conda environment in the DSVM JupyterHub
 
-Additionally, if you want to test a particular version of spark, you may pass the --pyspark-version argument:
+DSVM comes with a preinstalled JupyterHub, which is accessible through port 8000. To access it just type in your browser `https://your-vm-ip:8000`. See more details [in this tutorial](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#jupyterhub-and-jupyterlab).
 
-    ./scripts/generate_conda_file.sh --pyspark-version 2.4.0
-
-**NOTE** - for this environment, we need to set the environment variables `PYSPARK_PYTHON` and `PYSPARK_DRIVER_PYTHON` to point to the conda python executable.
-
-To set these variables every time the environment is activated, we can follow the steps of this [guide](https://conda.io/docs/user-guide/tasks/manage-environments.html#macos-and-linux). Assuming that we have installed the environment in `/anaconda/envs/reco_pyspark`, we create the file `/anaconda/envs/reco_pyspark/etc/conda/activate.d/env_vars.sh` and add:
-
-```bash
-#!/bin/sh
-export PYSPARK_PYTHON=/anaconda/envs/reco_pyspark/bin/python
-export PYSPARK_DRIVER_PYTHON=/anaconda/envs/reco_pyspark/bin/python
-```
-
-This will export the variables every time we do `conda activate reco_pyspark`. To unset these variables when we deactivate the environment, we create the file `/anaconda/envs/reco_pyspark/etc/conda/deactivate.d/env_vars.sh` and add:
-
-```bash
-#!/bin/sh
-unset PYSPARK_PYTHON
-unset PYSPARK_DRIVER_PYTHON
-```
-</details>
-
-<details>
-<summary><strong><em>All environments</em></strong></summary>
-
-To install all three environments:
-
-    cd Recommenders
-    ./scripts/generate_conda_file.sh  --gpu --pyspark
-    conda env create -n reco_full -f conda_full.yaml
-
-</details>
-
-### Register the conda environment as a kernel in Jupyter
-
-We can register our created conda environment to appear as a kernel in the Jupyter notebooks.
+When using the DSVM, we can register our created conda environment to appear as a kernel in JupyterHub. 
 
     conda activate my_env_name
     python -m ipykernel install --user --name my_env_name --display-name "Python (my_env_name)"
 
-### Troubleshooting for the DSVM
-
-* We found that there can be problems if the Spark version of the machine is not the same as the one in the conda file. You can use the option `--pyspark-version` to address this issue.
-* When running Spark on a single local node it is possible to run out of disk space as temporary files are written to the user's home directory. To avoid this on a DSVM, we attached an additional disk to the DSVM and made modifications to the Spark configuration. This is done by including the following lines in the file at `/dsvm/tools/spark/current/conf/spark-env.sh`.
-
-```{shell}
-SPARK_LOCAL_DIRS="/mnt"
-SPARK_WORKER_DIR="/mnt"
-SPARK_WORKER_OPTS="-Dspark.worker.cleanup.enabled=true, -Dspark.worker.cleanup.appDataTtl=3600, -Dspark.worker.cleanup.interval=300, -Dspark.storage.cleanupFilesAfterExecutorExit=true"
-```
-
-## Setup guide for Azure Databricks
-
-### Requirements of Azure Databricks
-
-* Runtime version 4.3 (Apache Spark 2.3.1, Scala 2.11)
-* Python 3
-
-### Repository installation
-You can setup the repository as a library on Databricks either manually or by running an [installation script](scripts/databricks_install.sh). Both options assume you have access to a provisioned Databricks workspace and cluster and that you have appropriate permissions to install libraries.
-
-<details>
-<summary><strong><em>Quick install</em></strong></summary>
-
-This option utilizes an installation script to do the setup, and it requires additional dependencies in the environment used to execute the script.
-
-> To run the script, following **prerequisites** are required:
-> * Install [Azure Databricks CLI (command-line interface)](https://docs.azuredatabricks.net/user-guide/dev-tools/databricks-cli.html#install-the-cli) and setup CLI authentication. Please find details about how to create a token and set authentication [here](https://docs.azuredatabricks.net/user-guide/dev-tools/databricks-cli.html#set-up-authentication). Very briefly, you can install and configure your environment with the following commands.
->
->     ```{shell}
->     pip install databricks-cli
->     databricks configure --token
->     ```
->
-> * Get the target **cluster id** and **start** the cluster if its status is *TERMINATED*.
->   * You can get the cluster id from the databricks CLI with:
->        ```{shell}
->        databricks clusters list
->        ```
->   * If required, you can start the cluster with:
->        ```{shell}
->        databricks clusters start --cluster-id <CLUSTER_ID>`
->        ```
-> * The script also requires the `zip` command line utility, which may not be installed. You can install it with:
->     ```{shell}
->     sudo apt-get update
->     sudo apt-get install zip
->     ```
-
-Once you have confirmed the databricks cluster is *RUNNING*, install the modules within this repository with the following commands:
-
-```{shell}
-cd Recommenders
-./scripts/databricks_install.sh <CLUSTER_ID>
-```
-
-</details>
-
-<details>
-<summary><strong><em>Manual setup</em></strong></summary>
-
-To install the repo manually onto Databricks, follow the steps:
-
-1. Clone the Microsoft Recommenders repository to your local computer.
-2. Zip the contents inside the Recommenders folder (Azure Databricks requires compressed folders to have the `.egg` suffix, so we don't use the standard `.zip`):
-
-    ```{shell}
-    cd Recommenders
-    zip -r Recommenders.egg .
-    ```
-3. Once your cluster has started, go to the Databricks workspace, and select the `Home` button.
-4. Your `Home` directory should appear in a panel. Right click within your directory, and select `Import`.
-5. In the pop-up window, there is an option to import a library, where it says: `(To import a library, such as a jar or egg, click here)`. Select `click here`.
-6. In the next screen, select the option `Upload Python Egg or PyPI` in the first menu.
-7. Next, click on the box that contains the text `Drop library egg here to upload` and use the file selector to choose the `Recommenders.egg` file you just created, and select `Open`.
-8. Click on the `Create library`. This will upload the egg and make it available in your workspace.
-9. Finally, in the next menu, attach the library to your cluster.
-
-</details>
-
-### Confirm Installation
-
-After installation, you can now create a new notebook and import the utilities from Databricks in order to confirm that the import worked.
-
-```{python}
-import reco_utils
-```
-
-### Troubleshooting Installation on Azure Databricks
-
-* For the [reco_utils](reco_utils) import to work on Databricks, it is important to zip the content correctly. The zip has to be performed inside the Recommenders folder, if you zip directly above the Recommenders folder, it won't work.
-
-## Prepare Azure Databricks for Operationalization
-
-This repository includes an end-to-end example notebook that uses Azure Datbaricks to estimate a recommendation model using Alternating Least Squares, writes pre-computed recommendations to Azure Cosmos DB, and then creates a real-time scoring service that retrieves the recommendations from Cosmos DB. In order to execute that [notebook](notebooks//05_operationalize/als_movie_o16n.ipynb), you must install the Recommenders repository as a library (as described above), **AND* you must also install some additional dependencies. Similar to above, you can do so either manually or via an installation [script](scripts/prepare_databricks_for_o16n.sh).
-
-<details>
-<summary><strong><em>Quick install</em></strong></summary>
-
-This option utilizes an installation script to do the setup, and it requires the same dependencies as the databricks installation script (see above).
-
-Once you have:
-
-* Installed and configured the databricks CLI
-* Confirmed that the appropriate cluster is *RUNNING*
-* Installed the Recommenders egg as described above
-* Confirmed you are in the root directory of the Recommenders repository
-
-you can install additional dependencies for operationalization with:
-
-```{shell}
-scripts/prepare_databricks_for_o16n.sh <CLUSTER_ID>
-```
-
-This script does all of the steps described in the *Manual setup* section below.
-
-</details>
-
-<details>
-<summary><strong><em>Manual setup</em></strong></summary>
-
-You must install three packages as libraries from PyPI:
-
-* `azure-cli`
-* `azureml-sdk[databricks]`
-* `pydocumentdb`
-
-You can follow instructions [here](https://docs.azuredatabricks.net/user-guide/libraries.html#install-a-library-on-a-cluster) for details on how to install packages from PyPI.
-
-Additionally, you must install the [spark-cosmosdb connector](https://docs.databricks.com/spark/latest/data-sources/azure/cosmosdb-connector.html) on the cluster. The easiest way to manually do that is to:
-
-1. Download the [appropriate jar](https://search.maven.org/remotecontent?filepath=com/microsoft/azure/azure-cosmosdb-spark_2.3.0_2.11/1.2.2/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar) from MAVEN. **NOTE** This is the appropriate jar for spark versions `2.3.X`, and is the appropriate version for the recommended Azure Databricks run-time detailed above.
-2. Upload and install the jar by:
-   1. Log into your `Azure Databricks` workspace
-   2. Select the `Clusters` button on the left.
-   3. Select the cluster on which you want to import the library.
-   4. Select the `Upload` and `Jar` options, and click in the box that has the text `Drop JAR here` in it.
-   5. Navigate to the downloaded `.jar` file, select it, and click `Open`.
-   6. Click on `Install`.
-   7. Restart the cluster.
-
-</details>

From eaf24ac5d5a5efd7698c344f2a8194baf3d2f567 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Tue, 4 Jun 2019 11:22:35 +0100
Subject: [PATCH 2/6] conda file

---
 tools/generate_conda_file.py | 119 +++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 tools/generate_conda_file.py

diff --git a/tools/generate_conda_file.py b/tools/generate_conda_file.py
new file mode 100644
index 0000000..4923f6b
--- /dev/null
+++ b/tools/generate_conda_file.py
@@ -0,0 +1,119 @@
+#!/usr/bin/python
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# This script creates yaml files to build conda environments
+# For generating a conda file for running only python code:
+# $ python generate_conda_file.py
+#
+# For generating a conda file for running python gpu:
+# $ python generate_conda_file.py --gpu
+
+
+import argparse
+import textwrap
+from sys import platform
+
+
+HELP_MSG = """
+To create the conda environment:
+$ conda env create -f {conda_env}.yaml
+
+To update the conda environment:
+$ conda env update -f {conda_env}.yaml
+
+To register the conda environment in Jupyter:
+$ conda activate {conda_env}
+$ python -m ipykernel install --user --name {conda_env} --display-name "Python ({conda_env})"
+"""
+
+CHANNELS = ["defaults", "conda-forge", "pytorch"]
+
+CONDA_BASE = {
+    "python": "python==3.6.8",
+    "gitpython": "gitpython>=2.1.8",
+    "ipykernel": "ipykernel>=4.6.1",
+    "jupyter": "jupyter>=1.0.0",
+    "matplotlib": "matplotlib>=2.2.2",
+    "numpy": "numpy>=1.13.3",
+    "pandas": "pandas>=0.23.4",
+    "pymongo": "pymongo>=3.6.1",
+    "pytest": "pytest>=3.6.4",
+    "pytorch": "pytorch-cpu>=1.0.0",
+    "scikit-learn": "scikit-learn>=0.19.1",
+    "scipy": "scipy>=1.0.0",
+    "tensorflow": "tensorflow==1.12.0",
+}
+
+CONDA_GPU = {
+    "numba": "numba>=0.38.1",
+    "pytorch": "pytorch>=1.0.0",
+    "tensorflow": "tensorflow-gpu==1.12.0",
+}
+
+PIP_BASE = {
+    "azureml-sdk[notebooks,tensorboard]": "azureml-sdk[notebooks,tensorboard]==1.0.18",
+    "azure-storage": "azure-storage>=0.36.0",
+    "black": "black>=18.6b4",
+    "papermill": "papermill==0.18.2",
+    "pydocumentdb": "pydocumentdb>=2.3.3",
+    "tqdm": "tqdm==4.31.1",
+    "pyemd": "pyemd==0.5.1",
+    "ipywebrtc": "ipywebrtc==0.4.3"
+    "pre-commit": "pre-commit>=1.14.4"
+}
+
+PIP_GPU = {}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=textwrap.dedent(
+            """
+        This script generates a conda file for different environments.
+        Plain python is the default, but flags can be used to support GPU functionality"""
+        ),
+        epilog=HELP_MSG,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--name", help="specify name of conda environment")
+    parser.add_argument(
+        "--gpu", action="store_true", help="include packages for GPU support"
+    )
+    args = parser.parse_args()
+
+    # set name for environment and output yaml file
+    conda_env = "nlp_cpu"
+    if args.gpu:
+        conda_env = "nlp_gpu"
+
+    # overwrite environment name with user input
+    if args.name is not None:
+        conda_env = args.name
+
+    # update conda and pip packages based on flags provided
+    conda_packages = CONDA_BASE
+    pip_packages = PIP_BASE
+    if args.gpu:
+        conda_packages.update(CONDA_GPU)
+        pip_packages.update(PIP_GPU)
+
+    # write out yaml file
+    conda_file = "{}.yaml".format(conda_env)
+    with open(conda_file, "w") as f:
+        for line in HELP_MSG.format(conda_env=conda_env).split("\n"):
+            f.write("# {}\n".format(line))
+        f.write("name: {}\n".format(conda_env))
+        f.write("channels:\n")
+        for channel in CHANNELS:
+            f.write("- {}\n".format(channel))
+        f.write("dependencies:\n")
+        for conda_package in conda_packages.values():
+            f.write("- {}\n".format(conda_package))
+        f.write("- pip:\n")
+        for pip_package in pip_packages.values():
+            f.write("  - {}\n".format(pip_package))
+
+    print("Generated conda file: {}".format(conda_file))
+    print(HELP_MSG.format(conda_env=conda_env))

From 8c239a61cb4ebf47116916bb7f44a0cccbae13f2 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Tue, 4 Jun 2019 11:29:25 +0100
Subject: [PATCH 3/6] updated setup :memo:

---
 SETUP.md | 42 ++++++++++++++++--------------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/SETUP.md b/SETUP.md
index dbba312..5146f7d 100755
--- a/SETUP.md
+++ b/SETUP.md
@@ -18,40 +18,32 @@ For training at scale, operationalization or hyperparameter tuning, it is recomm
 
 ## Compute environments
 
-Depending on the type of NLP system and the notebook that needs to be run, there are different computational requirements.
-
-Currently, this repository supports the following environments:
-
-* Python CPU
-* Python GPU
+Depending on the type of NLP system and the notebook that needs to be run, there are different computational requirements. Currently, this repository supports **Python CPU** and **Python GPU**.
 
 
 ## Setup guide for Local or DSVM
 
-### Setup Requirements
+### Requirements
 
-* Anaconda with Python version >= 3.6. [Miniconda](https://conda.io/miniconda.html) is the fastest way to get started.
-* The Python library dependencies can be found in this [script](tools/generate_conda_file.sh).
+* A machine running Linux, MacOS or Windows.
+* Anaconda with Python version >= 3.6. 
+    * This is pre-installed on Azure DSVM such that one can run the following steps directly. To setup on your local machine, [Miniconda](https://docs.conda.io/en/latest/miniconda.html) is a quick way to get started.
 
 ### Dependencies setup
 
 
-We provide a script to [generate a conda file](tools/generate_conda_file.sh), depending of the environment we want to use. This will create the environment using the Python version 3.6 with all the correct dependencies.
+We provide a script, [generate_conda_file.py](tools/generate_conda_file.py), to generate a conda-environment yaml file
+which you can use to create the target environment using the Python version 3.6 with all the correct dependencies.
 
-To install each environment, first we need to generate a conda yaml file and then install the environment. We can specify the environment name with the input `-n`.
+Assuming the repo is cloned as `nlp` in the system, to install **a default (Python CPU) environment**:
 
-Click on the following menus to see more details:
+    cd nlp
+    python tools/generate_conda_file.py
+    conda env create -f nlp_cpu.yaml 
 
-<details>
-<summary><strong><em>Python CPU environment</em></strong></summary>
+You can specify the environment name as well with the flag `-n`.
 
-Assuming the repo is cloned as `NLP` in the system, to install the Python CPU environment:
-
-    cd NLP
-    ./tools/generate_conda_file.sh
-    conda env create -n nlp_cpu -f nlp_cpu.yaml 
-
-</details>
+Click on the following menus to see how to install the Python GPU environment:
 
 <details>
 <summary><strong><em>Python GPU environment</em></strong></summary>
@@ -65,13 +57,11 @@ Assuming that you have a GPU machine, to install the Python GPU environment, whi
 </details>
 
 
-
 ### Register the conda environment in the DSVM JupyterHub
 
-DSVM comes with a preinstalled JupyterHub, which is accessible through port 8000. To access it just type in your browser `https://your-vm-ip:8000`. See more details [in this tutorial](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#jupyterhub-and-jupyterlab).
-
-When using the DSVM, we can register our created conda environment to appear as a kernel in JupyterHub. 
+We can register our created conda environment to appear as a kernel in the Jupyter notebooks.
 
     conda activate my_env_name
     python -m ipykernel install --user --name my_env_name --display-name "Python (my_env_name)"
-
+    
+If you are using the DSVM, you can [connect to JupyterHub](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#jupyterhub-and-jupyterlab) by browsing to `https://your-vm-ip:8000`.
\ No newline at end of file

From c06fa8e170cd5a1950be8766dfa981ff77159dd6 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Tue, 4 Jun 2019 11:29:40 +0100
Subject: [PATCH 4/6] updated setup :memo:

---
 SETUP.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SETUP.md b/SETUP.md
index 5146f7d..fbb8c0f 100755
--- a/SETUP.md
+++ b/SETUP.md
@@ -50,7 +50,7 @@ Click on the following menus to see how to install the Python GPU environment:
 
 Assuming that you have a GPU machine, to install the Python GPU environment, which by default installs the CPU environment:
 
-    cd NLP
+    cd nlp
     ./tools/generate_conda_file.sh --gpu
     conda env create -n nlp_gpu -f nlp_gpu.yaml 
 

From 0c24b8d7ab52f5f7efb45af69b8630fee1cf8036 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Tue, 4 Jun 2019 11:37:44 +0100
Subject: [PATCH 5/6] :bug:

---
 SETUP.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SETUP.md b/SETUP.md
index fbb8c0f..06266c0 100755
--- a/SETUP.md
+++ b/SETUP.md
@@ -51,7 +51,7 @@ Click on the following menus to see how to install the Python GPU environment:
 Assuming that you have a GPU machine, to install the Python GPU environment, which by default installs the CPU environment:
 
     cd nlp
-    ./tools/generate_conda_file.sh --gpu
+    python tools/generate_conda_file.py --gpu
     conda env create -n nlp_gpu -f nlp_gpu.yaml 
 
 </details>

From 6464d246e8d671395162bd554bf4dc6ee8557461 Mon Sep 17 00:00:00 2001
From: miguelgfierro <miguelgfierro@users.noreply.github.com>
Date: Tue, 4 Jun 2019 14:51:35 +0100
Subject: [PATCH 6/6] gitignore and conda file

---
 .gitignore                   |  2 ++
 tools/generate_conda_file.py | 13 +++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index fcb4c97..a7102e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -122,7 +122,9 @@ tools/repo_metrics/config.py
 *.jar
 *.item
 *.pkl
+nlp_*.yaml
 
 # Data
 data/
 sentence-similarity/data/
+
diff --git a/tools/generate_conda_file.py b/tools/generate_conda_file.py
index 4923f6b..14f0792 100644
--- a/tools/generate_conda_file.py
+++ b/tools/generate_conda_file.py
@@ -53,15 +53,20 @@ CONDA_GPU = {
 }
 
 PIP_BASE = {
-    "azureml-sdk[notebooks,tensorboard]": "azureml-sdk[notebooks,tensorboard]==1.0.18",
-    "azure-storage": "azure-storage>=0.36.0",
+    "azureml-sdk[notebooks,tensorboard]": "azureml-sdk[notebooks,tensorboard]==1.0.33",
+    "azureml-dataprep": "azureml-dataprep==1.1.4",
     "black": "black>=18.6b4",
     "papermill": "papermill==0.18.2",
     "pydocumentdb": "pydocumentdb>=2.3.3",
     "tqdm": "tqdm==4.31.1",
     "pyemd": "pyemd==0.5.1",
-    "ipywebrtc": "ipywebrtc==0.4.3"
-    "pre-commit": "pre-commit>=1.14.4"
+    "ipywebrtc": "ipywebrtc==0.4.3",
+    "pre-commit": "pre-commit>=1.14.4",
+    "spacy-models": "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz",
+    "gensim": "gensim>=3.7.0",
+    "nltk": "nltk>=3.4",
+    "pytorch-pretrained-bert": "pytorch-pretrained-bert>=0.6",
+    "horovod": "horovod>=0.16.1",
 }
 
 PIP_GPU = {}