update with the changes in staging branch

This commit is contained in:
Yijing Chen 2019-08-13 18:10:56 +00:00
Родитель 7b10bad2bd f99d34a05e
Коммит bd2b60d0bc
100 изменённых файлов: 9446 добавлений и 2132 удалений

13
.bumpversion.cfg Normal file
Просмотреть файл

@ -0,0 +1,13 @@
[bumpversion]
current_version = 1.0.0
commit = True
tag = True
message = "Bump version: {current_version} -> {new_version}"
[bumpversion:file:setup.py]
search = version='{current_version}'
replace = version='{new_version}'
[bumpversion:file:utils_nlp/__init__.py]
search = __version__ = '{current_version}'
replace = __version__ = '{new_version}'

10
.gitignore поставляемый
Просмотреть файл

@ -125,10 +125,18 @@ tools/repo_metrics/config.py
*.pkl
nlp_*.yaml
nohup.out
temp/
tmp/
# Data
data/
sentence-similarity/data/
*/question_answering/bidaf.tar.gz
*/question_answering/bidafenv.yml
*/question_answering/config.json
*/question_answering/score.py
*/question_answering/vocabulary/
*/question_answering/weights.th
# AML Config
aml_config/
.azureml/

Просмотреть файл

@ -18,7 +18,7 @@ General Public License.
--
https://github.com/huggingface/pytorch-pretrained-BERT
https://github.com/huggingface/pytorch-transformers
Apache License
Version 2.0, January 2004
@ -427,4 +427,34 @@ https://github.com/stanfordnlp/glove
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.
BSD License
For SentEval software
Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name Facebook nor the names of its contributors may be used to
endorse or promote products derived from this software without specific
prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Просмотреть файл

@ -1,14 +1,45 @@
| Branch | Status | | Branch | Status |
| ------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/unit-test-master?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=22&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/unit-test-staging?branchName=staging)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=21&branchName=staging) |
# NLP Best Practices
This repository contains examples and best practices for building NLP systems, provided as [Jupyter notebooks](scenarios) and [utility functions](utils_nlp). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on problems involving text and language.
This repository contains examples and best practices for building natural language processing (NLP) systems, provided as [Jupyter notebooks](scenarios) and [utility functions](utils_nlp). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on problems involving text and language.
![](https://nlpbp.blob.core.windows.net/images/cognitive_services.PNG)
## Overview
The goal of this repository is to build a comprehensive set of tools and examples that leverage recent advances in NLP algorithms, neural architectures, and distributed machine learning systems.
The content is based on our past and potential future engagements with customers as well as collaboration with partners, researchers, and the open source community.
Were hoping that the tools would significantly reduce the time from a business problem, or a research idea, to full implementation of a system. In addition, the example notebooks would serve as guidelines and showcase best practices and usage of the tools.
In an era of transfer learning, transformers, and deep architectures, we believe that pretrained models provide a unified solution to many real-world problems and allow handling different tasks and languages easily. We will, therefore, prioritize such models, as they achieve state-of-the-art results on several NLP benchmarks and can be used in a number of applications ranging from simple text classification to sophisticated intelligent chat bots.
> [*GLUE Leaderboard*](https://gluebenchmark.com/leaderboard)
> [*SQuAD Leaderbord*](https://rajpurkar.github.io/SQuAD-explorer/)
## Content
The following is a summary of the scenarios covered in the repository. Each scenario is demonstrated in one or more Jupyter notebook examples that make use of the core code base of models and utilities.
| Scenario | Applications | Models |
|---| ------------------------ | ------------------- |
|[Text Classification](scenarios/text_classification) |Topic Classification|BERT|
|[Named Entity Recognition](scenarios/named_entity_recognition) |Wikipedia NER |BERT|
|[Entailment](scenarios/entailment)|MultiNLI Natural Language Inference|BERT|
|[Question Answering](scenarios/question_answering) |SQuAD | BiDAF, BERT|
|[Sentence Similarity](scenarios/sentence_similarity) |STS Benchmark |Representation: TF-IDF, Word Embeddings, Doc Embeddings<br>Metrics: Cosine Similarity, Word Mover's Distance|
|[Embeddings](scenarios/embeddings)| Custom Embeddings Training|Word2Vec<br>fastText<br>GloVe|
| [Annotation](scenarios/annotation) | Text annotation | Tutorial |
## Getting Started
To get started, navigate to the [Setup Guide](SETUP.md), where you'll find instructions on how to setup your environment and dependencies.
## Contributing
This project welcomes contributions and suggestions. Before contributing, please see our [contribution guidelines](CONTRIBUTING.md).
## Build Status
| Build Type | Branch | Status | | Branch | Status |
| --- | --- | --- | --- | --- | --- |
| **Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/cpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=50&branchName=staging) |
| **Linux GPU** | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/gpu_integration_tests_linux?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=51&branchName=master) |

Просмотреть файл

@ -26,13 +26,10 @@ Depending on the type of NLP system and the notebook that needs to be run, there
### Requirements
* A machine running Linux, MacOS or Windows.
> NOTE: Windows machine are not **FULLY SUPPORTED**. Please use at your own risk.
* Miniconda or Anaconda with Python version >= 3.6.
> NOTE: Windows machines are not **FULLY SUPPORTED**. Please use at your own risk.
* Miniconda or Anaconda with Python version >= 3.6.
* This is pre-installed on Azure DSVM such that one can run the following steps directly. To setup on your local machine, [Miniconda](https://docs.conda.io/en/latest/miniconda.html) is a quick way to get started.
* It is recommended to update conda to the latest version: `conda update -n base -c defaults conda`
* CUDA Toolkit >=9.2 (for GPU machines only)
* On Windows: Download and install [toolkit](https://developer.nvidia.com/cuda-toolkit)
* On Linux: *conda install cudatoolkit>=9.2*
### Dependencies Setup
@ -45,23 +42,39 @@ Assuming the repo is cloned as `nlp` in the system, to install **a default (Pyth
cd nlp
python tools/generate_conda_file.py
conda env create -f nlp_cpu.yaml
conda env create -f nlp_cpu.yaml
You can specify the environment name as well with the flag `-n`.
Click on the following menus to see how to install the Python GPU environment:
<details>
<summary><strong><em>Python GPU environment</em></strong></summary>
<summary><strong><em>Python GPU environment on Linux, MacOS</em></strong></summary>
Assuming that you have a GPU machine, to install the Python GPU environment, which by default installs the CPU environment:
cd nlp
python tools/generate_conda_file.py --gpu
conda env create -n nlp_gpu -f nlp_gpu.yaml
conda env create -n nlp_gpu -f nlp_gpu.yaml
</details>
<details>
<summary><strong><em>Python GPU environment on Windows</em></strong></summary>
Assuming that you have an Azure GPU DSVM machine, here are the steps to setup the Python GPU environment:
1. Make sure you have CUDA Toolkit version 9.0 above installed on your Windows machine. You can run the command below in your terminal to check.
nvcc --version
If you don't have CUDA Toolkit or don't have the right version, please download it from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
2. Install the GPU environment.
cd nlp
python tools/generate_conda_file.py --gpu
conda env create -n nlp_gpu -f nlp_gpu.yaml
</details>
### Register Conda Environment in DSVM JupyterHub
@ -69,13 +82,13 @@ We can register our created conda environment to appear as a kernel in the Jupyt
conda activate my_env_name
python -m ipykernel install --user --name my_env_name --display-name "Python (my_env_name)"
If you are using the DSVM, you can [connect to JupyterHub](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#jupyterhub-and-jupyterlab) by browsing to `https://your-vm-ip:8000`.
If you are using the DSVM, you can [connect to JupyterHub](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#jupyterhub-and-jupyterlab) by browsing to `https://your-vm-ip:8000`. If you are prompted to enter user name and password, enter the user name and password that you use to log in to your virtual machine.
## Install this repository via PIP
A [setup.py](setup.py) file is provied in order to simplify the installation of this utilities in this repo from the main directory.
A [setup.py](setup.py) file is provied in order to simplify the installation of this utilities in this repo from the main directory.
pip install -e utils_nlp
pip install -e .
It is also possible to install directly from Github.

Просмотреть файл

@ -1,15 +0,0 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
__title__ = "Microsoft NLP"
__version__ = "2019.08"
__author__ = "NLPDev Team at Microsoft"
__license__ = "MIT"
__copyright__ = "Copyright 2018-present Microsoft Corporation"
# Synonyms
TITLE = __title__
VERSION = __version__
AUTHOR = __author__
LICENSE = __license__
COPYRIGHT = __copyright__

Просмотреть файл

@ -1,2 +1,2 @@
[tool.black]
line-length = 79
line-length = 100

Просмотреть файл

@ -1,19 +1,15 @@
# NLP Scenarios
This folder contains examples and best practices, written in Jupyter notebooks, for building Natural Language Processing systems for different scenarios.
This folder contains examples and best practices, written in Jupyter notebooks, for building Natural Language Processing systems for the following scenarios.
## Summary
The following is a summary of the scenarios covered in the best practice notebooks. Each scenario is demonstrated in one or more Jupyter notebook examples that make use of the core code base of models and utilities.
| Scenario | Applications | Models |
|---| ------------------------ | ------------------- |
|[Text Classification](text_classification) |Topic Classification|BERT|
|[Named Entity Recognition](named_entity_recognition) |Wikipedia NER |BERT|
|[Entailment](./entailment)|XNLI Natural Language Inference|BERT|
|[Question Answering](question_answering) |SQuAD | BiDAF|
|[Sentence Similarity](sentence_similarity) |STS Benchmark |Representation: TF-IDF, Word Embeddings, Doc Embeddings<br>Metrics: Cosine Similarity, Word Mover's Distance|
|[Embeddings](embeddings)| Custom Embeddings Training|Word2Vec<br>fastText<br>GloVe|
- [Text Classification](text_classification)
- [Named Entity Recognition](named_entity_recognition)
- [Entailment](entailment)
- [Question Answering](question_answering)
- [Sentence Similarity](sentence_similarity)
- [Embeddings](embeddings)
- [Annotation](annotation)
## Azure-enhanced notebooks
@ -31,8 +27,8 @@ The Azure products featured in the notebooks include:
* Scaling up and out on Azure Machine Learning Compute
* Deploying a web service to both Azure Container Instance and Azure Kubernetes Service
* [Azure Kubernetes Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aks) - You can use Azure Machine Learning service to host your classification model in a web service deployment on Azure Kubernetes Service (AKS). AKS is good for high-scale production deployments and provides autoscaling, and fast response times.
* [Azure Kubernetes Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aks) - You can use Azure Machine Learning service to host your model in a web service deployment on Azure Kubernetes Service (AKS). AKS is good for high-scale production deployments and provides autoscaling, and fast response times.
* [Azure Container Instance](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aci)- You can use Azure Machine Learning service to host your classification model in a web service deployment on Azure Container Instance (ACI). ACI is good for low scale, CPU-based workloads.
* [Azure Container Instance](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aci)- You can use Azure Machine Learning service to host your model in a web service deployment on Azure Container Instance (ACI). ACI is good for low scale, CPU-based workloads.
There may be other Azure service or products used in the notebooks. Introduction and/or reference of those will be provided in the notebooks.

Просмотреть файл

@ -0,0 +1,140 @@
# Doccano: Text Annotation Tool
## What is Doccano?
[Doccano](https://github.com/chakki-works/doccano) is one of the best open source tools that provides text annotation features. The latest version supports annotation features for text classification, sequence labeling (NER) and sequence to sequence (machine translation, text summarization). There are many other open source and commercial annotation tools available. Hereafter is an list of those tools:
- [Brat](https://brat.nlplab.org/) (open source)
- [Anafora](https://github.com/weitechen/anafora) (open source)
- [Prodigy](https://prodi.gy/) (commercial)
- [LightTag](https://www.lighttag.io/) (commercial)
Doccano needs to be hosted somewhere such that we can collaborate it. This tutorial walks through how to deploy Doccano on Azure and collaboratively annotate text data for natural language processing tasks.
## Deploy to Azure
Doccano can be deployed to Azure ([Web App for Containers](https://azure.microsoft.com/en-us/services/app-service/containers/) +
[PostgreSQL database](https://azure.microsoft.com/en-us/services/postgresql/)) by clicking on the button below:
<p align="center">
<a href="https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fchakki-works%2Fdoccano%2Fmaster%2Fazuredeploy.json"><img width=180 src="https://nlpbp.blob.core.windows.net/images/deploybutton.jpg" /></a>
</p>
You will need to have an existing Azure subscription such that you can create all Azure resources need to deploy Doccano. Otherwise you can get a [free Azure account](https://azure.microsoft.com/en-us/offers/ms-azr-0044p/?WT.mc_id=medium-blog-abornst) and then click the deploy button above.
You will need to specify your subscription and resource group, and fill in the setting details (App Name, Secret Key, and etc.) and then deploy. It takes a few minutes to create all needed Azure resources. Hereafter is a screen snippet of the deployment.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/deploy_to_azure.jpg" />
</p>
## Tutorial
### Useful Links
#### Main Page
After the deployment you can navigate to following url where **{`appname`}** is the `App Name` you choose when deploy to Azure:
_**https://{appname}.azurewebsites.net**_
For example, if your appname is "**doccano**", then the link will be
_**https://doccano.azurewebsites.net**_
And we will use `doccano` as the app name for this tutorial.
#### Login Page
You can login by clicking the `login` button at the top right of the main page, or you can navigate to the page with the link
_**https://doccano.azurewebsites.net/login**_
Both will bring you in to the Doccano login page where you can login with the Admin user name and Admin password you configured in the deployment.
#### Admin Page
By default, only the Admin user is created for you after the deployment. You can add more users, groups and configure the Doccano service by navigating to the admin page.
_**https://doccano.azurewebsites.net/admin**_
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/admin_page.JPG" />
</p>
### Create Project
The first step we need to do is to create a new project for annotation. And here we will use the NER annotation task for science fictions to give you a brief tutorial on Doccano.
After login with Admin user name and Admin password, you will be navigated to the main project list page of Doccano and there is no project.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/project_list.jpg" />
</p>
To create your project, make sure youre in the project list page and click `Create Project` button. As for this tutorial, we name the project as `sequence labeling for books`, write some description, then choose the sequence labeling task type.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/create_project.jpg" />
</p>
### Import Data
After creating a project, we will see the "`Import Data`" page, or click `Import Data` button in the navigation bar. We should see the following screen:
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/import_data.jpg" />
</p>
We choose JSONL and click `Select a file` button. Select `books.json` and it would be loaded automatically. Below is the `books.json` file containing lots of science fictions description with different languages. We need to annotate some entities like people name, book title, date and so on.
```json
{"text": "The Hitchhiker's Guide to the Galaxy (sometimes referred to as HG2G, HHGTTGor H2G2) is a comedy science fiction series created by Douglas Adams. Originally a radio comedy broadcast on BBC Radio 4 in 1978, it was later adapted to other formats, including stage shows, novels, comic books, a 1981 TV series, a 1984 video game, and 2005 feature film."}
{"text": "《三体》是中国大陆作家刘慈欣于2006年5月至12月在《科幻世界》杂志上连载的一部长篇科幻小说出版后成为中国大陆最畅销的科幻长篇小说之一。2008年该书的单行本由重庆出版社出版。本书是三体系列系列原名为地球往事三部曲的第一部该系列的第二部《三体II黑暗森林》已经于2008年5月出版。2010年11月第三部《三体III死神永生》出版发行。 2011年“地球往事三部曲”在台湾陆续出版。小说的英文版获得美国科幻奇幻作家协会2014年度“星云奖”提名并荣获2015年雨果奖最佳小说奖。"}
{"text": "『銀河英雄伝説』ぎんがえいゆうでんせつは、田中芳樹によるSF小説。また、これを原作とするアニメ、漫画、コンピュータゲーム、朗読、オーディオブック等の関連作品。略称は『銀英伝』ぎんえいでん。原作は累計発行部数が1500万部を超えるベストセラー小説である。1982年から2009年6月までに複数の版で刊行され、発行部数を伸ばし続けている。"}
```
After importing the dataset, you should be able to see the dataset immediately.
### Define labels
Click `Labels` button in left bar to define our own labels. We should see the label editor page. In label editor page, you can create labels by specifying label text, shortcut key, background color and text color.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/define_labels.jpg" />
</p>
### Annotation
Next, we are ready to annotate the texts. Just click the `Annotate Data` button in the navigation bar, we can start to annotate the documents. You can just select the text and then use the shortcut key that you have defined to label the entities.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/annotate.jpg" />
</p>
### Export Data
After the annotation step, we can download the annotated data. Click the `Edit data` button in the navigation bar, and then click `Export Data`. You should see below screen:
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/export_data.jpg" />
</p>
Here we choose JSONL file to download the data by clicking the button. Below is the annotated result for our tutorial project.
```json
{"id": 1, "text": "The Hitchhiker's Guide to the Galaxy (sometimes referred to as HG2G, HHGTTGor H2G2) is a comedy science fiction series created by Douglas Adams. Originally a radio comedy broadcast on BBC Radio 4 in 1978, it was later adapted to other formats, including stage shows, novels, comic books, a 1981 TV series, a 1984 video game, and 2005 feature film.", "annotations": [{"label": 2, "start_offset": 0, "end_offset": 36, "user": 1}, {"label": 2, "start_offset": 63, "end_offset": 67, "user": 1}, {"label": 2, "start_offset": 69, "end_offset": 82, "user": 1}, {"label": 5, "start_offset": 89, "end_offset": 111, "user": 1}, {"label": 1, "start_offset": 130, "end_offset": 143, "user": 1}, {"label": 5, "start_offset": 158, "end_offset": 180, "user": 1}, {"label": 6, "start_offset": 184, "end_offset": 195, "user": 1}, {"label": 3, "start_offset": 199, "end_offset": 203, "user": 1}, {"label": 5, "start_offset": 254, "end_offset": 265, "user": 1}, {"label": 5, "start_offset": 267, "end_offset": 273, "user": 1}, {"label": 5, "start_offset": 275, "end_offset": 286, "user": 1}, {"label": 3, "start_offset": 290, "end_offset": 294, "user": 1}, {"label": 5, "start_offset": 295, "end_offset": 304, "user": 1}, {"label": 3, "start_offset": 308, "end_offset": 312, "user": 1}, {"label": 5, "start_offset": 313, "end_offset": 323, "user": 1}, {"label": 3, "start_offset": 329, "end_offset": 333, "user": 1}, {"label": 5, "start_offset": 334, "end_offset": 346, "user": 1}], "meta": {}, "annotation_approver": "admin"}
{"id": 2, "text": "《三体》是中国大陆作家刘慈欣于2006年5月至12月在《科幻世界》杂志上连载的一部长篇科幻小说出版后成为中国大陆最畅销的科幻长篇小说之一。2008年该书的单行本由重庆出版社出版。本书是三体系列系列原名为地球往事三部曲的第一部该系列的第二部《三体II黑暗森林》已经于2008年5月出版。2010年11月第三部《三体III死神永生》出版发行。 2011年“地球往事三部曲”在台湾陆续出版。小说的英文版获得美国科幻奇幻作家协会2014年度“星云奖”提名并荣获2015年雨果奖最佳小说奖。", "annotations": [{"label": 2, "start_offset": 1, "end_offset": 3, "user": 1}, {"label": 4, "start_offset": 5, "end_offset": 9, "user": 1}, {"label": 1, "start_offset": 11, "end_offset": 14, "user": 1}, {"label": 3, "start_offset": 15, "end_offset": 26, "user": 1}, {"label": 2, "start_offset": 28, "end_offset": 32, "user": 1}, {"label": 5, "start_offset": 41, "end_offset": 47, "user": 1}, {"label": 4, "start_offset": 53, "end_offset": 57, "user": 1}, {"label": 5, "start_offset": 61, "end_offset": 67, "user": 1}, {"label": 3, "start_offset": 70, "end_offset": 74, "user": 1}, {"label": 6, "start_offset": 83, "end_offset": 88, "user": 1}, {"label": 2, "start_offset": 105, "end_offset": 112, "user": 1}, {"label": 2, "start_offset": 94, "end_offset": 98, "user": 1}, {"label": 2, "start_offset": 126, "end_offset": 135, "user": 1}, {"label": 3, "start_offset": 139, "end_offset": 146, "user": 1}, {"label": 3, "start_offset": 149, "end_offset": 157, "user": 1}, {"label": 2, "start_offset": 162, "end_offset": 172, "user": 1}, {"label": 3, "start_offset": 179, "end_offset": 184, "user": 1}, {"label": 2, "start_offset": 186, "end_offset": 193, "user": 1}, {"label": 4, "start_offset": 195, "end_offset": 197, "user": 1}, {"label": 5, "start_offset": 202, "end_offset": 204, "user": 1}, {"label": 6, "start_offset": 210, "end_offset": 220, "user": 1}, {"label": 3, "start_offset": 220, "end_offset": 225, "user": 1}, {"label": 6, "start_offset": 227, "end_offset": 230, "user": 1}, {"label": 3, "start_offset": 237, "end_offset": 242, "user": 1}, {"label": 6, "start_offset": 242, "end_offset": 245, "user": 1}], "meta": {}, "annotation_approver": "admin"}
{"id": 3, "text": "『銀河英雄伝説』ぎんがえいゆうでんせつは、田中芳樹によるSF小説。また、これを原作とするアニメ、漫画、コンピュータゲーム、朗読、オーディオブック等の関連作品。略称は『銀英伝』ぎんえいでん。原作は累計発行部数が1500万部を超えるベストセラー小説である。1982年から2009年6月までに複数の版で刊行され、発行部数を伸ばし続けている。", "annotations": [{"label": 2, "start_offset": 1, "end_offset": 7, "user": 1}, {"label": 1, "start_offset": 23, "end_offset": 30, "user": 1}, {"label": 5, "start_offset": 30, "end_offset": 34, "user": 1}, {"label": 2, "start_offset": 85, "end_offset": 88, "user": 1}, {"label": 5, "start_offset": 50, "end_offset": 52, "user": 1}, {"label": 5, "start_offset": 63, "end_offset": 65, "user": 1}, {"label": 3, "start_offset": 130, "end_offset": 135, "user": 1}, {"label": 3, "start_offset": 137, "end_offset": 144, "user": 1}], "meta": {}, "annotation_approver": "admin"}
```
Please note that in the exported JSON file, the label for each entity is an entity ID which is inconvenient if you want to consume the annotations somewhere else. Some post processing is needed if you want to have the entity type value instead of the type ID.
### View Statistics
One good thing of Doccano is that it also has dashboard to display annotation progress and label distributions. Click the `Edit data` button in the navigation bar, and then click `Statistics` on the left side of the menu.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/statistic.jpg" />
</p>
Congratulation! You just mastered how to use Doccano for a sequence labeling project.

Просмотреть файл

@ -0,0 +1,12 @@
# Natural Language Inference (NLI)
Natural Language Inference (NLI) or Recognizing Textual Entailment (RTE) is the
task of classifying a pair of premise and hypothesis sentences into three
classes: contradiction, neutral, and entailment. For example,
|Premise|Hypothesis|Label|
|-------|----------|-----|
|A man inspects the uniform of a figure in some East Asian country.|The man is sleeping.|contradiction|
|An older and younger man smiling.|Two men are smiling and laughing at the cats playing on the floor.|neutral|
|A soccer game with multiple males playing.|Some men are playing a sport.|entailment|
NLI is one of many NLP tasks that require robust compositional sentence understanding, but it's simpler compared to other tasks like question answering and machine translation.

Просмотреть файл

@ -0,0 +1,893 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*Copyright (c) Microsoft Corporation. All rights reserved.* \n",
"\n",
"*Licensed under the MIT License.*"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Natural Language Inference on MultiNLI Dataset using BERT"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Before You Start\n",
"\n",
"The running time shown in this notebook is on a Standard_NC24s_v3 Azure Deep Learning Virtual Machine with 4 NVIDIA Tesla V100 GPUs. If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n",
"The table below provides some reference running time on different machine configurations. \n",
"\n",
"|QUICK_RUN|Machine Configurations|Running time|\n",
"|:---------|:----------------------|:------------|\n",
"|True|4 **CPU**s, 14GB memory| ~ 15 minutes|\n",
"|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 5 minutes|\n",
"|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 10.5 hours|\n",
"|False|4 NVIDIA Tesla V100 GPUs, 64GB GPU memory| ~ 2.5 hours|\n",
"\n",
"If you run into CUDA out-of-memory error, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n",
"QUICK_RUN = False\n",
"\n",
"TRAIN_DATA_USED_PERCENT = 1\n",
"DEV_DATA_USED_PERCENT = 1\n",
"NUM_EPOCHS = 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if QUICK_RUN:\n",
" TRAIN_DATA_USED_PERCENT = 0.001\n",
" DEV_DATA_USED_PERCENT = 0.01\n",
" NUM_EPOCHS = 1\n",
"\n",
"import torch\n",
"if torch.cuda.is_available():\n",
" BATCH_SIZE = 32\n",
"else:\n",
" BATCH_SIZE = 16"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary\n",
"In this notebook, we demostrate using [BERT](https://arxiv.org/abs/1810.04805) to perform Natural Language Inference (NLI). We use the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset and the task is to classify sentence pairs into three classes: contradiction, entailment, and neutral. \n",
"The figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. It concatenates the tokens in each sentence pairs and separates the sentences by the [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.\n",
"<img src=\"https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG\">"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import random\n",
"import numpy as np\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"nlp_path = os.path.abspath('../../')\n",
"if nlp_path not in sys.path:\n",
" sys.path.insert(0, nlp_path)\n",
"\n",
"from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier\n",
"from utils_nlp.models.bert.common import Language, Tokenizer\n",
"from utils_nlp.dataset.multinli import load_pandas_df\n",
"from utils_nlp.common.timer import Timer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configurations"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# set random seeds\n",
"RANDOM_SEED = 42\n",
"random.seed(RANDOM_SEED)\n",
"np.random.seed(RANDOM_SEED)\n",
"torch.manual_seed(RANDOM_SEED)\n",
"num_cuda_devices = torch.cuda.device_count()\n",
"if num_cuda_devices > 1:\n",
" torch.cuda.manual_seed_all(RANDOM_SEED)\n",
"\n",
"# model configurations\n",
"LANGUAGE = Language.ENGLISH\n",
"TO_LOWER = True\n",
"MAX_SEQ_LENGTH = 128\n",
"\n",
"# optimizer configurations\n",
"LEARNING_RATE= 5e-5\n",
"WARMUP_PROPORTION= 0.1\n",
"\n",
"# data configurations\n",
"TEXT_COL = \"text\"\n",
"LABEL_COL = \"gold_label\"\n",
"\n",
"CACHE_DIR = \"./temp\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data\n",
"The MultiNLI dataset comes with three subsets: train, dev_matched, dev_mismatched. The dev_matched dataset are from the same genres as the train dataset, while the dev_mismatched dataset are from genres not seen in the training dataset. \n",
"The `load_pandas_df` function downloads and extracts the zip files if they don't already exist in `local_cache_path` and returns the data subset specified by `file_split`."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"train_df = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\")\n",
"dev_df_matched = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev_matched\")\n",
"dev_df_mismatched = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev_mismatched\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"dev_df_matched = dev_df_matched.loc[dev_df_matched['gold_label'] != '-']\n",
"dev_df_mismatched = dev_df_mismatched.loc[dev_df_mismatched['gold_label'] != '-']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training dataset size: 392702\n",
"Development (matched) dataset size: 9815\n",
"Development (mismatched) dataset size: 9832\n",
"\n",
" gold_label sentence1 \\\n",
"0 neutral Conceptually cream skimming has two basic dime... \n",
"1 entailment you know during the season and i guess at at y... \n",
"2 entailment One of our number will carry out your instruct... \n",
"3 entailment How do you know? All this is their information... \n",
"4 neutral yeah i tell you what though if you go price so... \n",
"\n",
" sentence2 \n",
"0 Product and geography are what make cream skim... \n",
"1 You lose the things to the following level if ... \n",
"2 A member of my team will execute your orders w... \n",
"3 This information belongs to them. \n",
"4 The tennis shoes have a range of prices. \n"
]
}
],
"source": [
"print(\"Training dataset size: {}\".format(train_df.shape[0]))\n",
"print(\"Development (matched) dataset size: {}\".format(dev_df_matched.shape[0]))\n",
"print(\"Development (mismatched) dataset size: {}\".format(dev_df_mismatched.shape[0]))\n",
"print()\n",
"print(train_df[['gold_label', 'sentence1', 'sentence2']].head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate the first and second sentences to form the input text."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>gold_label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>(Conceptually cream skimming has two basic dim...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>(you know during the season and i guess at at ...</td>\n",
" <td>entailment</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>(One of our number will carry out your instruc...</td>\n",
" <td>entailment</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>(How do you know? All this is their informatio...</td>\n",
" <td>entailment</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(yeah i tell you what though if you go price s...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text gold_label\n",
"0 (Conceptually cream skimming has two basic dim... neutral\n",
"1 (you know during the season and i guess at at ... entailment\n",
"2 (One of our number will carry out your instruc... entailment\n",
"3 (How do you know? All this is their informatio... entailment\n",
"4 (yeah i tell you what though if you go price s... neutral"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df[TEXT_COL] = list(zip(train_df['sentence1'], train_df['sentence2']))\n",
"dev_df_matched[TEXT_COL] = list(zip(dev_df_matched['sentence1'], dev_df_matched['sentence2']))\n",
"dev_df_mismatched[TEXT_COL] = list(zip(dev_df_mismatched['sentence1'], dev_df_mismatched['sentence2']))\n",
"train_df[[TEXT_COL, LABEL_COL]].head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"train_df = train_df.sample(frac=TRAIN_DATA_USED_PERCENT).reset_index(drop=True)\n",
"dev_df_matched = dev_df_matched.sample(frac=DEV_DATA_USED_PERCENT).reset_index(drop=True)\n",
"dev_df_mismatched = dev_df_mismatched.sample(frac=DEV_DATA_USED_PERCENT).reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokenize and Preprocess\n",
"Before training, we tokenize the sentence texts and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 392702/392702 [03:25<00:00, 1907.47it/s]\n",
"100%|██████████| 9815/9815 [00:05<00:00, 1961.13it/s]\n",
"100%|██████████| 9832/9832 [00:05<00:00, 1837.42it/s]\n"
]
}
],
"source": [
"tokenizer= Tokenizer(LANGUAGE, to_lower=TO_LOWER, cache_dir=CACHE_DIR)\n",
"\n",
"train_tokens = tokenizer.tokenize(train_df[TEXT_COL])\n",
"dev_matched_tokens = tokenizer.tokenize(dev_df_matched[TEXT_COL])\n",
"dev_mismatched_tokens = tokenizer.tokenize(dev_df_mismatched[TEXT_COL])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In addition, we perform the following preprocessing steps in the cell below:\n",
"\n",
"* Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n",
"* Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n",
"* Pad or truncate the token lists to the specified max length\n",
"* Return mask lists that indicate paddings' positions\n",
"* Return token type id lists that indicate which sentence the tokens belong to\n",
"\n",
"*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"train_token_ids, train_input_mask, train_token_type_ids = \\\n",
" tokenizer.preprocess_classification_tokens(train_tokens, max_len=MAX_SEQ_LENGTH)\n",
"dev_matched_token_ids, dev_matched_input_mask, dev_matched_token_type_ids = \\\n",
" tokenizer.preprocess_classification_tokens(dev_matched_tokens, max_len=MAX_SEQ_LENGTH)\n",
"dev_mismatched_token_ids, dev_mismatched_input_mask, dev_mismatched_token_type_ids = \\\n",
" tokenizer.preprocess_classification_tokens(dev_mismatched_tokens, max_len=MAX_SEQ_LENGTH)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"label_encoder = LabelEncoder()\n",
"train_labels = label_encoder.fit_transform(train_df[LABEL_COL])\n",
"num_labels = len(np.unique(train_labels))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train and Predict"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Classifier"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"classifier = BERTSequenceClassifier(language=LANGUAGE,\n",
" num_labels=num_labels,\n",
" cache_dir=CACHE_DIR)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train Classifier"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 0%| | 1/12272 [00:10<35:06:53, 10.30s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:1->1228/12272; average training loss:1.199178\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 10%|█ | 1229/12272 [07:20<1:03:16, 2.91it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:1229->2456/12272; average training loss:0.783637\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 20%|██ | 2457/12272 [14:28<55:44, 2.93it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:2457->3684/12272; average training loss:0.692243\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 30%|███ | 3685/12272 [21:37<48:36, 2.94it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:3685->4912/12272; average training loss:0.653206\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 40%|████ | 4913/12272 [28:45<41:36, 2.95it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:4913->6140/12272; average training loss:0.625751\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 50%|█████ | 6141/12272 [35:54<34:44, 2.94it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:6141->7368/12272; average training loss:0.605123\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 60%|██████ | 7369/12272 [42:58<27:46, 2.94it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:7369->8596/12272; average training loss:0.590521\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 70%|███████ | 8597/12272 [50:07<20:52, 2.93it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:8597->9824/12272; average training loss:0.577829\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 80%|████████ | 9825/12272 [57:14<13:46, 2.96it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:9825->11052/12272; average training loss:0.566418\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 90%|█████████ | 11053/12272 [1:04:20<06:53, 2.95it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:11053->12272/12272; average training loss:0.556558\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 100%|██████████| 12272/12272 [1:11:21<00:00, 2.88it/s]\n",
"Iteration: 0%| | 1/12272 [00:00<1:12:29, 2.82it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:1->1228/12272; average training loss:0.319802\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 10%|█ | 1229/12272 [07:09<1:02:29, 2.95it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:1229->2456/12272; average training loss:0.331876\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 20%|██ | 2457/12272 [14:15<55:22, 2.95it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:2457->3684/12272; average training loss:0.333463\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 30%|███ | 3685/12272 [21:21<48:41, 2.94it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:3685->4912/12272; average training loss:0.331817\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 40%|████ | 4913/12272 [28:25<41:26, 2.96it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:4913->6140/12272; average training loss:0.327940\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 50%|█████ | 6141/12272 [35:31<34:34, 2.96it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:6141->7368/12272; average training loss:0.325802\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 60%|██████ | 7369/12272 [42:36<27:48, 2.94it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:7369->8596/12272; average training loss:0.324641\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 70%|███████ | 8597/12272 [49:42<20:53, 2.93it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:8597->9824/12272; average training loss:0.322036\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 80%|████████ | 9825/12272 [56:44<13:50, 2.95it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:9825->11052/12272; average training loss:0.321205\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 90%|█████████ | 11053/12272 [1:03:49<06:54, 2.94it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:2/2; batch:11053->12272/12272; average training loss:0.319237\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 100%|██████████| 12272/12272 [1:10:52<00:00, 2.94it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training time : 2.374 hrs\n"
]
}
],
"source": [
"with Timer() as t:\n",
" classifier.fit(token_ids=train_token_ids,\n",
" input_mask=train_input_mask,\n",
" token_type_ids=train_token_type_ids,\n",
" labels=train_labels,\n",
" num_epochs=NUM_EPOCHS,\n",
" batch_size=BATCH_SIZE,\n",
" lr=LEARNING_RATE,\n",
" warmup_proportion=WARMUP_PROPORTION)\n",
"print(\"Training time : {:.3f} hrs\".format(t.interval / 3600))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Predict on Test Data"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 100%|██████████| 307/307 [00:40<00:00, 8.15it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prediction time : 0.011 hrs\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"with Timer() as t:\n",
" predictions_matched = classifier.predict(token_ids=dev_matched_token_ids,\n",
" input_mask=dev_matched_input_mask,\n",
" token_type_ids=dev_matched_token_type_ids,\n",
" batch_size=BATCH_SIZE)\n",
"print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 100%|██████████| 308/308 [00:38<00:00, 8.30it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prediction time : 0.011 hrs\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"with Timer() as t:\n",
" predictions_mismatched = classifier.predict(token_ids=dev_mismatched_token_ids,\n",
" input_mask=dev_mismatched_input_mask,\n",
" token_type_ids=dev_mismatched_token_type_ids,\n",
" batch_size=BATCH_SIZE)\n",
"print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
"contradiction 0.848 0.865 0.857 3213\n",
" entailment 0.894 0.828 0.860 3479\n",
" neutral 0.783 0.831 0.806 3123\n",
"\n",
" micro avg 0.841 0.841 0.841 9815\n",
" macro avg 0.842 0.841 0.841 9815\n",
" weighted avg 0.844 0.841 0.842 9815\n",
"\n"
]
}
],
"source": [
"predictions_matched = label_encoder.inverse_transform(predictions_matched)\n",
"print(classification_report(dev_df_matched[LABEL_COL], predictions_matched, digits=3))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
"contradiction 0.862 0.863 0.863 3240\n",
" entailment 0.878 0.853 0.865 3463\n",
" neutral 0.791 0.815 0.803 3129\n",
"\n",
" micro avg 0.844 0.844 0.844 9832\n",
" macro avg 0.844 0.844 0.844 9832\n",
" weighted avg 0.845 0.844 0.845 9832\n",
"\n"
]
}
],
"source": [
"predictions_mismatched = label_encoder.inverse_transform(predictions_mismatched)\n",
"print(classification_report(dev_df_mismatched[LABEL_COL], predictions_mismatched, digits=3))"
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "nlp_gpu",
"language": "python",
"name": "nlp_gpu"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -1,581 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Multi-lingual Inference on XNLI Dataset using BERT"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary\n",
"In this notebook, we demostrate using the [Multi-lingual BERT model](https://github.com/google-research/bert/blob/master/multilingual.md) to do language inference in Chinese and Hindi. We use the [XNLI](https://github.com/facebookresearch/XNLI) dataset and the task is to classify sentence pairs into three classes: contradiction, entailment, and neutral. \n",
"The figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. It concatenates the tokens in each sentence pairs and separates the sentences by the [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.\n",
"<img src=\"https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG\">"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import random\n",
"import numpy as np\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"import torch\n",
"\n",
"nlp_path = os.path.abspath('../../')\n",
"if nlp_path not in sys.path:\n",
" sys.path.insert(0, nlp_path)\n",
"\n",
"from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier\n",
"from utils_nlp.models.bert.common import Language, Tokenizer\n",
"from utils_nlp.dataset.xnli import load_pandas_df\n",
"from utils_nlp.common.timer import Timer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configurations\n",
"Note that the running time shown in this notebook are on a Standard_NC12 Azure Deep Learning Virtual Machine with two NVIDIA Tesla K80 GPUs. If you want to run through the notebook quickly, you can change the `TRAIN_DATA_USED_PERCENT` to a small number, e.g. 0.01. "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"TRAIN_DATA_USED_PERCENT = 1.0\n",
"\n",
"# set random seeds\n",
"RANDOM_SEED = 42\n",
"random.seed(RANDOM_SEED)\n",
"np.random.seed(RANDOM_SEED)\n",
"torch.manual_seed(RANDOM_SEED)\n",
"num_cuda_devices = torch.cuda.device_count()\n",
"if num_cuda_devices > 1:\n",
" torch.cuda.manual_seed_all(RANDOM_SEED)\n",
"\n",
"# model configurations\n",
"LANGUAGE_CHINESE = Language.CHINESE\n",
"LANGUAGE_MULTI = Language.MULTILINGUAL\n",
"TO_LOWER = True\n",
"MAX_SEQ_LENGTH = 128\n",
"\n",
"# training configurations\n",
"NUM_GPUS = 2\n",
"BATCH_SIZE = 32\n",
"NUM_EPOCHS = 2\n",
"\n",
"# optimizer configurations\n",
"LEARNING_RATE= 5e-5\n",
"WARMUP_PROPORTION= 0.1\n",
"\n",
"# data configurations\n",
"TEXT_COL = \"text\"\n",
"LABEL_COL = \"label\"\n",
"\n",
"CACHE_DIR = \"./temp\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data\n",
"The XNLI dataset comes in two zip files: \n",
"* XNLI-1.0.zip: dev and test datasets in 15 languages. The original English data was translated into other languages by human translators. \n",
"* XNLI-MT-1.0.zip: training dataset in 15 languages. This dataset is machine translations of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset. It also contains English translations of the dev and test datasets, but not used in this notebook. \n",
"\n",
"The `load_pandas_df` function downloads and extracts the zip files if they don't already exist in `local_cache_path` and returns the data subset specified by `file_split` and `language`."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"train_df_chinese = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\", language=\"zh\")\n",
"dev_df_chinese = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev\", language=\"zh\")\n",
"test_df_chinese = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"test\", language=\"zh\")\n",
"\n",
"train_df_hindi = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\", language=\"hi\")\n",
"dev_df_hindi = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev\", language=\"hi\")\n",
"test_df_hindi = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"test\", language=\"hi\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chinese training dataset size: 392702\n",
"Chinese dev dataset size: 2490\n",
"Chinese test dataset size: 5010\n",
"\n",
"Hindi training dataset size: 392702\n",
"Hindi dev dataset size: 2490\n",
"Hindi test dataset size: 5010\n",
"\n",
" text label\n",
"0 (从 概念 上 看 , 奶油 收入 有 两 个 基本 方面 产品 和 地理 ., 产品 和 ... neutral\n",
"1 (你 知道 在 这个 季节 , 我 猜 在 你 的 水平 你 把 他们 丢到 下 一个 水平... entailment\n",
"2 (我们 的 一个 号码 会 非常 详细 地 执行 你 的 指示, 我 团队 的 一个 成员 ... entailment\n",
"3 (你 怎么 知道 的 ? 所有 这些 都 是 他们 的 信息 ., 这些 信息 属于 他们 .) entailment\n",
"4 (是 啊 , 我 告诉 你 , 如果 你 去 买 一些 网球鞋 , 我 可以 看到 为什么 ... neutral\n",
" text label\n",
"0 (Conceptually क ् रीम एंजलिस में दो मूल आयाम ह... neutral\n",
"1 (आप मौसम के दौरान जानते हैं और मैं अपने स ् तर... entailment\n",
"2 (हमारे एक नंबर में से एक आपके निर ् देशों को म... entailment\n",
"3 (आप कैसे जानते हैं ? ये सब उनकी जानकारी फिर से... entailment\n",
"4 (हाँ मैं आपको बताता हूँ कि अगर आप उन टेनिस जूत... neutral\n"
]
}
],
"source": [
"print(\"Chinese training dataset size: {}\".format(train_df_chinese.shape[0]))\n",
"print(\"Chinese dev dataset size: {}\".format(dev_df_chinese.shape[0]))\n",
"print(\"Chinese test dataset size: {}\".format(test_df_chinese.shape[0]))\n",
"print()\n",
"print(\"Hindi training dataset size: {}\".format(train_df_hindi.shape[0]))\n",
"print(\"Hindi dev dataset size: {}\".format(dev_df_hindi.shape[0]))\n",
"print(\"Hindi test dataset size: {}\".format(test_df_hindi.shape[0]))\n",
"print()\n",
"print(train_df_chinese.head())\n",
"print(train_df_hindi.head())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"train_data_used_count = round(TRAIN_DATA_USED_PERCENT * train_df_chinese.shape[0])\n",
"train_df_chinese = train_df_chinese.loc[:train_data_used_count]\n",
"train_df_hindi = train_df_hindi.loc[:train_data_used_count]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Language Inference on Chinese\n",
"For Chinese dataset, we use the `bert-base-chinese` model which was pretrained on Chinese dataset only. The `bert-base-multilingual-cased` model can also be used on Chinese, but the accuracy is 3% lower."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tokenize and Preprocess\n",
"Before training, we tokenize the sentence texts and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 392702/392702 [02:26<00:00, 2682.67it/s]\n",
"100%|██████████| 5010/5010 [00:01<00:00, 3122.04it/s]\n"
]
}
],
"source": [
"tokenizer_chinese = Tokenizer(LANGUAGE_CHINESE, to_lower=TO_LOWER, cache_dir=CACHE_DIR)\n",
"\n",
"train_tokens_chinese = tokenizer_chinese.tokenize(train_df_chinese[TEXT_COL])\n",
"test_tokens_chinese= tokenizer_chinese.tokenize(test_df_chinese[TEXT_COL])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In addition, we perform the following preprocessing steps in the cell below:\n",
"\n",
"* Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n",
"* Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n",
"* Pad or truncate the token lists to the specified max length\n",
"* Return mask lists that indicate paddings' positions\n",
"* Return token type id lists that indicate which sentence the tokens belong to\n",
"\n",
"*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"train_token_ids_chinese, train_input_mask_chinese, train_token_type_ids_chinese = \\\n",
" tokenizer_chinese.preprocess_classification_tokens(train_tokens_chinese, max_len=MAX_SEQ_LENGTH)\n",
"test_token_ids_chinese, test_input_mask_chinese, test_token_type_ids_chinese = \\\n",
" tokenizer_chinese.preprocess_classification_tokens(test_tokens_chinese, max_len=MAX_SEQ_LENGTH)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"label_encoder_chinese = LabelEncoder()\n",
"train_labels_chinese = label_encoder_chinese.fit_transform(train_df_chinese[LABEL_COL])\n",
"num_labels_chinese = len(np.unique(train_labels_chinese))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Classifier"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"classifier_chinese = BERTSequenceClassifier(language=LANGUAGE_CHINESE,\n",
" num_labels=num_labels_chinese,\n",
" cache_dir=CACHE_DIR)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train Classifier"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:1->1228/12271; loss:1.194384\n",
"epoch:1/2; batch:1229->2456/12271; loss:0.863067\n",
"epoch:1/2; batch:2457->3684/12271; loss:0.781256\n",
"epoch:1/2; batch:3685->4912/12271; loss:1.067413\n",
"epoch:1/2; batch:4913->6140/12271; loss:0.599279\n",
"epoch:1/2; batch:6141->7368/12271; loss:0.471488\n",
"epoch:1/2; batch:7369->8596/12271; loss:0.572327\n",
"epoch:1/2; batch:8597->9824/12271; loss:0.689093\n",
"epoch:1/2; batch:9825->11052/12271; loss:0.651702\n",
"epoch:1/2; batch:11053->12271/12271; loss:0.431085\n",
"epoch:2/2; batch:1->1228/12271; loss:0.255859\n",
"epoch:2/2; batch:1229->2456/12271; loss:0.434052\n",
"epoch:2/2; batch:2457->3684/12271; loss:0.433569\n",
"epoch:2/2; batch:3685->4912/12271; loss:0.405915\n",
"epoch:2/2; batch:4913->6140/12271; loss:0.636128\n",
"epoch:2/2; batch:6141->7368/12271; loss:0.416685\n",
"epoch:2/2; batch:7369->8596/12271; loss:0.265789\n",
"epoch:2/2; batch:8597->9824/12271; loss:0.328964\n",
"epoch:2/2; batch:9825->11052/12271; loss:0.436310\n",
"epoch:2/2; batch:11053->12271/12271; loss:0.374193\n",
"Training time : 8.050 hrs\n"
]
}
],
"source": [
"with Timer() as t:\n",
" classifier_chinese.fit(token_ids=train_token_ids_chinese,\n",
" input_mask=train_input_mask_chinese,\n",
" token_type_ids=train_token_type_ids_chinese,\n",
" labels=train_labels_chinese,\n",
" num_gpus=NUM_GPUS,\n",
" num_epochs=NUM_EPOCHS,\n",
" batch_size=BATCH_SIZE,\n",
" lr=LEARNING_RATE,\n",
" warmup_proportion=WARMUP_PROPORTION)\n",
"print(\"Training time : {:.3f} hrs\".format(t.interval / 3600))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Predict on Test Data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"5024it [00:54, 101.88it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prediction time : 0.015 hrs\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"with Timer() as t:\n",
" predictions_chinese = classifier_chinese.predict(token_ids=test_token_ids_chinese,\n",
" input_mask=test_input_mask_chinese,\n",
" token_type_ids=test_token_type_ids_chinese,\n",
" batch_size=BATCH_SIZE)\n",
"print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Evaluate"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
"contradiction 0.81 0.84 0.82 1670\n",
" entailment 0.84 0.68 0.76 1670\n",
" neutral 0.70 0.80 0.74 1670\n",
"\n",
" accuracy 0.77 5010\n",
" macro avg 0.78 0.77 0.77 5010\n",
" weighted avg 0.78 0.77 0.77 5010\n",
"\n"
]
}
],
"source": [
"predictions_chinese = label_encoder_chinese.inverse_transform(predictions_chinese)\n",
"print(classification_report(test_df_chinese[LABEL_COL], predictions_chinese))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Language Inference on Hindi\n",
"For Hindi and all other languages except Chinese, we use the `bert-base-multilingual-cased` model. \n",
"The preprocesing, model training, and prediction steps are the same as on Chinese data, except for the underlying tokenizer and BERT model used"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tokenize and Preprocess"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 392702/392702 [03:48<00:00, 1719.84it/s]\n",
"100%|██████████| 5010/5010 [00:02<00:00, 1916.46it/s]\n"
]
}
],
"source": [
"tokenizer_multi = Tokenizer(LANGUAGE_MULTI, cache_dir=CACHE_DIR)\n",
"\n",
"train_tokens_hindi = tokenizer_multi.tokenize(train_df_hindi[TEXT_COL])\n",
"test_tokens_hindi= tokenizer_multi.tokenize(test_df_hindi[TEXT_COL])\n",
"\n",
"train_token_ids_hindi, train_input_mask_hindi, train_token_type_ids_hindi = \\\n",
" tokenizer_multi.preprocess_classification_tokens(train_tokens_hindi, max_len=MAX_SEQ_LENGTH)\n",
"test_token_ids_hindi, test_input_mask_hindi, test_token_type_ids_hindi = \\\n",
" tokenizer_multi.preprocess_classification_tokens(test_tokens_hindi, max_len=MAX_SEQ_LENGTH)\n",
"\n",
"label_encoder_hindi = LabelEncoder()\n",
"train_labels_hindi = label_encoder_hindi.fit_transform(train_df_hindi[LABEL_COL])\n",
"num_labels_hindi = len(np.unique(train_labels_hindi))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create and Train Classifier"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/2; batch:1->1228/12271; loss:1.091754\n",
"epoch:1/2; batch:1229->2456/12271; loss:0.992931\n",
"epoch:1/2; batch:2457->3684/12271; loss:1.045146\n",
"epoch:1/2; batch:3685->4912/12271; loss:0.799912\n",
"epoch:1/2; batch:4913->6140/12271; loss:0.815425\n",
"epoch:1/2; batch:6141->7368/12271; loss:0.564856\n",
"epoch:1/2; batch:7369->8596/12271; loss:0.726981\n",
"epoch:1/2; batch:8597->9824/12271; loss:0.764087\n",
"epoch:1/2; batch:9825->11052/12271; loss:0.964115\n",
"epoch:1/2; batch:11053->12271/12271; loss:0.502252\n",
"epoch:2/2; batch:1->1228/12271; loss:0.601600\n",
"epoch:2/2; batch:1229->2456/12271; loss:0.695099\n",
"epoch:2/2; batch:2457->3684/12271; loss:0.419610\n",
"epoch:2/2; batch:3685->4912/12271; loss:0.603106\n",
"epoch:2/2; batch:4913->6140/12271; loss:0.705180\n",
"epoch:2/2; batch:6141->7368/12271; loss:0.493404\n",
"epoch:2/2; batch:7369->8596/12271; loss:0.864921\n",
"epoch:2/2; batch:8597->9824/12271; loss:0.518601\n",
"epoch:2/2; batch:9825->11052/12271; loss:0.395920\n",
"epoch:2/2; batch:11053->12271/12271; loss:0.685858\n",
"Training time : 9.520 hrs\n"
]
}
],
"source": [
"classifier_multi = BERTSequenceClassifier(language=LANGUAGE_MULTI,\n",
" num_labels=num_labels_hindi,\n",
" cache_dir=CACHE_DIR)\n",
"with Timer() as t:\n",
" classifier_multi.fit(token_ids=train_token_ids_hindi,\n",
" input_mask=train_input_mask_hindi,\n",
" token_type_ids=train_token_type_ids_hindi,\n",
" labels=train_labels_hindi,\n",
" num_gpus=NUM_GPUS,\n",
" num_epochs=NUM_EPOCHS,\n",
" batch_size=BATCH_SIZE,\n",
" lr=LEARNING_RATE,\n",
" warmup_proportion=WARMUP_PROPORTION)\n",
"print(\"Training time : {:.3f} hrs\".format(t.interval / 3600))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Predict and Evaluate"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"5024it [01:02, 87.10it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prediction time : 0.017 hrs\n",
" precision recall f1-score support\n",
"\n",
"contradiction 0.69 0.72 0.70 1670\n",
" entailment 0.74 0.51 0.60 1670\n",
" neutral 0.58 0.74 0.65 1670\n",
"\n",
" accuracy 0.65 5010\n",
" macro avg 0.67 0.65 0.65 5010\n",
" weighted avg 0.67 0.65 0.65 5010\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"with Timer() as t:\n",
" predictions_hindi = classifier_multi.predict(token_ids=test_token_ids_hindi,\n",
" input_mask=test_input_mask_hindi,\n",
" token_type_ids=test_token_type_ids_hindi,\n",
" batch_size=BATCH_SIZE)\n",
"print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))\n",
"predictions_hindi= label_encoder_hindi.inverse_transform(predictions_hindi)\n",
"print(classification_report(test_df_hindi[LABEL_COL], predictions_hindi))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,8 +1,25 @@
# Named Entity Recognition (NER)
This folder contains examples and best practices, written in Jupyter notebooks, for building Named Entity Recognition models. The models can be used in a wide variety of applications, such as information extraction and filtering. It also plays an important role in other
NLP tasks like question answering and text summarization.
## What is Named Entity Recognition (NER)
Named Entity Recognition (NER) is the task of detecting and classifying
real-world objects mentioned in text. Common named entities include person
names, locations, organizations, etc. The state-of-the art NER methods include
combining Long Short-Term Memory neural network with Conditional Random Field
(LSTM-CRF) and pretrained language models like BERT. NER can be used for
information extraction and filtering. It also plays an important role in other
NLP tasks like question answering and text summarization.
names, locations, organizations, etc. The [state-of-the art](https://paperswithcode.com/task/named-entity-recognition-ner) NER methods include combining Long Short-Term Memory neural network with Conditional Random Field
(LSTM-CRF) and pretrained language models like BERT.
The figure below illustrates how BERT can be fine tuned for NER tasks. The input data is a list of tokens representing a sentence. In the training data, each token has an entity label. After fine tuning, the model predicts an entity label for each token in a given testing sentence.
<p align="center">
<img src="https://nlpbp.blob.core.windows.net/images/bert_architecture.png" alt=" Fine-tuned BERT for NER tasks"/>
</p>
## Summary
The following summarizes each notebook for NER. Each notebook provides more details and guiding in principles on building state of the art models.
|Notebook|Runs Local|Description|
|---|---|---|
|[Bert](ner_wikigold_bert.ipynb)| Yes| Fine-tune a [pretrained BERT model](https://github.com/huggingface/pytorch-pretrained-BERT) using the [wikigold dataset](https://www.aclweb.org/anthology/W09-3302) for token classification.|

Просмотреть файл

@ -194,6 +194,7 @@
"sys.path.append(\"../../\")\n",
"import json\n",
"from urllib.request import urlretrieve\n",
"import scrapbook as sb\n",
"\n",
"#import utils\n",
"from utils_nlp.common.timer import Timer\n",
@ -211,6 +212,38 @@
"print(\"Azure ML SDK Version:\", aml.core.VERSION)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"PROJECT_FOLDER = \"./bidaf-question-answering\"\n",
"SQUAD_FOLDER = \"./squad\"\n",
"BIDAF_CONFIG_PATH = \".\"\n",
"LOGS_FOLDER = '.'\n",
"NUM_EPOCHS = 25\n",
"PIP_PACKAGES = [\n",
" \"allennlp==0.8.4\",\n",
" \"azureml-sdk==1.0.48\",\n",
" \"https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz\",\n",
" ]\n",
"CONDA_PACKAGES = [\"jsonnet\", \"cmake\", \"regex\", \"pytorch\", \"torchvision\"]\n",
"config_path = (\n",
" \"./.azureml\"\n",
") # Path to the directory containing config.json with azureml credentials\n",
"\n",
"# Azure resources\n",
"subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
"resource_group = \"YOUR_RESOURCE_GROUP_NAME\" \n",
"workspace_name = \"YOUR_WORKSPACE_NAME\" \n",
"workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on."
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -240,14 +273,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n",
"\n",
"**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
"First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. Then enter the configuration variables into the cell above."
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -275,16 +306,17 @@
],
"source": [
"ws = azureml_utils.get_or_create_workspace(\n",
" subscription_id=\"<SUBSCRIPTION_ID>\",\n",
" resource_group=\"<RESOURCE_GROUP>\",\n",
" workspace_name=\"<WORKSPACE_NAME>\",\n",
" workspace_region=\"<WORKSPACE_REGION>\",\n",
" config_path=config_path,\n",
" subscription_id=subscription_id,\n",
" resource_group=resource_group,\n",
" workspace_name=workspace_name,\n",
" workspace_region=workspace_region,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -313,20 +345,19 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Make a folder for the project\n",
"project_folder = \"./bidaf-question-answering\"\n",
"os.makedirs(project_folder, exist_ok=True)\n",
"os.makedirs(PROJECT_FOLDER, exist_ok=True)\n",
"\n",
"# Set up an experiment\n",
"experiment_name = \"bidaf-question-answering\"\n",
"experiment_name = \"NLP-QA-BiDAF-deepdive\"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"# Add logging to our experiment\n",
"run = experiment.start_logging()"
"run = experiment.start_logging(snapshot_directory=PROJECT_FOLDER)"
]
},
{
@ -347,7 +378,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -355,7 +386,7 @@
"output_type": "stream",
"text": [
"Found existing compute target.\n",
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-17T17:18:24.507000+00:00', 'errors': None, 'creationTime': '2019-07-09T16:20:30.625908+00:00', 'modifiedTime': '2019-07-09T16:20:46.601973+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-23T16:18:34.392000+00:00', 'errors': None, 'creationTime': '2019-07-09T16:20:30.625908+00:00', 'modifiedTime': '2019-07-09T16:20:46.601973+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
]
}
],
@ -404,31 +435,31 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('squad/squad_dev.json', <http.client.HTTPMessage at 0x2640d393320>)"
"('./squad/squad_dev.json', <http.client.HTTPMessage at 0x2646892de10>)"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.makedirs(\"squad\", exist_ok=True) # make squad folder locally\n",
"os.makedirs(SQUAD_FOLDER, exist_ok=True) # make squad folder locally\n",
"\n",
"urlretrieve(\n",
" \"https://allennlp.s3.amazonaws.com/datasets/squad/squad-train-v1.1.json\",\n",
" filename=\"squad/squad_train.json\",\n",
" filename=SQUAD_FOLDER+\"/squad_train.json\",\n",
")\n",
"\n",
"urlretrieve(\n",
" \"https://allennlp.s3.amazonaws.com/datasets/squad/squad-dev-v1.1.json\",\n",
" filename=\"squad/squad_dev.json\",\n",
" filename=SQUAD_FOLDER+\"/squad_dev.json\",\n",
")"
]
},
@ -441,22 +472,22 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'squad\\\\bidaf_config.json'"
"'./squad\\\\bidaf_config.json'"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shutil.copy('bidaf_config.json', \"squad\")"
"shutil.copy(BIDAF_CONFIG_PATH+'/bidaf_config.json', SQUAD_FOLDER)"
]
},
{
@ -468,7 +499,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -488,10 +519,10 @@
{
"data": {
"text/plain": [
"$AZUREML_DATAREFERENCE_64cd400292b5405d9deea6ee03786597"
"$AZUREML_DATAREFERENCE_09a567b57ea546b697d8d7ce1bcf2d86"
]
},
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -503,7 +534,7 @@
"\n",
"# Upload files in squad data folder to the datastore\n",
"ds.upload(\n",
" src_dir=\"./squad\", target_path=\"squad_data\", overwrite=True, show_progress=True\n",
" src_dir=SQUAD_FOLDER, target_path=\"squad_data\", overwrite=True, show_progress=True\n",
")"
]
},
@ -530,19 +561,19 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing ./bidaf-question-answering/train.py\n"
"Overwriting ./bidaf-question-answering/train.py\n"
]
}
],
"source": [
"%%writefile $project_folder/train.py\n",
"%%writefile $PROJECT_FOLDER/train.py\n",
"import torch\n",
"import argparse\n",
"import os\n",
@ -588,7 +619,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -597,26 +628,22 @@
"'bidafenv.yml'"
]
},
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"myenv = CondaDependencies.create(\n",
" conda_packages=[\"jsonnet\", \"cmake\", \"regex\", \"pytorch\", \"torchvision\"],\n",
" pip_packages=[\n",
" \"allennlp==0.8.4\",\n",
" \"azureml-sdk==1.0.48\",\n",
" \"https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz\",\n",
" ],\n",
" conda_packages= CONDA_PACKAGES,\n",
" pip_packages= PIP_PACKAGES,\n",
" python_version=\"3.6.8\",\n",
")\n",
"myenv.add_channel(\"conda-forge\")\n",
"myenv.add_channel(\"pytorch\")\n",
"\n",
"conda_env_file_name = \"bidafenv.yml\"\n",
"myenv.save_to_file(project_folder, conda_env_file_name)"
"myenv.save_to_file(PROJECT_FOLDER, conda_env_file_name)"
]
},
{
@ -628,11 +655,11 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"overrides = {\"trainer\":{'num_epochs': 25}}\n",
"overrides = {\"trainer\":{'num_epochs': NUM_EPOCHS}}\n",
"overrides = json.dumps(overrides)"
]
},
@ -645,7 +672,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"outputs": [
{
@ -667,7 +694,7 @@
"}\n",
"\n",
"estimator = PyTorch(\n",
" source_directory=project_folder,\n",
" source_directory=PROJECT_FOLDER,\n",
" script_params=script_params,\n",
" compute_target=compute_target,\n",
" entry_script=\"train.py\",\n",
@ -692,7 +719,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"outputs": [
{
@ -700,7 +727,7 @@
"output_type": "stream",
"text": [
"Run(Experiment: bidaf-question-answering,\n",
"Id: bidaf-question-answering_1563384448_9b24b038,\n",
"Id: bidaf-question-answering_1563899344_bce3c688,\n",
"Type: azureml.scriptrun,\n",
"Status: Starting)\n"
]
@ -713,13 +740,13 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "673919a19d314f628760e4c896992a34",
"model_id": "3da61f9cf1a84f91ae23925843b584d7",
"version_major": 2,
"version_minor": 0
},
@ -737,7 +764,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@ -756,7 +783,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@ -779,11 +806,11 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"run.download_files(prefix=\"./logs/\", output_directory=\".\")"
"run.download_files(prefix=\"./logs\", output_directory=LOGS_FOLDER)"
]
},
{
@ -802,24 +829,43 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.6152317880794702,
"encoder": "json",
"name": "validation_EM",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "validation_EM"
}
},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"0.6674550614947966"
"0.6152317880794702"
]
},
"execution_count": 18,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open(\"./logs/metrics.json\") as f:\n",
"with open(LOGS_FOLDER+\"/logs/metrics.json\") as f:\n",
" metrics = json.load(f)\n",
"\n",
"sb.glue(\"validation_EM\", metrics[\"best_validation_em\"])\n",
"metrics[\"best_validation_em\"]"
]
},
@ -839,19 +885,19 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING - _jsonnet not loaded, treating ./logs/config.json as json\n"
"WARNING - _jsonnet not loaded, treating ./logs\\config.json as json\n"
]
}
],
"source": [
"model = Predictor.from_path('./logs/')"
"model = Predictor.from_path(LOGS_FOLDER+\"/logs\")"
]
},
{
@ -863,7 +909,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@ -884,7 +930,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@ -893,7 +939,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 23,
"metadata": {},
"outputs": [
{
@ -913,6 +959,7 @@
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
@ -928,7 +975,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.6.8"
}
},
"nbformat": 4,

Просмотреть файл

@ -0,0 +1,103 @@
{
"dataset_reader": {
"type": "squad",
"token_indexers": {
"tokens": {
"type": "single_id",
"lowercase_tokens": true
},
"token_characters": {
"type": "characters",
"character_tokenizer": {
"byte_encoding": "utf-8",
"start_tokens": [259],
"end_tokens": [260]
},
"min_padding_length": 5
}
}
},
"train_data_path": "https://allennlp.s3.amazonaws.com/datasets/squad/squad-train-v1.1.json",
"validation_data_path": "https://allennlp.s3.amazonaws.com/datasets/squad/squad-dev-v1.1.json",
"evaluate_on_test": true,
"model": {
"type": "bidaf",
"text_field_embedder": {
"token_embedders": {
"tokens": {
"type": "embedding",
"pretrained_file": "https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.100d.txt.gz",
"embedding_dim": 100,
"trainable": false
},
"token_characters": {
"type": "character_encoding",
"embedding": {
"num_embeddings": 262,
"embedding_dim": 16
},
"encoder": {
"type": "cnn",
"embedding_dim": 16,
"num_filters": 100,
"ngram_filter_sizes": [5]
},
"dropout": 0.2
}
}
},
"num_highway_layers": 2,
"phrase_layer": {
"type": "lstm",
"bidirectional": true,
"input_size": 200,
"hidden_size": 100,
"num_layers": 1
},
"similarity_function": {
"type": "linear",
"combination": "x,y,x*y",
"tensor_1_dim": 200,
"tensor_2_dim": 200
},
"modeling_layer": {
"type": "lstm",
"bidirectional": true,
"input_size": 800,
"hidden_size": 100,
"num_layers": 2,
"dropout": 0.2
},
"span_end_encoder": {
"type": "lstm",
"bidirectional": true,
"input_size": 1400,
"hidden_size": 100,
"num_layers": 1
},
"dropout": 0.2
},
"iterator": {
"type": "bucket",
"sorting_keys": [["passage", "num_tokens"], ["question", "num_tokens"]],
"batch_size": 40
},
"trainer": {
"num_epochs": 20,
"grad_norm": 5.0,
"patience": 10,
"validation_metric": "+em",
"cuda_device": 0,
"learning_rate_scheduler": {
"type": "reduce_on_plateau",
"factor": 0.5,
"mode": "max",
"patience": 2
},
"optimizer": {
"type": "adam",
"betas": [0.9, 0.9]
}
}
}

Просмотреть файл

@ -135,9 +135,11 @@
"import math\n",
"import json \n",
"import pandas as pd\n",
"import papermill as pm\n",
"#package for flattening json in pandas df\n",
"from pandas.io.json import json_normalize\n",
"import shutil\n",
"import scrapbook as sb\n",
"# Check core SDK version number\n",
"import azureml.core\n",
"from azureml.core import Datastore\n",
@ -153,6 +155,35 @@
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Model configuration\n",
"AZUREML_CONFIG_PATH = \"./.azureml\"\n",
"DATA_FOLDER = './squad'\n",
"PROJECT_FOLDER = './pytorch-transformers'\n",
"EXPERIMENT_NAME = 'NLP-QA-BERT-deepdive'\n",
"BERT_MODEL = 'bert-large-uncased'\n",
"TARGET_GRADIENT_STEPS = 16\n",
"INIT_GRADIENT_STEPS = 2\n",
"MAX_SEQ_LENGTH = 384\n",
"NUM_TRAIN_EPOCHS = 2.0\n",
"NODE_COUNT = 2\n",
"TRAIN_SCRIPT_PATH = 'bert_run_squad_azureml.py'\n",
"MAX_TOTAL_RUNS = 8\n",
"MAX_CONCURRENT_RUNS = 4\n",
"BERT_UTIL_PATH = '../../utils_nlp/azureml/azureml_bert_util.py'\n",
"EVALUATE_SQAD_PATH = '../../utils_nlp/eval/evaluate_squad.py'\n",
"AZUREML_VERBOSE = False"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -169,47 +200,26 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Performing interactive authentication. Please follow the instructions on the terminal.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING - Note, we have launched a browser for you to login. For old experience with device code, use \"az login --use-device-code\"\n",
"WARNING - You have logged in. Now let us find all the subscriptions to which you have access...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Interactive authentication successfully completed.\n",
"Workspace name: MAIDAIPBERT-eastus\n",
"Azure region: eastus\n",
"Subscription id: 15ae9cb6-95c1-483d-a0e3-b1a1a3b06324\n",
"Resource group: nlprg\n"
]
}
],
"outputs": [],
"source": [
"ws = azureml_utils.get_or_create_workspace(\n",
" subscription_id=\"<SUBSCRIPTION_ID>\",\n",
" resource_group=\"<RESOURCE_GROUP>\",\n",
" workspace_name=\"<WORKSPACE_NAME>\",\n",
" workspace_region=\"<WORKSPACE_REGION>\"\n",
")\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep='\\n')"
"if os.path.exists(AZUREML_CONFIG_PATH):\n",
" ws = azureml_utils.get_or_create_workspace(config_path=AZUREML_CONFIG_PATH)\n",
"else:\n",
" ws = azureml_utils.get_or_create_workspace(\n",
" config_path=AZUREML_CONFIG_PATH,\n",
" subscription_id=\"<SUBSCRIPTION_ID>\",\n",
" resource_group=\"<RESOURCE_GROUP>\",\n",
" workspace_name=\"<WORKSPACE_NAME>\",\n",
" workspace_region=\"<WORKSPACE_REGION>\",\n",
" )\n",
"\n",
"if AZUREML_VERBOSE:\n",
" print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep='\\n')"
]
},
{
@ -221,7 +231,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -255,11 +265,11 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"data_folder = './squad'"
"data_folder = DATA_FOLDER"
]
},
{
@ -275,7 +285,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -288,7 +298,8 @@
],
"source": [
"ds = ws.get_default_datastore()\n",
"print(ds.datastore_type, ds.account_name, ds.container_name, ds.as_mount())"
"if AZUREML_VERBOSE:\n",
" print(ds.datastore_type, ds.account_name, ds.container_name, ds.as_mount())"
]
},
{
@ -309,16 +320,16 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('./squad\\\\dev-v1.1.json', <http.client.HTTPMessage at 0x1e1a85ef198>)"
"('./squad\\\\dev-v1.1.json', <http.client.HTTPMessage at 0x1569b645f28>)"
]
},
"execution_count": 14,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -339,7 +350,7 @@
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -392,7 +403,7 @@
"0 [{'answers': [{'answer_start': 515, 'text': 'S... "
]
},
"execution_count": 57,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -412,7 +423,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -499,7 +510,7 @@
"4 5733be284776f4190066117e What sits on top of the Main Building at Notre... "
]
},
"execution_count": 58,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -525,32 +536,34 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING - Target already exists. Skipping upload for ./squad\\dev-v1.1.json\n",
"WARNING - Target already exists. Skipping upload for ./squad\\train-v1.1.json\n"
"Uploading an estimated of 2 files\n",
"Target already exists. Skipping upload for squad\\dev-v1.1.json\n",
"Target already exists. Skipping upload for squad\\train-v1.1.json\n",
"Uploaded 0 files\n"
]
},
{
"data": {
"text/plain": [
"$AZUREML_DATAREFERENCE_5a4cead96ec140b8b5884e917df16e3a"
"$AZUREML_DATAREFERENCE_972d18f476b34d26a1ffd6a11b473114"
]
},
"execution_count": 15,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds.upload(src_dir='./squad', target_path='./squad')"
"ds.upload(src_dir='./squad', target_path='./squad', show_progress=AZUREML_VERBOSE)"
]
},
{
@ -595,7 +608,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -603,7 +616,7 @@
"output_type": "stream",
"text": [
"Found existing compute target.\n",
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-12T22:32:24.801000+00:00', 'errors': None, 'creationTime': '2019-07-12T19:59:45.933132+00:00', 'modifiedTime': '2019-07-12T20:00:01.793458+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\n"
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-22T22:38:04.496000+00:00', 'errors': None, 'creationTime': '2019-07-12T19:59:45.933132+00:00', 'modifiedTime': '2019-07-12T20:00:01.793458+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\n"
]
}
],
@ -638,11 +651,11 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"project_folder = './pytorch-transformers'"
"project_folder = PROJECT_FOLDER"
]
},
{
@ -654,7 +667,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -694,49 +707,29 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve and copy the training script [bert_run_squad_azureml.py](.\\bert_run_squad_azureml.py), evaluation script for SQuAD v1.1 [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/498c8026d92a8bcf0286e2d216d092d444d02d76/squad/evaluate-v1.1.py) and the helper utility script for Horovod [azureml_bert_util.py](https://github.com/microsoft/AzureML-BERT/blob/master/PyTorch/azureml_bert_util.py) into our project directory."
"Let's retrieve and copy the training script [bert_run_squad_azureml.py](.\\bert_run_squad_azureml.py), evaluation script for SQuAD v1.1 [evaluate-v1.1.py](../../utils_nlp/eval/evaluate_squad.py) and the helper utility script for Horovod [azureml_bert_util.py](../../utils_nlp/azureml/azureml_bert_util.py) into our project directory."
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('./pytorch-pretrained-BERT\\\\evaluate_squad.py',\n",
" <http.client.HTTPMessage at 0x25103433c88>)"
"'./pytorch-transformers\\\\bert_run_squad_azureml.py'"
]
},
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"urllib.request.urlretrieve('https://raw.githubusercontent.com/allenai/bi-att-flow/498c8026d92a8bcf0286e2d216d092d444d02d76/squad/evaluate-v1.1.py', filename= project_folder + '\\\\evaluate_squad.py')\n",
"urllib.request.urlretrieve('https://raw.githubusercontent.com/microsoft/AzureML-BERT/master/finetune/PyTorch/azureml_bert_util.py', filename= project_folder + '\\\\azureml_bert_util.py')"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'./pytorch-pretrained-BERT\\\\bert_run_squad_azureml.py'"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shutil.copy('bert_run_squad_azureml.py', project_folder)"
"shutil.copy(EVALUATE_SQAD_PATH, project_folder)\n",
"shutil.copy(BERT_UTIL_PATH, project_folder)\n",
"shutil.copy(TRAIN_SCRIPT_PATH, project_folder)"
]
},
{
@ -784,26 +777,26 @@
"estimator = PyTorch(source_directory=project_folder,\n",
" compute_target=gpu_compute_target,\n",
" script_params = {\n",
" '--bert_model':'bert-large-uncased',\n",
" '--bert_model':BERT_MODEL,\n",
" '--do_train' : '',\n",
" '--do_predict': '',\n",
" '--train_file': ds.path('squad/train-v1.1.json').as_mount(),\n",
" '--predict_file': ds.path('squad/dev-v1.1.json').as_mount(),\n",
" '--max_seq_length': 384,\n",
" '--max_seq_length': MAX_SEQ_LENGTH,\n",
" '--train_batch_size': 8,\n",
" '--learning_rate': 6.8e-5,\n",
" '--num_train_epochs': 2.0,\n",
" '--num_train_epochs': NUM_TRAIN_EPOCHS,\n",
" '--doc_stride': 128,\n",
" '--seed': 32,\n",
" '--init_gradient_accumulation_steps':2,\n",
" '--target_gradient_accumulation_steps':16,\n",
" '--init_gradient_accumulation_steps':INIT_GRADIENT_STEPS,\n",
" '--target_gradient_accumulation_steps':TARGET_GRADIENT_STEPS,\n",
" '--accumulation_warmup_proportion':0.25,\n",
" '--output_dir': './outputs',\n",
" '--loss_scale':256,\n",
" },\n",
" custom_docker_image='azuremlsamples/bert:torch-1.0.0-apex-cuda9',\n",
" entry_script='bert_run_squad_azureml.py',\n",
" node_count=2,\n",
" node_count=NODE_COUNT,\n",
" distributed_training=mpiConfig,\n",
" framework_version='1.1',\n",
" use_gpu=True)\n",
@ -814,7 +807,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"**Note: You can try with `--bert_model':'bert-base-uncased`to run a smaller bert model faster.**"
"**Note: You can try with `--bert_model:'bert-base-uncased'`to run a smaller bert model faster.**"
]
},
{
@ -831,7 +824,7 @@
"metadata": {},
"outputs": [],
"source": [
"experiment_name = 'BERT-SQuAD'\n",
"experiment_name = EXPERIMENT_NAME\n",
"experiment = Experiment(ws, name=experiment_name)"
]
},
@ -874,6 +867,15 @@
"RunDetails(run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"_ = run.wait_for_completion(show_output=AZUREML_VERBOSE) # Block until complete"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -936,7 +938,15 @@
"metadata": {},
"source": [
"### 3.1 Start a hyperparameter sweep\n",
"First, we will define the hyperparameter space to sweep over. In this example we will use random sampling to try different configuration sets of hyperparameter to minimize our primary metric, the f1 score (`f1`). For simplicity, we tune the BERT base model with `--bert_model':'bert-base-uncased` and `node_count=1`."
"First, we will define the hyperparameter space to sweep over. In this example we will use random sampling to try different configuration sets of hyperparameter to minimize our primary metric, the f1 score (`f1`). For simplicity, we tune the BERT base model with `--bert_model':'bert-base-uncased` and `node_count=1`.\n",
"\n",
"We can also try with `BayesianParameterSampling` with suggested `max_total_runs=20`.\n",
"```Python\n",
"param_sampling = BayesianParameterSampling( {\n",
" 'learning_rate': uniform(5e-5, 9e-5),\n",
" }\n",
")\n",
"```"
]
},
{
@ -953,8 +963,8 @@
" hyperparameter_sampling=param_sampling, \n",
" primary_metric_name='f1',\n",
" primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,\n",
" max_total_runs=8,\n",
" max_concurrent_runs=4)"
" max_total_runs=MAX_TOTAL_RUNS,\n",
" max_concurrent_runs=MAX_CONCURRENT_RUNS)"
]
},
{
@ -971,7 +981,8 @@
"outputs": [],
"source": [
"# start the HyperDrive run\n",
"hyperdrive_run = experiment.submit(hyperdrive_config)"
"hyperdrive_run = experiment.submit(hyperdrive_config)\n",
"RunDetails(hyperdrive_run).show()"
]
},
{
@ -990,7 +1001,7 @@
"metadata": {},
"outputs": [],
"source": [
"RunDetails(hyperdrive_run).show()"
"_ = hyperdrive_run.wait_for_completion(show_output=AZUREML_VERBOSE) # Block until complete"
]
},
{
@ -1017,7 +1028,7 @@
"metadata": {},
"source": [
"### 3.3 Find and register the best model\n",
"Once all the runs complete, we can find the run that produced the model with the highest F1 score. The F1 score with default learning rate is **86.18** in [Submit and Monitor your run](#2.6-Submit-and-Monitor-your-run) . The best F1 score is **87.01** after tuning with `learning rate=0.000090`."
"Once all the runs complete, we can find the run that produced the model with the highest F1 score. The F1 score with default learning rate is **86.18** in [Submit and Monitor your run](#2.6-Submit-and-Monitor-your-run) . The best F1 score is **87.01** after tuning with `learning rate=0.000090` with random sampling. With Bayesian sampling, the best F1 score is **86.87** after tuning with `learning rate=0.0000896`."
]
},
{
@ -1042,10 +1053,20 @@
"source": [
"best_run = hyperdrive_run.get_best_run_by_primary_metric()\n",
"best_run_metrics = best_run.get_metrics()\n",
"print(best_run)\n",
"print('Best Run is:\\n F1 score: %.2f \\n Learning rate: %f' % (best_run_metrics['f1'], best_run_metrics['lr']))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Persist properties of the run so we can access the logged metrics later\n",
"sb.glue(\"f1\", best_run_metrics['f1'])\n",
"sb.glue(\"learning_rate\", best_run_metrics['lr'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -1063,6 +1084,7 @@
"name": "minxia"
}
],
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3",
"language": "python",

Просмотреть файл

@ -49,6 +49,7 @@
"sys.path.append(\"../../\")\n",
"import json\n",
"import urllib\n",
"import scrapbook as sb\n",
"\n",
"#import utils\n",
"from utils_nlp.common.timer import Timer\n",
@ -63,7 +64,11 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"CPU_CORES = 1\n",
@ -72,7 +77,18 @@
"DEPLOYMENT_CONDA_PACKAGES = ['jsonnet','cmake','regex','pytorch','torchvision']\n",
"DEPLOYMENT_PIP_PACKAGES = ['allennlp==0.8.4','azureml-sdk==1.0.48']\n",
"CONTAINER_TAGS = {'area': \"nlp\", 'type': \"question-answering BiDAF\"}\n",
"MODEL_TAGS = {\"bidaf\": \"demo\"}"
"MODEL_TAGS = {\"bidaf\": \"demo\"}\n",
"config_path = (\n",
" \"./.azureml\"\n",
") # Path to the directory containing config.json with azureml credentials\n",
"\n",
"webservice_name = \"aci-bidaf-service\" #name for webservice; must be unique within your workspace\n",
"\n",
"# Azure resources\n",
"subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
"resource_group = \"YOUR_RESOURCE_GROUP_NAME\" \n",
"workspace_name = \"YOUR_WORKSPACE_NAME\" \n",
"workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on."
]
},
{
@ -93,9 +109,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML Workspace. This will create a config.json file containing the values needed below to create a workspace.\n",
"\n",
"**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
"First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML Workspace. Enter the configuration values in the parameter cell above."
]
},
{
@ -130,10 +144,11 @@
],
"source": [
"ws = azureml_utils.get_or_create_workspace(\n",
" subscription_id=\"<SUBSCRIPTION_ID>\",\n",
" resource_group=\"<RESOURCE_GROUP>\",\n",
" workspace_name=\"<WORKSPACE_NAME>\",\n",
" workspace_region=\"<WORKSPACE_REGION>\"\n",
" config_path=config_path,\n",
" subscription_id=subscription_id,\n",
" resource_group=resource_group,\n",
" workspace_name=workspace_name,\n",
" workspace_region=workspace_region,\n",
")"
]
},
@ -459,7 +474,7 @@
"source": [
"# deploy image as web service\n",
"aci_service = Webservice.deploy_from_image(workspace = ws, \n",
" name = 'bidaf-aci-service-1',\n",
" name = webservice_name,\n",
" image = image,\n",
" deployment_config = aci_config)\n",
"\n",
@ -593,6 +608,7 @@
"result = json.loads(score)\n",
"try:\n",
" output = result[\"result\"]\n",
" sb.glue(\"answer\", output)\n",
" print(\"Answer:\", output)\n",
"except:\n",
" print(result[\"error\"])"
@ -677,10 +693,23 @@
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,

Просмотреть файл

@ -21,7 +21,8 @@ The following summarizes each notebook for Sentence Similarity. Each notebook pr
|Notebook|Runs Local|Description|
|---|---|---|
|[Creating a Baseline model](baseline_deep_dive.ipynb)| Yes| A baseline model is a basic solution that serves as a point of reference for comparing other models to. The baseline model's performance gives us an indication of how much better our models can perform relative to a naive approach.|
|Senteval |[Local](senteval_local.ipynb), [AzureML](senteval_azureml.ipynb)|SentEval is a widely used benchmarking tool for evaluating general-purpose sentence embeddings. Running SentEval locally is easy, but not necessarily efficient depending on the model specs. We provide an example on how to do this efficiently in Azure Machine Learning Service. |
|[BERT Sentence Encoder](bert_encoder.ipynb)|Yes|In this notebook, we show how to extract features from pretrained BERT as sentence embeddings.|
|[BERT with SentEval](bert_senteval.ipynb)|No|In this notebook, we show how to use SentEval to compare the performance of BERT sequence encodings with various pooling strategies on a sentence similarity task. We leverage AzureML resources such as Datastore and AmlCompute to autoscale our compute cluster and run the experiments in parallel.|
|Gensen | [Local](gensen_local.ipynb), [AzureML](gensen_aml_deep_dive.ipynb)|This notebook serves as an introduction to an end-to-end NLP solution for sentence similarity building one of the State of the Art models, GenSen. We provide two notebooks. One, which runs on the AzureML platform. We show the advantages of AzureML when training large NLP models with GPU in this notebook. The other example walks through using a GPU enabled VM to train and score Gensen.|
|[Automated Machine Learning(AutoML) with Deployment on Azure Container Instance](automl_local_deployment_aci.ipynb)| Yes |This notebook shows users how to use AutoML on local machine and deploy the model as a webservice to Azure Container Instance(ACI) to get a sentence similarity score.
|[Google Universal Sentence Encoder with Azure Machine Learning Pipeline, AutoML with Deployment on Azure Kubernetes Service](automl_with_pipelines_deployment_aks.ipynb)| No | This notebook shows a user how to use AzureML pipelines and deploy the pipeline output model as a webservice to Azure Kubernetes Service which can be used as an end point to get sentence similarity scores.|

Просмотреть файл

@ -125,6 +125,7 @@
"from scipy.spatial import distance\n",
"from sklearn.externals import joblib\n",
"import json\n",
"import scrapbook as sb\n",
"\n",
"# Import utils\n",
"from utils_nlp.azureml import azureml_utils\n",
@ -162,12 +163,36 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"BASE_DATA_PATH = \"../../data\"\n",
"CPU_CORES = 1\n",
"MEMORY_GB = 8"
"MEMORY_GB = 8\n",
"\n",
"# Define the settings for AutoML\n",
"automl_task = \"regression\"\n",
"automl_iteration_timeout = 15\n",
"automl_iterations = 50\n",
"automl_metric = \"spearman_correlation\"\n",
"automl_preprocess = True\n",
"automl_model_blacklist = ['XGBoostRegressor']\n",
"\n",
"config_path = (\n",
" \"./.azureml\"\n",
") # Path to the directory containing config.json with azureml credentials\n",
"\n",
"webservice_name = \"aci-automl-service\" #name for webservice; must be unique within your workspace\n",
"\n",
"# Azure resources\n",
"subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
"resource_group = \"YOUR_RESOURCE_GROUP_NAME\" \n",
"workspace_name = \"YOUR_WORKSPACE_NAME\" \n",
"workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on."
]
},
{
@ -176,16 +201,15 @@
"metadata": {},
"outputs": [],
"source": [
"# Define the settings for AutoML\n",
"automl_settings = {\n",
" \"task\": \"regression\", # type of task: classification, regression or forecasting\n",
" \"task\": automl_task, # type of task: classification, regression or forecasting\n",
" \"debug_log\": \"automated_ml_errors.log\",\n",
" \"path\": \"./automated-ml-regression\",\n",
" \"iteration_timeout_minutes\": 15, # How long each iteration can take before moving on\n",
" \"iterations\": 50, # Number of algorithm options to try\n",
" \"primary_metric\": \"spearman_correlation\", # Metric to optimize\n",
" \"preprocess\": True, # Whether dataset preprocessing should be applied\n",
" \"blacklist_models\": ['XGBoostRegressor'] #exclude this model due to installation issues\n",
" \"iteration_timeout_minutes\": automl_iteration_timeout, # How long each iteration can take before moving on\n",
" \"iterations\": automl_iterations, # Number of algorithm options to try\n",
" \"primary_metric\": automl_metric, # Metric to optimize\n",
" \"preprocess\": automl_preprocess, # Whether dataset preprocessing should be applied\n",
" \"blacklist_models\": automl_model_blacklist #exclude this model due to installation issues\n",
"}"
]
},
@ -429,9 +453,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n",
"\n",
"**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
"First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. Enter the configuration values in the cell above."
]
},
{
@ -464,10 +486,11 @@
],
"source": [
"ws = azureml_utils.get_or_create_workspace(\n",
" subscription_id=\"<SUBSCRIPTION_ID>\",\n",
" resource_group=\"<RESOURCE_GROUP>\",\n",
" workspace_name=\"<WORKSPACE_NAME>\",\n",
" workspace_region=\"<WORKSPACE_REGION>\",\n",
" config_path=config_path,\n",
" subscription_id=subscription_id,\n",
" resource_group=resource_group,\n",
" workspace_name=workspace_name,\n",
" workspace_region=workspace_region,\n",
")"
]
},
@ -570,7 +593,7 @@
"metadata": {},
"outputs": [],
"source": [
"experiment = Experiment(ws, \"automated-ml-regression\")\n",
"experiment = Experiment(ws, \"NLP-SS-automl\")\n",
"local_run = experiment.submit(automated_ml_config, show_output=True)"
]
},
@ -762,7 +785,6 @@
"metadata": {},
"outputs": [],
"source": [
"experiment = Experiment(ws, \"automated-ml-regression\")\n",
"ml_run = AutoMLRun(experiment=experiment, run_id=local_run.id)"
]
},
@ -980,7 +1002,7 @@
"source": [
"# deploy image as web service\n",
"aci_service = Webservice.deploy_from_image(\n",
" workspace=ws, name=\"aci-automl-service-8\", image=image, deployment_config=aci_config\n",
" workspace=ws, name=webservice_name, image=image, deployment_config=aci_config\n",
")\n",
"\n",
"aci_service.wait_for_deployment(show_output=True)\n",
@ -1141,6 +1163,26 @@
" print(result[\"error\"])"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"# get Pearson Correlation\n",
"pearson = pearsonr(output, test_y)[0]\n",
"print(pearson)\n",
"\n",
"sb.glue(\"pearson_correlation\", pearson)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The goal of this notebook is to demonstrate how to use AutoML locally and then deploy the model to Azure Container Instance quickly. The model utilizes the built-in capabilities of AutoML to embed our sentences. The model performance on its own, without tweaking, is not very strong with this particular dataset. For a more advanced model, see [AutoML with Pipelines Deployment AKS](automl_with_pipelines_deployment_aks.ipynb) for much stronger performance on the same task. This notebook utilizes AzureML Pipelines to explicitly embed our sentences using the Google Universal Sentence Encoder (USE) model. For our dataset, the Google USE embeddings result in superior model performance."
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -1155,7 +1197,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@ -1173,7 +1215,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
@ -1191,7 +1233,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
@ -1205,16 +1247,17 @@
"source": [
"As mentioned above, Azure Container Instances tend to be used to develop and test deployments. They are typically configured with CPUs, which usually suffice when the number of requests per second is not too high. When working with several instances, we can configure them further by specifically allocating CPU resources to each of them.\n",
"\n",
"For production requirements, i.e. when > 100 requests per second are expected, we recommend deploying models to Azure Kubernetes Service (AKS). It is a convenient infrastructure as it manages hosted Kubernetes environments, and makes it easy to deploy and manage containerized applications without container orchestration expertise. It also supports deployments with CPU clusters and deployments with GPU clusters.For more examples on deployment follow [MachineLearningNotebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/deployment) github repository.\n",
"\n",
"For production requirements, i.e. when > 100 requests per second are expected, we recommend deploying models to Azure Kubernetes Service (AKS). It is a convenient infrastructure as it manages hosted Kubernetes environments, and makes it easy to deploy and manage containerized applications without container orchestration expertise. It also supports deployments with CPU clusters and deployments with GPU clusters.\n",
"\n",
"To see an example with Azure Kubernetes Service example, go to [this notebook](automl_with_pipelines_deployment_aks.ipynb)\n",
"## Next Steps\n",
"\n",
"For more examples on deployment follow [MachineLearningNotebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/deployment) github repository."
"Check out [AutoML with Pipelines Deployment AKS](automl_with_pipelines_deployment_aks.ipynb) to see how to construct a AzureML Pipeline with an embedding step (using Google Universal Sentence Encoder model) and an AutoMLStep, increasing our Pearson correlation score. Also, this notebooks demonstrates deployment using AKS versus ACI."
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3",
"language": "python",

Просмотреть файл

@ -20,7 +20,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook demonstrates how to use [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-service/\n",
"This notebook builds off of the [AutoML Local Deployment ACI](automl_local_deployment_aci.ipynb) notebook and demonstrates how to use [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-service/\n",
") pipelines and Automated Machine Learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml\n",
")) to streamline the creation of a machine learning workflow for predicting sentence similarity. The pipeline contains two steps: \n",
"1. PythonScriptStep: embeds sentences using a popular sentence embedding model, Google Universal Sentence Encoder\n",
@ -228,7 +228,11 @@
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"automl_settings = {\n",
@ -239,7 +243,17 @@
" \"preprocess\": True, # Whether dataset preprocessing should be applied\n",
" \"verbosity\": logging.INFO,\n",
" \"blacklist_models\": ['XGBoostRegressor'] #this model is blacklisted due to installation issues\n",
"}"
"}\n",
"\n",
"config_path = (\n",
" \"./.azureml\"\n",
") # Path to the directory containing config.json with azureml credentials\n",
"\n",
"# Azure resources\n",
"subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
"resource_group = \"YOUR_RESOURCE_GROUP_NAME\" \n",
"workspace_name = \"YOUR_WORKSPACE_NAME\" \n",
"workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on."
]
},
{
@ -494,9 +508,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n",
"\n",
"**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
"First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. Enter the configuration values in the cell above."
]
},
{
@ -529,10 +541,11 @@
],
"source": [
"ws = azureml_utils.get_or_create_workspace(\n",
" subscription_id=\"<SUBSCRIPTION_ID>\",\n",
" resource_group=\"<RESOURCE_GROUP>\",\n",
" workspace_name=\"<WORKSPACE_NAME>\",\n",
" workspace_region=\"<WORKSPACE_REGION>\",\n",
" config_path=config_path,\n",
" subscription_id=subscription_id,\n",
" resource_group=resource_group,\n",
" workspace_name=workspace_name,\n",
" workspace_region=workspace_region,\n",
")"
]
},
@ -569,7 +582,7 @@
"os.makedirs(project_folder, exist_ok=True)\n",
"\n",
"# Set up an experiment\n",
"experiment_name = \"automl-sentence-similarity\"\n",
"experiment_name = \"NLP-SS-googleUSE\"\n",
"experiment = Experiment(ws, experiment_name)\n",
"\n",
"# Add logging to our experiment\n",
@ -587,7 +600,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"To use AzureML Pipelines we need to link a compute target as they can not be run locally (see [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) for explanation of the different options). We will use an AmlCompute target in this example."
"To use AzureML Pipelines we need to link a compute target as they can not be run locally. The different options include AmlCompute, Azure Databricks, Remote VMs, etc. All [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) can be found in this table with details about whether the given options work with automated ML, pipelines, and GPU. For the following example, we will use an AmlCompute target because it supports Azure Pipelines and GPU. "
]
},
{
@ -738,7 +751,9 @@
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"metadata": {
"format": "row"
},
"outputs": [
{
"name": "stdout",
@ -758,9 +773,8 @@
"conda_run_config.environment.docker.enabled = True\n",
"conda_run_config.environment.docker.base_image = aml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"\n",
"# Use conda_dependencies.yml to create a conda environment in the Docker image for execution\n",
"# Specify our own conda dependencies for the execution environment\n",
"conda_run_config.environment.python.user_managed_dependencies = False\n",
"\n",
"conda_run_config.environment.python.conda_dependencies = CondaDependencies.create(\n",
" pip_packages=[\n",
" \"azureml-sdk[automl]==1.0.48\",\n",
@ -961,7 +975,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This step defines the `PythonScriptStep`. We give the step a name, tell the step which python script to run (embed.py) and what directory that script is located in (source_directory). Note that the hash_paths parameter will be deprecated but currently is needed to check for any updates to the embed.py file.\n",
"This step defines the `PythonScriptStep`. We give the step a name, tell the step which python script to run (embed.py) and what directory that script is located in (source_directory). \n",
"\n",
"We also link the compute target and run configuration that we made previously. Our input is the `DataReference` object (input_data) where our raw sentence data was uploaded and our ouput is the `PipelineData` object (embedded_data) where the embedded data produced by this step will be stored. These are also passed in as arguments so that we have access to the correct data paths."
]
@ -1011,7 +1025,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Define the get_data.py file and get_data() function that the `AutoMLStep` will execute to collect data. Note that we can directly access the path of the intermediate data (called embedded_data) through `os.environ['AZUREML_DATAREFERENCE_embedded_data']`. This is necessary because the AutoMLStep does not accept additional parameters like the PythonScriptStep does with `arguments`."
"Define the get_data.py file and get_data() function that the `AutoMLStep` will execute to collect data. When AutoML is used with a remote compute, the data can not be passed directly as parameters. Rather, a get_data function must be defined to access the data (see [this resource](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-auto-train-remote) for further details). Note that we can directly access the path of the intermediate data (called embedded_data) through `os.environ['AZUREML_DATAREFERENCE_embedded_data']`. This is necessary because the AutoMLStep does not accept additional parameters like the PythonScriptStep does with `arguments`."
]
},
{
@ -1104,7 +1118,7 @@
" run_configuration=conda_run_config,\n",
" data_script=project_folder\n",
" + \"/get_data.py\", # local path to script with get_data() function\n",
" **automl_settings #where the autoML main settings are defined\n",
" **automl_settings #where the AutoML main settings are defined\n",
")"
]
},
@ -1119,7 +1133,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, we create `PipelineData` objects for the model data (our outputs) and then create the `AutoMLStep`. The `AutoMLStep` requires a `AutoMLConfig` object and we pass our intermediate data (embedded_data) in as the inputs. Again, note that the hash_paths parameter will be deprecated but currently is needed to check for any updates to the get_data.py file."
"Finally, we create `PipelineData` objects for the model data (our outputs) and then create the `AutoMLStep`. The `AutoMLStep` requires a `AutoMLConfig` object and we pass our intermediate data (embedded_data) in as the inputs. "
]
},
{
@ -1528,7 +1542,6 @@
" scores: list of target variables\n",
" \"\"\"\n",
" google_USE_emb1, google_USE_emb2 = google_encoder(dataset)\n",
" n_google = google_USE_emb1.shape[1] # length of the embeddings\n",
" return np.concatenate((google_USE_emb1, google_USE_emb2), axis=1)\n",
"\n",
"\n",
@ -1751,7 +1764,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"We are now ready to deploy our web service. We will deploy from the Docker image. It contains our AutoML model as well as the Google Universal Sentence Encoder model and the conda environment needed for the scoring script to work properly. The parameters to pass to the Webservice.deploy_from_image() command are similar to those used for the deployment on ACI. The only major difference is the compute target (aks_target), i.e. the CPU cluster we just spun up.\n",
"We are now ready to deploy our web service. We will deploy from the Docker image. It contains our AutoML model as well as the Google Universal Sentence Encoder model and the conda environment needed for the scoring script to work properly. The parameters to pass to the Webservice.deploy_from_image() command are similar to those used for deployment on Azure Container Instance ([ACI](https://azure.microsoft.com/en-us/services/container-instances/\n",
")). The only major difference is the compute target (aks_target), i.e. the CPU cluster we just spun up.\n",
"\n",
"**Note:** This deployment takes a few minutes to complete."
]
@ -1947,11 +1961,13 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": []
"source": [
"## Conclusion\n",
"\n",
"This notebook demonstrated how to use AzureML Pipelines and AutoML to streamline the creation of a machine learning workflow for predicting sentence similarity. After creating the pipeline, the notebook demonstrated the deployment of our sentence similarity model using AKS. The model results reported in this notebook (using Google USE embeddings) are much stronger than the results from using AutoML with its built-in embedding capabilities (as in [AutoML Local Deployment ACI](automl_local_deployment_aci.ipynb)). "
]
}
],
"metadata": {

Просмотреть файл

@ -0,0 +1,253 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Sentence Similarity with Pretrained BERT\n",
"In this notebook, we use pretrained [BERT](https://arxiv.org/abs/1810.04805) as a sentence encoder to measure sentence similarity. We use a [feature extractor](../../utils_nlp/bert/extract_features.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert). "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 00 Global Settings"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import torch\n",
"import itertools\n",
"import numpy as np\n",
"import pandas as pd\n",
"import scrapbook as sb\n",
"from collections import OrderedDict\n",
"\n",
"sys.path.append(\"../../\")\n",
"from utils_nlp.models.bert.common import Language, Tokenizer\n",
"from utils_nlp.models.bert.sequence_encoding import BERTSentenceEncoder, PoolingStrategy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# device config\n",
"NUM_GPUS = 0\n",
"\n",
"# model config\n",
"LANGUAGE = Language.ENGLISH\n",
"TO_LOWER = True\n",
"MAX_SEQ_LENGTH = 128\n",
"LAYER_INDEX = -2\n",
"POOLING_STRATEGY = PoolingStrategy.MEAN\n",
"\n",
"# path config\n",
"CACHE_DIR = \"./temp\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(CACHE_DIR):\n",
" os.makedirs(CACHE_DIR, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 01 Define the Sentence Encoder with Pretrained BERT"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `BERTSentenceEncoder` defaults to Pretrained BERT."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 407873900/407873900 [00:15<00:00, 26602678.27B/s]\n",
"100%|██████████| 231508/231508 [00:00<00:00, 905295.88B/s]\n"
]
}
],
"source": [
"se = BERTSentenceEncoder(\n",
" language=LANGUAGE,\n",
" num_gpus=NUM_GPUS,\n",
" cache_dir=CACHE_DIR,\n",
" to_lower=TO_LOWER,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" layer_index=LAYER_INDEX,\n",
" pooling_strategy=POOLING_STRATEGY,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 02 Compute the Sentence Encodings"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `encode` method of the sentence encoder accepts a list of text to encode, as well as the layers we want to extract the embeddings from and the pooling strategy we want to use. The embedding size is 768. We can also return just the values column as a list of numpy arrays by setting the `as_numpy` parameter to True."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 2/2 [00:00<00:00, 2917.78it/s]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text_index</th>\n",
" <th>layer_index</th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>-2</td>\n",
" <td>[0.038080588, 0.0926698, 0.0366186, -0.1218368...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>-2</td>\n",
" <td>[0.084241375, 0.099506006, -0.38437817, 0.2164...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text_index layer_index values\n",
"0 0 -2 [0.038080588, 0.0926698, 0.0366186, -0.1218368...\n",
"1 1 -2 [0.084241375, 0.099506006, -0.38437817, 0.2164..."
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = se.encode(\n",
" [\"Coffee is good\", \"The moose is across the street\"],\n",
" as_numpy=False\n",
")\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 768,
"encoder": "json",
"name": "result",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "result"
}
},
"output_type": "display_data"
}
],
"source": [
"# for testing\n",
"size_emb = len(result[\"values\"].iloc[0])\n",
"sb.glue(\"size_emb\", size_emb)\n"
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python (nlp_gpu)",
"language": "python",
"name": "nlp_gpu"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -2,11 +2,7 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "75caf421-c00a-4d6d-8a3d-47ebe7493af5"
}
},
"metadata": {},
"source": [
"\n",
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
@ -16,11 +12,7 @@
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "0738bb22-14af-45ca-9ad7-e0c068f280cf"
}
},
"metadata": {},
"source": [
"# GenSen with Pytorch\n",
"In this tutorial, you will train a GenSen model for the sentence similarity task. We use the [SNLI](https://nlp.stanford.edu/projects/snli/) dataset in this example. For a more detailed walkthrough about data processing jump to [SNLI Data Prep](../01-prep-data/snli.ipynb). A quickstart version of this notebook can be found [here](../00-quick-start/)\n",
@ -59,21 +51,17 @@
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "e91468d4-7bb8-469b-95a6-4e6f4dfcdf55"
}
},
"metadata": {},
"source": [
"## 0. Global Settings"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 11,
"metadata": {
"nbpresent": {
"id": "a6e277ee-edbb-44a5-81d4-93565d2f3a83"
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
@ -92,46 +80,50 @@
"\n",
"import os\n",
"import papermill as pm\n",
"import scrapbook as sb\n",
"\n",
"from utils_nlp.dataset.preprocess import to_lowercase, to_nltk_tokens\n",
"from utils_nlp.dataset import snli, preprocess\n",
"from utils_nlp.models.pretrained_embeddings.glove import download_and_extract\n",
"from utils_nlp.dataset import Split\n",
"from scenarios.sentence_similarity.gensen_wrapper import GenSenClassifier\n",
"from utils_nlp.models.pretrained_embeddings.glove import download_and_extract \n",
"\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"BASE_DATA_PATH = '../../data'"
"print(\"System version: {}\".format(sys.version))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"max_epoch = None\n",
"config_filepath = 'gensen_config.json'\n",
"base_data_path = '../../data'\n",
"nrows = None"
]
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "aee768e5-f317-4dfb-807c-cb4f5f0c0204"
}
},
"metadata": {},
"source": [
"## 1. Data Preparation and inspection"
]
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "4c570c1b-0e4e-41e9-aa27-5ab1ce8c13a1"
}
},
"metadata": {},
"source": [
"The [SNLI](https://nlp.stanford.edu/projects/snli/) corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment (RTE). "
]
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "99c241e1-2f23-4fb3-9d3c-8f479c6b0030"
}
},
"metadata": {},
"source": [
"### 1.1 Load the dataset\n",
"\n",
@ -152,10 +144,10 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 3,
"metadata": {
"nbpresent": {
"id": "5952e06d-1dae-462d-8fce-66eb7ef536dd"
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
@ -337,38 +329,34 @@
"4 2267923837.jpg#2r1e entailment NaN NaN NaN NaN "
]
},
"execution_count": 7,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
"dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n",
"test = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")\n",
"train = snli.load_pandas_df(base_data_path, file_split=Split.TRAIN, nrows=nrows)\n",
"dev = snli.load_pandas_df(base_data_path, file_split=Split.DEV, nrows=nrows)\n",
"test = snli.load_pandas_df(base_data_path, file_split=Split.TEST, nrows=nrows)\n",
"\n",
"train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "6d5f7565-1f84-4489-8d06-dabd6bd99190"
}
},
"metadata": {},
"source": [
"### 1.2 Tokenize\n",
"\n",
"We have loaded the dataset into pandas.DataFrame, we now convert sentences to tokens. We also clean the data before tokenizing. This includes dropping unneccessary columns and renaming the relevant columns as score, sentence_1, and sentence_2. Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK] (https://www.nltk.org/) library for tokenization."
"We have loaded the dataset into pandas.DataFrame, we now convert sentences to tokens. We also clean the data before tokenizing. This includes dropping unneccessary columns and renaming the relevant columns as score, sentence_1, and sentence_2."
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 4,
"metadata": {
"nbpresent": {
"id": "e6160617-03f0-4809-9360-8b040dc4395f"
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
@ -385,12 +373,19 @@
"test = clean_and_tokenize(test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK] (https://www.nltk.org/) library for tokenization."
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"metadata": {
"nbpresent": {
"id": "4912b609-8141-4212-a6ad-814d73f724ed"
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
@ -497,7 +492,7 @@
"4 [two, kids, at, a, ballgame, wash, their, hand... "
]
},
"execution_count": 9,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -508,11 +503,7 @@
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "59494d88-c7c9-4efc-a191-f16d6ac2ac40"
}
},
"metadata": {},
"source": [
"## 2. Model application, performance and analysis of the results\n",
"The model has been implemented as a GenSen class with the specifics hidden inside the fit() method, so that no explicit call is needed. The algorithm operates in three different steps:\n",
@ -538,8 +529,12 @@
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"execution_count": 14,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -550,69 +545,47 @@
}
],
"source": [
"pretrained_embedding_path = download_and_extract(BASE_DATA_PATH)"
"pretrained_embedding_path = download_and_extract(base_data_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "ab565124-43de-4862-b286-2b5db3a868fe"
}
},
"metadata": {},
"source": [
"### 2.1 Initialize Model"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 15,
"metadata": {
"nbpresent": {
"id": "641a9c74-974c-4aac-8c16-3b44d686f0f3"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
"pycharm": {
"name": "#%%\n"
}
],
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"config_filepath = 'gensen_config.json'\n",
"clf = GenSenClassifier(config_file = config_filepath, \n",
" pretrained_embedding_path = pretrained_embedding_path,\n",
" learning_rate = 0.0001, \n",
" cache_dir=BASE_DATA_PATH)"
" cache_dir=base_data_path,\n",
" max_epoch=max_epoch)"
]
},
{
"cell_type": "markdown",
"metadata": {
"nbpresent": {
"id": "5f87d13c-d04f-4d38-820e-fb82082153c4"
}
},
"metadata": {},
"source": [
"### 2.2 Train Model"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 8,
"metadata": {
"nbpresent": {
"id": "6ea45671-c7a5-4fe8-a450-8b54161f26c5"
},
"scrolled": false
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
@ -621,7 +594,7 @@
"text": [
"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/modules/rnn.py:46: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.8 and num_layers=1\n",
" \"num_layers={}\".format(dropout, num_layers))\n",
"../../scenarios/sentence_similarity/gensen_train.py:428: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
"../../scenarios/sentence_similarity/gensen_train.py:431: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
" torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\n",
"../../utils_nlp/models/gensen/utils.py:364: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
" Variable(torch.LongTensor(sorted_src_lens), volatile=True)\n",
@ -629,13 +602,13 @@
" warnings.warn(\"nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\")\n",
"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/functional.py:1320: UserWarning: nn.functional.tanh is deprecated. Use torch.tanh instead.\n",
" warnings.warn(\"nn.functional.tanh is deprecated. Use torch.tanh instead.\")\n",
"../../scenarios/sentence_similarity/gensen_train.py:520: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
"../../scenarios/sentence_similarity/gensen_train.py:523: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
" torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\n",
"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/horovod/torch/__init__.py:163: UserWarning: optimizer.step(synchronize=True) called after optimizer.synchronize(). This can cause training slowdown. You may want to consider using optimizer.step(synchronize=False) if you use optimizer.synchronize() in your code.\n",
" warnings.warn(\"optimizer.step(synchronize=True) called after \"\n",
"../../scenarios/sentence_similarity/gensen_train.py:241: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
"../../scenarios/sentence_similarity/gensen_train.py:243: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
" f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n",
"../../scenarios/sentence_similarity/gensen_train.py:260: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
"../../scenarios/sentence_similarity/gensen_train.py:262: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
" f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n"
]
},
@ -643,8 +616,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 29min 21s, sys: 8min 11s, total: 37min 32s\n",
"Wall time: 37min 29s\n"
"CPU times: user 1h 19min 28s, sys: 22min 1s, total: 1h 41min 30s\n",
"Wall time: 1h 41min 22s\n"
]
}
],
@ -657,13 +630,19 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.3 Predict"
"### 2.3 Predict\n",
"\n",
"In the predict method we perform Pearson's Correlation computation [\\[2\\]](#References) on the outputs of the model. The predictions of the model can be further improved by hyperparameter tuning which we walk through in the other example [here](gensen_aml_deep_dive.ipynb). "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"execution_count": 16,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -671,20 +650,23 @@
"text": [
"******** Similarity Score for sentences **************\n",
" 0 1\n",
"0 1.000000 0.936469\n",
"1 0.936469 1.000000\n"
"0 1.000000 0.966793\n",
"1 0.966793 1.000000\n"
]
}
],
"source": [
"sentences = [\n",
" 'the quick brown fox jumped over the lazy dog',\n",
" 'bright sunshiny day tomorrow.'\n",
" 'The sky is blue and beautiful',\n",
" 'Love this blue and beautiful sky!'\n",
" ]\n",
"\n",
"results = clf.predict(sentences)\n",
"print(\"******** Similarity Score for sentences **************\")\n",
"print(results)"
"print(results)\n",
"\n",
"# Record results with scrapbook for tests\n",
"sb.glue(\"results\", results.to_dict())"
]
},
{
@ -694,11 +676,15 @@
"## References\n",
"\n",
"1. Subramanian, Sandeep and Trischler, Adam and Bengio, Yoshua and Pal, Christopher J, [*Learning general purpose distributed sentence representations via large scale multi-task learning*](https://arxiv.org/abs/1804.00079), ICLR, 2018.\n",
"3. Semantic textual similarity. url: http://nlpprogress.com/english/semantic_textual_similarity.html"
"2. Pearson's Correlation Coefficient. url: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient\n",
"3. Semantic textual similarity. url: http://nlpprogress.com/english/semantic_textual_similarity.html\n",
"4. Minh-Thang Luong, Quoc V Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. [*Multi-task sequence to sequence learning*](https://arxiv.org/abs/1511.06114), 2015.\n",
"5. Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. [*Learned in translation: Contextualized word vectors](https://arxiv.org/abs/1708.00107), 2017. "
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python (nlp_gpu)",
"language": "python",
@ -715,6 +701,15 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,

Просмотреть файл

@ -134,10 +134,12 @@ def evaluate(
save_dir,
starting_time,
model_state,
max_epoch,
):
""" Function to validate the model.
Args:
max_epoch(int): Limit training to specified number of epochs.
model_state(dict): Saved model weights.
config(dict): Config object.
train_iterator(BufferedDataIterator): BufferedDataIterator object.
@ -197,7 +199,7 @@ def evaluate(
)
if (monitor_epoch - min_val_loss_epoch) > config["training"][
"stop_patience"
]:
] or (max_epoch is not None and monitor_epoch >= max_epoch):
logging.info("Saving model ...")
# Save the name with validation loss.
torch.save(
@ -269,10 +271,11 @@ def evaluate_nli(nli_iterator, model, batch_size, n_gpus):
logging.info("******************************************************")
def train(config, data_folder, learning_rate=0.0001):
def train(config, data_folder, learning_rate=0.0001, max_epoch=None):
""" Train the Gensen model.
Args:
max_epoch(int): Limit training to specified number of epochs.
config(dict): Loaded json file as a python object.
data_folder(str): Path to the folder containing the data.
learning_rate(float): Learning rate for the model.
@ -562,7 +565,7 @@ def train(config, data_folder, learning_rate=0.0001):
)
logging.info(
"Average time per mininbatch : %.5f"
"Average time per minibatch : %.5f"
% (np.mean(mbatch_times))
)
mlflow.log_metric(
@ -588,8 +591,11 @@ def train(config, data_folder, learning_rate=0.0001):
save_dir=save_dir,
starting_time=start,
model_state=model_state,
max_epoch=max_epoch,
)
if training_complete:
mlflow.log_metric("min_val_loss", float(min_val_loss))
mlflow.log_metric("learning_rate", learning_rate)
break
logging.info("Evaluating on NLI")
@ -621,11 +627,18 @@ if __name__ == "__main__":
parser.add_argument(
"--learning_rate", type=float, default=0.0001, help="learning rate"
)
parser.add_argument(
"--max_epoch",
type=int,
default=None,
help="Limit training to specified number of epochs.",
)
args = parser.parse_args()
data_path = args.data_folder
lr = args.learning_rate
config_file_path = args.config
max_epoch = args.max_epoch
config_obj = read_config(config_file_path)
train(config_obj, data_path, lr)
train(config_obj, data_path, lr, max_epoch)

Просмотреть файл

@ -3,11 +3,11 @@
import json
import os
import numpy as np
import pandas as pd
from scenarios.sentence_similarity.gensen_train import train
from utils_nlp.models.gensen.create_gensen_model import create_multiseq2seq_model
from utils_nlp.eval.classification import compute_correlation_coefficients
from utils_nlp.models.gensen.create_gensen_model import (
create_multiseq2seq_model,
)
from utils_nlp.models.gensen.gensen import GenSenSingle
from utils_nlp.models.gensen.preprocess_utils import gensen_preprocess
@ -30,12 +30,14 @@ class GenSenClassifier:
pretrained_embedding_path,
learning_rate=0.0001,
cache_dir=".",
max_epoch=None,
):
self.learning_rate = learning_rate
self.config_file = config_file
self.cache_dir = cache_dir
self.pretrained_embedding_path = pretrained_embedding_path
self.model_name = "gensen_multiseq2seq"
self.max_epoch = max_epoch
self._validate_params()
@ -118,6 +120,7 @@ class GenSenClassifier:
data_folder=os.path.abspath(self.cache_dir),
config=self.config,
learning_rate=self.learning_rate,
max_epoch=self.max_epoch,
)
self._create_multiseq2seq_model()
@ -132,13 +135,13 @@ class GenSenClassifier:
sentences(list) : List of sentences.
Returns
array: A pairwise cosine similarity for the sentences provided based on their gensen
vector representations.
pd.Dataframe: A pairwise cosine similarity for the sentences provided based on their
gensen vector representations.
"""
# self.cache_dir = os.path.join(self.cache_dir, "clean/snli_1.0")
self._create_multiseq2seq_model()
# self._create_multiseq2seq_model()
gensen_model = GenSenSingle(
model_folder=os.path.join(
@ -149,7 +152,7 @@ class GenSenClassifier:
)
reps_h, reps_h_t = gensen_model.get_representation(
sentences, pool="last", return_numpy=True
sentences, pool="last", return_numpy=True, tokenize=True
)
return pd.DataFrame(np.corrcoef(reps_h_t))
return compute_correlation_coefficients(reps_h_t)

Просмотреть файл

@ -1,3 +1,13 @@
# Text Classification
Text classification is a supervised learning method of learning and predicting the category or the class of a document given its text content. The state-of-the-art methods are based on neural networks of different architectures as well as pretrained language models or word embeddings. Text classification is a core task in natural language Processing and has numerous applications such as sentiment analysis, document indexing in digital libraries, hate speech detection, and general-purpose categorization in medical, academic, legal, and many other domains.
## Summary
The following summarizes each notebook for Text Classification. Each notebook provides more details and guiding in principles on building state of the art models.
|Notebook|Runs Local|Description|
|---|---|---|
|[BERT for TC with MNLI](tc_mnli_bert.ipynb)| Yes| A notebook which walks through fine-tuning and evaluating a pretrained BERT model on a subset of the MultiNLI dataset|
|[BERT for TC on AzureML](tc_bert_azureml.ipynb) | No |A notebook which walks through fine-tuning and evaluating pretrained BERT model on a distributed setup with AzureML. |

Просмотреть файл

@ -0,0 +1,903 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*Copyright (c) Microsoft Corporation. All rights reserved.*\n",
"\n",
"*Licensed under the MIT License.*\n",
"\n",
"# Text Classification of MultiNLI Sentences using BERT with Azure ML Pipelines"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 0. Introduction\n",
"\n",
"In this notebook, we fine-tune and evaluate a pretrained [BERT](https://arxiv.org/abs/1810.04805) model on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset using [AzureML](https://azure.microsoft.com/en-us/services/machine-learning-service/) Pipelines.\n",
"\n",
"We use a [distributed sequence classifier](../../utils_nlp/bert/sequence_classification_distributed.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert).\n",
"\n",
"The notebooks acts as a template to,\n",
"1. Process a massive dataset in parallel by dividing the dataset into chunks using [DASK](https://dask.org/) .\n",
"2. Perform distributed training on AzureML compute on these processed chunks.\n",
"\n",
"We create an [AzureML Pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines) for the two steps mentioned above. With this pipeline, the notebook can be scheduled regularly to fine tune BERT with new data and get a model which can be further deployed on [Azure Container Instance](https://docs.microsoft.com/en-us/azure/container-service/).\n",
"\n",
"AzureML Pipeline define reusable machine learning workflows that can be used as a template for your machine learning scenarios. Pipelines allow you to optimize your workflow and spend time on machine learning rather than infrastructure. If you are new to the concept of pipelines, [this would be a good place to get started](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines)."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n",
"Azure ML SDK Version: 1.0.48\n"
]
}
],
"source": [
"import sys\n",
"sys.path.append(\"../../\")\n",
"import os\n",
"import json\n",
"import random\n",
"import shutil\n",
"import pandas as pd\n",
"\n",
"from utils_nlp.azureml import azureml_utils\n",
"from utils_nlp.dataset.multinli import get_generator\n",
"\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import azureml.core\n",
"from azureml.core import Datastore, Experiment, get_run\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.exceptions import ComputeTargetException\n",
"from azureml.data.data_reference import DataReference\n",
"from azureml.pipeline.steps import PythonScriptStep\n",
"from azureml.pipeline.core import Pipeline, PipelineData\n",
"from azureml.widgets import RunDetails\n",
"from azureml.train.dnn import PyTorch\n",
"from azureml.core.runconfig import MpiConfiguration\n",
"from azureml.pipeline.steps import EstimatorStep\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"Azure ML SDK Version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's define a few variables before we get started, these variables define the folder where the data would reside, the batch size and the number of epochs we are training for. \n",
"We also define the variables for AzureML workspace, which you can use to create a new workspace. You can ignore these variables if you have `config.json` in `.azureml` directory."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"LABEL_COL = \"genre\"\n",
"DATA_FOLDER = \"../../data/temp\"\n",
"TRAIN_FOLDER = \"../../data/temp/train\"\n",
"TEST_FOLDER = \"../../data/temp/test\"\n",
"ENCODED_LABEL_COL = \"label\"\n",
"NUM_PARTITIONS = None\n",
"LABELS = ['telephone', 'government', 'travel', 'slate', 'fiction']\n",
"PROJECT_FOLDER = \"../../\"\n",
"NODE_COUNT = 4\n",
"\n",
"config_path = (\n",
" \"./.azureml\"\n",
") # Path to the directory containing config.json with azureml credentials\n",
"\n",
"# Azure resources\n",
"subscription_id = \"YOUR_SUBSCRIPTION_ID\"\n",
"resource_group = \"YOUR_RESOURCE_GROUP_NAME\" \n",
"workspace_name = \"YOUR_WORKSPACE_NAME\" \n",
"workspace_region = \"YOUR_WORKSPACE_REGION\" #Possible values eastus, eastus2 and so on.\n",
"cluster_name = \"pipelines-tc-12\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this example we will use AzureML pipelines to execute training pipelines. Each preprocessing step is included as a step in the pipeline. For a more detailed walkthrough of what pipelines are with a getting started guidelines check this [notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb). We start by doing some AzureML related setup below."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 0.1 Create a workspace\n",
"\n",
"First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n",
"\n",
"**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"ws = azureml_utils.get_or_create_workspace(\n",
" config_path=config_path,\n",
" subscription_id=subscription_id,\n",
" resource_group=resource_group,\n",
" workspace_name=workspace_name,\n",
" workspace_region=workspace_region,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 0.2 Create a compute target\n",
"We create and attach a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training the model. Here we use the AzureML-managed compute target ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) as our remote training compute resource. Our cluster autoscales from 0 to 8 `STANDARD_NC12` GPU nodes.\n",
"\n",
"Creating and configuring the AmlCompute cluster takes approximately 5 minutes the first time around. Once a cluster with the given configuration is created, it does not need to be created again.\n",
"\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Read more about the default limits and how to request more quota [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas)."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found existing compute target.\n",
"{'currentNodeCount': 2, 'targetNodeCount': 2, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 2, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-31T22:29:42.732000+00:00', 'errors': None, 'creationTime': '2019-07-25T04:16:20.598768+00:00', 'modifiedTime': '2019-07-25T04:16:36.486727+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 2, 'maxNodeCount': 10, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}\n"
]
}
],
"source": [
"try:\n",
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print(\"Found existing compute target.\")\n",
"except ComputeTargetException:\n",
" print(\"Creating a new compute target...\")\n",
" compute_config = AmlCompute.provisioning_configuration(\n",
" vm_size=\"STANDARD_NC12\", max_nodes=8\n",
" )\n",
"\n",
" # create the cluster\n",
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
"\n",
" compute_target.wait_for_completion(show_output=True)\n",
"\n",
"# use get_status() to get a detailed status for the current AmlCompute.\n",
"print(compute_target.get_status().serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Preprocessing\n",
"\n",
"The pipeline is defined by a series of steps, the first being a PythonScriptStep which utilizes [DASK](https://dask.org/) to load dataframes in partitions allowing us to load and preprocess different sets of data in parallel."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1 Read Dataset"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"train_batches = get_generator(DATA_FOLDER, \"train\", num_batches=NUM_PARTITIONS, batch_size=10e6)\n",
"test_batches = get_generator(DATA_FOLDER, \"dev_matched\", num_batches=NUM_PARTITIONS, batch_size=10e6)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2 Preprocess and Tokenize\n",
"\n",
"In the classification task, we use the first sentence only as the text input, and the corresponding genre as the label. Select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are.\n",
"\n",
"Once filtered, we encode the labels. To do this, fit a label encoder with the known labels in a MNLI dataset."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"os.makedirs(TRAIN_FOLDER, exist_ok=True)\n",
"os.makedirs(TEST_FOLDER, exist_ok=True)\n",
"\n",
"labels = LABELS\n",
"label_encoder = LabelEncoder()\n",
"label_encoder.fit(labels)\n",
"\n",
"num_train_partitions = 0\n",
"for batch in train_batches:\n",
" batch = batch[batch[\"gold_label\"]==\"neutral\"]\n",
" batch[ENCODED_LABEL_COL] = label_encoder.transform(batch[LABEL_COL])\n",
" batch.to_csv(TRAIN_FOLDER+\"/batch{}.csv\".format(str(num_train_partitions)))\n",
" num_train_partitions += 1\n",
" \n",
"num_test_partitions = 0\n",
"for batch in test_batches:\n",
" batch = batch[batch[\"gold_label\"]==\"neutral\"]\n",
" batch[ENCODED_LABEL_COL] = label_encoder.transform(batch[LABEL_COL])\n",
" batch.to_csv(TEST_FOLDER+\"/batch{}.csv\".format(str(num_test_partitions)))\n",
" num_test_partitions += 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once we have the partitions of data ready they are uploaded to the datastore."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ds = ws.get_default_datastore()\n",
"ds.upload(src_dir=TRAIN_FOLDER, target_path=\"mnli_data/train\", overwrite=True, show_progress=False)\n",
"ds.upload(src_dir=TEST_FOLDER, target_path=\"mnli_data/test\", overwrite=True, show_progress=False)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"shutil.rmtree(TRAIN_FOLDER)\n",
"shutil.rmtree(TEST_FOLDER)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now parallely operate on each batch to tokenize the data and preprocess the tokens. To do this, we create a PythonScript step below."
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing preprocess.py\n"
]
}
],
"source": [
"%%writefile preprocess.py\n",
"# Copyright (c) Microsoft Corporation. All rights reserved.\n",
"# Licensed under the MIT License.\n",
"import argparse\n",
"import logging\n",
"import os\n",
"\n",
"import pandas as pd\n",
"\n",
"from utils_nlp.models.bert.common import Language, Tokenizer\n",
"\n",
"LABEL_COL = \"genre\"\n",
"TEXT_COL = \"sentence1\"\n",
"LANGUAGE = Language.ENGLISH\n",
"TO_LOWER = True\n",
"MAX_LEN = 150\n",
"\n",
"logger = logging.getLogger(__name__)\n",
"\n",
"\n",
"def tokenize(df):\n",
" \"\"\"Tokenize the text documents and convert them to lists of tokens using the BERT tokenizer.\n",
" Args:\n",
" df(pd.Dataframe): Dataframe with training or test samples\n",
"\n",
" Returns:\n",
"\n",
" list: List of lists of tokens for train set.\n",
"\n",
" \"\"\"\n",
" tokenizer = Tokenizer(\n",
" LANGUAGE, to_lower=TO_LOWER)\n",
" tokens = tokenizer.tokenize(list(df[TEXT_COL]))\n",
"\n",
" return tokens\n",
"\n",
"\n",
"def preprocess(tokens):\n",
" \"\"\" Preprocess method that does the following,\n",
" Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n",
" Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n",
" Pad or truncate the token lists to the specified max length\n",
" Return mask lists that indicate paddings' positions\n",
" Return token type id lists that indicate which sentence the tokens belong to (not needed\n",
" for one-sequence classification)\n",
"\n",
" Args:\n",
" tokens(pd.Dataframe): Dataframe with tokens for train set.\n",
"\n",
" Returns:\n",
" list: List of lists of tokens for train or test set with special tokens added.\n",
" list: Input mask.\n",
" \"\"\"\n",
" tokenizer = Tokenizer(\n",
" LANGUAGE, to_lower=TO_LOWER)\n",
" tokens, mask, _ = tokenizer.preprocess_classification_tokens(\n",
" tokens, MAX_LEN\n",
" )\n",
"\n",
" return tokens, mask\n",
"\n",
"\n",
"parser = argparse.ArgumentParser()\n",
"parser.add_argument(\"--input_data\", type=str, help=\"input data\")\n",
"parser.add_argument(\"--output_data\", type=str, help=\"Path to the output file.\")\n",
"\n",
"args = parser.parse_args()\n",
"input_data = args.input_data\n",
"output_data = args.output_data\n",
"output_dir = os.path.dirname(os.path.abspath(output_data))\n",
"\n",
"if output_dir is not None:\n",
" os.makedirs(output_dir, exist_ok=True)\n",
" logger.info(\"%s created\" % output_dir)\n",
"\n",
"df = pd.read_csv(args.input_data)\n",
"tokens_array = tokenize(df)\n",
"tokens_array, mask_array = preprocess(tokens_array)\n",
"\n",
"df['tokens'] = tokens_array\n",
"df['mask'] = mask_array\n",
"\n",
"# Filter columns\n",
"cols = ['tokens', 'mask', 'label']\n",
"df = df[cols]\n",
"df.to_csv(output_data, header=False, index=False)\n",
"logger.info(\"Completed\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'../../utils_nlp/models/bert/preprocess.py'"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocess_file = os.path.join(PROJECT_FOLDER,'utils_nlp/models/bert/preprocess.py')\n",
"shutil.move('preprocess.py',preprocess_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a conda environment for the steps below."
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"conda_dependencies = CondaDependencies.create(\n",
" conda_packages=[\n",
" \"numpy\",\n",
" \"scikit-learn\",\n",
" \"pandas\",\n",
" ],\n",
" pip_packages=[\"azureml-sdk==1.0.43.*\", \n",
" \"torch==1.1\", \n",
" \"tqdm==4.31.1\",\n",
" \"pytorch-pretrained-bert>=0.6\"],\n",
" python_version=\"3.6.8\",\n",
")\n",
"run_config = RunConfiguration(conda_dependencies=conda_dependencies)\n",
"run_config.environment.docker.enabled = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Then create the list of steps that use the preprocess.py created above. We use the output of these steps as input to training in the next section."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"processed_train_files = []\n",
"processed_test_files = []\n",
"ds = ws.get_default_datastore()\n",
"\n",
"for i in range(num_train_partitions):\n",
" input_data = DataReference(datastore=ds, \n",
" data_reference_name='train_batch_{}'.format(str(i)), \n",
" path_on_datastore='mnli_data/train/batch{}.csv'.format(str(i)),\n",
" overwrite=False)\n",
"\n",
" output_data = PipelineData(name=\"train{}\".format(str(i)), datastore=ds,\n",
" output_path_on_compute='mnli_data/processed_train/batch{}.csv'.format(str(i)))\n",
"\n",
" step = PythonScriptStep(\n",
" name='preprocess_step_train_{}'.format(str(i)),\n",
" arguments=[\"--input_data\", input_data, \"--output_data\", output_data],\n",
" script_name= 'utils_nlp/models/bert/preprocess.py',\n",
" inputs=[input_data],\n",
" outputs=[output_data],\n",
" source_directory=PROJECT_FOLDER,\n",
" compute_target=compute_target,\n",
" runconfig=run_config,\n",
" allow_reuse=False,\n",
" )\n",
" \n",
" processed_train_files.append(output_data) \n",
" \n",
"for i in range(num_test_partitions):\n",
" input_data = DataReference(datastore=ds, \n",
" data_reference_name='test_batch_{}'.format(str(i)), \n",
" path_on_datastore='mnli_data/test/batch{}.csv'.format(str(i)),\n",
" overwrite=False)\n",
" \n",
" output_data = PipelineData(name=\"test{}\".format(str(i)), datastore=ds,\n",
" output_path_on_compute='mnli_data/processed_test/batch{}.csv'.format(str(i)))\n",
" \n",
" step = PythonScriptStep(\n",
" name='preprocess_step_test_{}'.format(str(i)),\n",
" arguments=[\"--input_data\", input_data, \"--output_data\", output_data],\n",
" script_name= 'utils_nlp/models/bert/preprocess.py',\n",
" inputs=[input_data],\n",
" outputs=[output_data],\n",
" source_directory=PROJECT_FOLDER,\n",
" compute_target=compute_target,\n",
" runconfig=run_config,\n",
" allow_reuse=False,\n",
" )\n",
" \n",
" processed_test_files.append(output_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Train and Score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once the data is processed and available on datastore, we train the classifier using the training examples. This involves fine-tuning the BERT Transformer and learning a linear classification layer on top of that. After training is complete we score the performance of the model on the test dataset\n",
"\n",
"The training is distributed and is done AzureML's capability to support distributed using MPI with horovod. \n",
"\n",
"**Please note** that training requires a GPU enabled cluster in AzureML Compute. We suggest using NC12. If you would like to change the GPU configuration, please changes `NUM_GPUS` variable accordingly.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.1 Setup training script"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing train.py\n"
]
}
],
"source": [
"%%writefile train.py\n",
"# Copyright (c) Microsoft Corporation. All rights reserved.\n",
"# Licensed under the MIT License.\n",
"\n",
"import argparse\n",
"import json\n",
"import logging\n",
"import os\n",
"import torch\n",
"from sklearn.metrics import classification_report\n",
"\n",
"from utils_nlp.models.bert.common import Language\n",
"from utils_nlp.models.bert.sequence_classification_distributed import (\n",
" BERTSequenceDistClassifier,\n",
")\n",
"from utils_nlp.common.timer import Timer\n",
"\n",
"BATCH_SIZE = 32\n",
"NUM_GPUS = 2\n",
"NUM_EPOCHS = 1\n",
"LABELS = [\"telephone\", \"government\", \"travel\", \"slate\", \"fiction\"]\n",
"OUTPUT_DIR = \"./outputs/\"\n",
"\n",
"logger = logging.getLogger(__name__)\n",
"\n",
"parser = argparse.ArgumentParser()\n",
"parser.add_argument(\n",
" \"--train_files\",\n",
" nargs=\"+\",\n",
" default=[],\n",
" help=\"List of file paths to all the files in train dataset.\",\n",
")\n",
"\n",
"parser.add_argument(\n",
" \"--test_files\",\n",
" nargs=\"+\",\n",
" default=[],\n",
" help=\"List of file paths to all the files in test dataset.\",\n",
")\n",
"\n",
"args = parser.parse_args()\n",
"train_files = [file.strip() for file in args.train_files]\n",
"test_files = [file.strip() for file in args.test_files]\n",
"\n",
"# Handle square brackets from train list\n",
"train_files[0] = train_files[0][1:]\n",
"train_files[len(train_files) - 1] = train_files[len(train_files) - 1][:-1]\n",
"\n",
"# Handle square brackets from test list\n",
"test_files[0] = test_files[0][1:]\n",
"test_files[len(test_files) - 1] = test_files[len(test_files) - 1][:-1]\n",
"\n",
"# Train\n",
"classifier = BERTSequenceDistClassifier(\n",
" language=Language.ENGLISH, num_labels=len(LABELS)\n",
")\n",
"with Timer() as t:\n",
" classifier.fit(\n",
" train_files,\n",
" num_gpus=NUM_GPUS,\n",
" num_epochs=NUM_EPOCHS,\n",
" batch_size=BATCH_SIZE,\n",
" verbose=True,\n",
" )\n",
"\n",
"# Predict\n",
"preds, labels_test = classifier.predict(\n",
" test_files, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE\n",
")\n",
"\n",
"results = classification_report(\n",
" labels_test, preds, target_names=LABELS, output_dict=True\n",
")\n",
"\n",
"# Write out results.\n",
"result_file = os.path.join(OUTPUT_DIR, \"results.json\")\n",
"with open(result_file, \"w+\") as fp:\n",
" json.dump(results, fp)\n",
"\n",
"# Save model\n",
"model_file = os.path.join(OUTPUT_DIR, \"model.pt\")\n",
"torch.save(classifier.model.state_dict(), model_file)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'../../utils_nlp/models/bert/train.py'"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_file = os.path.join(PROJECT_FOLDER,'utils_nlp/models/bert/train.py')\n",
"shutil.move('train.py',train_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.2 Create a Pytorch Estimator\n",
"\n",
"We create a Pytorch Estimator using AzureML SDK and additonally define an EstimatorStep to run it on AzureML pipelines.\n",
"\n",
"The Azure ML SDK's PyTorch Estimator allows us to submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch).\n",
"\n",
"This Estimator specifies that the training script will run on 4 nodes, with 2 worker per node. In order to execute a distributed run using GPU, we must define `use_gpu` and `distributed_backend` to use MPI/Horovod. PyTorch, Horovod, and other necessary dependencies are installed automatically."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING - framework_version is not specified, defaulting to version 1.1.\n",
"WARNING - 'process_count_per_node' parameter will be deprecated. Please use it as part of 'distributed_training' parameter.\n"
]
}
],
"source": [
"estimator = PyTorch(source_directory=PROJECT_FOLDER,\n",
" compute_target=compute_target,\n",
" entry_script='utils_nlp/models/bert/train.py',\n",
" node_count= NODE_COUNT,\n",
" distributed_training=MpiConfiguration(),\n",
" process_count_per_node=2,\n",
" use_gpu=True,\n",
" conda_packages=['scikit-learn=0.20.3', 'numpy>=1.16.0', 'pandas'],\n",
" pip_packages=[\"tqdm==4.31.1\",\"pytorch-pretrained-bert>=0.6\"]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"inputs = processed_train_files + processed_test_files\n",
"\n",
"est_step = EstimatorStep(name=\"Estimator-Train\", \n",
" estimator=estimator, \n",
" estimator_entry_script_arguments=[\n",
" '--train_files', str(processed_train_files),\n",
" '--test_files', str(processed_test_files)],\n",
" inputs = inputs,\n",
" runconfig_pipeline_params=None, \n",
" compute_target=compute_target)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.3 Submit the pipeline\n",
"\n",
"The model is fine tuned on AML Compute and takes **45 minutes** to train. The total time to run the pipeline will be around **1h 30 minutes** if you use the default value `max_epoch=1`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"pipeline = Pipeline(workspace=ws, steps=[est_step])\n",
"experiment = Experiment(ws, 'NLP-TC-BERT-distributed')\n",
"pipeline_run = experiment.submit(pipeline)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "48df85f533834264a8a8b65a57d60d59",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"RunDetails(pipeline_run).show()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"#If you would like to cancel the job for any reasons uncomment the code below.\n",
"#pipeline_run.cancel()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"#wait for the run to complete before continuing in the notebook\n",
"pipeline_run.wait_for_completion()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.4 Download and analyze results"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading file outputs/results.json to ./outputs\\results.json...\n",
"Downloading file outputs/model.pt to ./outputs\\model.pt...\n"
]
}
],
"source": [
"step_run = pipeline_run.find_step_run(\"Estimator-Train\")[0]\n",
"file_names = ['outputs/results.json', 'outputs/model.pt']\n",
"azureml_utils.get_output_files(step_run, './outputs', file_names=file_names)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" f1-score precision recall support\n",
"telephone 0.920217 0.897281 0.944356 629.0\n",
"government 0.967905 0.979487 0.956594 599.0\n",
"travel 0.856683 0.900169 0.817204 651.0\n",
"slate 0.991093 0.991896 0.990291 618.0\n",
"fiction 0.936434 0.906907 0.967949 624.0\n",
"micro avg 0.933996 0.933996 0.933996 3121.0\n",
"macro avg 0.934466 0.935148 0.935279 3121.0\n",
"weighted avg 0.933394 0.934321 0.933996 3121.0\n"
]
}
],
"source": [
"with open('outputs/results.json', 'r') as handle:\n",
" parsed = json.load(handle)\n",
" print(pd.DataFrame.from_dict(parsed).transpose())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"From the above chart we can notice the performance of the model trained on a distributed setup in AzureML Compute. From our comparison to fine tuning the same model on MNLI dataset on a `STANDARD_NC12` machine [here](tc_mnli_bert.ipynb) we notice a gain of 20% in the model training time with no drop in performance for AzureML Compute. We present the comparison of weight avg of the metrics along with the training time below,\n",
"\n",
"| Training Setup | F1-Score | Precision | Recall | Training Time |\n",
"| --- | --- | --- | --- | --- |\n",
"|Standard NC12 | 0.93 |0.93 |0.93 | 58 min |\n",
"|AzureML Compute*|0.934| 0.934 | 0.934| 46 min |\n",
"\n",
"* AzureML Compute - The setup used 4 nodes with `STANDARD_NC12` machines.\n",
"\n",
"We also observe common tradeoffs associated with distributed training. We make use of [Horovod](https://github.com/horovod/horovod), a distributed training tool for many popular deep learning frameworks that enables parallelization of work across the nodes in the cluster. Distributed training decreases the time it takes for the model to converge in theory, but the model may also take more time in communicating with each node. Note that the communication time will eventually become negligible when training on larger and larger datasets, but being aware of this tradeoff is helpful for choosing the node configuration when training on smaller datasets. We expect the gains of using AzureML to increase with increased dataset size."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally clean up any intermediate files we created."
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"os.remove(train_file)\n",
"os.remove(preprocess_file)"
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python nlp_cpu",
"language": "python",
"name": "ame"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -13,25 +13,28 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"../../\")\n",
"import os\n",
"import json\n",
"import pandas as pd\n",
"import numpy as np\n",
"import scrapbook as sb\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"import torch\n",
"import torch.nn as nn\n",
"\n",
"from utils_nlp.dataset.multinli import load_pandas_df\n",
"from utils_nlp.eval.classification import eval_classification\n",
"from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier\n",
"from utils_nlp.models.bert.common import Language, Tokenizer\n",
"from utils_nlp.common.timer import Timer\n",
"import torch\n",
"import torch.nn as nn\n",
"import numpy as np"
"from utils_nlp.common.timer import Timer"
]
},
{
@ -46,8 +49,12 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"execution_count": 17,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"DATA_FOLDER = \"../../../temp\"\n",
@ -56,7 +63,8 @@
"TO_LOWER = True\n",
"MAX_LEN = 150\n",
"BATCH_SIZE = 32\n",
"NUM_GPUS = 2\n",
"BATCH_SIZE_PRED = 512\n",
"NUM_GPUS = 1\n",
"NUM_EPOCHS = 1\n",
"TRAIN_SIZE = 0.6\n",
"LABEL_COL = \"genre\"\n",
@ -256,15 +264,15 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 78540/78540 [00:26<00:00, 2991.68it/s]\n",
"100%|██████████| 52360/52360 [00:17<00:00, 2981.71it/s]\n"
"100%|██████████| 78540/78540 [00:26<00:00, 2968.10it/s]\n",
"100%|██████████| 52360/52360 [00:17<00:00, 2960.85it/s]\n"
]
}
],
@ -291,7 +299,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@ -313,7 +321,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@ -332,7 +340,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {
"scrolled": true
},
@ -341,24 +349,162 @@
"name": "stderr",
"output_type": "stream",
"text": [
"t_total value of -1 results in schedule not being applied\n"
"t_total value of -1 results in schedule not being applied\n",
"Iteration: 0%| | 1/2455 [00:00<35:04, 1.17it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:1->246/2454; loss:1.584357\n",
"epoch:1/1; batch:247->492/2454; loss:0.110689\n",
"epoch:1/1; batch:493->738/2454; loss:0.208907\n",
"epoch:1/1; batch:739->984/2454; loss:0.423804\n",
"epoch:1/1; batch:985->1230/2454; loss:0.035525\n",
"epoch:1/1; batch:1231->1476/2454; loss:0.189890\n",
"epoch:1/1; batch:1477->1722/2454; loss:0.216201\n",
"epoch:1/1; batch:1723->1968/2454; loss:0.245825\n",
"epoch:1/1; batch:1969->2214/2454; loss:0.138958\n",
"epoch:1/1; batch:2215->2454/2454; loss:0.066018\n",
"[Training time: 0.963 hrs]\n"
"epoch:1/1; batch:1->246/2455; average training loss:1.610151\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 10%|█ | 247/2455 [02:21<21:02, 1.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:247->492/2455; average training loss:0.376939\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 20%|██ | 493/2455 [04:42<18:42, 1.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:493->738/2455; average training loss:0.305378\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 30%|███ | 739/2455 [07:03<16:22, 1.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:739->984/2455; average training loss:0.279816\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 40%|████ | 985/2455 [09:24<13:59, 1.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:985->1230/2455; average training loss:0.262505\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 50%|█████ | 1231/2455 [11:44<11:38, 1.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:1231->1476/2455; average training loss:0.250177\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 60%|██████ | 1477/2455 [14:05<09:17, 1.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:1477->1722/2455; average training loss:0.241982\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 70%|███████ | 1723/2455 [16:25<06:57, 1.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:1723->1968/2455; average training loss:0.232584\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 80%|████████ | 1969/2455 [18:46<04:37, 1.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:1969->2214/2455; average training loss:0.226051\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 90%|█████████ | 2215/2455 [21:06<02:16, 1.75it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch:1/1; batch:2215->2455/2455; average training loss:0.221012\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Iteration: 100%|██████████| 2455/2455 [23:23<00:00, 2.09it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Training time: 0.390 hrs]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
@ -386,21 +532,14 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"52384it [11:51, 88.76it/s] \n"
]
}
],
"outputs": [],
"source": [
"preds = classifier.predict(\n",
" token_ids=tokens_test, input_mask=mask_test, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE\n",
")"
"preds = classifier.predict(token_ids=tokens_test, \n",
" input_mask=mask_test, \n",
" num_gpus=NUM_GPUS, \n",
" batch_size=BATCH_SIZE_PRED)"
]
},
{
@ -413,45 +552,159 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" fiction 0.88 0.96 0.91 10275\n",
" government 0.94 0.94 0.94 10292\n",
" slate 0.91 0.80 0.85 10277\n",
" telephone 0.99 1.00 0.99 11205\n",
" travel 0.95 0.97 0.96 10311\n",
"\n",
" accuracy 0.93 52360\n",
" macro avg 0.93 0.93 0.93 52360\n",
"weighted avg 0.93 0.93 0.93 52360\n",
"\n"
"{\n",
" \"accuracy\": 0.9343964858670741,\n",
" \"fiction\": {\n",
" \"f1-score\": 0.9240671732081498,\n",
" \"precision\": 0.9190412013862148,\n",
" \"recall\": 0.9291484184914842,\n",
" \"support\": 10275\n",
" },\n",
" \"government\": {\n",
" \"f1-score\": 0.943645744627561,\n",
" \"precision\": 0.9739427153345053,\n",
" \"recall\": 0.9151768363777691,\n",
" \"support\": 10292\n",
" },\n",
" \"macro avg\": {\n",
" \"f1-score\": 0.9329061626350004,\n",
" \"precision\": 0.9340480538608924,\n",
" \"recall\": 0.9332503791830062,\n",
" \"support\": 52360\n",
" },\n",
" \"slate\": {\n",
" \"f1-score\": 0.8626293944091614,\n",
" \"precision\": 0.8873456790123457,\n",
" \"recall\": 0.8392527002043398,\n",
" \"support\": 10277\n",
" },\n",
" \"telephone\": {\n",
" \"f1-score\": 0.9943437402574267,\n",
" \"precision\": 0.9924431009957326,\n",
" \"recall\": 0.9962516733601071,\n",
" \"support\": 11205\n",
" },\n",
" \"travel\": {\n",
" \"f1-score\": 0.9398447606727038,\n",
" \"precision\": 0.897467572575664,\n",
" \"recall\": 0.9864222674813307,\n",
" \"support\": 10311\n",
" },\n",
" \"weighted avg\": {\n",
" \"f1-score\": 0.9340029685187979,\n",
" \"precision\": 0.9350712643460813,\n",
" \"recall\": 0.9343964858670741,\n",
" \"support\": 52360\n",
" }\n",
"}\n"
]
}
],
"source": [
"print(classification_report(labels_test, preds, target_names=label_encoder.classes_))"
"report = classification_report(labels_test, preds, target_names=label_encoder.classes_, output_dict=True) \n",
"print(json.dumps(report, indent=4, sort_keys=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.9343964858670741,
"encoder": "json",
"name": "accuracy",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "accuracy"
}
},
"output_type": "display_data"
},
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.9340480538608924,
"encoder": "json",
"name": "precision",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "precision"
}
},
"output_type": "display_data"
},
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.9332503791830062,
"encoder": "json",
"name": "recall",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "recall"
}
},
"output_type": "display_data"
},
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.9329061626350004,
"encoder": "json",
"name": "f1",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "f1"
}
},
"output_type": "display_data"
}
],
"source": [
"# for testing\n",
"sb.glue(\"accuracy\", report[\"accuracy\"])\n",
"sb.glue(\"precision\", report[\"macro avg\"][\"precision\"])\n",
"sb.glue(\"recall\", report[\"macro avg\"][\"recall\"])\n",
"sb.glue(\"f1\", report[\"macro avg\"][\"f1-score\"])\n"
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python (nlp_gpu)",
"language": "python",
"name": "python3"
"name": "nlp_gpu"
},
"language_info": {
"codemirror_mode": {

Просмотреть файл

@ -11,9 +11,6 @@ from os.path import basename, dirname, join, splitext
from setuptools import find_packages, setup
VERSION = __import__("__init__").VERSION
def read(*names, **kwargs):
with io.open(
join(dirname(__file__), *names),
@ -24,7 +21,6 @@ def read(*names, **kwargs):
setup(
name="utils_nlp",
version=VERSION,
license="MIT License",
description="NLP Utility functions that are used for best practices in building state-of-the-art NLP methods and scenarios. Developed by Microsoft AI CAT",
long_description="%s\n%s"
@ -73,8 +69,9 @@ setup(
"Word Embedding",
],
python_requires=">=3.6",
install_requires=[],
install_requires=['setuptools_scm>=3.2.0',],
dependency_links=[],
extras_require={},
setup_requires=[],
use_scm_version=True,
setup_requires=['setuptools_scm'],
)

Просмотреть файл

@ -1,33 +1,45 @@
# Tests
This project uses unit, smoke and integration tests with Python files and notebooks. For more information, see a [quick introduction to unit, smoke and integration tests](https://miguelgfierro.com/blog/2018/a-beginners-guide-to-python-testing/). To manually execute the unit tests in the different environments, first **make sure you are in the correct environment as described in the [SETUP.md](/SETUP.md)**.
This project uses unit, smoke and integration tests with Python files and notebooks.
Tests are automatically run as part of a DevOps pipeline. The pipelines are defined in .yml files in tests/ci with filenames that align with pipeline names.
* In the unit tests we just make sure the notebook runs.
* In the smoke tests, we run them with a small dataset or a small number of epochs to make sure that, apart from running, they provide reasonable metrics.
* In the integration tests we use a bigger dataset for more epochs and we test that the metrics are what we expect.
For more information, see a [quick introduction to unit, smoke and integration tests](https://miguelgfierro.com/blog/2018/a-beginners-guide-to-python-testing/). To manually execute the unit tests in the different environments, first **make sure you are in the correct environment as described in the [SETUP.md](../SETUP.md)**.
Tests are automatically run as part of a DevOps pipeline. The pipelines are defined in the `.yml` files in [tests/ci](./ci) with filenames that align with pipeline names.
## Test execution
Click on the following menus to see more details on how to execute the unit, smoke and integration tests:
**Click on the following menus** to see more details on how to execute the unit, smoke and integration tests:
<details>
<summary><strong><em>Unit tests</em></strong></summary>
Unit tests ensure that each class or function behaves as it should. Every time a developer makes a pull request to staging or master branch, a battery of unit tests is executed.
**Note that the next instructions execute the tests from the root folder.**
For executing the Python unit tests for the utilities:
pytest tests/unit -m "not notebooks and not gpu"
pytest tests/unit -m "not notebooks and not gpu and not azureml"
For executing the Python unit tests for the notebooks:
pytest tests/unit -m "notebooks and not gpu"
pytest tests/unit -m "notebooks and not gpu and not azureml"
For executing the Python GPU unit tests for the utilities:
pytest tests/unit -m "not notebooks and gpu"
pytest tests/unit -m "not notebooks and gpu and not azureml"
For executing the Python GPU unit tests for the notebooks:
pytest tests/unit -m "notebooks and gpu"
pytest tests/unit -m "notebooks and gpu and not azureml"
For executing the AzureML unit tests:
pytest tests/unit -m "azureml"
</details>
@ -37,13 +49,19 @@ For executing the Python GPU unit tests for the notebooks:
Smoke tests make sure that the system works and are executed just before the integration tests every night.
**Note that the next instructions execute the tests from the root folder.**
For executing the Python smoke tests:
pytest tests/smoke -m "smoke and not gpu"
pytest --durations=0 tests/smoke -m "smoke and not gpu and not azureml"
For executing the Python GPU smoke tests:
pytest tests/smoke -m "smoke and gpu"
pytest --durations=0 tests/smoke -m "smoke and gpu and not azureml"
For executing the AzureML smoke tests:
pytest --durations=0 tests/smoke -m "azureml"
</details>
@ -52,13 +70,19 @@ For executing the Python GPU smoke tests:
Integration tests make sure that the program results are acceptable
**Note that the next instructions execute the tests from the root folder.**
For executing the Python integration tests:
pytest tests/integration -m "integration and not gpu"
pytest --durations=0 tests/integration -m "integration and not gpu and not azureml"
For executing the Python GPU integration tests:
pytest tests/integration -m "integration and gpu"
pytest --durations=0 tests/integration -m "integration and gpu and not azureml"
For executing the AzureML integration tests:
pytest --durations=0 tests/smoke -m "azureml"
</details>

Просмотреть файл

@ -0,0 +1,62 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers
# Implementing the scheduler from the dashboard
# Uncomment in case it wants to be done from using the yml
#schedules:
#- cron: "56 22 * * *"
# displayName: Daily computation of nightly builds
# branches:
# include:
# - master
# always: true
# no PR builds
pr: none
# no CI trigger
trigger: none
jobs:
- job: nightly
displayName : 'Nightly tests'
timeoutInMinutes: 180 # how long to run the job before automatically cancelling
pool:
name: nlpagentpool
steps:
- bash: |
echo "##vso[task.prependpath]/data/anaconda/bin"
conda env list
displayName: 'Add Conda to PATH'
# Conda creation can take around 10min
- bash: |
python tools/generate_conda_file.py
conda env create -n integration_cpu -f nlp_cpu.yaml
displayName: 'Creating Conda Environment with dependencies'
- bash: |
source activate integration_cpu
pytest --durations=0 tests/smoke -m "smoke and not gpu and not azureml" --junitxml=junit/test-smoke-test.xml
displayName: 'Run smoke tests'
- bash: |
source activate integration_cpu
pytest --durations=0 tests/integration -m "integration and not gpu and not azureml" --junitxml=junit/test-integration-test.xml
displayName: 'Run integration tests'
- bash: |
echo Remove Conda Environment
conda remove -n integration_cpu --all -q --force -y
echo Done Cleanup
displayName: 'Cleanup Task'
condition: always()
- task: PublishTestResults@2
inputs:
testResultsFiles: '**/test-*-test.xml'
testRunTitle: 'Test results for PyTest'

Просмотреть файл

@ -43,7 +43,7 @@ jobs:
- bash: |
source activate nlp_cpu
pytest --durations=0 tests/unit -m "not notebooks and not gpu" --junitxml=junit/test-unitttest.xml
pytest --durations=0 tests/unit -m "not notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml
displayName: 'Run Unit tests'
# Uncomment if needed

Просмотреть файл

@ -0,0 +1,62 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers
# Implementing the scheduler from the dashboard
# Uncomment in case it wants to be done from using the yml
#schedules:
#- cron: "56 11 * * *"
# displayName: Daily computation of nightly builds
# branches:
# include:
# - master
# always: true
# no PR builds
pr: none
# no CI trigger
trigger: none
jobs:
- job: nightly
displayName : 'Nightly tests'
timeoutInMinutes: 180 # how long to run the job before automatically cancelling
pool:
name: nlpagentpool
steps:
- bash: |
echo "##vso[task.prependpath]/data/anaconda/bin"
conda env list
displayName: 'Add Conda to PATH'
# Conda creation can take around 10min
- bash: |
python tools/generate_conda_file.py --gpu
conda env create -n integration_gpu -f nlp_gpu.yaml
displayName: 'Creating Conda Environment with dependencies'
- bash: |
source activate integration_gpu
pytest --durations=0 tests/smoke -m "smoke and gpu and not azureml" --junitxml=junit/test-smoke-test.xml
displayName: 'Run smoke tests'
- bash: |
source activate integration_gpu
pytest --durations=0 tests/integration -m "integration and gpu and not azureml" --junitxml=junit/test-integration-test.xml
displayName: 'Run integration tests'
- bash: |
echo Remove Conda Environment
conda remove -n integration_gpu --all -q --force -y
echo Done Cleanup
displayName: 'Cleanup Task'
condition: always()
- task: PublishTestResults@2
inputs:
testResultsFiles: '**/test-*-test.xml'
testRunTitle: 'Test results for PyTest'

Просмотреть файл

@ -32,7 +32,7 @@ jobs:
- bash: |
source activate nlp_gpu
pytest --durations=0 tests/unit -m "not notebooks and gpu" --junitxml=junit/test-unitttest.xml
pytest --durations=0 tests/unit -m "not notebooks and gpu and not azureml" --junitxml=junit/test-unitttest.xml
displayName: 'Run Unit tests'
# Uncomment if needed

Просмотреть файл

@ -32,7 +32,7 @@ jobs:
- bash: |
source activate nlp_cpu
pytest --durations=0 tests/unit -m "notebooks and not gpu" --junitxml=junit/test-unitttest.xml
pytest --durations=0 tests/unit -m "notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml
displayName: 'Run Unit tests'
# Uncomment if needed

Просмотреть файл

@ -32,7 +32,7 @@ jobs:
- bash: |
source activate nlp_gpu
pytest --durations=0 tests/unit -m "notebooks and gpu" --junitxml=junit/test-unitttest.xml
pytest --durations=0 tests/unit -m "notebooks and gpu and not azureml" --junitxml=junit/test-unitttest.xml
displayName: 'Run Unit tests'
# Uncomment if needed

Просмотреть файл

@ -1,12 +1,18 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# More info on scheduling: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops&tabs=yaml#scheduled-triggers
schedules:
- cron: "56 22 * * *"
displayName: Daily track of metrics
branches:
include:
- master
always: true
# Implementing the scheduler from the dashboard
# Uncomment in case it wants to be done from using the yml
# schedules:
# - cron: "56 22 * * *"
# displayName: Daily track of metrics
# branches:
# include:
# - master
# always: true
# no PR builds
pr: none

Просмотреть файл

@ -17,6 +17,9 @@ from tests.notebooks_common import path_notebooks
from utils_nlp.models.bert.common import Language
from utils_nlp.models.bert.common import Tokenizer as BERTTokenizer
from utils_nlp.azureml import azureml_utils
from azureml.core.webservice import Webservice
@pytest.fixture(scope="module")
@ -25,11 +28,55 @@ def notebooks():
# Path for the notebooks
paths = {
"embedding_trainer": os.path.join(
folder_notebooks, "embeddings", "embedding_trainer.ipynb"
),
"similarity_embeddings_baseline": os.path.join(
folder_notebooks, "sentence_similarity", "baseline_deep_dive.ipynb"
),
"embedding_trainer": os.path.join(
folder_notebooks, "embeddings", "embedding_trainer.ipynb"
"bert_encoder": os.path.join(
folder_notebooks, "sentence_similarity", "bert_encoder.ipynb"
),
"gensen_local": os.path.join(
folder_notebooks, "sentence_similarity", "gensen_local.ipynb"
),
"gensen_azureml": os.path.join(
folder_notebooks, "sentence_similarity", "gensen_aml_deep_dive.ipynb"
),
"similarity_automl_local": os.path.join(
folder_notebooks,
"sentence_similarity",
"automl_local_deployment_aci.ipynb",
),
"automl_with_pipelines_deployment_aks": os.path.join(
folder_notebooks,
"sentence_similarity",
"automl_with_pipelines_deployment_aks.ipynb",
),
"bert_qa_trainer": os.path.join(
folder_notebooks,
"question_answering",
"pretrained-BERT-SQuAD-deep-dive-aml.ipynb",
),
"bidaf_deep_dive": os.path.join(
folder_notebooks, "question_answering", "bidaf_aml_deep_dive.ipynb"
),
"bidaf_quickstart": os.path.join(
folder_notebooks,
"question_answering",
"question_answering_system_bidaf_quickstart.ipynb",
),
"entailment_multinli_bert": os.path.join(
folder_notebooks, "entailment", "entailment_multinli_bert.ipynb"
),
"tc_bert_azureml": os.path.join(
folder_notebooks, "text_classification", "tc_bert_azureml.ipynb"
),
"tc_mnli_bert": os.path.join(
folder_notebooks, "text_classification", "tc_mnli_bert.ipynb"
),
"deep_and_unified_understanding": os.path.join(
folder_notebooks, "interpret_NLP_models", "understand_models.ipynb"
),
}
return paths
@ -52,22 +99,10 @@ def ner_test_data():
false_pos = [1, 2]
for p in false_pos:
TRAILING_TOKEN_MASK[0][p] = False
INPUT_LABEL_IDS = [
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
]
INPUT_LABEL_IDS = [[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
return {
"INPUT_TEXT": [
[
"Johnathan",
"is",
"studying",
"in",
"the",
"University",
"of",
"Michigan",
".",
]
["Johnathan", "is", "studying", "in", "the", "University", "of", "Michigan", "."]
],
"INPUT_TEXT_SINGLE": [
"Johnathan",
@ -80,23 +115,9 @@ def ner_test_data():
"Michigan",
".",
],
"INPUT_LABELS": [
["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"]
],
"INPUT_LABELS_SINGLE": [
"I-PER",
"O",
"O",
"O",
"O",
"I-ORG",
"I-ORG",
"I-ORG",
"O",
],
"INPUT_LABELS_WRONG": [
["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]
],
"INPUT_LABELS": [["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"]],
"INPUT_LABELS_SINGLE": ["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"],
"INPUT_LABELS_WRONG": [["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]],
"INPUT_TOKEN_IDS": [
[
1287,
@ -123,26 +144,12 @@ def ner_test_data():
],
"INPUT_LABEL_IDS": INPUT_LABEL_IDS,
"INPUT_MASK": [[1] * 11 + [0] * 9],
"PREDICTED_LABELS": [
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
],
"PREDICTED_LABELS": [[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
"TRAILING_TOKEN_MASK": TRAILING_TOKEN_MASK,
"UNIQUE_LABELS": UNIQUE_LABELS,
"LABEL_MAP": LABEL_MAP,
"EXPECTED_TOKENS_NO_PADDING": [
[
"I-PER",
"X",
"X",
"O",
"O",
"O",
"O",
"I-ORG",
"I-ORG",
"I-ORG",
"O",
]
["I-PER", "X", "X", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"]
],
"EXPECTED_TOKENS_NO_PADDING_NO_TRAILING": [
["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "O"]
@ -152,6 +159,62 @@ def ner_test_data():
}
def pytest_addoption(parser):
parser.addoption("--subscription_id", help="Azure Subscription Id to create resources in")
parser.addoption("--resource_group", help="Name of the resource group")
parser.addoption("--workspace_name", help="Name of Azure ML Workspace")
parser.addoption("--workspace_region", help="Azure region to create the workspace in")
parser.addoption("--cluster_name", help="Name of the AzureML Cluster.")
@pytest.fixture(scope="module")
def subscription_id(request):
return request.config.getoption("--subscription_id")
@pytest.fixture(scope="module")
def resource_group(request):
return request.config.getoption("--resource_group")
@pytest.fixture(scope="module")
def workspace_name(request):
return request.config.getoption("--workspace_name")
@pytest.fixture(scope="module")
def workspace_region(request):
return request.config.getoption("--workspace_region")
@pytest.fixture(scope="module")
def cluster_name(request):
return request.config.getoption("--cluster_name")
@pytest.fixture()
def bert_english_tokenizer():
return BERTTokenizer(language=Language.ENGLISHCASED, to_lower=False)
@pytest.fixture(scope="module")
def teardown_service(
subscription_id, resource_group, workspace_name, workspace_region
):
yield
# connect to workspace
ws = azureml_utils.get_or_create_workspace(
config_path="tests/ci",
subscription_id=subscription_id,
resource_group=resource_group,
workspace_name=workspace_name,
workspace_region=workspace_region,
)
# connect to aci_service
aci_service = Webservice(workspace=ws, name="aci-test-service")
# delete aci_service
aci_service.delete()

Просмотреть файл

@ -0,0 +1,11 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import pytest
import torch
@pytest.mark.gpu
@pytest.mark.integration
def test_machine_is_gpu_machine():
assert torch.cuda.is_available() is True

Просмотреть файл

@ -15,4 +15,4 @@ def test_embedding_trainer_runs(notebooks):
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(NLP_REPO_PATH=".")
)
)

Просмотреть файл

@ -0,0 +1,22 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import pytest
import papermill as pm
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
@pytest.mark.gpu
@pytest.mark.integration
def test_entailment_multinli_bert(notebooks):
notebook_path = notebooks["entailment_multinli_bert"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
parameters={
"TRAIN_DATA_USED_PERCENT": 0.001,
"DEV_DATA_USED_PERCENT": 0.01,
"NUM_EPOCHS": 1,
},
kernel_name=KERNEL_NAME,
)

Просмотреть файл

@ -0,0 +1,28 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import pytest
import numpy as np
import papermill as pm
import scrapbook as sb
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
@pytest.mark.gpu
@pytest.mark.integration
def test_deep_and_unified_understanding(notebooks):
notebook_path = notebooks["deep_and_unified_understanding"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME)
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
sigma_numbers = [0.00317593, 0.00172284, 0.00634005, 0.00164305, 0.00317159]
sigma_bert = [0.1735696 , 0.14028822, 0.14590865, 0.2263149 , 0.20640415,
0.21249843, 0.18685372, 0.14112663, 0.25824168, 0.22399105,
0.2393731 , 0.12868434, 0.27386534, 0.35876372]
np.testing.assert_array_almost_equal(result["sigma_numbers"], sigma_numbers, decimal=4)
np.testing.assert_array_almost_equal(result["sigma_bert"], sigma_bert, decimal=1)

Просмотреть файл

@ -0,0 +1,86 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import pytest
import papermill as pm
import scrapbook as sb
from tests.notebooks_common import OUTPUT_NOTEBOOK
ABS_TOL = 0.2
@pytest.mark.integration
@pytest.mark.azureml
def test_bidaf_deep_dive(notebooks,
subscription_id,
resource_group,
workspace_name,
workspace_region):
notebook_path = notebooks["bidaf_deep_dive"]
pm.execute_notebook(notebook_path,
OUTPUT_NOTEBOOK,
parameters = {'NUM_EPOCHS':2,
'config_path': "tests/ci",
'PROJECT_FOLDER': "scenarios/question_answering/bidaf-question-answering",
'SQUAD_FOLDER': "scenarios/question_answering/squad",
'LOGS_FOLDER': "scenarios/question_answering/",
'BIDAF_CONFIG_PATH': "scenarios/question_answering/",
'subscription_id': subscription_id,
'resource_group': resource_group,
'workspace_name': workspace_name,
'workspace_region': workspace_region})
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["validation_EM"]
assert result == pytest.approx(0.5, abs=ABS_TOL)
@pytest.mark.usefixtures("teardown_service")
@pytest.mark.integration
@pytest.mark.azureml
def test_bidaf_quickstart(notebooks,
subscription_id,
resource_group,
workspace_name,
workspace_region):
notebook_path = notebooks["bidaf_quickstart"]
pm.execute_notebook(notebook_path,
OUTPUT_NOTEBOOK,
parameters = {'config_path': "tests/ci",
'subscription_id': subscription_id,
'resource_group': resource_group,
'workspace_name': workspace_name,
'workspace_region': workspace_region,
'webservice_name': "aci-test-service"})
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["answer"]
assert result == "Bi-Directional Attention Flow"
@pytest.mark.integration
@pytest.mark.azureml
@pytest.mark.gpu
def test_bert_qa_runs(notebooks):
notebook_path = notebooks["bert_qa_trainer"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
parameters=dict(
AZUREML_CONFIG_PATH="./tests/integration/.azureml",
DATA_FOLDER='./tests/integration/squad',
PROJECT_FOLDER='./tests/integration/pytorch-transformers',
EXPERIMENT_NAME='NLP-QA-BERT-deepdive',
BERT_UTIL_PATH='./utils_nlp/azureml/azureml_bert_util.py',
EVALUATE_SQAD_PATH = './utils_nlp/eval/evaluate_squad.py',
TRAIN_SCRIPT_PATH="./scenarios/question_answering/bert_run_squad_azureml.py",
BERT_MODEL="bert-base-uncased",
NUM_TRAIN_EPOCHS=1.0,
NODE_COUNT=1,
MAX_TOTAL_RUNS=1,
MAX_CONCURRENT_RUNS=1,
TARGET_GRADIENT_STEPS=1,
INIT_GRADIENT_STEPS=1,
),
)
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
assert result["f1"] > 70
assert result["learning_rate"] >= 5e-5
assert result["learning_rate"] <= 9e-5

Просмотреть файл

@ -1,15 +1,14 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import sys
import pytest
import papermill as pm
import scrapbook as sb
from tests.notebooks_common import OUTPUT_NOTEBOOK
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
ABS_TOL = 0.2
ABS_TOL_PEARSONS = 0.05
@pytest.fixture(scope="module")
@ -34,11 +33,109 @@ def baseline_results():
}
@pytest.mark.notebooks
@pytest.mark.gpu
@pytest.mark.integration
def test_gensen_local(notebooks):
notebook_path = notebooks["gensen_local"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(
max_epoch=1,
config_filepath="scenarios/sentence_similarity/gensen_config.json",
base_data_path="data",
),
)
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["results"]
expected = {"0": {"0": 1, "1": 0.95}, "1": {"0": 0.95, "1": 1}}
for key, value in expected.items():
for k, v in value.items():
assert results[key][k] == pytest.approx(v, abs=ABS_TOL_PEARSONS)
@pytest.mark.gpu
@pytest.mark.integration
def test_bert_encoder(notebooks, tmp):
notebook_path = notebooks["bert_encoder"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(NUM_GPUS=1,
MAX_SEQ_LENGTH=128,
CACHE_DIR=tmp),
)
size_emb = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["size_emb"]
assert size_emb == 768
@pytest.mark.integration
@pytest.mark.azureml
def test_similarity_embeddings_baseline_runs(notebooks, baseline_results):
notebook_path = notebooks["similarity_embeddings_baseline"]
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK)
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["results"]
for key, value in baseline_results.items():
assert results[key] == pytest.approx(value, abs=ABS_TOL)
@pytest.mark.usefixtures("teardown_service")
@pytest.mark.integration
@pytest.mark.azureml
def test_automl_local_runs(notebooks,
subscription_id,
resource_group,
workspace_name,
workspace_region):
notebook_path = notebooks["similarity_automl_local"]
pm.execute_notebook(notebook_path,
OUTPUT_NOTEBOOK,
parameters = {'automl_iterations': 2,
'automl_iteration_timeout':7,
'config_path': "tests/ci",
'webservice_name': "aci-test-service",
'subscription_id': subscription_id,
'resource_group': resource_group,
'workspace_name': workspace_name,
'workspace_region': workspace_region})
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["pearson_correlation"]
assert result == pytest.approx(0.5, abs=ABS_TOL)
@pytest.mark.integration
@pytest.mark.azureml
def test_similarity_gensen_azureml_runs(notebooks):
notebook_path = notebooks["gensen_azureml"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
parameters=dict(
CACHE_DIR="./tests/integration/temp",
AZUREML_CONFIG_PATH="./tests/integration/.azureml",
UTIL_NLP_PATH="./utils_nlp",
MAX_EPOCH=1,
TRAIN_SCRIPT="./scenarios/sentence_similarity/gensen_train.py",
CONFIG_PATH="./scenarios/sentence_similarity/gensen_config.json",
MAX_TOTAL_RUNS=1,
MAX_CONCURRENT_RUNS=1,
),
)
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
assert result["min_val_loss"] > 5
assert result["learning_rate"] >= 0.0001
assert result["learning_rate"] <= 0.001
@pytest.mark.integration
@pytest.mark.azureml
@pytest.mark.skip(reason="can't run programmatically, AKS cluster takes ~20 minutes to create and there is no blocking call in the notebook to tell that the cluster creation is in progress")
def test_automl_with_pipelines_deployment_aks(notebooks):
notebook_path = notebooks["automl_with_pipelines_deployment_aks"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK)

Просмотреть файл

@ -0,0 +1,74 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import json
import shutil
import pytest
import papermill as pm
import scrapbook as sb
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
ABS_TOL = 0.1
@pytest.mark.gpu
@pytest.mark.integration
def test_tc_mnli_bert(notebooks, tmp):
notebook_path = notebooks["tc_mnli_bert"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(NUM_GPUS=1,
DATA_FOLDER=tmp,
BERT_CACHE_DIR=tmp,
BATCH_SIZE=32,
BATCH_SIZE_PRED=512,
NUM_EPOCHS=1
)
)
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
assert pytest.approx(result["accuracy"], 0.93, abs=ABS_TOL)
assert pytest.approx(result["precision"], 0.93, abs=ABS_TOL)
assert pytest.approx(result["recall"], 0.93, abs=ABS_TOL)
assert pytest.approx(result["f1"], 0.93, abs=ABS_TOL)
@pytest.mark.integration
@pytest.mark.azureml
@pytest.mark.gpu
def test_tc_bert_azureml(
notebooks, subscription_id, resource_group, workspace_name, workspace_region, cluster_name, tmp
):
notebook_path = notebooks["tc_bert_azureml"]
train_folder = os.path.join(tmp, "train")
test_folder = os.path.join(tmp, "test")
parameters = {
"config_path": "tests/ci",
"subscription_id": subscription_id,
"resource_group": resource_group,
"workspace_name": workspace_name,
"workspace_region": workspace_region,
"cluster_name": cluster_name,
"DATA_FOLDER": tmp,
"TRAIN_FOLDER": train_folder,
"TEST_FOLDER": test_folder,
"PROJECT_FOLDER": "./",
"NUM_PARTITIONS": 1,
"NODE_COUNT": 1,
}
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=parameters
)
with open("outputs/results.json", "r") as handle:
result_dict = json.load(handle)
assert result_dict["weighted avg"]["f1-score"] == pytest.approx(0.85, abs=ABS_TOL)
if os.path.exists("outputs"):
shutil.rmtree("outputs")

Просмотреть файл

@ -0,0 +1,31 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import pytest
from utils_nlp.dataset import msrpc
from utils_nlp.dataset import xnli
@pytest.mark.smoke
def test_msrpc_download(tmp_path):
filepath = msrpc.download_msrpc(tmp_path)
statinfo = os.stat(filepath)
assert statinfo.st_size == 1359872
@pytest.mark.skip(reason="Can't test it programmatically, needs input")
@pytest.mark.smoke
def test_msrpc_load_df(tmp_path):
df_train = msrpc.load_pandas_df(
local_cache_path=tmp_path, dataset_type="train"
)
@pytest.mark.smoke
def test_xnli(tmp_path):
df_train = xnli.load_pandas_df(
local_cache_path=tmp_path, file_split="train"
)
assert df_train.shape == (392702, 2)

Просмотреть файл

@ -0,0 +1,12 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import pytest
import torch
@pytest.mark.smoke
@pytest.mark.gpu
def test_machine_is_gpu_machine():
assert torch.cuda.is_available() is True

Просмотреть файл

@ -1,14 +0,0 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import pytest
from utils_nlp.dataset import msrpc
@pytest.mark.smoke
def test_download_msrpc(tmp_path):
filepath = msrpc.download_msrpc(tmp_path)
statinfo = os.stat(filepath)
assert statinfo.st_size == 1359872

Просмотреть файл

@ -5,51 +5,14 @@ import os
import pytest
from utils_nlp.dataset.url_utils import maybe_download
from utils_nlp.dataset.msrpc import load_pandas_df
import utils_nlp.dataset.wikigold as wg
import utils_nlp.dataset.xnli as xnli
from utils_nlp.dataset import msrpc
from utils_nlp.dataset import wikigold
from utils_nlp.dataset import xnli
from utils_nlp.dataset import snli
from utils_nlp.dataset import Split
from utils_nlp.dataset.ner_utils import preprocess_conll
def test_maybe_download():
# ToDo: Change this url when repo goes public.
file_url = (
"https://raw.githubusercontent.com/Microsoft/Recommenders/"
"master/LICENSE"
)
filepath = "license.txt"
assert not os.path.exists(filepath)
filepath = maybe_download(file_url, "license.txt", expected_bytes=1162)
assert os.path.exists(filepath)
os.remove(filepath)
with pytest.raises(IOError):
filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
def test_load_pandas_df_msrpc():
with pytest.raises(Exception):
load_pandas_df(dataset_type="Dummy")
def test_wikigold(tmp_path):
wg_sentence_count = 1841
wg_test_percentage = 0.5
wg_test_sentence_count = round(wg_sentence_count * wg_test_percentage)
wg_train_sentence_count = wg_sentence_count - wg_test_sentence_count
downloaded_file = os.path.join(tmp_path, "wikigold.conll.txt")
assert not os.path.exists(downloaded_file)
train_df, test_df = wg.load_train_test_dfs(
tmp_path, test_percentage=wg_test_percentage
)
assert os.path.exists(downloaded_file)
assert train_df.shape == (wg_train_sentence_count, 2)
assert test_df.shape == (wg_test_sentence_count, 2)
@pytest.fixture
def ner_utils_test_data(scope="module"):
return {
@ -115,6 +78,45 @@ def ner_utils_test_data(scope="module"):
}
def test_maybe_download():
# ToDo: Change this url when repo goes public.
file_url = (
"https://raw.githubusercontent.com/Microsoft/Recommenders/"
"master/LICENSE"
)
filepath = "license.txt"
assert not os.path.exists(filepath)
filepath = maybe_download(file_url, "license.txt", expected_bytes=1162)
assert os.path.exists(filepath)
os.remove(filepath)
with pytest.raises(IOError):
filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
def test_msrpc():
with pytest.raises(Exception):
msrpc.load_pandas_df(dataset_type="Dummy")
def test_wikigold(tmp_path):
wg_sentence_count = 1841
wg_test_percentage = 0.5
wg_test_sentence_count = round(wg_sentence_count * wg_test_percentage)
wg_train_sentence_count = wg_sentence_count - wg_test_sentence_count
downloaded_file = os.path.join(tmp_path, "wikigold.conll.txt")
assert not os.path.exists(downloaded_file)
train_df, test_df = wikigold.load_train_test_dfs(
tmp_path, test_percentage=wg_test_percentage
)
assert os.path.exists(downloaded_file)
assert train_df.shape == (wg_train_sentence_count, 2)
assert test_df.shape == (wg_test_sentence_count, 2)
def test_ner_utils(ner_utils_test_data):
output = preprocess_conll(ner_utils_test_data["input"])
assert output == ner_utils_test_data["expected_output"]
@ -123,5 +125,21 @@ def test_ner_utils(ner_utils_test_data):
def test_xnli(tmp_path):
# only test for the dev df as the train dataset takes several
# minutes to download
dev_df = xnli.load_pandas_df(local_cache_path=tmp_path)
dev_df = xnli.load_pandas_df(local_cache_path=tmp_path, file_split="dev")
assert dev_df.shape == (2490, 2)
def test_snli(tmp_path):
df_train = snli.load_pandas_df(
local_cache_path=tmp_path, file_split=Split.TRAIN
)
assert df_train.shape == (550152, 14)
df_test = snli.load_pandas_df(
local_cache_path=tmp_path, file_split=Split.TEST
)
assert df_test.shape == (10000, 14)
df_dev = snli.load_pandas_df(
local_cache_path=tmp_path, file_split=Split.DEV
)
assert df_dev.shape == (10000, 14)

Просмотреть файл

@ -0,0 +1,16 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import numpy as np
from utils_nlp.eval.classification import compute_correlation_coefficients
def test_compute():
x = np.random.rand(2, 100)
df = compute_correlation_coefficients(x)
assert df.shape == (2, 2)
y = np.random.rand(2, 100)
df = compute_correlation_coefficients(x, y)
assert df.shape == (4, 4)

Просмотреть файл

@ -0,0 +1,62 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import pandas as pd
from utils_nlp.models.gensen.preprocess_utils import gensen_preprocess
from utils_nlp.models.gensen.utils import DataIterator
def test_gensen_preprocess(tmp_path):
data = [
[
"neutral",
"it is a lovely day",
"the weather is great outside.",
["it", "is", "lovely", "day"],
["the", "weather", "is", "great", "outside"],
]
]
df = pd.DataFrame(data)
df.columns = [
"score",
"sentence1",
"sentence2",
"sentence1_tokens",
"sentence2_tokens",
]
expected_files = [
"snli_1.0_test.txt.lab",
"snli_1.0_test.txt.s1.tok",
"snli_1.0_dev.txt.clean.noblank",
"snli_1.0_train.txt.s1.tok",
"snli_1.0_train.txt.lab",
"snli_1.0_dev.txt.s1.tok",
"snli_1.0_dev.txt.s2.tok",
"snli_1.0_test.txt.s2.tok",
"snli_1.0_train.txt.clean",
"snli_1.0_train.txt.s2.tok",
"snli_1.0_test.txt.clean.noblank",
"snli_1.0_test.txt.clean",
"snli_1.0_train.txt.clean.noblank",
"snli_1.0_dev.txt.lab",
"snli_1.0_dev.txt.clean",
]
path = gensen_preprocess(df, df, df, tmp_path)
assert os.path.isdir(path) is True
assert set(os.listdir(path)) == set(expected_files)
def test_data_iterator():
sentences = ["it is a lovely day", "the weather is great outside.", ]
expected_vocab = ["it", "is", "a", "lovely", "day", "the", "weather", "is", "great", "outside."]
vocab_size = 10
di = DataIterator()
word2id, id2word = di.construct_vocab(sentences, vocab_size)
assert set(expected_vocab).issubset(word2id.keys())
assert set(expected_vocab).issubset(id2word.values())

Просмотреть файл

@ -0,0 +1,25 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import pytest
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
import papermill as pm
from utils_nlp.models.bert.common import Language
@pytest.mark.notebooks
def test_bert_encoder(notebooks):
notebook_path = notebooks["bert_encoder"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(
NUM_GPUS=0,
LANGUAGE=Language.ENGLISH,
TO_LOWER=True,
MAX_SEQ_LENGTH=128,
CACHE_DIR="./temp",
),
)

Просмотреть файл

@ -0,0 +1,26 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import pytest
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
import papermill as pm
from utils_nlp.models.bert.common import Language
@pytest.mark.notebooks
@pytest.mark.gpu
def test_bert_encoder(notebooks):
notebook_path = notebooks["bert_encoder"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(
NUM_GPUS=1,
LANGUAGE=Language.ENGLISH,
TO_LOWER=True,
MAX_SEQ_LENGTH=128,
CACHE_DIR="./temp",
),
)

Просмотреть файл

@ -13,6 +13,8 @@
import argparse
import textwrap
from sys import platform
HELP_MSG = """
To create the conda environment:
@ -50,6 +52,7 @@ CONDA_GPU = {
"numba": "numba>=0.38.1",
"pytorch": "pytorch>=1.0.0",
"tensorflow": "tensorflow-gpu==1.12.0",
"cudatoolkit": "cudatoolkit==9.2",
}
PIP_BASE = {
@ -69,6 +72,7 @@ PIP_BASE = {
"ipywebrtc": "ipywebrtc==0.4.3",
"pre-commit": "pre-commit>=1.14.4",
"scikit-learn": "scikit-learn>=0.19.0,<=0.20.3",
"sklearn-crfsuite": "sklearn-crfsuite>=0.3.6",
"spacy": "spacy>=2.1.4",
"spacy-models": (
"https://github.com/explosion/spacy-models/releases/download/"
@ -80,7 +84,18 @@ PIP_BASE = {
"seqeval": "seqeval>=0.0.12",
}
PIP_GPU = {"horovod": "horovod>=0.16.1"}
PIP_GPU = {}
PIP_DARWIN = {}
PIP_DARWIN_GPU = {"horovod": "horovod>=0.16.1"}
PIP_LINUX = {}
PIP_LINUX_GPU = {"horovod": "horovod>=0.16.1"}
PIP_WIN32 = {}
PIP_WIN32_GPU = {}
CONDA_WIN32 = {"pytorch": "pytorch==1.0.0", "cudatoolkit": "cuda90"}
if __name__ == "__main__":
parser = argparse.ArgumentParser(
@ -111,6 +126,23 @@ if __name__ == "__main__":
# update conda and pip packages based on flags provided
conda_packages = CONDA_BASE
pip_packages = PIP_BASE
# check for os platform support
if platform == "darwin":
pip_packages.update(PIP_DARWIN)
PIP_GPU.update(PIP_DARWIN_GPU)
elif platform.startswith("linux"):
pip_packages.update(PIP_LINUX)
PIP_GPU.update(PIP_LINUX_GPU)
elif platform == "win32":
conda_packages.update(CONDA_WIN32)
pip_packages.update(PIP_WIN32)
PIP_GPU.update(PIP_WIN32_GPU)
else:
raise Exception(
"Unsupported platform, must be Windows, Linux, or macOS"
)
if args.gpu:
conda_packages.update(CONDA_GPU)
pip_packages.update(PIP_GPU)

Просмотреть файл

@ -0,0 +1,107 @@
# NLP Utilities
This module (utils_nlp) contains functions to simplify common tasks used when developing and evaluating NLP systems. A short description of the sub-modules is provided below. For more details about what functions are available and how to use them, please review the doc-strings provided with the code.
## Sub-Modules
### [AzureML](azureml)
The AzureML submodule contains utilities to connect to a workspace, train, tune and operationalize NLP systems at scale using AzureML.
```python
from utils_nlp.azureml.azureml_utils import get_or_create_workspace
###Note: you do not need to fill in these values if you have a config.json in the same folder as this notebook
ws = get_or_create_workspace(
config_path=config_path,
subscription_id=subscription_id,
resource_group=resource_group,
workspace_name=workspace_name,
workspace_region=workspace_region,
)
```
### [Common](common)
This submodule contains high-level utilities for defining constants used in most algorithms as well as helper functions for managing aspects of different frameworks like pytorch.
### [Dataset](dataset)
Dataset includes helper functions for interacting with different datasets and formatting them appropriately as well as utilities for splitting data for training / testing.
#### Data Loading
There are dataloaders for several datasets. For example, the snli module will allow you to load a dataframe in pandas from the SNLI dataset, with the option to set the number of rows to load in order to test algorithms and evaluate performance benchmarks. Information on the datasets used in the repo can be found [here](https://github.com/microsoft/nlp/tree/staging/utils_nlp/dataset#datasets).
Most datasets may be split into `train`, `dev`, and `test`.
```python
from utils_nlp.dataset.snli import load_pandas_df
df = load_pandas_df(DATA_FOLDER, file_split ="train", nrows = 1000)
```
### [Evaluation (Eval)](eval)
The evaluation (eval) submodule includes functionality for computing eturns common classification evaluation metrics like accuracy, precision, recall, and f1 scores for classification scenarios, normalizing and finding f1_scores for different datasets like SQuAD, as well as logging the means and other coefficients for datasets like senteval.
### [Models](models)
The models submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new natural language processing systems. A description of which algorithms are used in each scenario can be found on [this table](../README.md#content)
This includes:
* BERT
* GenSen
* Pretrained embeddings (Word2Vec,
fastText,
GloVe)
* Pytorch's conditional Gated Recurrent Unit (GRU)
### [Interpreter](interpreter)
The interpreter submodule contains implementations to explain hidden states of models. It is a code implementation of the paper [Towards a Deep and Unified Understanding of Deep Neural Models in NLP](http://proceedings.mlr.press/v97/guan19a/guan19a.pdf).
### [Semantic Versioning](versioning)
This library is configured to use
[setuptools_scm](https://github.com/pypa/setuptools_scm/), following the
instructions there, to automatically get package version from git commit histories.
> NOTE: **There shouldn't be any references to manually coded versions**.
Verify what git tag to use by running:
```bash
python setup.py --version
```
It should look something like `0.1.0.dev4+gdfedba7.d20190209`
Using the information above the master branch, after a merge commit, can be _**Tagged**_ with the above semantic version `0.1.0` (ignoring the `dev4+gdfedba7.d20190209`)
For example:
git tag v0.1.0
Now verify the semantic version for the package:
python setup.py --version
All new merged commit on master must have a
[Semantic Versioning](https://semver.org/) release version with an
accompanying tag. TL;DR:
* `major.minor.patch`
* Patch is for bugfix
* Minor is for new features
* Major is for backwards-incompatible changes
* tags should be of the form `v0.1.2`
Installing this library into another clean git repository with a tag version, you should get a nice version like `0.2.1`.
However, if you inspect the `__version__` in this repo,
you'll get a nice **'dirty'** version number like `'0.2.1.dev0+g850a76d.d20180908'`.
This is useful for debugging, building sphinx docs in dev and so on.
You should never have to specify a version manually except just tagging your commit from the tag calculation generated by running
python setup.py --version

Просмотреть файл

@ -0,0 +1,5 @@
from setuptools_scm import get_version
# Determine semantic versioning automatically
# from git commits
__version__ = get_version()

Просмотреть файл

@ -0,0 +1,133 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from horovod.torch.mpi_ops import allreduce, allreduce_async_, synchronize
from horovod.torch.compression import Compression
import horovod.torch as hvd
import torch
import time
from collections import OrderedDict
try:
from apex_C import flatten
from apex_C import unflatten
except ImportError:
try:
_ = warned_flatten
except NameError:
print("Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten.")
warned_flatten = True
from torch._utils import _flatten_dense_tensors as flatten
from torch._utils import _unflatten_dense_tensors as unflatten
def warmup_linear(x, warmup=0.002):
if x < warmup:
return x/warmup
return 1.0 - x
def adjust_gradient_accumulation_steps(x, initial_steps, target_steps, warmup):
return min(max(int(x/warmup*target_steps), initial_steps), target_steps)
class DistributedCommunicator:
def __init__(self, accumulation_step=1):
hvd.init()
self.local_rank = hvd.local_rank()
self.world_size = hvd.size()
self.rank = hvd.rank()
self.n_gpu = torch.cuda.device_count()
self.node_count = self.world_size // self.n_gpu
self.accumulation_step = accumulation_step
self.count_down = accumulation_step - 1
self._multi_node = self.node_count > 1
if not self._multi_node:
# use PyTorch build-in NCCL backend for single node training
torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:6000',
world_size=self.n_gpu, rank=self.local_rank)
def register_model(self, model, fp16):
# broadcast model parameters
if self.node_count > 1:
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
else:
for param in model.parameters():
torch.distributed.broadcast_multigpu([param], 0)
# register hook for reduce when backpropagate
self._parameter_names = {v: k for k, v in sorted(model.named_parameters())}
self._handles = {}
self._requires_update = set()
self._grad_accs = []
self._grad = []
self._compression = hvd.Compression.fp16 if fp16 else hvd.Compression.none
for p in model.parameters():
if p.requires_grad:
p.grad = p.data.new(p.size()).zero_()
self._requires_update.add(p)
p_tmp = p.expand_as(p)
grad_acc = p_tmp.grad_fn.next_functions[0][0]
grad_acc.register_hook(self._make_hook(p))
self._grad_accs.append(grad_acc)
def _allreduce_tensor(self, p):
assert p not in self._handles
assert not p.grad.requires_grad
tensor = p.grad
name = self._parameter_names.get(p)
if self._multi_node:
tensor_compressed, ctx = self._compression.compress(tensor)
handle = allreduce_async_(tensor_compressed, average=True, name=name)
self._handles[p] = (handle, ctx)
else:
self._handles[p] = tensor
def _make_hook(self, p):
def hook(*ignore):
if self.count_down == 0:
self._allreduce_tensor(p)
return hook
def synchronize(self):
synced = False
if self.count_down == 0:
missing_p = self._requires_update - set(self._handles.keys())
for p in missing_p:
self._allreduce_tensor(p)
if self._multi_node:
for p, value in self._handles.items():
handle, ctx = value
output = synchronize(handle)
p.grad.set_(self._compression.decompress(output, ctx) / self.accumulation_step)
else:
buckets = OrderedDict()
for tensor in self._handles.values():
tp = tensor.type()
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(tensor)
for tp in buckets:
bucket = buckets[tp]
coalesced = flatten(bucket) / self.world_size / self.accumulation_step
torch.distributed.all_reduce_multigpu([coalesced])
for buf, synced in zip(bucket, unflatten(coalesced, bucket)):
buf.copy_(synced)
self._handles.clear()
synced = True
self.count_down = self.accumulation_step
self.count_down -= 1
return synced
def set_accumulation_step(self, accumulation_step):
self.accumulation_step = accumulation_step
self.count_down = self.accumulation_step - 1
# Original source:
# https://github.com/microsoft/AzureML-BERT/blob/dec79be13befdd51fa72c05419cf9288d32eb263/finetune/PyTorch/azureml_bert_util.py

Просмотреть файл

@ -2,8 +2,27 @@
# Licensed under the MIT License.
import os
from azureml.core.authentication import AzureCliAuthentication
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.authentication import AuthenticationException
from azureml.core import Workspace
from azureml.exceptions import WorkspaceException
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
def get_auth():
"""
Method to get the correct Azure ML Authentication type
Always start with CLI Authentication and if it fails, fall back
to interactive login
"""
try:
auth_type = AzureCliAuthentication()
auth_type.get_authentication_header()
except AuthenticationException:
auth_type = InteractiveLoginAuthentication()
return auth_type
def get_or_create_workspace(
@ -13,92 +32,114 @@ def get_or_create_workspace(
workspace_name=None,
workspace_region=None,
):
"""Get or create AzureML Workspace this will save the config to the path specified for later use
"""
Method to get or create workspace.
Args:
config_path (str): optional directory to look for / store config.json file (defaults to current directory)
subscription_id (str): subscription id
resource_group (str): resource group
workspace_name (str): workspace name
workspace_region (str): region
config_path: optional directory to look for / store config.json file (defaults to current directory)
subscription_id: Azure subscription id
resource_group: Azure resource group to create workspace and related resources
workspace_name: name of azure ml workspace
workspace_region: region for workspace
Returns:
Workspace
obj: AzureML workspace if one exists already with the name otherwise creates a new one.
"""
# use environment variables if needed
if subscription_id is None:
subscription_id = os.getenv("SUBSCRIPTION_ID")
if resource_group is None:
resource_group = os.getenv("RESOURCE_GROUP")
if workspace_name is None:
workspace_name = os.getenv("WORKSPACE_NAME")
if workspace_region is None:
workspace_region = os.getenv("WORKSPACE_REGION")
# define fallback options in order to try
options = [
(
Workspace,
dict(
subscription_id=subscription_id,
resource_group=resource_group,
workspace_name=workspace_name,
),
),
(Workspace.from_config, dict(path=config_path)),
(
Workspace.create,
dict(
subscription_id=subscription_id,
resource_group=resource_group,
try:
# get existing azure ml workspace
if config_path is not None:
ws = Workspace.from_config(config_path, auth=get_auth())
else:
ws = Workspace.get(
name=workspace_name,
location=workspace_region,
create_resource_group=True,
exist_ok=True,
),
),
]
subscription_id=subscription_id,
resource_group=resource_group,
auth=get_auth(),
)
for function, kwargs in options:
try:
ws = function(**kwargs)
break
except Exception:
continue
else:
raise ValueError(
"Failed to get or create AzureML Workspace with the configuration information provided"
except WorkspaceException:
# this call might take a minute or two.
print("Creating new workspace")
ws = Workspace.create(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
create_resource_group=True,
location=workspace_region,
auth=get_auth(),
)
ws.write_config(path=config_path)
ws.write_config(path=config_path)
return ws
def log_metrics_scalar(value, run, name="", description=None):
"""Log scalar metric to the AzureML run
def get_or_create_amlcompute(
workspace,
compute_name,
vm_size="",
min_nodes=0,
max_nodes=None,
idle_seconds_before_scaledown=None,
verbose=False,
):
"""Get or create AmlCompute as the compute target. If a cluster of the same name is found, attach it and rescale
accordingly. Otherwise, create a new cluster.
Args:
value : numerical or string value to log
run : AzureML Run object
name : name of metric
description : description of metric
workspace (Workspace): workspace
compute_name (str): name
vm_size (str, optional): vm size
min_nodes (int, optional): minimum number of nodes in cluster
max_nodes (None, optional): maximum number of nodes in cluster
idle_seconds_before_scaledown (None, optional): how long to wait before the cluster autoscales down
verbose (bool, optional): if true, print logs
Returns:
Compute target
"""
run.log(name, value, description)
try:
if verbose:
print("Found compute target: {}".format(compute_name))
compute_target = ComputeTarget(workspace=workspace, name=compute_name)
if len(compute_target.list_nodes()) < max_nodes:
if verbose:
print("Rescaling to {} nodes".format(max_nodes))
compute_target.update(max_nodes=max_nodes)
compute_target.wait_for_completion(show_output=verbose)
except ComputeTargetException:
if verbose:
print("Creating new compute target: {}".format(compute_name))
compute_config = AmlCompute.provisioning_configuration(
vm_size=vm_size,
min_nodes=min_nodes,
max_nodes=max_nodes,
idle_seconds_before_scaledown=idle_seconds_before_scaledown,
)
compute_target = ComputeTarget.create(ws, compute_name, compute_config)
compute_target.wait_for_completion(show_output=verbose)
return compute_target
def get_output_files(run, output_path, file_names=None):
"""
Method to get the output files from an AzureML output directory.
def log_metrics_table(df, run, name="", description=None, as_scalar=False):
"""Log data from pd.DataFrame to the AzureML run
Args:
df : pd.DataFrame containing metrics to log
run : AzureML Run object
name : name of metric
description : description of metric
as_scalar : when True, logs each cell of the table as a scalar metric; defaults to False
"""
if as_scalar:
for rn in df.index:
for cn in df.columns:
log_metrics_scalar(df.loc[rn, cn], run, name="{0}::{1}".format(rn, cn), description=description)
file_names(list): Names of the files to download.
run(azureml.core.run.Run): Run object of the run.
output_path(str): Path to download the output files.
else:
run.log_table(name, df.to_dict(), description)
Returns: None
"""
os.makedirs(output_path, exist_ok=True)
if file_names is None:
file_names = run.get_file_names()
for f in file_names:
dest = os.path.join(output_path, f.split("/")[-1])
print("Downloading file {} to {}...".format(f, dest))
run.download_file(f, dest)

Просмотреть файл

@ -36,6 +36,12 @@ Alexis Conneau, Guillaume Lample, Ruty Rinott, Holger Schwenk, Ves Stoyanov. 201
Original source: https://www.nyu.edu/projects/bowman/xnli/
The dataset is preprocessed to remove unused columns.
### The SQuAD dataset
>This dataset is provided under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode).
Redistributing the datasets "train-v1.1.json" and "dev-v1.1.json" with attribution:
Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. SQuAD: 100,000+ Questions for Machine Comprehension of Text. Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing (EMNLP).
Original source: https://github.com/rajpurkar/SQuAD-explorer
### The STSbenchmark dataset
>Redistributing the dataset "Stsbenchmark.tar.gz" with attribution:

Просмотреть файл

@ -8,7 +8,7 @@ nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
class Split(Enum):
TRAIN = "train"
DEV = "dev"
TEST = "test"
class Split(str, Enum):
TRAIN : str = "train"
DEV : str = "dev"
TEST : str = "test"

Просмотреть файл

@ -8,12 +8,12 @@ class DaskCSVLoader:
at a time), before sampling batches from the partitions."""
def __init__(
self,
file_path,
sep=",",
header="infer",
block_size=10e6,
random_seed=None,
self,
file_path,
sep=",",
header="infer",
block_size=10e6,
random_seed=None,
):
"""Initializes the loader.
@ -69,7 +69,7 @@ class DaskCSVLoader:
for i in range(self.df.npartitions):
part = self.df.partitions[i].compute()
for j in range(0, part.shape[0], batch_size):
yield part.iloc[j : j + batch_size, :]
yield part.iloc[j: j + batch_size, :]
class DaskJSONLoader:
@ -78,7 +78,7 @@ class DaskJSONLoader:
batches from the partitions."""
def __init__(
self, file_path, block_size=10e6, random_seed=None, lines=True
self, file_path, block_size=10e6, random_seed=None, lines=True
):
"""Initializes the loader.
@ -118,15 +118,19 @@ class DaskJSONLoader:
else:
yield sample_part
def get_sequential_batches(self, batch_size):
def get_sequential_batches(self, batch_size, num_batches=None):
"""Creates a sequential generator.
Batches returned are pandas dataframes of length=batch_size.
Note: Final batch might be of smaller size.
Args:
num_batches: Number of batches to generate.
batch_size (int): Batch size.
"""
for i in range(self.df.npartitions):
if num_batches is None:
num_batches = self.df.npartitions
for i in range(num_batches):
part = self.df.partitions[i].compute()
for j in range(0, part.shape[0], batch_size):
yield part.iloc[j : j + batch_size, :]
yield part.iloc[j: j + batch_size, :]

Просмотреть файл

@ -50,9 +50,8 @@ def get_generator(
local_cache_path=".",
file_split="train",
block_size=10e6,
random_seed=None,
num_batches=1000,
batch_size=1000,
batch_size=10e6,
num_batches=None,
):
""" Downloads and extracts the dataset files and then returns a random batch generator that
yields pandas dataframes.
@ -81,10 +80,8 @@ def get_generator(
loader = DaskJSONLoader(
os.path.join(local_cache_path, DATA_FILES[file_split]),
block_size=block_size,
random_seed=random_seed,
)
block_size=block_size,)
return loader.get_random_batches(
num_batches=num_batches, batch_size=batch_size
return loader.get_sequential_batches(
batch_size=int(batch_size), num_batches=num_batches
)

Просмотреть файл

@ -74,14 +74,14 @@ def _maybe_download_and_extract(zip_path, file_split, file_type):
os.makedirs(dir_path)
# format csv filename
file_name = "{0}_{1}.{2}".format(SNLI_FILE_PREFIX, file_split, file_type)
file_name = "{0}_{1}.{2}".format(SNLI_FILE_PREFIX, file_split.value, file_type)
extract_path = os.path.join(dir_path, file_name)
if not os.path.exists(extract_path):
download_snli(zip_path)
dpath = download_snli(zip_path)
extract_snli(
zip_path,
source_path=os.path.join(SNLI_DIRNAME, file_name),
source_path=SNLI_DIRNAME + "/" + file_name,
dest_path=extract_path,
)
@ -156,8 +156,10 @@ def clean_cols(df):
def clean_rows(df, label_col=LABEL_COL):
"""Drop badly formatted rows from the input dataframe
Args: df (pd.DataFrame): Input dataframe label_col (str): Name of label column. Defaults to
the standardized column name that is set after running the clean_col method.
Args:
df (pd.DataFrame): Input dataframe
label_col (str): Name of label column.
Defaults to the standardized column name that is set after running the clean_col method.
Returns:
pd.DataFrame
@ -167,6 +169,11 @@ def clean_rows(df, label_col=LABEL_COL):
return snli_df
def clean_df(df, label_col=LABEL_COL):
df = clean_cols(df)
df = clean_rows(df, label_col)
return df
def load_azureml_df(
local_cache_path=None, file_split=Split.TRAIN, file_type="txt"

Просмотреть файл

@ -1,12 +1,8 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""XNLI dataset utils
https://www.nyu.edu/projects/bowman/xnli/
"""
import os
import pandas as pd
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
@ -16,9 +12,11 @@ URL_XNLI = "https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip"
URL_XNLI_MT = "https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip"
def load_pandas_df(local_cache_path="./", file_split="dev", language="zh"):
def load_pandas_df(local_cache_path=".", file_split="dev", language="zh"):
"""Downloads and extracts the dataset files.
Utilities information can be found `on this link <https://www.nyu.edu/projects/bowman/xnli/>`_.
Args:
local_cache_path (str, optional): Path to store the data.
Defaults to "./".

15
utils_nlp/eval/SentEval/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,15 @@
# SentEval data and .pyc files
# python
__pycache__/
*.py[cod]
*$py.class
# log files
*.log
*.txt
# data files
data/senteval_data*

Просмотреть файл

@ -0,0 +1,30 @@
BSD License
For SentEval software
Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name Facebook nor the names of its contributors may be used to
endorse or promote products derived from this software without specific
prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

244
utils_nlp/eval/SentEval/README.md Executable file
Просмотреть файл

@ -0,0 +1,244 @@
# SentEval: evaluation toolkit for sentence embeddings
SentEval is a library for evaluating the quality of sentence embeddings. We assess their generalization power by using them as features on a broad and diverse set of "transfer" tasks. **SentEval currently includes 17 downstream tasks**. We also include a suite of **10 probing tasks** which evaluate what linguistic properties are encoded in sentence embeddings. Our goal is to ease the study and the development of general-purpose fixed-size sentence representations.
**(04/22) SentEval new tasks: Added probing tasks for evaluating what linguistic properties are encoded in sentence embeddings**
**(10/04) SentEval example scripts for three sentence encoders: [SkipThought-LN](https://github.com/ryankiros/layer-norm#skip-thoughts)/[GenSen](https://github.com/Maluuba/gensen)/[Google-USE](https://tfhub.dev/google/universal-sentence-encoder/1)**
## Dependencies
This code is written in python. The dependencies are:
* Python 2/3 with [NumPy](http://www.numpy.org/)/[SciPy](http://www.scipy.org/)
* [Pytorch](http://pytorch.org/)>=0.4
* [scikit-learn](http://scikit-learn.org/stable/index.html)>=0.18.0
## Transfer tasks
### Downstream tasks
SentEval allows you to evaluate your sentence embeddings as features for the following *downstream* tasks:
| Task | Type | #train | #test | needs_train | set_classifier |
|---------- |------------------------------ |-----------:|----------:|:-----------:|:----------:|
| [MR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | movie review | 11k | 11k | 1 | 1 |
| [CR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | product review | 4k | 4k | 1 | 1 |
| [SUBJ](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | subjectivity status | 10k | 10k | 1 | 1 |
| [MPQA](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | opinion-polarity | 11k | 11k | 1 | 1 |
| [SST](https://nlp.stanford.edu/sentiment/index.html) | binary sentiment analysis | 67k | 1.8k | 1 | 1 |
| **[SST](https://nlp.stanford.edu/sentiment/index.html)** | **fine-grained sentiment analysis** | 8.5k | 2.2k | 1 | 1 |
| [TREC](http://cogcomp.cs.illinois.edu/Data/QA/QC/) | question-type classification | 6k | 0.5k | 1 | 1 |
| [SICK-E](http://clic.cimec.unitn.it/composes/sick.html) | natural language inference | 4.5k | 4.9k | 1 | 1 |
| [SNLI](https://nlp.stanford.edu/projects/snli/) | natural language inference | 550k | 9.8k | 1 | 1 |
| [MRPC](https://aclweb.org/aclwiki/Paraphrase_Identification_(State_of_the_art)) | paraphrase detection | 4.1k | 1.7k | 1 | 1 |
| [STS 2012](https://www.cs.york.ac.uk/semeval-2012/task6/) | semantic textual similarity | N/A | 3.1k | 0 | 0 |
| [STS 2013](http://ixa2.si.ehu.es/sts/) | semantic textual similarity | N/A | 1.5k | 0 | 0 |
| [STS 2014](http://alt.qcri.org/semeval2014/task10/) | semantic textual similarity | N/A | 3.7k | 0 | 0 |
| [STS 2015](http://alt.qcri.org/semeval2015/task2/) | semantic textual similarity | N/A | 8.5k | 0 | 0 |
| [STS 2016](http://alt.qcri.org/semeval2016/task1/) | semantic textual similarity | N/A | 9.2k | 0 | 0 |
| [STS B](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results) | semantic textual similarity | 5.7k | 1.4k | 1 | 0 |
| [SICK-R](http://clic.cimec.unitn.it/composes/sick.html) | semantic textual similarity | 4.5k | 4.9k | 1 | 0 |
| [COCO](http://mscoco.org/) | image-caption retrieval | 567k | 5*1k | 1 | 0 |
where **needs_train** means a model with parameters is learned on top of the sentence embeddings, and **set_classifier** means you can define the parameters of the classifier in the case of a classification task (see below).
Note: COCO comes with ResNet-101 2048d image embeddings. [More details on the tasks.](https://arxiv.org/pdf/1705.02364.pdf)
### Probing tasks
SentEval also includes a series of [*probing* tasks](https://github.com/facebookresearch/SentEval/tree/master/data/probing) to evaluate what linguistic properties are encoded in your sentence embeddings:
| Task | Type | #train | #test | needs_train | set_classifier |
|---------- |------------------------------ |-----------:|----------:|:-----------:|:----------:|
| [SentLen](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Length prediction | 100k | 10k | 1 | 1 |
| [WC](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Word Content analysis | 100k | 10k | 1 | 1 |
| [TreeDepth](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Tree depth prediction | 100k | 10k | 1 | 1 |
| [TopConst](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Top Constituents prediction | 100k | 10k | 1 | 1 |
| [BShift](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Word order analysis | 100k | 10k | 1 | 1 |
| [Tense](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Verb tense prediction | 100k | 10k | 1 | 1 |
| [SubjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Subject number prediction | 100k | 10k | 1 | 1 |
| [ObjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Object number prediction | 100k | 10k | 1 | 1 |
| [SOMO](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Semantic odd man out | 100k | 10k | 1 | 1 |
| [CoordInv](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Coordination Inversion | 100k | 10k | 1 | 1 |
## Download datasets
To get all the transfer tasks datasets, run (in data/downstream/):
```bash
./get_transfer_data.bash
```
This will automatically download and preprocess the downstream datasets, and store them in data/downstream (warning: for MacOS users, you may have to use p7zip instead of unzip). The probing tasks are already in data/probing by default.
## How to use SentEval: examples
### examples/bow.py
In examples/bow.py, we evaluate the quality of the average of word embeddings.
To download state-of-the-art fastText embeddings:
```bash
curl -Lo glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
curl -Lo crawl-300d-2M.vec.zip https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip
```
To reproduce the results for bag-of-vectors, run (in examples/):
```bash
python bow.py
```
As required by SentEval, this script implements two functions: **prepare** (optional) and **batcher** (required) that turn text sentences into sentence embeddings. Then SentEval takes care of the evaluation on the transfer tasks using the embeddings as features.
### examples/infersent.py
To get the **[InferSent](https://www.github.com/facebookresearch/InferSent)** model and reproduce our results, download our best models and run infersent.py (in examples/):
```bash
curl -Lo examples/infersent1.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent1.pkl
curl -Lo examples/infersent2.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent2.pkl
```
### examples/skipthought.py - examples/gensen.py - examples/googleuse.py
We also provide example scripts for three other encoders:
* [SkipThought with Layer-Normalization](https://github.com/ryankiros/layer-norm#skip-thoughts) in Theano
* [GenSen encoder](https://github.com/Maluuba/gensen) in Pytorch
* [Google encoder](https://tfhub.dev/google/universal-sentence-encoder/1) in TensorFlow
Note that for SkipThought and GenSen, following the steps of the associated githubs is necessary.
The Google encoder script should work as-is.
## How to use SentEval
To evaluate your sentence embeddings, SentEval requires that you implement two functions:
1. **prepare** (sees the whole dataset of each task and can thus construct the word vocabulary, the dictionary of word vectors etc)
2. **batcher** (transforms a batch of text sentences into sentence embeddings)
### 1.) prepare(params, samples) (optional)
*batcher* only sees one batch at a time while the *samples* argument of *prepare* contains all the sentences of a task.
```
prepare(params, samples)
```
* *params*: senteval parameters.
* *samples*: list of all sentences from the tranfer task.
* *output*: No output. Arguments stored in "params" can further be used by *batcher*.
*Example*: in bow.py, prepare is is used to build the vocabulary of words and construct the "params.word_vect* dictionary of word vectors.
### 2.) batcher(params, batch)
```
batcher(params, batch)
```
* *params*: senteval parameters.
* *batch*: numpy array of text sentences (of size params.batch_size)
* *output*: numpy array of sentence embeddings (of size params.batch_size)
*Example*: in bow.py, batcher is used to compute the mean of the word vectors for each sentence in the batch using params.word_vec. Use your own encoder in that function to encode sentences.
### 3.) evaluation on transfer tasks
After having implemented the batch and prepare function for your own sentence encoder,
1) to perform the actual evaluation, first import senteval and set its parameters:
```python
import senteval
params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
```
2) (optional) set the parameters of the classifier (when applicable):
```python
params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
'tenacity': 5, 'epoch_size': 4}
```
You can choose **nhid=0** (Logistic Regression) or **nhid>0** (MLP) and define the parameters for training.
3) Create an instance of the class SE:
```python
se = senteval.engine.SE(params, batcher, prepare)
```
4) define the set of transfer tasks and run the evaluation:
```python
transfer_tasks = ['MR', 'SICKEntailment', 'STS14', 'STSBenchmark']
results = se.eval(transfer_tasks)
```
The current list of available tasks is:
```python
['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SNLI',
'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'ImageCaptionRetrieval',
'STS12', 'STS13', 'STS14', 'STS15', 'STS16',
'Length', 'WordContent', 'Depth', 'TopConstituents','BigramShift', 'Tense',
'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion']
```
## SentEval parameters
Global parameters of SentEval:
```bash
# senteval parameters
task_path # path to SentEval datasets (required)
seed # seed
usepytorch # use cuda-pytorch (else scikit-learn) where possible
kfold # k-fold validation for MR/CR/SUB/MPQA.
```
Parameters of the classifier:
```bash
nhid: # number of hidden units (0: Logistic Regression, >0: MLP); Default nonlinearity: Tanh
optim: # optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..)
tenacity: # how many times dev acc does not increase before training stops
epoch_size: # each epoch corresponds to epoch_size pass on the train set
max_epoch: # max number of epoches
dropout: # dropout for MLP
```
Note that to get a proxy of the results while **dramatically reducing computation time**,
we suggest the **prototyping config**:
```python
params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
'tenacity': 3, 'epoch_size': 2}
```
which will results in a 5 times speedup for classification tasks.
To produce results that are **comparable to the literature**, use the **default config**:
```python
params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
'tenacity': 5, 'epoch_size': 4}
```
which takes longer but will produce better and comparable results.
For probing tasks, we used an MLP with a Sigmoid nonlinearity and and tuned the nhid (in [50, 100, 200]) and dropout (in [0.0, 0.1, 0.2]) on the dev set.
## References
Please considering citing [[1]](https://arxiv.org/abs/1803.05449) if using this code for evaluating sentence embedding methods.
### SentEval: An Evaluation Toolkit for Universal Sentence Representations
[1] A. Conneau, D. Kiela, [*SentEval: An Evaluation Toolkit for Universal Sentence Representations*](https://arxiv.org/abs/1803.05449)
```
@article{conneau2018senteval,
title={SentEval: An Evaluation Toolkit for Universal Sentence Representations},
author={Conneau, Alexis and Kiela, Douwe},
journal={arXiv preprint arXiv:1803.05449},
year={2018}
}
```
Contact: [aconneau@fb.com](mailto:aconneau@fb.com), [dkiela@fb.com](mailto:dkiela@fb.com)
### Related work
* [J. R Kiros, Y. Zhu, R. Salakhutdinov, R. S. Zemel, A. Torralba, R. Urtasun, S. Fidler - SkipThought Vectors, NIPS 2015](https://arxiv.org/abs/1506.06726)
* [S. Arora, Y. Liang, T. Ma - A Simple but Tough-to-Beat Baseline for Sentence Embeddings, ICLR 2017](https://openreview.net/pdf?id=SyK00v5xx)
* [Y. Adi, E. Kermany, Y. Belinkov, O. Lavi, Y. Goldberg - Fine-grained analysis of sentence embeddings using auxiliary prediction tasks, ICLR 2017](https://arxiv.org/abs/1608.04207)
* [A. Conneau, D. Kiela, L. Barrault, H. Schwenk, A. Bordes - Supervised Learning of Universal Sentence Representations from Natural Language Inference Data, EMNLP 2017](https://arxiv.org/abs/1705.02364)
* [S. Subramanian, A. Trischler, Y. Bengio, C. J Pal - Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning, ICLR 2018](https://arxiv.org/abs/1804.00079)
* [A. Nie, E. D. Bennett, N. D. Goodman - DisSent: Sentence Representation Learning from Explicit Discourse Relations, 2018](https://arxiv.org/abs/1710.04334)
* [D. Cer, Y. Yang, S. Kong, N. Hua, N. Limtiaco, R. St. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y. Sung, B. Strope, R. Kurzweil - Universal Sentence Encoder, 2018](https://arxiv.org/abs/1803.11175)
* [A. Conneau, G. Kruszewski, G. Lample, L. Barrault, M. Baroni - What you can cram into a single vector: Probing sentence embeddings for linguistic properties, ACL 2018](https://arxiv.org/abs/1805.01070)

Просмотреть файл

@ -0,0 +1,10 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
from __future__ import absolute_import
from senteval.engine import SE

Просмотреть файл

@ -0,0 +1,92 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA
'''
from __future__ import absolute_import, division, unicode_literals
import io
import os
import numpy as np
import logging
from senteval.tools.validation import InnerKFoldClassifier
class BinaryClassifierEval(object):
def __init__(self, pos, neg, seed=1111):
self.seed = seed
self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg)
self.n_samples = len(self.samples)
def do_prepare(self, params, prepare):
# prepare is given the whole text
return prepare(params, self.samples)
# prepare puts everything it outputs in "params" : params.word2id etc
# Those output will be further used by "batcher".
def loadFile(self, fpath):
with io.open(fpath, 'r', encoding='latin-1') as f:
return [line.split() for line in f.read().splitlines()]
def run(self, params, batcher):
enc_input = []
# Sort to reduce padding
sorted_corpus = sorted(zip(self.samples, self.labels),
key=lambda z: (len(z[0]), z[1]))
sorted_samples = [x for (x, y) in sorted_corpus]
sorted_labels = [y for (x, y) in sorted_corpus]
logging.info('Generating sentence embeddings')
for ii in range(0, self.n_samples, params.batch_size):
batch = sorted_samples[ii:ii + params.batch_size]
embeddings = batcher(params, batch)
enc_input.append(embeddings)
enc_input = np.vstack(enc_input)
logging.info('Generated sentence embeddings')
config = {'nclasses': 2, 'seed': self.seed,
'usepytorch': params.usepytorch,
'classifier': params.classifier,
'nhid': params.nhid, 'kfold': params.kfold}
clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config)
devacc, testacc = clf.run()
logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc))
return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples,
'ntest': self.n_samples}
class CREval(BinaryClassifierEval):
def __init__(self, task_path, seed=1111):
logging.debug('***** Transfer task : CR *****\n\n')
pos = self.loadFile(os.path.join(task_path, 'custrev.pos'))
neg = self.loadFile(os.path.join(task_path, 'custrev.neg'))
super(self.__class__, self).__init__(pos, neg, seed)
class MREval(BinaryClassifierEval):
def __init__(self, task_path, seed=1111):
logging.debug('***** Transfer task : MR *****\n\n')
pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos'))
neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg'))
super(self.__class__, self).__init__(pos, neg, seed)
class SUBJEval(BinaryClassifierEval):
def __init__(self, task_path, seed=1111):
logging.debug('***** Transfer task : SUBJ *****\n\n')
obj = self.loadFile(os.path.join(task_path, 'subj.objective'))
subj = self.loadFile(os.path.join(task_path, 'subj.subjective'))
super(self.__class__, self).__init__(obj, subj, seed)
class MPQAEval(BinaryClassifierEval):
def __init__(self, task_path, seed=1111):
logging.debug('***** Transfer task : MPQA *****\n\n')
pos = self.loadFile(os.path.join(task_path, 'mpqa.pos'))
neg = self.loadFile(os.path.join(task_path, 'mpqa.neg'))
super(self.__class__, self).__init__(pos, neg, seed)

Просмотреть файл

@ -0,0 +1,123 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
Generic sentence evaluation scripts wrapper
'''
from __future__ import absolute_import, division, unicode_literals
from senteval import utils
from senteval.binary import CREval, MREval, MPQAEval, SUBJEval
from senteval.snli import SNLIEval
from senteval.trec import TRECEval
from senteval.sick import SICKRelatednessEval, SICKEntailmentEval
from senteval.mrpc import MRPCEval
from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval
from senteval.sst import SSTEval
from senteval.rank import ImageCaptionRetrievalEval
from senteval.probing import *
class SE(object):
def __init__(self, params, batcher, prepare=None):
# parameters
params = utils.dotdict(params)
params.usepytorch = True if 'usepytorch' not in params else params.usepytorch
params.seed = 1111 if 'seed' not in params else params.seed
params.batch_size = 128 if 'batch_size' not in params else params.batch_size
params.nhid = 0 if 'nhid' not in params else params.nhid
params.kfold = 5 if 'kfold' not in params else params.kfold
if 'classifier' not in params or not params['classifier']:
params.classifier = {'nhid': 0}
assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!'
self.params = params
# batcher and prepare
self.batcher = batcher
self.prepare = prepare if prepare else lambda x, y: None
self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
'SICKRelatedness', 'SICKEntailment', 'STSBenchmark',
'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13',
'STS14', 'STS15', 'STS16',
'Length', 'WordContent', 'Depth', 'TopConstituents',
'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
'OddManOut', 'CoordinationInversion']
def eval(self, name):
# evaluate on evaluation [name], either takes string or list of strings
if (isinstance(name, list)):
self.results = {x: self.eval(x) for x in name}
return self.results
tpath = self.params.task_path
assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks)
# Original SentEval tasks
if name == 'CR':
self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed)
elif name == 'MR':
self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed)
elif name == 'MPQA':
self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed)
elif name == 'SUBJ':
self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed)
elif name == 'SST2':
self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed)
elif name == 'SST5':
self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed)
elif name == 'TREC':
self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed)
elif name == 'MRPC':
self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed)
elif name == 'SICKRelatedness':
self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed)
elif name == 'STSBenchmark':
self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
elif name == 'SICKEntailment':
self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed)
elif name == 'SNLI':
self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed)
elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
fpath = name + '-en-test'
self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed)
elif name == 'ImageCaptionRetrieval':
self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed)
# Probing Tasks
elif name == 'Length':
self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed)
elif name == 'WordContent':
self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed)
elif name == 'Depth':
self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed)
elif name == 'TopConstituents':
self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed)
elif name == 'BigramShift':
self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed)
elif name == 'Tense':
self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed)
elif name == 'SubjNumber':
self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed)
elif name == 'ObjNumber':
self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed)
elif name == 'OddManOut':
self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed)
elif name == 'CoordinationInversion':
self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed)
self.params.current_task = name
self.evaluation.do_prepare(self.params, self.prepare)
self.results = self.evaluation.run(self.params, self.batcher)
return self.results

Просмотреть файл

@ -0,0 +1,104 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
MRPC : Microsoft Research Paraphrase (detection) Corpus
'''
from __future__ import absolute_import, division, unicode_literals
import os
import logging
import numpy as np
import io
from senteval.tools.validation import KFoldClassifier
from sklearn.metrics import f1_score
class MRPCEval(object):
def __init__(self, task_path, seed=1111):
logging.info('***** Transfer task : MRPC *****\n\n')
self.seed = seed
train = self.loadFile(os.path.join(task_path,
'msr_paraphrase_train.txt'))
test = self.loadFile(os.path.join(task_path,
'msr_paraphrase_test.txt'))
self.mrpc_data = {'train': train, 'test': test}
def do_prepare(self, params, prepare):
# TODO : Should we separate samples in "train, test"?
samples = self.mrpc_data['train']['X_A'] + \
self.mrpc_data['train']['X_B'] + \
self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B']
return prepare(params, samples)
def loadFile(self, fpath):
mrpc_data = {'X_A': [], 'X_B': [], 'y': []}
with io.open(fpath, 'r', encoding='utf-8') as f:
for line in f:
text = line.strip().split('\t')
mrpc_data['X_A'].append(text[3].split())
mrpc_data['X_B'].append(text[4].split())
mrpc_data['y'].append(text[0])
mrpc_data['X_A'] = mrpc_data['X_A'][1:]
mrpc_data['X_B'] = mrpc_data['X_B'][1:]
mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]]
return mrpc_data
def run(self, params, batcher):
mrpc_embed = {'train': {}, 'test': {}}
for key in self.mrpc_data:
logging.info('Computing embedding for {0}'.format(key))
# Sort to reduce padding
text_data = {}
sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'],
self.mrpc_data[key]['X_B'],
self.mrpc_data[key]['y']),
key=lambda z: (len(z[0]), len(z[1]), z[2]))
text_data['A'] = [x for (x, y, z) in sorted_corpus]
text_data['B'] = [y for (x, y, z) in sorted_corpus]
text_data['y'] = [z for (x, y, z) in sorted_corpus]
for txt_type in ['A', 'B']:
mrpc_embed[key][txt_type] = []
for ii in range(0, len(text_data['y']), params.batch_size):
batch = text_data[txt_type][ii:ii + params.batch_size]
embeddings = batcher(params, batch)
mrpc_embed[key][txt_type].append(embeddings)
mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type])
mrpc_embed[key]['y'] = np.array(text_data['y'])
logging.info('Computed {0} embeddings'.format(key))
# Train
trainA = mrpc_embed['train']['A']
trainB = mrpc_embed['train']['B']
trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
trainY = mrpc_embed['train']['y']
# Test
testA = mrpc_embed['test']['A']
testB = mrpc_embed['test']['B']
testF = np.c_[np.abs(testA - testB), testA * testB]
testY = mrpc_embed['test']['y']
config = {'nclasses': 2, 'seed': self.seed,
'usepytorch': params.usepytorch,
'classifier': params.classifier,
'nhid': params.nhid, 'kfold': params.kfold}
clf = KFoldClassifier(train={'X': trainF, 'y': trainY},
test={'X': testF, 'y': testY}, config=config)
devacc, testacc, yhat = clf.run()
testf1 = round(100*f1_score(testY, yhat), 2)
logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n'
.format(devacc, testacc, testf1))
return {'devacc': devacc, 'acc': testacc, 'f1': testf1,
'ndev': len(trainA), 'ntest': len(testA)}

Просмотреть файл

@ -0,0 +1,171 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
probing tasks
'''
from __future__ import absolute_import, division, unicode_literals
import os
import io
import copy
import logging
import numpy as np
from senteval.tools.validation import SplitClassifier
class PROBINGEval(object):
def __init__(self, task, task_path, seed=1111):
self.seed = seed
self.task = task
logging.debug('***** (Probing) Transfer task : %s classification *****', self.task.upper())
self.task_data = {'train': {'X': [], 'y': []},
'dev': {'X': [], 'y': []},
'test': {'X': [], 'y': []}}
self.loadFile(task_path)
logging.info('Loaded %s train - %s dev - %s test for %s' %
(len(self.task_data['train']['y']), len(self.task_data['dev']['y']),
len(self.task_data['test']['y']), self.task))
def do_prepare(self, params, prepare):
samples = self.task_data['train']['X'] + self.task_data['dev']['X'] + \
self.task_data['test']['X']
return prepare(params, samples)
def loadFile(self, fpath):
self.tok2split = {'tr': 'train', 'va': 'dev', 'te': 'test'}
with io.open(fpath, 'r', encoding='utf-8') as f:
for line in f:
line = line.rstrip().split('\t')
self.task_data[self.tok2split[line[0]]]['X'].append(line[-1].split())
self.task_data[self.tok2split[line[0]]]['y'].append(line[1])
labels = sorted(np.unique(self.task_data['train']['y']))
self.tok2label = dict(zip(labels, range(len(labels))))
self.nclasses = len(self.tok2label)
for split in self.task_data:
for i, y in enumerate(self.task_data[split]['y']):
self.task_data[split]['y'][i] = self.tok2label[y]
def run(self, params, batcher):
task_embed = {'train': {}, 'dev': {}, 'test': {}}
bsize = params.batch_size
logging.info('Computing embeddings for train/dev/test')
for key in self.task_data:
# Sort to reduce padding
sorted_data = sorted(zip(self.task_data[key]['X'],
self.task_data[key]['y']),
key=lambda z: (len(z[0]), z[1]))
self.task_data[key]['X'], self.task_data[key]['y'] = map(list, zip(*sorted_data))
task_embed[key]['X'] = []
for ii in range(0, len(self.task_data[key]['y']), bsize):
batch = self.task_data[key]['X'][ii:ii + bsize]
embeddings = batcher(params, batch)
task_embed[key]['X'].append(embeddings)
task_embed[key]['X'] = np.vstack(task_embed[key]['X'])
task_embed[key]['y'] = np.array(self.task_data[key]['y'])
logging.info('Computed embeddings')
config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
'usepytorch': params.usepytorch,
'classifier': params.classifier}
if self.task == "WordContent" and params.classifier['nhid'] > 0:
config_classifier = copy.deepcopy(config_classifier)
config_classifier['classifier']['nhid'] = 0
print(params.classifier['nhid'])
clf = SplitClassifier(X={'train': task_embed['train']['X'],
'valid': task_embed['dev']['X'],
'test': task_embed['test']['X']},
y={'train': task_embed['train']['y'],
'valid': task_embed['dev']['y'],
'test': task_embed['test']['y']},
config=config_classifier)
devacc, testacc = clf.run()
logging.debug('\nDev acc : %.1f Test acc : %.1f for %s classification\n' % (devacc, testacc, self.task.upper()))
return {'devacc': devacc, 'acc': testacc,
'ndev': len(task_embed['dev']['X']),
'ntest': len(task_embed['test']['X'])}
"""
Surface Information
"""
class LengthEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'sentence_length.txt')
# labels: bins
PROBINGEval.__init__(self, 'Length', task_path, seed)
class WordContentEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'word_content.txt')
# labels: 200 target words
PROBINGEval.__init__(self, 'WordContent', task_path, seed)
"""
Latent Structural Information
"""
class DepthEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'tree_depth.txt')
# labels: bins
PROBINGEval.__init__(self, 'Depth', task_path, seed)
class TopConstituentsEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'top_constituents.txt')
# labels: 'PP_NP_VP_.' .. (20 classes)
PROBINGEval.__init__(self, 'TopConstituents', task_path, seed)
class BigramShiftEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'bigram_shift.txt')
# labels: 0 or 1
PROBINGEval.__init__(self, 'BigramShift', task_path, seed)
# TODO: Voice?
"""
Latent Semantic Information
"""
class TenseEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'past_present.txt')
# labels: 'PRES', 'PAST'
PROBINGEval.__init__(self, 'Tense', task_path, seed)
class SubjNumberEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'subj_number.txt')
# labels: 'NN', 'NNS'
PROBINGEval.__init__(self, 'SubjNumber', task_path, seed)
class ObjNumberEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'obj_number.txt')
# labels: 'NN', 'NNS'
PROBINGEval.__init__(self, 'ObjNumber', task_path, seed)
class OddManOutEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'odd_man_out.txt')
# labels: 'O', 'C'
PROBINGEval.__init__(self, 'OddManOut', task_path, seed)
class CoordinationInversionEval(PROBINGEval):
def __init__(self, task_path, seed=1111):
task_path = os.path.join(task_path, 'coordination_inversion.txt')
# labels: 'O', 'I'
PROBINGEval.__init__(self, 'CoordinationInversion', task_path, seed)

Просмотреть файл

@ -0,0 +1,108 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
Image-Caption Retrieval with COCO dataset
'''
from __future__ import absolute_import, division, unicode_literals
import os
import sys
import logging
import numpy as np
try:
import cPickle as pickle
except ImportError:
import pickle
from senteval.tools.ranking import ImageSentenceRankingPytorch
class ImageCaptionRetrievalEval(object):
def __init__(self, task_path, seed=1111):
logging.debug('***** Transfer task: Image Caption Retrieval *****\n\n')
# Get captions and image features
self.seed = seed
train, dev, test = self.loadFile(task_path)
self.coco_data = {'train': train, 'dev': dev, 'test': test}
def do_prepare(self, params, prepare):
samples = self.coco_data['train']['sent'] + \
self.coco_data['dev']['sent'] + \
self.coco_data['test']['sent']
prepare(params, samples)
def loadFile(self, fpath):
coco = {}
for split in ['train', 'valid', 'test']:
list_sent = []
list_img_feat = []
if sys.version_info < (3, 0):
with open(os.path.join(fpath, split + '.pkl')) as f:
cocodata = pickle.load(f)
else:
with open(os.path.join(fpath, split + '.pkl'), 'rb') as f:
cocodata = pickle.load(f, encoding='latin1')
for imgkey in range(len(cocodata['features'])):
assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \
cocodata['image_to_caption_ids'][imgkey]
for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]:
sent = cocodata['captions'][captkey]['cleaned_caption']
sent += ' .' # add punctuation to end of sentence in COCO
list_sent.append(sent.encode('utf-8').split())
list_img_feat.append(cocodata['features'][imgkey])
assert len(list_sent) == len(list_img_feat) and \
len(list_sent) % 5 == 0
list_img_feat = np.array(list_img_feat).astype('float32')
coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat}
return coco['train'], coco['valid'], coco['test']
def run(self, params, batcher):
coco_embed = {'train': {'sentfeat': [], 'imgfeat': []},
'dev': {'sentfeat': [], 'imgfeat': []},
'test': {'sentfeat': [], 'imgfeat': []}}
for key in self.coco_data:
logging.info('Computing embedding for {0}'.format(key))
# Sort to reduce padding
self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent'])
self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent'])
idx_unsort = np.argsort(idx_sort)
coco_embed[key]['X'] = []
nsent = len(self.coco_data[key]['sent'])
for ii in range(0, nsent, params.batch_size):
batch = self.coco_data[key]['sent'][ii:ii + params.batch_size]
embeddings = batcher(params, batch)
coco_embed[key]['sentfeat'].append(embeddings)
coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort]
coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat'])
logging.info('Computed {0} embeddings'.format(key))
config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2}
clf = ImageSentenceRankingPytorch(train=coco_embed['train'],
valid=coco_embed['dev'],
test=coco_embed['test'],
config=config)
bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \
r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run()
logging.debug("\nTest scores | Image to text: \
{0}, {1}, {2}, {3}".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))
logging.debug("Test scores | Text to image: \
{0}, {1}, {2}, {3}\n".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))
return {'devacc': bestdevscore,
'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t),
(r1_t2i, r5_t2i, r10_t2i, medr_t2i)],
'ndev': len(coco_embed['dev']['sentfeat']),
'ntest': len(coco_embed['test']['sentfeat'])}

Просмотреть файл

@ -0,0 +1,217 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
SICK Relatedness and Entailment
'''
from __future__ import absolute_import, division, unicode_literals
import os
import io
import logging
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr
from senteval.tools.relatedness import RelatednessPytorch
from senteval.tools.validation import SplitClassifier
class SICKRelatednessEval(object):
def __init__(self, task_path, seed=1111):
logging.debug('***** Transfer task : SICK-Relatedness*****\n\n')
self.seed = seed
train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
self.sick_data = {'train': train, 'dev': dev, 'test': test}
def do_prepare(self, params, prepare):
samples = self.sick_data['train']['X_A'] + \
self.sick_data['train']['X_B'] + \
self.sick_data['dev']['X_A'] + \
self.sick_data['dev']['X_B'] + \
self.sick_data['test']['X_A'] + self.sick_data['test']['X_B']
return prepare(params, samples)
def loadFile(self, fpath):
skipFirstLine = True
sick_data = {'X_A': [], 'X_B': [], 'y': []}
with io.open(fpath, 'r', encoding='utf-8') as f:
for line in f:
if skipFirstLine:
skipFirstLine = False
else:
text = line.strip().split('\t')
sick_data['X_A'].append(text[1].split())
sick_data['X_B'].append(text[2].split())
sick_data['y'].append(text[3])
sick_data['y'] = [float(s) for s in sick_data['y']]
return sick_data
def run(self, params, batcher):
sick_embed = {'train': {}, 'dev': {}, 'test': {}}
bsize = params.batch_size
for key in self.sick_data:
logging.info('Computing embedding for {0}'.format(key))
# Sort to reduce padding
sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
self.sick_data[key]['X_B'],
self.sick_data[key]['y']),
key=lambda z: (len(z[0]), len(z[1]), z[2]))
self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
for txt_type in ['X_A', 'X_B']:
sick_embed[key][txt_type] = []
for ii in range(0, len(self.sick_data[key]['y']), bsize):
batch = self.sick_data[key][txt_type][ii:ii + bsize]
embeddings = batcher(params, batch)
sick_embed[key][txt_type].append(embeddings)
sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
sick_embed[key]['y'] = np.array(self.sick_data[key]['y'])
logging.info('Computed {0} embeddings'.format(key))
# Train
trainA = sick_embed['train']['X_A']
trainB = sick_embed['train']['X_B']
trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
trainY = self.encode_labels(self.sick_data['train']['y'])
# Dev
devA = sick_embed['dev']['X_A']
devB = sick_embed['dev']['X_B']
devF = np.c_[np.abs(devA - devB), devA * devB]
devY = self.encode_labels(self.sick_data['dev']['y'])
# Test
testA = sick_embed['test']['X_A']
testB = sick_embed['test']['X_B']
testF = np.c_[np.abs(testA - testB), testA * testB]
testY = self.encode_labels(self.sick_data['test']['y'])
config = {'seed': self.seed, 'nclasses': 5}
clf = RelatednessPytorch(train={'X': trainF, 'y': trainY},
valid={'X': devF, 'y': devY},
test={'X': testF, 'y': testY},
devscores=self.sick_data['dev']['y'],
config=config)
devpr, yhat = clf.run()
pr = pearsonr(yhat, self.sick_data['test']['y'])[0]
sr = spearmanr(yhat, self.sick_data['test']['y'])[0]
pr = 0 if pr != pr else pr
sr = 0 if sr != sr else sr
se = mean_squared_error(yhat, self.sick_data['test']['y'])
logging.debug('Dev : Pearson {0}'.format(devpr))
logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
for SICK Relatedness\n'.format(pr, sr, se))
return {'devpearson': devpr, 'pearson': pr, 'spearman': sr, 'mse': se,
'yhat': yhat, 'ndev': len(devA), 'ntest': len(testA)}
def encode_labels(self, labels, nclass=5):
"""
Label encoding from Tree LSTM paper (Tai, Socher, Manning)
"""
Y = np.zeros((len(labels), nclass)).astype('float32')
for j, y in enumerate(labels):
for i in range(nclass):
if i+1 == np.floor(y) + 1:
Y[j, i] = y - np.floor(y)
if i+1 == np.floor(y):
Y[j, i] = np.floor(y) - y + 1
return Y
class SICKEntailmentEval(SICKRelatednessEval):
def __init__(self, task_path, seed=1111):
logging.debug('***** Transfer task : SICK-Entailment*****\n\n')
self.seed = seed
train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
self.sick_data = {'train': train, 'dev': dev, 'test': test}
def loadFile(self, fpath):
label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2}
skipFirstLine = True
sick_data = {'X_A': [], 'X_B': [], 'y': []}
with io.open(fpath, 'r', encoding='utf-8') as f:
for line in f:
if skipFirstLine:
skipFirstLine = False
else:
text = line.strip().split('\t')
sick_data['X_A'].append(text[1].split())
sick_data['X_B'].append(text[2].split())
sick_data['y'].append(text[4])
sick_data['y'] = [label2id[s] for s in sick_data['y']]
return sick_data
def run(self, params, batcher):
sick_embed = {'train': {}, 'dev': {}, 'test': {}}
bsize = params.batch_size
for key in self.sick_data:
logging.info('Computing embedding for {0}'.format(key))
# Sort to reduce padding
sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
self.sick_data[key]['X_B'],
self.sick_data[key]['y']),
key=lambda z: (len(z[0]), len(z[1]), z[2]))
self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
for txt_type in ['X_A', 'X_B']:
sick_embed[key][txt_type] = []
for ii in range(0, len(self.sick_data[key]['y']), bsize):
batch = self.sick_data[key][txt_type][ii:ii + bsize]
embeddings = batcher(params, batch)
sick_embed[key][txt_type].append(embeddings)
sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
logging.info('Computed {0} embeddings'.format(key))
# Train
trainA = sick_embed['train']['X_A']
trainB = sick_embed['train']['X_B']
trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
trainY = np.array(self.sick_data['train']['y'])
# Dev
devA = sick_embed['dev']['X_A']
devB = sick_embed['dev']['X_B']
devF = np.c_[np.abs(devA - devB), devA * devB]
devY = np.array(self.sick_data['dev']['y'])
# Test
testA = sick_embed['test']['X_A']
testB = sick_embed['test']['X_B']
testF = np.c_[np.abs(testA - testB), testA * testB]
testY = np.array(self.sick_data['test']['y'])
config = {'nclasses': 3, 'seed': self.seed,
'usepytorch': params.usepytorch,
'classifier': params.classifier,
'nhid': params.nhid}
clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF},
y={'train': trainY, 'valid': devY, 'test': testY},
config=config)
devacc, testacc = clf.run()
logging.debug('\nDev acc : {0} Test acc : {1} for \
SICK entailment\n'.format(devacc, testacc))
return {'devacc': devacc, 'acc': testacc,
'ndev': len(devA), 'ntest': len(testA)}

Просмотреть файл

@ -0,0 +1,113 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
SNLI - Entailment
'''
from __future__ import absolute_import, division, unicode_literals
import codecs
import os
import io
import copy
import logging
import numpy as np
from senteval.tools.validation import SplitClassifier
class SNLIEval(object):
def __init__(self, taskpath, seed=1111):
logging.debug('***** Transfer task : SNLI Entailment*****\n\n')
self.seed = seed
train1 = self.loadFile(os.path.join(taskpath, 's1.train'))
train2 = self.loadFile(os.path.join(taskpath, 's2.train'))
trainlabels = io.open(os.path.join(taskpath, 'labels.train'),
encoding='utf-8').read().splitlines()
valid1 = self.loadFile(os.path.join(taskpath, 's1.dev'))
valid2 = self.loadFile(os.path.join(taskpath, 's2.dev'))
validlabels = io.open(os.path.join(taskpath, 'labels.dev'),
encoding='utf-8').read().splitlines()
test1 = self.loadFile(os.path.join(taskpath, 's1.test'))
test2 = self.loadFile(os.path.join(taskpath, 's2.test'))
testlabels = io.open(os.path.join(taskpath, 'labels.test'),
encoding='utf-8').read().splitlines()
# sort data (by s2 first) to reduce padding
sorted_train = sorted(zip(train2, train1, trainlabels),
key=lambda z: (len(z[0]), len(z[1]), z[2]))
train2, train1, trainlabels = map(list, zip(*sorted_train))
sorted_valid = sorted(zip(valid2, valid1, validlabels),
key=lambda z: (len(z[0]), len(z[1]), z[2]))
valid2, valid1, validlabels = map(list, zip(*sorted_valid))
sorted_test = sorted(zip(test2, test1, testlabels),
key=lambda z: (len(z[0]), len(z[1]), z[2]))
test2, test1, testlabels = map(list, zip(*sorted_test))
self.samples = train1 + train2 + valid1 + valid2 + test1 + test2
self.data = {'train': (train1, train2, trainlabels),
'valid': (valid1, valid2, validlabels),
'test': (test1, test2, testlabels)
}
def do_prepare(self, params, prepare):
return prepare(params, self.samples)
def loadFile(self, fpath):
with codecs.open(fpath, 'rb', 'latin-1') as f:
return [line.split() for line in
f.read().splitlines()]
def run(self, params, batcher):
self.X, self.y = {}, {}
dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
for key in self.data:
if key not in self.X:
self.X[key] = []
if key not in self.y:
self.y[key] = []
input1, input2, mylabels = self.data[key]
enc_input = []
n_labels = len(mylabels)
for ii in range(0, n_labels, params.batch_size):
batch1 = input1[ii:ii + params.batch_size]
batch2 = input2[ii:ii + params.batch_size]
if len(batch1) == len(batch2) and len(batch1) > 0:
enc1 = batcher(params, batch1)
enc2 = batcher(params, batch2)
enc_input.append(np.hstack((enc1, enc2, enc1 * enc2,
np.abs(enc1 - enc2))))
if (ii*params.batch_size) % (20000*params.batch_size) == 0:
logging.info("PROGRESS (encoding): %.2f%%" %
(100 * ii / n_labels))
self.X[key] = np.vstack(enc_input)
self.y[key] = [dico_label[y] for y in mylabels]
config = {'nclasses': 3, 'seed': self.seed,
'usepytorch': params.usepytorch,
'cudaEfficient': True,
'nhid': params.nhid, 'noreg': True}
config_classifier = copy.deepcopy(params.classifier)
config_classifier['max_epoch'] = 15
config_classifier['epoch_size'] = 1
config['classifier'] = config_classifier
clf = SplitClassifier(self.X, self.y, config)
devacc, testacc = clf.run()
logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'
.format(devacc, testacc))
return {'devacc': devacc, 'acc': testacc,
'ndev': len(self.data['valid'][0]),
'ntest': len(self.data['test'][0])}

Просмотреть файл

@ -0,0 +1,96 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
SST - binary classification
'''
from __future__ import absolute_import, division, unicode_literals
import os
import io
import logging
import numpy as np
from senteval.tools.validation import SplitClassifier
class SSTEval(object):
def __init__(self, task_path, nclasses=2, seed=1111):
self.seed = seed
# binary of fine-grained
assert nclasses in [2, 5]
self.nclasses = nclasses
self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained'
logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name)
train = self.loadFile(os.path.join(task_path, 'sentiment-train'))
dev = self.loadFile(os.path.join(task_path, 'sentiment-dev'))
test = self.loadFile(os.path.join(task_path, 'sentiment-test'))
self.sst_data = {'train': train, 'dev': dev, 'test': test}
def do_prepare(self, params, prepare):
samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \
self.sst_data['test']['X']
return prepare(params, samples)
def loadFile(self, fpath):
sst_data = {'X': [], 'y': []}
with io.open(fpath, 'r', encoding='utf-8') as f:
for line in f:
if self.nclasses == 2:
sample = line.strip().split('\t')
sst_data['y'].append(int(sample[1]))
sst_data['X'].append(sample[0].split())
elif self.nclasses == 5:
sample = line.strip().split(' ', 1)
sst_data['y'].append(int(sample[0]))
sst_data['X'].append(sample[1].split())
assert max(sst_data['y']) == self.nclasses - 1
return sst_data
def run(self, params, batcher):
sst_embed = {'train': {}, 'dev': {}, 'test': {}}
bsize = params.batch_size
for key in self.sst_data:
logging.info('Computing embedding for {0}'.format(key))
# Sort to reduce padding
sorted_data = sorted(zip(self.sst_data[key]['X'],
self.sst_data[key]['y']),
key=lambda z: (len(z[0]), z[1]))
self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))
sst_embed[key]['X'] = []
for ii in range(0, len(self.sst_data[key]['y']), bsize):
batch = self.sst_data[key]['X'][ii:ii + bsize]
embeddings = batcher(params, batch)
sst_embed[key]['X'].append(embeddings)
sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
logging.info('Computed {0} embeddings'.format(key))
config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
'usepytorch': params.usepytorch,
'classifier': params.classifier}
clf = SplitClassifier(X={'train': sst_embed['train']['X'],
'valid': sst_embed['dev']['X'],
'test': sst_embed['test']['X']},
y={'train': sst_embed['train']['y'],
'valid': sst_embed['dev']['y'],
'test': sst_embed['test']['y']},
config=config_classifier)
devacc, testacc = clf.run()
logging.debug('\nDev acc : {0} Test acc : {1} for \
SST {2} classification\n'.format(devacc, testacc, self.task_name))
return {'devacc': devacc, 'acc': testacc,
'ndev': len(sst_embed['dev']['X']),
'ntest': len(sst_embed['test']['X'])}

Просмотреть файл

@ -0,0 +1,171 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
STS-{2012,2013,2014,2015,2016} (unsupervised) and
STS-benchmark (supervised) tasks
'''
from __future__ import absolute_import, division, unicode_literals
import os
import io
import numpy as np
import logging
from scipy.stats import spearmanr, pearsonr
from senteval.utils import cosine
from senteval.sick import SICKRelatednessEval
class STSEval(object):
def loadFile(self, fpath):
self.data = {}
self.samples = []
for dataset in self.datasets:
sent1, sent2 = zip(*[l.split("\t") for l in
io.open(fpath + '/STS.input.%s.txt' % dataset,
encoding='utf8').read().splitlines()])
raw_scores = np.array([x for x in
io.open(fpath + '/STS.gs.%s.txt' % dataset,
encoding='utf8')
.read().splitlines()])
not_empty_idx = raw_scores != ''
gs_scores = [float(x) for x in raw_scores[not_empty_idx]]
sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
sent2 = np.array([s.split() for s in sent2])[not_empty_idx]
# sort data by length to minimize padding in batcher
sorted_data = sorted(zip(sent1, sent2, gs_scores),
key=lambda z: (len(z[0]), len(z[1]), z[2]))
sent1, sent2, gs_scores = map(list, zip(*sorted_data))
self.data[dataset] = (sent1, sent2, gs_scores)
self.samples += sent1 + sent2
def do_prepare(self, params, prepare):
if 'similarity' in params:
self.similarity = params.similarity
else: # Default similarity is cosine
self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2)))
return prepare(params, self.samples)
def run(self, params, batcher):
results = {}
for dataset in self.datasets:
sys_scores = []
input1, input2, gs_scores = self.data[dataset]
for ii in range(0, len(gs_scores), params.batch_size):
batch1 = input1[ii:ii + params.batch_size]
batch2 = input2[ii:ii + params.batch_size]
# we assume get_batch already throws out the faulty ones
if len(batch1) == len(batch2) and len(batch1) > 0:
enc1 = batcher(params, batch1)
enc2 = batcher(params, batch2)
for kk in range(enc2.shape[0]):
sys_score = self.similarity(enc1[kk], enc2[kk])
sys_scores.append(sys_score)
results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores),
'spearman': spearmanr(sys_scores, gs_scores),
'nsamples': len(sys_scores)}
logging.debug('%s : pearson = %.4f, spearman = %.4f' %
(dataset, results[dataset]['pearson'][0],
results[dataset]['spearman'][0]))
weights = [results[dset]['nsamples'] for dset in results.keys()]
list_prs = np.array([results[dset]['pearson'][0] for
dset in results.keys()])
list_spr = np.array([results[dset]['spearman'][0] for
dset in results.keys()])
avg_pearson = np.average(list_prs)
avg_spearman = np.average(list_spr)
wavg_pearson = np.average(list_prs, weights=weights)
wavg_spearman = np.average(list_spr, weights=weights)
results['all'] = {'pearson': {'mean': avg_pearson,
'wmean': wavg_pearson},
'spearman': {'mean': avg_spearman,
'wmean': wavg_spearman}}
logging.debug('ALL (weighted average) : Pearson = %.4f, \
Spearman = %.4f' % (wavg_pearson, wavg_spearman))
logging.debug('ALL (average) : Pearson = %.4f, \
Spearman = %.4f\n' % (avg_pearson, avg_spearman))
return results
class STS12Eval(STSEval):
def __init__(self, taskpath, seed=1111):
logging.debug('***** Transfer task : STS12 *****\n\n')
self.seed = seed
self.datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl',
'surprise.OnWN', 'surprise.SMTnews']
self.loadFile(taskpath)
class STS13Eval(STSEval):
# STS13 here does not contain the "SMT" subtask due to LICENSE issue
def __init__(self, taskpath, seed=1111):
logging.debug('***** Transfer task : STS13 (-SMT) *****\n\n')
self.seed = seed
self.datasets = ['FNWN', 'headlines', 'OnWN']
self.loadFile(taskpath)
class STS14Eval(STSEval):
def __init__(self, taskpath, seed=1111):
logging.debug('***** Transfer task : STS14 *****\n\n')
self.seed = seed
self.datasets = ['deft-forum', 'deft-news', 'headlines',
'images', 'OnWN', 'tweet-news']
self.loadFile(taskpath)
class STS15Eval(STSEval):
def __init__(self, taskpath, seed=1111):
logging.debug('***** Transfer task : STS15 *****\n\n')
self.seed = seed
self.datasets = ['answers-forums', 'answers-students',
'belief', 'headlines', 'images']
self.loadFile(taskpath)
class STS16Eval(STSEval):
def __init__(self, taskpath, seed=1111):
logging.debug('***** Transfer task : STS16 *****\n\n')
self.seed = seed
self.datasets = ['answer-answer', 'headlines', 'plagiarism',
'postediting', 'question-question']
self.loadFile(taskpath)
class STSBenchmarkEval(SICKRelatednessEval):
def __init__(self, task_path, seed=1111):
logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n')
self.seed = seed
train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))
dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
self.sick_data = {'train': train, 'dev': dev, 'test': test}
def loadFile(self, fpath):
sick_data = {'X_A': [], 'X_B': [], 'y': []}
with io.open(fpath, 'r', encoding='utf-8') as f:
for line in f:
text = line.strip().split('\t')
sick_data['X_A'].append(text[5].split())
sick_data['X_B'].append(text[6].split())
sick_data['y'].append(text[4])
sick_data['y'] = [float(s) for s in sick_data['y']]
return sick_data

Просмотреть файл

Просмотреть файл

@ -0,0 +1,202 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
"""
Pytorch Classifier class in the style of scikit-learn
Classifiers include Logistic Regression and MLP
"""
from __future__ import absolute_import, division, unicode_literals
import numpy as np
import copy
from senteval import utils
import torch
from torch import nn
import torch.nn.functional as F
class PyTorchClassifier(object):
def __init__(self, inputdim, nclasses, l2reg=0., batch_size=64, seed=1111,
cudaEfficient=False):
# fix seed
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
self.inputdim = inputdim
self.nclasses = nclasses
self.l2reg = l2reg
self.batch_size = batch_size
self.cudaEfficient = cudaEfficient
def prepare_split(self, X, y, validation_data=None, validation_split=None):
# Preparing validation data
assert validation_split or validation_data
if validation_data is not None:
trainX, trainy = X, y
devX, devy = validation_data
else:
permutation = np.random.permutation(len(X))
trainidx = permutation[int(validation_split * len(X)):]
devidx = permutation[0:int(validation_split * len(X))]
trainX, trainy = X[trainidx], y[trainidx]
devX, devy = X[devidx], y[devidx]
device = torch.device('cpu') if self.cudaEfficient else torch.device('cuda')
trainX = torch.from_numpy(trainX).to(device, dtype=torch.float32)
trainy = torch.from_numpy(trainy).to(device, dtype=torch.int64)
devX = torch.from_numpy(devX).to(device, dtype=torch.float32)
devy = torch.from_numpy(devy).to(device, dtype=torch.int64)
return trainX, trainy, devX, devy
def fit(self, X, y, validation_data=None, validation_split=None,
early_stop=True):
self.nepoch = 0
bestaccuracy = -1
stop_train = False
early_stop_count = 0
# Preparing validation data
trainX, trainy, devX, devy = self.prepare_split(X, y, validation_data,
validation_split)
# Training
while not stop_train and self.nepoch <= self.max_epoch:
self.trainepoch(trainX, trainy, epoch_size=self.epoch_size)
accuracy = self.score(devX, devy)
if accuracy > bestaccuracy:
bestaccuracy = accuracy
bestmodel = copy.deepcopy(self.model)
elif early_stop:
if early_stop_count >= self.tenacity:
stop_train = True
early_stop_count += 1
self.model = bestmodel
return bestaccuracy
def trainepoch(self, X, y, epoch_size=1):
self.model.train()
for _ in range(self.nepoch, self.nepoch + epoch_size):
permutation = np.random.permutation(len(X))
all_costs = []
for i in range(0, len(X), self.batch_size):
# forward
idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().to(X.device)
Xbatch = X[idx]
ybatch = y[idx]
if self.cudaEfficient:
Xbatch = Xbatch.cuda()
ybatch = ybatch.cuda()
output = self.model(Xbatch)
# loss
loss = self.loss_fn(output, ybatch)
all_costs.append(loss.data.item())
# backward
self.optimizer.zero_grad()
loss.backward()
# Update parameters
self.optimizer.step()
self.nepoch += epoch_size
def score(self, devX, devy):
self.model.eval()
correct = 0
if not isinstance(devX, torch.cuda.FloatTensor) or self.cudaEfficient:
devX = torch.FloatTensor(devX).cuda()
devy = torch.LongTensor(devy).cuda()
with torch.no_grad():
for i in range(0, len(devX), self.batch_size):
Xbatch = devX[i:i + self.batch_size]
ybatch = devy[i:i + self.batch_size]
if self.cudaEfficient:
Xbatch = Xbatch.cuda()
ybatch = ybatch.cuda()
output = self.model(Xbatch)
pred = output.data.max(1)[1]
correct += pred.long().eq(ybatch.data.long()).sum().item()
accuracy = 1.0 * correct / len(devX)
return accuracy
def predict(self, devX):
self.model.eval()
if not isinstance(devX, torch.cuda.FloatTensor):
devX = torch.FloatTensor(devX).cuda()
yhat = np.array([])
with torch.no_grad():
for i in range(0, len(devX), self.batch_size):
Xbatch = devX[i:i + self.batch_size]
output = self.model(Xbatch)
yhat = np.append(yhat,
output.data.max(1)[1].cpu().numpy())
yhat = np.vstack(yhat)
return yhat
def predict_proba(self, devX):
self.model.eval()
probas = []
with torch.no_grad():
for i in range(0, len(devX), self.batch_size):
Xbatch = devX[i:i + self.batch_size]
vals = F.softmax(self.model(Xbatch).data.cpu().numpy())
if not probas:
probas = vals
else:
probas = np.concatenate(probas, vals, axis=0)
return probas
"""
MLP with Pytorch (nhid=0 --> Logistic Regression)
"""
class MLP(PyTorchClassifier):
def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64,
seed=1111, cudaEfficient=False):
super(self.__class__, self).__init__(inputdim, nclasses, l2reg,
batch_size, seed, cudaEfficient)
"""
PARAMETERS:
-nhid: number of hidden units (0: Logistic Regression)
-optim: optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..)
-tenacity: how many times dev acc does not increase before stopping
-epoch_size: each epoch corresponds to epoch_size pass on the train set
-max_epoch: max number of epoches
-dropout: dropout for MLP
"""
self.nhid = 0 if "nhid" not in params else params["nhid"]
self.optim = "adam" if "optim" not in params else params["optim"]
self.tenacity = 5 if "tenacity" not in params else params["tenacity"]
self.epoch_size = 4 if "epoch_size" not in params else params["epoch_size"]
self.max_epoch = 200 if "max_epoch" not in params else params["max_epoch"]
self.dropout = 0. if "dropout" not in params else params["dropout"]
self.batch_size = 64 if "batch_size" not in params else params["batch_size"]
if params["nhid"] == 0:
self.model = nn.Sequential(
nn.Linear(self.inputdim, self.nclasses),
).cuda()
else:
self.model = nn.Sequential(
nn.Linear(self.inputdim, params["nhid"]),
nn.Dropout(p=self.dropout),
nn.Sigmoid(),
nn.Linear(params["nhid"], self.nclasses),
).cuda()
self.loss_fn = nn.CrossEntropyLoss().cuda()
self.loss_fn.size_average = False
optim_fn, optim_params = utils.get_optimizer(self.optim)
self.optimizer = optim_fn(self.model.parameters(), **optim_params)
self.optimizer.param_groups[0]['weight_decay'] = self.l2reg

Просмотреть файл

@ -0,0 +1,359 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
"""
Image Annotation/Search for COCO with Pytorch
"""
from __future__ import absolute_import, division, unicode_literals
import logging
import copy
import numpy as np
import torch
from torch import nn
from torch.autograd import Variable
import torch.optim as optim
class COCOProjNet(nn.Module):
def __init__(self, config):
super(COCOProjNet, self).__init__()
self.imgdim = config['imgdim']
self.sentdim = config['sentdim']
self.projdim = config['projdim']
self.imgproj = nn.Sequential(
nn.Linear(self.imgdim, self.projdim),
)
self.sentproj = nn.Sequential(
nn.Linear(self.sentdim, self.projdim),
)
def forward(self, img, sent, imgc, sentc):
# imgc : (bsize, ncontrast, imgdim)
# sentc : (bsize, ncontrast, sentdim)
# img : (bsize, imgdim)
# sent : (bsize, sentdim)
img = img.unsqueeze(1).expand_as(imgc).contiguous()
img = img.view(-1, self.imgdim)
imgc = imgc.view(-1, self.imgdim)
sent = sent.unsqueeze(1).expand_as(sentc).contiguous()
sent = sent.view(-1, self.sentdim)
sentc = sentc.view(-1, self.sentdim)
imgproj = self.imgproj(img)
imgproj = imgproj / torch.sqrt(torch.pow(imgproj, 2).sum(1, keepdim=True)).expand_as(imgproj)
imgcproj = self.imgproj(imgc)
imgcproj = imgcproj / torch.sqrt(torch.pow(imgcproj, 2).sum(1, keepdim=True)).expand_as(imgcproj)
sentproj = self.sentproj(sent)
sentproj = sentproj / torch.sqrt(torch.pow(sentproj, 2).sum(1, keepdim=True)).expand_as(sentproj)
sentcproj = self.sentproj(sentc)
sentcproj = sentcproj / torch.sqrt(torch.pow(sentcproj, 2).sum(1, keepdim=True)).expand_as(sentcproj)
# (bsize*ncontrast, projdim)
anchor1 = torch.sum((imgproj*sentproj), 1)
anchor2 = torch.sum((sentproj*imgproj), 1)
img_sentc = torch.sum((imgproj*sentcproj), 1)
sent_imgc = torch.sum((sentproj*imgcproj), 1)
# (bsize*ncontrast)
return anchor1, anchor2, img_sentc, sent_imgc
def proj_sentence(self, sent):
output = self.sentproj(sent)
output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output)
return output # (bsize, projdim)
def proj_image(self, img):
output = self.imgproj(img)
output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output)
return output # (bsize, projdim)
class PairwiseRankingLoss(nn.Module):
"""
Pairwise ranking loss
"""
def __init__(self, margin):
super(PairwiseRankingLoss, self).__init__()
self.margin = margin
def forward(self, anchor1, anchor2, img_sentc, sent_imgc):
cost_sent = torch.clamp(self.margin - anchor1 + img_sentc,
min=0.0).sum()
cost_img = torch.clamp(self.margin - anchor2 + sent_imgc,
min=0.0).sum()
loss = cost_sent + cost_img
return loss
class ImageSentenceRankingPytorch(object):
# Image Sentence Ranking on COCO with Pytorch
def __init__(self, train, valid, test, config):
# fix seed
self.seed = config['seed']
np.random.seed(self.seed)
torch.manual_seed(self.seed)
torch.cuda.manual_seed(self.seed)
self.train = train
self.valid = valid
self.test = test
self.imgdim = len(train['imgfeat'][0])
self.sentdim = len(train['sentfeat'][0])
self.projdim = config['projdim']
self.margin = config['margin']
self.batch_size = 128
self.ncontrast = 30
self.maxepoch = 20
self.early_stop = True
config_model = {'imgdim': self.imgdim,'sentdim': self.sentdim,
'projdim': self.projdim}
self.model = COCOProjNet(config_model).cuda()
self.loss_fn = PairwiseRankingLoss(margin=self.margin).cuda()
self.optimizer = optim.Adam(self.model.parameters())
def prepare_data(self, trainTxt, trainImg, devTxt, devImg,
testTxt, testImg):
trainTxt = torch.FloatTensor(trainTxt)
trainImg = torch.FloatTensor(trainImg)
devTxt = torch.FloatTensor(devTxt).cuda()
devImg = torch.FloatTensor(devImg).cuda()
testTxt = torch.FloatTensor(testTxt).cuda()
testImg = torch.FloatTensor(testImg).cuda()
return trainTxt, trainImg, devTxt, devImg, testTxt, testImg
def run(self):
self.nepoch = 0
bestdevscore = -1
early_stop_count = 0
stop_train = False
# Preparing data
logging.info('prepare data')
trainTxt, trainImg, devTxt, devImg, testTxt, testImg = \
self.prepare_data(self.train['sentfeat'], self.train['imgfeat'],
self.valid['sentfeat'], self.valid['imgfeat'],
self.test['sentfeat'], self.test['imgfeat'])
# Training
while not stop_train and self.nepoch <= self.maxepoch:
logging.info('start epoch')
self.trainepoch(trainTxt, trainImg, devTxt, devImg, nepoches=1)
logging.info('Epoch {0} finished'.format(self.nepoch))
results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
'dev': bestdevscore}
score = 0
for i in range(5):
devTxt_i = devTxt[i*5000:(i+1)*5000]
devImg_i = devImg[i*5000:(i+1)*5000]
# Compute dev ranks img2txt
r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg_i,
devTxt_i)
results['i2t']['r1'] += r1_i2t / 5
results['i2t']['r5'] += r5_i2t / 5
results['i2t']['r10'] += r10_i2t / 5
results['i2t']['medr'] += medr_i2t / 5
logging.info("Image to text: {0}, {1}, {2}, {3}"
.format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))
# Compute dev ranks txt2img
r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg_i,
devTxt_i)
results['t2i']['r1'] += r1_t2i / 5
results['t2i']['r5'] += r5_t2i / 5
results['t2i']['r10'] += r10_t2i / 5
results['t2i']['medr'] += medr_t2i / 5
logging.info("Text to Image: {0}, {1}, {2}, {3}"
.format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))
score += (r1_i2t + r5_i2t + r10_i2t +
r1_t2i + r5_t2i + r10_t2i) / 5
logging.info("Dev mean Text to Image: {0}, {1}, {2}, {3}".format(
results['t2i']['r1'], results['t2i']['r5'],
results['t2i']['r10'], results['t2i']['medr']))
logging.info("Dev mean Image to text: {0}, {1}, {2}, {3}".format(
results['i2t']['r1'], results['i2t']['r5'],
results['i2t']['r10'], results['i2t']['medr']))
# early stop on Pearson
if score > bestdevscore:
bestdevscore = score
bestmodel = copy.deepcopy(self.model)
elif self.early_stop:
if early_stop_count >= 3:
stop_train = True
early_stop_count += 1
self.model = bestmodel
# Compute test for the 5 splits
results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0},
'dev': bestdevscore}
for i in range(5):
testTxt_i = testTxt[i*5000:(i+1)*5000]
testImg_i = testImg[i*5000:(i+1)*5000]
# Compute test ranks img2txt
r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(testImg_i, testTxt_i)
results['i2t']['r1'] += r1_i2t / 5
results['i2t']['r5'] += r5_i2t / 5
results['i2t']['r10'] += r10_i2t / 5
results['i2t']['medr'] += medr_i2t / 5
# Compute test ranks txt2img
r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(testImg_i, testTxt_i)
results['t2i']['r1'] += r1_t2i / 5
results['t2i']['r5'] += r5_t2i / 5
results['t2i']['r10'] += r10_t2i / 5
results['t2i']['medr'] += medr_t2i / 5
return bestdevscore, results['i2t']['r1'], results['i2t']['r5'], \
results['i2t']['r10'], results['i2t']['medr'], \
results['t2i']['r1'], results['t2i']['r5'], \
results['t2i']['r10'], results['t2i']['medr']
def trainepoch(self, trainTxt, trainImg, devTxt, devImg, nepoches=1):
self.model.train()
for _ in range(self.nepoch, self.nepoch + nepoches):
permutation = list(np.random.permutation(len(trainTxt)))
all_costs = []
for i in range(0, len(trainTxt), self.batch_size):
# forward
if i % (self.batch_size*500) == 0 and i > 0:
logging.info('samples : {0}'.format(i))
r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg,
devTxt)
logging.info("Image to text: {0}, {1}, {2}, {3}".format(
r1_i2t, r5_i2t, r10_i2t, medr_i2t))
# Compute test ranks txt2img
r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg,
devTxt)
logging.info("Text to Image: {0}, {1}, {2}, {3}".format(
r1_t2i, r5_t2i, r10_t2i, medr_t2i))
idx = torch.LongTensor(permutation[i:i + self.batch_size])
imgbatch = Variable(trainImg.index_select(0, idx)).cuda()
sentbatch = Variable(trainTxt.index_select(0, idx)).cuda()
idximgc = np.random.choice(permutation[:i] +
permutation[i + self.batch_size:],
self.ncontrast*idx.size(0))
idxsentc = np.random.choice(permutation[:i] +
permutation[i + self.batch_size:],
self.ncontrast*idx.size(0))
idximgc = torch.LongTensor(idximgc)
idxsentc = torch.LongTensor(idxsentc)
# Get indexes for contrastive images and sentences
imgcbatch = Variable(trainImg.index_select(0, idximgc)).view(
-1, self.ncontrast, self.imgdim).cuda()
sentcbatch = Variable(trainTxt.index_select(0, idxsentc)).view(
-1, self.ncontrast, self.sentdim).cuda()
anchor1, anchor2, img_sentc, sent_imgc = self.model(
imgbatch, sentbatch, imgcbatch, sentcbatch)
# loss
loss = self.loss_fn(anchor1, anchor2, img_sentc, sent_imgc)
all_costs.append(loss.data.item())
# backward
self.optimizer.zero_grad()
loss.backward()
# Update parameters
self.optimizer.step()
self.nepoch += nepoches
def t2i(self, images, captions):
"""
Images: (5N, imgdim) matrix of images
Captions: (5N, sentdim) matrix of captions
"""
with torch.no_grad():
# Project images and captions
img_embed, sent_embed = [], []
for i in range(0, len(images), self.batch_size):
img_embed.append(self.model.proj_image(
Variable(images[i:i + self.batch_size])))
sent_embed.append(self.model.proj_sentence(
Variable(captions[i:i + self.batch_size])))
img_embed = torch.cat(img_embed, 0).data
sent_embed = torch.cat(sent_embed, 0).data
npts = int(img_embed.size(0) / 5)
idxs = torch.cuda.LongTensor(range(0, len(img_embed), 5))
ims = img_embed.index_select(0, idxs)
ranks = np.zeros(5 * npts)
for index in range(npts):
# Get query captions
queries = sent_embed[5*index: 5*index + 5]
# Compute scores
scores = torch.mm(queries, ims.transpose(0, 1)).cpu().numpy()
inds = np.zeros(scores.shape)
for i in range(len(inds)):
inds[i] = np.argsort(scores[i])[::-1]
ranks[5 * index + i] = np.where(inds[i] == index)[0][0]
# Compute metrics
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
medr = np.floor(np.median(ranks)) + 1
return (r1, r5, r10, medr)
def i2t(self, images, captions):
"""
Images: (5N, imgdim) matrix of images
Captions: (5N, sentdim) matrix of captions
"""
with torch.no_grad():
# Project images and captions
img_embed, sent_embed = [], []
for i in range(0, len(images), self.batch_size):
img_embed.append(self.model.proj_image(
Variable(images[i:i + self.batch_size])))
sent_embed.append(self.model.proj_sentence(
Variable(captions[i:i + self.batch_size])))
img_embed = torch.cat(img_embed, 0).data
sent_embed = torch.cat(sent_embed, 0).data
npts = int(img_embed.size(0) / 5)
index_list = []
ranks = np.zeros(npts)
for index in range(npts):
# Get query image
query_img = img_embed[5 * index]
# Compute scores
scores = torch.mm(query_img.view(1, -1),
sent_embed.transpose(0, 1)).view(-1)
scores = scores.cpu().numpy()
inds = np.argsort(scores)[::-1]
index_list.append(inds[0])
# Score
rank = 1e20
for i in range(5*index, 5*index + 5, 1):
tmp = np.where(inds == i)[0][0]
if tmp < rank:
rank = tmp
ranks[index] = rank
# Compute metrics
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
medr = np.floor(np.median(ranks)) + 1
return (r1, r5, r10, medr)

Просмотреть файл

@ -0,0 +1,134 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
"""
Semantic Relatedness (supervised) with Pytorch
"""
from __future__ import absolute_import, division, unicode_literals
import copy
import numpy as np
import torch
from torch import nn
import torch.optim as optim
from scipy.stats import pearsonr
class RelatednessPytorch(object):
# Can be used for SICK-Relatedness, and STS14
def __init__(self, train, valid, test, devscores, config):
# fix seed
np.random.seed(config['seed'])
torch.manual_seed(config['seed'])
assert torch.cuda.is_available(), 'torch.cuda required for Relatedness'
torch.cuda.manual_seed(config['seed'])
self.train = train
self.valid = valid
self.test = test
self.devscores = devscores
self.inputdim = train['X'].shape[1]
self.nclasses = config['nclasses']
self.seed = config['seed']
self.l2reg = 0.
self.batch_size = 64
self.maxepoch = 1000
self.early_stop = True
self.model = nn.Sequential(
nn.Linear(self.inputdim, self.nclasses),
nn.Softmax(dim=-1),
)
self.loss_fn = nn.MSELoss()
if torch.cuda.is_available():
self.model = self.model.cuda()
self.loss_fn = self.loss_fn.cuda()
self.loss_fn.size_average = False
self.optimizer = optim.Adam(self.model.parameters(),
weight_decay=self.l2reg)
def prepare_data(self, trainX, trainy, devX, devy, testX, testy):
# Transform probs to log-probs for KL-divergence
trainX = torch.from_numpy(trainX).float().cuda()
trainy = torch.from_numpy(trainy).float().cuda()
devX = torch.from_numpy(devX).float().cuda()
devy = torch.from_numpy(devy).float().cuda()
testX = torch.from_numpy(testX).float().cuda()
testY = torch.from_numpy(testy).float().cuda()
return trainX, trainy, devX, devy, testX, testy
def run(self):
self.nepoch = 0
bestpr = -1
early_stop_count = 0
r = np.arange(1, 6)
stop_train = False
# Preparing data
trainX, trainy, devX, devy, testX, testy = self.prepare_data(
self.train['X'], self.train['y'],
self.valid['X'], self.valid['y'],
self.test['X'], self.test['y'])
# Training
while not stop_train and self.nepoch <= self.maxepoch:
self.trainepoch(trainX, trainy, nepoches=50)
yhat = np.dot(self.predict_proba(devX), r)
pr = pearsonr(yhat, self.devscores)[0]
pr = 0 if pr != pr else pr # if NaN bc std=0
# early stop on Pearson
if pr > bestpr:
bestpr = pr
bestmodel = copy.deepcopy(self.model)
elif self.early_stop:
if early_stop_count >= 3:
stop_train = True
early_stop_count += 1
self.model = bestmodel
yhat = np.dot(self.predict_proba(testX), r)
return bestpr, yhat
def trainepoch(self, X, y, nepoches=1):
self.model.train()
for _ in range(self.nepoch, self.nepoch + nepoches):
permutation = np.random.permutation(len(X))
all_costs = []
for i in range(0, len(X), self.batch_size):
# forward
idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda()
Xbatch = X[idx]
ybatch = y[idx]
output = self.model(Xbatch)
# loss
loss = self.loss_fn(output, ybatch)
all_costs.append(loss.item())
# backward
self.optimizer.zero_grad()
loss.backward()
# Update parameters
self.optimizer.step()
self.nepoch += nepoches
def predict_proba(self, devX):
self.model.eval()
probas = []
with torch.no_grad():
for i in range(0, len(devX), self.batch_size):
Xbatch = devX[i:i + self.batch_size]
if len(probas) == 0:
probas = self.model(Xbatch).data.cpu().numpy()
else:
probas = np.concatenate((probas, self.model(Xbatch).data.cpu().numpy()), axis=0)
return probas

Просмотреть файл

@ -0,0 +1,246 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
"""
Validation and classification
(train) : inner-kfold classifier
(train, test) : kfold classifier
(train, dev, test) : split classifier
"""
from __future__ import absolute_import, division, unicode_literals
import logging
import numpy as np
from senteval.tools.classifier import MLP
import sklearn
assert(sklearn.__version__ >= "0.18.0"), \
"need to update sklearn to version >= 0.18.0"
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
def get_classif_name(classifier_config, usepytorch):
if not usepytorch:
modelname = 'sklearn-LogReg'
else:
nhid = classifier_config['nhid']
optim = 'adam' if 'optim' not in classifier_config else classifier_config['optim']
bs = 64 if 'batch_size' not in classifier_config else classifier_config['batch_size']
modelname = 'pytorch-MLP-nhid%s-%s-bs%s' % (nhid, optim, bs)
return modelname
# Pytorch version
class InnerKFoldClassifier(object):
"""
(train) split classifier : InnerKfold.
"""
def __init__(self, X, y, config):
self.X = X
self.y = y
self.featdim = X.shape[1]
self.nclasses = config['nclasses']
self.seed = config['seed']
self.devresults = []
self.testresults = []
self.usepytorch = config['usepytorch']
self.classifier_config = config['classifier']
self.modelname = get_classif_name(self.classifier_config, self.usepytorch)
self.k = 5 if 'kfold' not in config else config['kfold']
def run(self):
logging.info('Training {0} with (inner) {1}-fold cross-validation'
.format(self.modelname, self.k))
regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
[2**t for t in range(-2, 4, 1)]
skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111)
innerskf = StratifiedKFold(n_splits=self.k, shuffle=True,
random_state=1111)
count = 0
for train_idx, test_idx in skf.split(self.X, self.y):
count += 1
X_train, X_test = self.X[train_idx], self.X[test_idx]
y_train, y_test = self.y[train_idx], self.y[test_idx]
scores = []
for reg in regs:
regscores = []
for inner_train_idx, inner_test_idx in innerskf.split(X_train, y_train):
X_in_train, X_in_test = X_train[inner_train_idx], X_train[inner_test_idx]
y_in_train, y_in_test = y_train[inner_train_idx], y_train[inner_test_idx]
if self.usepytorch:
clf = MLP(self.classifier_config, inputdim=self.featdim,
nclasses=self.nclasses, l2reg=reg,
seed=self.seed)
clf.fit(X_in_train, y_in_train,
validation_data=(X_in_test, y_in_test))
else:
clf = LogisticRegression(C=reg, random_state=self.seed)
clf.fit(X_in_train, y_in_train)
regscores.append(clf.score(X_in_test, y_in_test))
scores.append(round(100*np.mean(regscores), 2))
optreg = regs[np.argmax(scores)]
logging.info('Best param found at split {0}: l2reg = {1} \
with score {2}'.format(count, optreg, np.max(scores)))
self.devresults.append(np.max(scores))
if self.usepytorch:
clf = MLP(self.classifier_config, inputdim=self.featdim,
nclasses=self.nclasses, l2reg=optreg,
seed=self.seed)
clf.fit(X_train, y_train, validation_split=0.05)
else:
clf = LogisticRegression(C=optreg, random_state=self.seed)
clf.fit(X_train, y_train)
self.testresults.append(round(100*clf.score(X_test, y_test), 2))
devaccuracy = round(np.mean(self.devresults), 2)
testaccuracy = round(np.mean(self.testresults), 2)
return devaccuracy, testaccuracy
class KFoldClassifier(object):
"""
(train, test) split classifier : cross-validation on train.
"""
def __init__(self, train, test, config):
self.train = train
self.test = test
self.featdim = self.train['X'].shape[1]
self.nclasses = config['nclasses']
self.seed = config['seed']
self.usepytorch = config['usepytorch']
self.classifier_config = config['classifier']
self.modelname = get_classif_name(self.classifier_config, self.usepytorch)
self.k = 5 if 'kfold' not in config else config['kfold']
def run(self):
# cross-validation
logging.info('Training {0} with {1}-fold cross-validation'
.format(self.modelname, self.k))
regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
[2**t for t in range(-1, 6, 1)]
skf = StratifiedKFold(n_splits=self.k, shuffle=True,
random_state=self.seed)
scores = []
for reg in regs:
scanscores = []
for train_idx, test_idx in skf.split(self.train['X'],
self.train['y']):
# Split data
X_train, y_train = self.train['X'][train_idx], self.train['y'][train_idx]
X_test, y_test = self.train['X'][test_idx], self.train['y'][test_idx]
# Train classifier
if self.usepytorch:
clf = MLP(self.classifier_config, inputdim=self.featdim,
nclasses=self.nclasses, l2reg=reg,
seed=self.seed)
clf.fit(X_train, y_train, validation_data=(X_test, y_test))
else:
clf = LogisticRegression(C=reg, random_state=self.seed)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
scanscores.append(score)
# Append mean score
scores.append(round(100*np.mean(scanscores), 2))
# evaluation
logging.info([('reg:' + str(regs[idx]), scores[idx])
for idx in range(len(scores))])
optreg = regs[np.argmax(scores)]
devaccuracy = np.max(scores)
logging.info('Cross-validation : best param found is reg = {0} \
with score {1}'.format(optreg, devaccuracy))
logging.info('Evaluating...')
if self.usepytorch:
clf = MLP(self.classifier_config, inputdim=self.featdim,
nclasses=self.nclasses, l2reg=optreg,
seed=self.seed)
clf.fit(self.train['X'], self.train['y'], validation_split=0.05)
else:
clf = LogisticRegression(C=optreg, random_state=self.seed)
clf.fit(self.train['X'], self.train['y'])
yhat = clf.predict(self.test['X'])
testaccuracy = clf.score(self.test['X'], self.test['y'])
testaccuracy = round(100*testaccuracy, 2)
return devaccuracy, testaccuracy, yhat
class SplitClassifier(object):
"""
(train, valid, test) split classifier.
"""
def __init__(self, X, y, config):
self.X = X
self.y = y
self.nclasses = config['nclasses']
self.featdim = self.X['train'].shape[1]
self.seed = config['seed']
self.usepytorch = config['usepytorch']
self.classifier_config = config['classifier']
self.cudaEfficient = False if 'cudaEfficient' not in config else \
config['cudaEfficient']
self.modelname = get_classif_name(self.classifier_config, self.usepytorch)
self.noreg = False if 'noreg' not in config else config['noreg']
self.config = config
def run(self):
logging.info('Training {0} with standard validation..'
.format(self.modelname))
regs = [10**t for t in range(-5, -1)] if self.usepytorch else \
[2**t for t in range(-2, 4, 1)]
if self.noreg:
regs = [1e-9 if self.usepytorch else 1e9]
scores = []
for reg in regs:
if self.usepytorch:
clf = MLP(self.classifier_config, inputdim=self.featdim,
nclasses=self.nclasses, l2reg=reg,
seed=self.seed, cudaEfficient=self.cudaEfficient)
# TODO: Find a hack for reducing nb epoches in SNLI
clf.fit(self.X['train'], self.y['train'],
validation_data=(self.X['valid'], self.y['valid']))
else:
clf = LogisticRegression(C=reg, random_state=self.seed)
clf.fit(self.X['train'], self.y['train'])
scores.append(round(100*clf.score(self.X['valid'],
self.y['valid']), 2))
logging.info([('reg:'+str(regs[idx]), scores[idx])
for idx in range(len(scores))])
optreg = regs[np.argmax(scores)]
devaccuracy = np.max(scores)
logging.info('Validation : best param found is reg = {0} with score \
{1}'.format(optreg, devaccuracy))
clf = LogisticRegression(C=optreg, random_state=self.seed)
logging.info('Evaluating...')
if self.usepytorch:
clf = MLP(self.classifier_config, inputdim=self.featdim,
nclasses=self.nclasses, l2reg=optreg,
seed=self.seed, cudaEfficient=self.cudaEfficient)
# TODO: Find a hack for reducing nb epoches in SNLI
clf.fit(self.X['train'], self.y['train'],
validation_data=(self.X['valid'], self.y['valid']))
else:
clf = LogisticRegression(C=optreg, random_state=self.seed)
clf.fit(self.X['train'], self.y['train'])
testaccuracy = clf.score(self.X['test'], self.y['test'])
testaccuracy = round(100*testaccuracy, 2)
return devaccuracy, testaccuracy

Просмотреть файл

@ -0,0 +1,89 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
TREC question-type classification
'''
from __future__ import absolute_import, division, unicode_literals
import os
import io
import logging
import numpy as np
from senteval.tools.validation import KFoldClassifier
class TRECEval(object):
def __init__(self, task_path, seed=1111):
logging.info('***** Transfer task : TREC *****\n\n')
self.seed = seed
self.train = self.loadFile(os.path.join(task_path, 'train_5500.label'))
self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label'))
def do_prepare(self, params, prepare):
samples = self.train['X'] + self.test['X']
return prepare(params, samples)
def loadFile(self, fpath):
trec_data = {'X': [], 'y': []}
tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2,
'HUM': 3, 'LOC': 4, 'NUM': 5}
with io.open(fpath, 'r', encoding='latin-1') as f:
for line in f:
target, sample = line.strip().split(':', 1)
sample = sample.split(' ', 1)[1].split()
assert target in tgt2idx, target
trec_data['X'].append(sample)
trec_data['y'].append(tgt2idx[target])
return trec_data
def run(self, params, batcher):
train_embeddings, test_embeddings = [], []
# Sort to reduce padding
sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']),
key=lambda z: (len(z[0]), z[1]))
train_samples = [x for (x, y) in sorted_corpus_train]
train_labels = [y for (x, y) in sorted_corpus_train]
sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']),
key=lambda z: (len(z[0]), z[1]))
test_samples = [x for (x, y) in sorted_corpus_test]
test_labels = [y for (x, y) in sorted_corpus_test]
# Get train embeddings
for ii in range(0, len(train_labels), params.batch_size):
batch = train_samples[ii:ii + params.batch_size]
embeddings = batcher(params, batch)
train_embeddings.append(embeddings)
train_embeddings = np.vstack(train_embeddings)
logging.info('Computed train embeddings')
# Get test embeddings
for ii in range(0, len(test_labels), params.batch_size):
batch = test_samples[ii:ii + params.batch_size]
embeddings = batcher(params, batch)
test_embeddings.append(embeddings)
test_embeddings = np.vstack(test_embeddings)
logging.info('Computed test embeddings')
config_classifier = {'nclasses': 6, 'seed': self.seed,
'usepytorch': params.usepytorch,
'classifier': params.classifier,
'kfold': params.kfold}
clf = KFoldClassifier({'X': train_embeddings,
'y': np.array(train_labels)},
{'X': test_embeddings,
'y': np.array(test_labels)},
config_classifier)
devacc, testacc, _ = clf.run()
logging.debug('\nDev acc : {0} Test acc : {1} \
for TREC\n'.format(devacc, testacc))
return {'devacc': devacc, 'acc': testacc,
'ndev': len(self.train['X']), 'ntest': len(self.test['X'])}

Просмотреть файл

@ -0,0 +1,95 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
from __future__ import absolute_import, division, unicode_literals
import numpy as np
import re
import inspect
from torch import optim
def create_dictionary(sentences):
words = {}
for s in sentences:
for word in s:
if word in words:
words[word] += 1
else:
words[word] = 1
words['<s>'] = 1e9 + 4
words['</s>'] = 1e9 + 3
words['<p>'] = 1e9 + 2
# words['<UNK>'] = 1e9 + 1
sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort
id2word = []
word2id = {}
for i, (w, _) in enumerate(sorted_words):
id2word.append(w)
word2id[w] = i
return id2word, word2id
def cosine(u, v):
return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
class dotdict(dict):
""" dot.notation access to dictionary attributes """
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
def get_optimizer(s):
"""
Parse optimizer parameters.
Input should be of the form:
- "sgd,lr=0.01"
- "adagrad,lr=0.1,lr_decay=0.05"
"""
if "," in s:
method = s[:s.find(',')]
optim_params = {}
for x in s[s.find(',') + 1:].split(','):
split = x.split('=')
assert len(split) == 2
assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
optim_params[split[0]] = float(split[1])
else:
method = s
optim_params = {}
if method == 'adadelta':
optim_fn = optim.Adadelta
elif method == 'adagrad':
optim_fn = optim.Adagrad
elif method == 'adam':
optim_fn = optim.Adam
elif method == 'adamax':
optim_fn = optim.Adamax
elif method == 'asgd':
optim_fn = optim.ASGD
elif method == 'rmsprop':
optim_fn = optim.RMSprop
elif method == 'rprop':
optim_fn = optim.Rprop
elif method == 'sgd':
optim_fn = optim.SGD
assert 'lr' in optim_params
else:
raise Exception('Unknown optimization method: "%s"' % method)
# check that we give good parameters to the optimizer
expected_args = inspect.getargspec(optim_fn.__init__)[0]
assert expected_args[:2] == ['self', 'params']
if not all(k in expected_args[2:] for k in optim_params.keys()):
raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
str(expected_args[2:]), str(optim_params.keys())))
return optim_fn, optim_params

Просмотреть файл

@ -0,0 +1,21 @@
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
import io
from setuptools import setup, find_packages
with io.open('./README.md', encoding='utf-8') as f:
readme = f.read()
setup(
name='SentEval',
version='0.1.0',
url='https://github.com/facebookresearch/SentEval',
packages=find_packages(exclude=['examples']),
license='Attribution-NonCommercial 4.0 International',
long_description=readme,
)

Просмотреть файл

@ -8,6 +8,9 @@ from sklearn.metrics import (
f1_score,
)
from numpy import corrcoef
import pandas as pd
def eval_classification(actual, predicted, round_decimals=4):
"""Returns common classification evaluation metrics.
@ -32,3 +35,23 @@ def eval_classification(actual, predicted, round_decimals=4):
f1_score(actual, predicted, average=None).round(round_decimals)
),
}
def compute_correlation_coefficients(x, y=None):
"""
Compute Pearson product-moment correlation coefficients.
Args:
x: array_like
A 1-D or 2-D array containing multiple variables and observations.
Each row of `x` represents a variable, and each column a single
observation of all those variables.
y: array_like, optional
An additional set of variables and observations. `y` has the same
shape as `x`.
Returns:
pd.DataFrame : A pandas dataframe from the correlation coefficient matrix of the variables.
"""
return pd.DataFrame(corrcoef(x, y))

Просмотреть файл

@ -0,0 +1,97 @@
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions):
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
if __name__ == '__main__':
expected_version = '1.1'
parser = argparse.ArgumentParser(
description='Evaluation for SQuAD ' + expected_version)
parser.add_argument('dataset_file', help='Dataset file')
parser.add_argument('prediction_file', help='Prediction File')
args = parser.parse_args()
with open(args.dataset_file) as dataset_file:
dataset_json = json.load(dataset_file)
if (dataset_json['version'] != expected_version):
print('Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
with open(args.prediction_file) as prediction_file:
predictions = json.load(prediction_file)
print(json.dumps(evaluate(dataset, predictions)))
# Original source:
# https://github.com/allenai/bi-att-flow/blob/498c8026d92a8bcf0286e2d216d092d444d02d76/squad/evaluate-v1.1.py

Просмотреть файл

@ -1,43 +1,49 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import sys
import itertools
import pandas as pd
from collections import OrderedDict
from copy import deepcopy
class SentEvalRunner:
def __init__(self, path_to_senteval="."):
"""Wrapper class interfacing with the original implementation of SentEval
class SentEvalConfig:
"""Object to store static properties of senteval experiments
Attributes:
model_params (dict): model parameters that stay consistent across all runs
senteval_params (dict): senteval parameters that stay consistent across all runs
"""
def __init__(
self,
model_params,
senteval_params,
):
"""Summary
Args:
path_to_senteval (str, optional): Path to the SentEval source code.
model_params (dict): model parameters that stay consistent across all runs
senteval_params (dict): senteval parameters that stay consistent across all runs
"""
self.path_to_senteval = path_to_senteval
self.params_senteval = {}
self.model_params = model_params
self.senteval_params = senteval_params
def set_transfer_data_path(self, relative_path):
"""Set the datapath that contains the datasets for the SentEval transfer tasks
Args:
relative_path (str): Relative datapath
"""
self.transfer_data_path = os.path.join(
self.path_to_senteval, relative_path
)
self.params_senteval["task_path"] = self.transfer_data_path
@property
def model_params(self):
return self._model_params
def set_transfer_tasks(self, task_list):
"""Set the transfer tasks to use for evaluation
Args:
task_list (list(str)): List of downstream transfer tasks
"""
self.transfer_tasks = task_list
@model_params.setter
def model_params(self, model_params):
self._model_params = model_params
def set_model(self, model):
"""Set the model to evaluate"""
self.params_senteval["model"] = model
def set_params(self, params):
self.params_senteval = dict(self.params_senteval, **params)
def append_senteval_params(self, params):
"""Util to append any params to senteval_params after initialization"""
self.senteval_params = dict(self.senteval_params, **params)
classifying_tasks = {
"MR",
@ -54,7 +60,7 @@ class SentEvalRunner:
if any(t in classifying_tasks for t in self.transfer_tasks):
try:
a = "classifier" in self.params_senteval
a = "classifier" in self.senteval_params
if not a:
raise ValueError(
"Include param['classifier'] to run task {}".format(t)
@ -68,7 +74,7 @@ class SentEvalRunner:
"tenacity",
"epoch_size",
)
in self.params_senteval["classifier"].keys()
in self.senteval_params["classifier"].keys()
)
if not b:
raise ValueError(
@ -78,50 +84,3 @@ class SentEvalRunner:
)
except ValueError as ve:
print(ve)
def run(self, batcher_func, prepare_func):
"""Run the SentEval engine on the model on the transfer tasks
Args:
batcher_func (function): Function required by SentEval that transforms a batch of text sentences into
sentence embeddings
prepare_func (function): Function that sees the whole dataset of each task and can thus construct the word
vocabulary, the dictionary of word vectors, etc
Returns:
dict: Dictionary of results
"""
sys.path.insert(0, self.path_to_senteval)
import senteval
se = senteval.engine.SE(
self.params_senteval, batcher_func, prepare_func
)
return se.eval(self.transfer_tasks)
def log_mean(self, results, selected_metrics=[], round_decimals=3):
"""Log the means of selected metrics of the transfer tasks
Args:
results (dict): Results from the SentEval evaluation engine
selected_metrics (list(str), optional): List of metric names
round_decimals (int, optional): Number of decimal digits to round to; defaults to 3
Returns:
pd.DataFrame table of formatted results
"""
data = []
for task in self.transfer_tasks:
if "all" in results[task]:
row = [
results[task]["all"][metric]["mean"]
for metric in selected_metrics
]
else:
row = [results[task][metric] for metric in selected_metrics]
data.append(row)
table = pd.DataFrame(
data=data, columns=selected_metrics, index=self.transfer_tasks
)
return table.round(round_decimals)

Просмотреть файл

@ -17,7 +17,7 @@ tqdm
## How to use
We provide a notebook tutorial [here](../../scenarios/interpret_NLP_models/explain_simple_model.ipynb) to help you start quickly. The important class we need to utilize is the `Interpreter` in [Interpreter.py](Interpreter.py). Given any input word embeddings and a forward function $\Phi$ that transforms the word embeddings $\bf x$ to a hidden state $\bf s$, Interpreter helps understand how much each input word contributes to the hidden state. Suppose the $\Phi$, the input $\bf x$ and the input words are defined as:
We provide a notebook tutorial [here](../../scenarios/interpret_NLP_models/understand_models.ipynb) to help you start quickly. The important class we need to utilize is the `Interpreter` in [Interpreter.py](Interpreter.py). Given any input word embeddings and a forward function $\Phi$ that transforms the word embeddings $\bf x$ to a hidden state $\bf s$, Interpreter helps understand how much each input word contributes to the hidden state. Suppose the $\Phi$, the input $\bf x$ and the input words are defined as:
```
import torch

Просмотреть файл

Просмотреть файл

Просмотреть файл

@ -6,42 +6,44 @@
# https://github.com/huggingface/pytorch-transformers/blob/master/examples
# /run_glue.py
from enum import Enum
import csv
import linecache
import subprocess
import warnings
from collections import Iterable
from enum import Enum
import torch
from tqdm import tqdm
from pytorch_pretrained_bert.tokenization import BertTokenizer
from torch.utils.data import (
DataLoader,
Dataset,
RandomSampler,
SequentialSampler,
TensorDataset,
ConcatDataset,
)
from tqdm import tqdm
# Max supported sequence length
BERT_MAX_LEN = 512
class Language(Enum):
class Language(str, Enum):
"""An enumeration of the supported pretrained models and languages."""
ENGLISH = "bert-base-uncased"
ENGLISHCASED = "bert-base-cased"
ENGLISHLARGE = "bert-large-uncased"
ENGLISHLARGECASED = "bert-large-cased"
ENGLISHLARGEWWM = "bert-large-uncased-whole-word-masking"
ENGLISHLARGECASEDWWM = "bert-large-cased-whole-word-masking"
CHINESE = "bert-base-chinese"
MULTILINGUAL = "bert-base-multilingual-cased"
ENGLISH: str = "bert-base-uncased"
ENGLISHCASED: str = "bert-base-cased"
ENGLISHLARGE: str = "bert-large-uncased"
ENGLISHLARGECASED: str = "bert-large-cased"
ENGLISHLARGEWWM: str = "bert-large-uncased-whole-word-masking"
ENGLISHLARGECASEDWWM: str = "bert-large-cased-whole-word-masking"
CHINESE: str = "bert-base-chinese"
MULTILINGUAL: str = "bert-base-multilingual-cased"
class Tokenizer:
def __init__(
self, language=Language.ENGLISH, to_lower=False, cache_dir="."
):
def __init__(self, language=Language.ENGLISH, to_lower=False, cache_dir="."):
"""Initializes the underlying pretrained BERT tokenizer.
Args:
@ -51,7 +53,7 @@ class Tokenizer:
Defaults to ".".
"""
self.tokenizer = BertTokenizer.from_pretrained(
language.value, do_lower_case=to_lower, cache_dir=cache_dir
language, do_lower_case=to_lower, cache_dir=cache_dir
)
self.language = language
@ -69,10 +71,7 @@ class Tokenizer:
if isinstance(text[0], str):
return [self.tokenizer.tokenize(x) for x in tqdm(text)]
else:
return [
[self.tokenizer.tokenize(x) for x in sentences]
for sentences in tqdm(text)
]
return [[self.tokenizer.tokenize(x) for x in sentences] for sentences in tqdm(text)]
def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
@ -121,11 +120,7 @@ class Tokenizer:
list of token type id lists
"""
if max_len > BERT_MAX_LEN:
print(
"setting max_len to max allowed tokens: {}".format(
BERT_MAX_LEN
)
)
print("setting max_len to max allowed tokens: {}".format(BERT_MAX_LEN))
max_len = BERT_MAX_LEN
if isinstance(tokens[0][0], str):
@ -141,23 +136,16 @@ class Tokenizer:
# construct token_type_ids
# [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ]
token_type_ids = [
[[i] * len(sentence) for i, sentence in enumerate(example)]
for example in tokens
[[i] * len(sentence) for i, sentence in enumerate(example)] for example in tokens
]
# merge sentences
tokens = [
[token for sentence in example for token in sentence]
for example in tokens
]
tokens = [[token for sentence in example for token in sentence] for example in tokens]
# prefix with [0] for [CLS]
token_type_ids = [
[0] + [i for sentence in example for i in sentence]
for example in token_type_ids
[0] + [i for sentence in example for i in sentence] for example in token_type_ids
]
# pad sequence
token_type_ids = [
x + [0] * (max_len - len(x)) for x in token_type_ids
]
token_type_ids = [x + [0] * (max_len - len(x)) for x in token_type_ids]
tokens = [["[CLS]"] + x for x in tokens]
# convert tokens to indices
@ -168,13 +156,65 @@ class Tokenizer:
input_mask = [[min(1, x) for x in y] for y in tokens]
return tokens, input_mask, token_type_ids
def preprocess_encoder_tokens(self, tokens, max_len=BERT_MAX_LEN):
"""Preprocessing of input tokens:
- add BERT sentence markers ([CLS] and [SEP])
- map tokens to token indices in the BERT vocabulary
- pad and truncate sequences
- create an input_mask
- create token type ids, aka. segment ids
Args:
tokens (list): List of token lists to preprocess.
max_len (int, optional): Maximum number of tokens
(documents will be truncated or padded).
Defaults to 512.
Returns:
tuple: A tuple containing the following four lists
list of preprocesssed token lists
list of input id lists
list of input mask lists
list of token type id lists
"""
if max_len > BERT_MAX_LEN:
print("setting max_len to max allowed tokens: {}".format(BERT_MAX_LEN))
max_len = BERT_MAX_LEN
if isinstance(tokens[0][0], str):
tokens = [x[0 : max_len - 2] + ["[SEP]"] for x in tokens]
token_type_ids = None
else:
# get tokens for each sentence [[t00, t01, ...] [t10, t11,... ]]
tokens = [
self._truncate_seq_pair(sentence[0], sentence[1], max_len - 3)
for sentence in tokens
]
# construct token_type_ids
# [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ]
token_type_ids = [
[[i] * len(sentence) for i, sentence in enumerate(example)] for example in tokens
]
# merge sentences
tokens = [[token for sentence in example for token in sentence] for example in tokens]
# prefix with [0] for [CLS]
token_type_ids = [
[0] + [i for sentence in example for i in sentence] for example in token_type_ids
]
# pad sequence
token_type_ids = [x + [0] * (max_len - len(x)) for x in token_type_ids]
tokens = [["[CLS]"] + x for x in tokens]
# convert tokens to indices
input_ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokens]
# pad sequence
input_ids = [x + [0] * (max_len - len(x)) for x in input_ids]
# create input mask
input_mask = [[min(1, x) for x in y] for y in input_ids]
return tokens, input_ids, input_mask, token_type_ids
def tokenize_ner(
self,
text,
max_len=BERT_MAX_LEN,
labels=None,
label_map=None,
trailing_piece_tag="X",
self, text, max_len=BERT_MAX_LEN, labels=None, label_map=None, trailing_piece_tag="X"
):
"""
Tokenize and preprocesses input word lists, involving the following steps
@ -232,18 +272,12 @@ class Tokenizer:
return isinstance(obj, Iterable) and not isinstance(obj, str)
if max_len > BERT_MAX_LEN:
warnings.warn(
"setting max_len to max allowed tokens: {}".format(
BERT_MAX_LEN
)
)
warnings.warn("setting max_len to max allowed tokens: {}".format(BERT_MAX_LEN))
max_len = BERT_MAX_LEN
if not _is_iterable_but_not_string(text):
# The input text must be an non-string Iterable
raise ValueError(
"Input text must be an iterable and not a string."
)
raise ValueError("Input text must be an iterable and not a string.")
else:
# If the input text is a single list of words, convert it to
# list of lists for later iteration
@ -251,9 +285,7 @@ class Tokenizer:
text = [text]
if labels is not None:
if not _is_iterable_but_not_string(labels):
raise ValueError(
"labels must be an iterable and not a string."
)
raise ValueError("labels must be an iterable and not a string.")
else:
if not _is_iterable_but_not_string(labels[0]):
labels = [labels]
@ -316,10 +348,7 @@ class Tokenizer:
new_labels += label_padding
trailing_token_mask_all.append(
[
True if label != trailing_piece_tag else False
for label in new_labels
]
[True if label != trailing_piece_tag else False for label in new_labels]
)
if label_map:
@ -332,22 +361,13 @@ class Tokenizer:
label_ids_all.append(label_ids)
if label_available:
return (
input_ids_all,
input_mask_all,
trailing_token_mask_all,
label_ids_all,
)
return (input_ids_all, input_mask_all, trailing_token_mask_all, label_ids_all)
else:
return input_ids_all, input_mask_all, trailing_token_mask_all, None
def create_data_loader(
input_ids,
input_mask,
label_ids=None,
sample_method="random",
batch_size=32,
input_ids, input_mask, label_ids=None, sample_method="random", batch_size=32
):
"""
Create a dataloader for sampling and serving data batches.
@ -377,9 +397,7 @@ def create_data_loader(
if label_ids:
label_ids_tensor = torch.tensor(label_ids, dtype=torch.long)
tensor_data = TensorDataset(
input_ids_tensor, input_mask_tensor, label_ids_tensor
)
tensor_data = TensorDataset(input_ids_tensor, input_mask_tensor, label_ids_tensor)
else:
tensor_data = TensorDataset(input_ids_tensor, input_mask_tensor)
@ -389,12 +407,73 @@ def create_data_loader(
sampler = SequentialSampler(tensor_data)
else:
raise ValueError(
"Invalid sample_method value, accepted values are: "
"random and sequential."
"Invalid sample_method value, accepted values are: " "random and sequential."
)
dataloader = DataLoader(
tensor_data, sampler=sampler, batch_size=batch_size
)
dataloader = DataLoader(tensor_data, sampler=sampler, batch_size=batch_size)
return dataloader
class TextDataset(Dataset):
"""
Characterizes a dataset for PyTorch which can be used to load a file containing multiple rows
where each row is a training example.
"""
def __init__(self, filename):
"""
Initialization. We set the filename and number of lines in the file.
Args:
filename(str): Name of the file.
"""
self._filename = filename
self._total_data = (
int(subprocess.check_output("wc -l " + filename, shell=True).split()[0]) - 1
)
def __len__(self):
"""Denotes the total number of samples in the file."""
return self._total_data
@staticmethod
def _cast(row):
return [int(x.strip()) for x in row]
def __getitem__(self, index):
"""
Generates one sample of data. We assume that the last column is label here. We use
linecache to load files lazily.
Args:
index(int): Index of the test case.
Returns(list, list, int): Returns the tokens, mask and label for a single item.
"""
line = linecache.getline(self._filename, index + 1)
row = next(csv.reader([line]))
tokens = self._cast(row[0][1:-1].split(","))
mask = self._cast(row[1][1:-1].split(","))
return (
torch.tensor(tokens, dtype=torch.long),
torch.tensor(mask, dtype=torch.long),
torch.tensor(int(row[2]), dtype=torch.long),
)
def get_dataset_multiple_files(files):
""" Get dataset from multiple files
Args:
files(list): List of paths to the files.
Returns:
torch.utils.data.Dataset : A combined dataset of all files in the directory.
"""
datasets = [TextDataset(x) for x in files]
return ConcatDataset(datasets)

Просмотреть файл

@ -44,7 +44,7 @@ class BERTSequenceClassifier:
# create classifier
self.model = BertForSequenceClassification.from_pretrained(
language.value, cache_dir=cache_dir, num_labels=num_labels
language, cache_dir=cache_dir, num_labels=num_labels
)
def fit(

Просмотреть файл

@ -0,0 +1,325 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import logging
import horovod.torch as hvd
import numpy as np
import torch.nn as nn
from torch.utils.data import TensorDataset
import torch.utils.data.distributed
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from tqdm import tqdm
from utils_nlp.models.bert.common import Language
from utils_nlp.models.bert.common import get_dataset_multiple_files
from utils_nlp.common.pytorch_utils import get_device, move_to_device
logger = logging.getLogger(__name__)
hvd.init()
torch.manual_seed(42)
if torch.cuda.is_available():
# Horovod: pin GPU to local rank.
torch.cuda.set_device(hvd.local_rank())
torch.cuda.manual_seed(42)
class BERTSequenceDistClassifier:
"""Distributed BERT-based sequence classifier"""
def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir="."):
"""Initializes the classifier and the underlying pretrained model.
Args:
language (Language, optional): The pretrained model's language.
Defaults to Language.ENGLISH.
num_labels (int, optional): The number of unique labels in the
training data. Defaults to 2.
cache_dir (str, optional): Location of BERT's cache directory.
Defaults to ".".
"""
if num_labels < 2:
raise ValueError("Number of labels should be at least 2.")
self.language = language
self.num_labels = num_labels
self.cache_dir = cache_dir
self.kwargs = (
{"num_workers": 1, "pin_memory": True}
if torch.cuda.is_available()
else {}
)
# create classifier
self.model = BertForSequenceClassification.from_pretrained(
language.value, num_labels=num_labels
)
def fit(
self,
token_ids,
input_mask,
labels,
token_type_ids=None,
input_files,
num_gpus=1,
num_epochs=1,
batch_size=32,
lr=2e-5,
warmup_proportion=None,
verbose=True,
fp16_allreduce=False,
):
"""fine-tunes the bert classifier using the given training data.
args:
input_files(list, required): list of paths to the training data files.
token_ids (list): List of training token id lists.
input_mask (list): List of input mask lists.
labels (list): List of training labels.
token_type_ids (list, optional): List of lists. Each sublist
contains segment ids indicating if the token belongs to
the first sentence(0) or second sentence(1). Only needed
for two-sentence tasks.
num_gpus (int, optional): the number of gpus to use.
if none is specified, all available gpus
will be used. defaults to none.
num_epochs (int, optional): number of training epochs.
defaults to 1.
batch_size (int, optional): training batch size. defaults to 32.
lr (float): learning rate of the adam optimizer. defaults to 2e-5.
warmup_proportion (float, optional): proportion of training to
perform linear learning rate warmup for. e.g., 0.1 = 10% of
training. defaults to none.
verbose (bool, optional): if true, shows the training progress and
loss values. defaults to true.
fp16_allreduce(bool, optional)L if true, use fp16 compression during allreduce
"""
if input_files is not None:
train_dataset = get_dataset_multiple_files(input_files)
else:
token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
labels_tensor = torch.tensor(labels, dtype=torch.long)
if token_type_ids:
token_type_ids_tensor = torch.tensor(
token_type_ids, dtype=torch.long
)
train_dataset = TensorDataset(
token_ids_tensor,
input_mask_tensor,
token_type_ids_tensor,
labels_tensor,
)
else:
train_dataset = TensorDataset(
token_ids_tensor, input_mask_tensor, labels_tensor
)
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset, num_replicas=hvd.size(), rank=hvd.rank()
)
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=batch_size,
sampler=train_sampler,
**self.kwargs
)
device = get_device()
self.model.cuda()
hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
# hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# define loss function
loss_func = nn.CrossEntropyLoss().to(device)
# define optimizer and model parameters
param_optimizer = list(self.model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in param_optimizer
if not any(nd in n for nd in no_decay)
],
"weight_decay": 0.01,
},
{
"params": [
p
for n, p in param_optimizer
if any(nd in n for nd in no_decay)
]
},
]
num_examples = len(train_dataset)
num_batches = int(num_examples / batch_size)
num_train_optimization_steps = num_batches * num_epochs
if warmup_proportion is None:
optimizer = BertAdam(
optimizer_grouped_parameters, lr=lr * hvd.size()
)
else:
optimizer = BertAdam(
optimizer_grouped_parameters,
lr=lr * hvd.size(),
t_total=num_train_optimization_steps,
warmup=warmup_proportion,
)
# Horovod: (optional) compression algorithm.
compression = (
hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none
)
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(
optimizer,
named_parameters=self.model.named_parameters(),
compression=compression,
)
# Horovod: set epoch to sampler for shuffling.
for epoch in range(num_epochs):
self.model.train()
train_sampler.set_epoch(epoch)
for batch_idx, batch in enumerate(train_loader):
if token_type_ids:
x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
t.to(device) for t in batch
)
else:
token_type_ids_batch = None
x_batch, mask_batch, y_batch = tuple(
t.to(device) for t in batch
)
optimizer.zero_grad()
output = self.model(
input_ids=x_batch, attention_mask=mask_batch, labels=None
)
loss = loss_func(output, y_batch).mean()
loss.backward()
optimizer.step()
if verbose and (batch_idx % ((num_batches // 10) + 1)) == 0:
# Horovod: use train_sampler to determine the number of examples in
# this worker's partition.
print(
"Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
epoch,
batch_idx * len(x_batch),
len(train_sampler),
100.0 * batch_idx / len(train_loader),
loss.item(),
)
)
# empty cache
torch.cuda.empty_cache()
def predict(
self,
input_files = None,
token_ids,
input_mask,
token_type_ids=None,
input_files, num_gpus=1, batch_size=32, probabilities=False
):
"""Scores the given set of train files and returns the predicted classes.
Args:
input_files(list, required): list of paths to the test data files.
token_ids (list): List of training token lists.
input_mask (list): List of input mask lists.
token_type_ids (list, optional): List of lists. Each sublist
contains segment ids indicating if the token belongs to
the first sentence(0) or second sentence(1). Only needed
for two-sentence tasks.
num_gpus (int, optional): The number of gpus to use.
If None is specified, all available GPUs
will be used. Defaults to None.
batch_size (int, optional): Scoring batch size. Defaults to 32.
probabilities (bool, optional):
If True, the predicted probability distribution
is also returned. Defaults to False.
Returns:
1darray, dict(1darray, 1darray, ndarray): Predicted classes and target labels or
a dictionary with classes, target labels, probabilities) if probabilities is True.
"""
if input_files is not None:
test_dataset = get_dataset_multiple_files(input_files)
else:
token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
if token_type_ids:
token_type_ids_tensor = torch.tensor(
token_type_ids, dtype=torch.long
)
test_dataset = TensorDataset(
token_ids_tensor, input_mask_tensor, token_type_ids_tensor
)
else:
test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor)
# Horovod: use DistributedSampler to partition the test data.
test_sampler = torch.utils.data.sampler.SequentialSampler(test_dataset)
test_loader = torch.utils.data.DataLoader(
test_dataset,
batch_size=batch_size,
sampler=test_sampler,
**self.kwargs
)
device = get_device()
self.model = move_to_device(self.model, device, num_gpus)
self.model.eval()
preds = []
labels_test = []
with tqdm(total=len(test_loader)) as pbar:
for i, (tokens, mask, target) in enumerate(test_loader):
if torch.cuda.is_available():
tokens, mask, target = (
tokens.cuda(),
mask.cuda(),
target.cuda(),
)
with torch.no_grad():
p_batch = self.model(
input_ids=tokens, attention_mask=mask, labels=None
)
preds.append(p_batch.cpu())
labels_test.append(target.cpu())
if i % batch_size == 0:
pbar.update(batch_size)
preds = np.concatenate(preds)
labels_test = np.concatenate(labels_test)
if probabilities:
return {
"Predictions": preds.argmax(axis=1),
"Target": labels_test,
"classes probabilities": nn.Softmax(dim=1)(
torch.Tensor(preds)
).numpy(),
}
else:
return preds.argmax(axis=1), labels_test

Просмотреть файл

@ -0,0 +1,251 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# This script reuses code from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples
# /extract_features.py, with necessary modifications.
from pytorch_pretrained_bert.modeling import BertModel
from utils_nlp.common.pytorch_utils import get_device, move_to_device
from enum import Enum
import numpy as np
import pandas as pd
import os
import torch
from torch.utils.data import (
DataLoader,
RandomSampler,
SequentialSampler,
TensorDataset,
)
from utils_nlp.models.bert.common import Language, Tokenizer
class PoolingStrategy(str, Enum):
"""Enumerate pooling strategies"""
MAX : str = "max"
MEAN : str = "mean"
CLS : str = "cls"
class BERTSentenceEncoder:
"""BERT-based sentence encoder"""
def __init__(
self,
bert_model=None,
tokenizer=None,
language=Language.ENGLISH,
num_gpus=None,
cache_dir=".",
to_lower=True,
max_len=512,
layer_index=-1,
pooling_strategy=PoolingStrategy.MEAN,
):
"""Initialize the encoder's underlying model and tokenizer
Args:
bert_model: BERT model to use for encoding. Defaults to pretrained BertModel.
tokenizer: Tokenizer to use for preprocessing. Defaults to pretrained BERT tokenizer.
language: The pretrained model's language. Defaults to Language.ENGLISH.
num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used.
cache_dir: Location of BERT's cache directory. Defaults to "."
to_lower: True to lowercase before tokenization. Defaults to False.
max_len: Maximum number of tokens.
layer_index: The layer from which to extract features.
Defaults to the last layer; can also be a list of integers for experimentation.
pooling_strategy: Pooling strategy to aggregate token embeddings into sentence embedding.
"""
self.model = (
bert_model.model.bert
if bert_model
else BertModel.from_pretrained(language, cache_dir=cache_dir)
)
self.tokenizer = (
tokenizer
if tokenizer
else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
)
self.num_gpus = num_gpus
self.max_len = max_len
self.layer_index = layer_index
self.pooling_strategy = pooling_strategy
@property
def layer_index(self):
return self._layer_index
@layer_index.setter
def layer_index(self, layer_index):
if isinstance(layer_index, int):
self._layer_index = [layer_index]
else:
self._layer_index = layer_index
@property
def pooling_strategy(self):
return self._pooling_strategy
@pooling_strategy.setter
def pooling_strategy(self, pooling_strategy):
self._pooling_strategy = pooling_strategy
def get_hidden_states(self, text, batch_size=32):
"""Extract the hidden states from the pretrained model
Args:
text: List of documents to extract features from.
batch_size: Batch size, defaults to 32.
Returns:
pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]).
"""
device = get_device("cpu" if self.num_gpus == 0 else "gpu")
self.model = move_to_device(self.model, device, self.num_gpus)
self.model.eval()
tokens = self.tokenizer.tokenize(text)
tokens, input_ids, input_mask, input_type_ids = self.tokenizer.preprocess_encoder_tokens(
tokens, max_len=self.max_len
)
input_ids = torch.tensor(input_ids, dtype=torch.long, device=device)
input_mask = torch.tensor(input_mask, dtype=torch.long, device=device)
input_type_ids = torch.arange(
input_ids.size(0), dtype=torch.long, device=device
)
eval_data = TensorDataset(input_ids, input_mask, input_type_ids)
eval_dataloader = DataLoader(
eval_data,
sampler=SequentialSampler(eval_data),
batch_size=batch_size,
)
hidden_states = {
"text_index": [],
"token": [],
"layer_index": [],
"values": [],
}
for (
input_ids_tensor,
input_mask_tensor,
example_indices_tensor,
) in eval_dataloader:
with torch.no_grad():
all_encoder_layers, _ = self.model(
input_ids_tensor,
token_type_ids=None,
attention_mask=input_mask_tensor,
)
self.embedding_dim = all_encoder_layers[0].size()[-1]
for b, example_index in enumerate(example_indices_tensor):
for (i, token) in enumerate(tokens[example_index.item()]):
for (j, layer_index) in enumerate(self.layer_index):
layer_output = (
all_encoder_layers[int(layer_index)]
.detach()
.cpu()
.numpy()
)
layer_output = layer_output[b]
hidden_states["text_index"].append(
example_index.item()
)
hidden_states["token"].append(token)
hidden_states["layer_index"].append(layer_index)
hidden_states["values"].append(
[round(x.item(), 6) for x in layer_output[i]]
)
# empty cache
del [input_ids_tensor, input_mask_tensor, example_indices_tensor]
torch.cuda.empty_cache()
# empty cache
del [input_ids, input_mask, input_type_ids]
torch.cuda.empty_cache()
return pd.DataFrame.from_dict(hidden_states)
def pool(self, df):
"""Pooling to aggregate token-wise embeddings to sentence embeddings
Args:
df: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float])
pooling_strategy: The pooling strategy to use
Returns:
pd.DataFrame grouped by text index and layer index
"""
def max_pool(x):
values = np.array(
[
np.reshape(np.array(x.values[i]), self.embedding_dim)
for i in range(x.values.shape[0])
]
)
m, _ = torch.max(torch.tensor(values, dtype=torch.float), 0)
return m.numpy()
def mean_pool(x):
values = np.array(
[
np.reshape(np.array(x.values[i]), self.embedding_dim)
for i in range(x.values.shape[0])
]
)
return torch.mean(
torch.tensor(values, dtype=torch.float), 0
).numpy()
def cls_pool(x):
values = np.array(
[
np.reshape(np.array(x.values[i]), self.embedding_dim)
for i in range(x.values.shape[0])
]
)
return values[0]
try:
if self.pooling_strategy == "max":
pool_func = max_pool
elif self.pooling_strategy == "mean":
pool_func = mean_pool
elif self.pooling_strategy == "cls":
pool_func = cls_pool
else:
raise ValueError("Please enter valid pooling strategy")
except ValueError as ve:
print(ve)
return df.groupby(["text_index", "layer_index"])["values"].apply(lambda x: pool_func(x)).reset_index()
def encode(
self,
text,
batch_size=32,
as_numpy=False
):
"""Computes sentence encodings
Args:
text: List of documents to encode.
batch_size: Batch size, defaults to 32.
"""
df = self.get_hidden_states(text, batch_size)
pooled = self.pool(df)
if as_numpy:
return np.array(pooled["values"].tolist())
else:
return pooled

Просмотреть файл

@ -52,7 +52,7 @@ class BERTTokenClassifier:
self.cache_dir = cache_dir
self.model = BertForTokenClassification.from_pretrained(
language.value, cache_dir=cache_dir, num_labels=num_labels
language, cache_dir=cache_dir, num_labels=num_labels
)
def _get_optimizer(

Просмотреть файл

@ -89,13 +89,13 @@ def _split_and_cleanup(split_map, data_path):
"snli_1.0_{}.txt.s2.tok".format(file_split),
)
with open(s1_tok_path, "r") as fin, open(
"{}.tmp".format(s1_tok_path), "w"
"{}.tmp".format(s1_tok_path), "w"
) as tmp:
for line in fin:
s = line.replace('"', "")
tmp.write(s)
with open(s2_tok_path, "r") as fin, open(
"{}.tmp".format(s2_tok_path), "w"
"{}.tmp".format(s2_tok_path), "w"
) as tmp:
for line in fin:
s = line.replace('"', "")

Просмотреть файл

@ -11,14 +11,11 @@ import pickle
import numpy as np
import torch
from azureml.core.run import Run
from sklearn.utils import shuffle
from torch.autograd import Variable
# Change to python3+.
# from itertools import zip
# get the Azure ML run object
run = Run.get_context()
class DataIterator(object):
@ -393,7 +390,7 @@ class NLIIterator(DataIterator):
test(torch.Tensor): Testing dataset.
vocab_size(int): The size of the vocabulary.
lowercase(bool): If lowercase the dataset.
vocab(list): The list of the vocabulary.
vocab(Union[bytes,str): The list of the vocabulary.
"""
self.seed = seed
self.train = train