Added test code for NER utils.

This commit is contained in:
Ubuntu 2019-06-13 18:25:27 +00:00
Родитель 6d671b6221
Коммит 5438d76596
6 изменённых файлов: 225 добавлений и 32 удалений

Просмотреть файл

@ -3,8 +3,8 @@ repos:
rev: stable
hooks:
- id: black
language_version: python3.6
language_version: python3.7
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v1.2.3
hooks:
- id: flake8
- id: flake8

Просмотреть файл

@ -4,6 +4,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"*Copyright (c) Microsoft Corporation. All rights reserved.* \n",
"*Licensed under the MIT License.*\n",
"# Named Entity Recognition Using BERT\n",
"## Summary\n",
"This notebook demonstrates how to fine tune [pretrained BERT model](https://github.com/huggingface/pytorch-pretrained-BERT) for named entity recognition (NER) task. Utility functions and classes in the NLP Best Practices repo are used to facilitate data preprocessing, model training, and model evaluation. \n",

Просмотреть файл

@ -1,5 +1,9 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import sys
import pytest
nlp_path = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@ -7,23 +11,95 @@ nlp_path = os.path.dirname(
if nlp_path not in sys.path:
sys.path.insert(0, nlp_path)
from utils_nlp.bert.common import Tokenizer
from utils_nlp.bert.common import Tokenizer, create_data_loader, Language
INPUT_TEXT = ["Johnathan is studying in the University of Michigan."]
INPUT_LABELS = [["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]]
INPUT_TOKEN_IDS = [
[
1287,
9779,
1389,
1110,
5076,
1107,
1103,
1239,
1104,
3312,
119,
0,
0,
0,
0,
0,
0,
0,
0,
0,
]
]
INPUT_LABEL_IDS = [
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
]
INPUT_MASK = [[1] * 11 + [0] * 9]
INPUT_TEXT = ["Sarah is studying in the library."]
INPUT_LABELS = [["I-PER", "O", "O", "O", "O", "I-LOC"]]
UNIQUE_LABELS = ["O", "I-LOC", "I-MISC", "I-PER", "I-ORG", "X"]
LABEL_MAP = {label: i for i, label in enumerate(UNIQUE_LABELS)}
def test_tokenizer_preprocess_ner_tokens():
pass
# tokenizer = Tokenizer()
expected_trailing_token_mask = [[True] * 20]
false_pos = [1, 2, 10]
for p in false_pos:
expected_trailing_token_mask[0][p] = False
expected_label_ids = [
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
]
seq_length = 20
# preprocessed_tokens = tokenizer.preprocess_ner_tokens(
# text=INPUT_TEXT, labels=INPUT_LABELS, label_map=LABEL_MAP
# )
tokenizer = Tokenizer(language=Language.ENGLISHCASED, to_lower=False)
# test providing labels
preprocessed_tokens = tokenizer.preprocess_ner_tokens(
text=INPUT_TEXT,
labels=INPUT_LABELS,
label_map=LABEL_MAP,
max_len=seq_length,
)
assert len(preprocessed_tokens[0][0]) == seq_length
assert len(preprocessed_tokens[1][0]) == seq_length
assert preprocessed_tokens[2] == expected_trailing_token_mask
assert preprocessed_tokens[3] == expected_label_ids
# test not providing labels
preprocessed_tokens = tokenizer.preprocess_ner_tokens(
text=INPUT_TEXT, label_map=LABEL_MAP, max_len=20
)
assert preprocessed_tokens[2] == expected_trailing_token_mask
def test_create_data_loader():
pass
with pytest.raises(ValueError):
create_data_loader(
input_ids=INPUT_TOKEN_IDS,
input_mask=INPUT_MASK,
label_ids=INPUT_LABEL_IDS,
sample_method="dummy",
)
create_data_loader(
input_ids=INPUT_TOKEN_IDS,
input_mask=INPUT_MASK,
label_ids=INPUT_LABEL_IDS,
sample_method="sequential",
)
create_data_loader(
input_ids=INPUT_TOKEN_IDS,
input_mask=INPUT_MASK,
label_ids=INPUT_LABEL_IDS,
sample_method="random",
)

Просмотреть файл

@ -1,5 +1,10 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import sys
import shutil
import pytest
nlp_path = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@ -7,7 +12,111 @@ nlp_path = os.path.dirname(
if nlp_path not in sys.path:
sys.path.insert(0, nlp_path)
# from utils_nlp.bert.token_classification import (
# BERTTokenClassifier,
# postprocess_token_labels,
# )
from utils_nlp.bert.token_classification import (
BERTTokenClassifier,
postprocess_token_labels,
)
# Test data
INPUT_TOKEN_IDS = [
[
1287,
9779,
1389,
1110,
5076,
1107,
1103,
1239,
1104,
3312,
119,
0,
0,
0,
0,
0,
0,
0,
0,
0,
]
]
INPUT_LABEL_IDS = [
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
]
INPUT_MASK = [[1] * 11 + [0] * 9]
PREDICTED_LABELS = [
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
]
TRAILING_TOKEN_MASK = [[True] * 20]
false_pos = [1, 2, 10]
for p in false_pos:
TRAILING_TOKEN_MASK[0][p] = False
UNIQUE_LABELS = ["O", "I-LOC", "I-MISC", "I-PER", "I-ORG", "X"]
LABEL_MAP = {label: i for i, label in enumerate(UNIQUE_LABELS)}
CACHE_DIR = "./test_bert_token_cache"
def test_token_classifier_num_labels():
with pytest.raises(ValueError):
BERTTokenClassifier(num_labels=1)
def test_token_classifier_fit_predict():
token_classifier = BERTTokenClassifier(num_labels=6, cache_dir=CACHE_DIR)
# test fit, no warmup
token_classifier.fit(
token_ids=INPUT_TOKEN_IDS,
input_mask=INPUT_MASK,
labels=INPUT_LABEL_IDS,
)
# test fit, with warmup
token_classifier.fit(
token_ids=INPUT_TOKEN_IDS,
input_mask=INPUT_MASK,
labels=INPUT_LABEL_IDS,
warmup_proportion=0.1,
)
# test predict, no labels
token_classifier.predict(token_ids=INPUT_TOKEN_IDS, input_mask=INPUT_MASK)
# test predict, with labels
token_classifier.predict(
token_ids=INPUT_TOKEN_IDS,
input_mask=INPUT_MASK,
labels=INPUT_LABEL_IDS,
)
shutil.rmtree(CACHE_DIR)
def test_postprocess_token_labels():
expected_labels_no_padding = [
["I-PER", "X", "X", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "X"]
]
labels_no_padding = postprocess_token_labels(
labels=PREDICTED_LABELS, input_mask=INPUT_MASK, label_map=LABEL_MAP
)
assert labels_no_padding == expected_labels_no_padding
def test_postprocess_token_labels_remove_trailing():
expected_postprocessed_labels = [
["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]
]
labels_no_padding_no_trailing = postprocess_token_labels(
labels=PREDICTED_LABELS,
input_mask=INPUT_MASK,
label_map=LABEL_MAP,
remove_trailing_word_pieces=True,
trailing_token_mask=TRAILING_TOKEN_MASK,
)
assert labels_no_padding_no_trailing == expected_postprocessed_labels

Просмотреть файл

@ -12,7 +12,6 @@ from torch.utils.data import (
SequentialSampler,
TensorDataset,
)
from torch.utils.data.distributed import DistributedSampler
# Max supported sequence length
BERT_MAX_LEN = 512
@ -157,7 +156,7 @@ class Tokenizer:
if labels is None:
label_available = False
# create an artificial label list for creating trailing token mask
labels = ["O"] * len(text)
labels = [["O"] * len(text)]
input_ids_all = []
input_mask_all = []
@ -166,18 +165,28 @@ class Tokenizer:
for t, t_labels in zip(text, labels):
new_labels = []
tokens = []
for word, tag in zip(t.split(), t_labels):
sub_words = self.tokenizer.tokenize(word)
for count, sub_word in enumerate(sub_words):
if count > 0:
tag = trailing_piece_tag
new_labels.append(tag)
tokens.append(sub_word)
if label_available:
for word, tag in zip(t.split(), t_labels):
sub_words = self.tokenizer.tokenize(word)
for count, sub_word in enumerate(sub_words):
if count > 0:
tag = trailing_piece_tag
new_labels.append(tag)
tokens.append(sub_word)
else:
for word in t.split():
sub_words = self.tokenizer.tokenize(word)
for count, sub_word in enumerate(sub_words):
if count > 0:
tag = trailing_piece_tag
else:
tag = "O"
new_labels.append(tag)
tokens.append(sub_word)
if len(tokens) > max_len:
tokens = tokens[:max_len]
new_labels = new_labels[:max_len]
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens.
@ -239,8 +248,7 @@ def create_data_loader(
each sublist contains token labels of a input
sentence/paragraph. Default value is None.
sample_method (str, optional): Order of data sampling. Accepted
values are "random", "sequential" and "distributed". Default
value is "random".
values are "random", "sequential". Default value is "random".
batch_size (int, optional): Number of samples used in each training
iteration. Default value is 32.
@ -264,8 +272,6 @@ def create_data_loader(
sampler = RandomSampler(tensor_data)
elif sample_method == "sequential":
sampler = SequentialSampler(tensor_data)
elif sample_method == "distributed":
sampler = DistributedSampler(tensor_data)
else:
raise ValueError(
"Invalid sample_method value, accepted values are: "

Просмотреть файл

@ -45,7 +45,7 @@ class BERTTokenClassifier:
"""
if num_labels < 2:
raise Exception("Number of labels should be at least 2.")
raise ValueError("Number of labels should be at least 2.")
self.language = language
self.num_labels = num_labels
@ -150,8 +150,8 @@ class BERTTokenClassifier:
else:
num_gpus_used = min(num_gpus, torch.cuda.device_count())
num_train_optimization_steps = (
int(len(token_ids) / batch_size) * num_epochs
num_train_optimization_steps = max(
(int(len(token_ids) / batch_size) * num_epochs), 1
)
optimizer = self._get_optimizer(
learning_rate=learning_rate,