Added test code for NER utils.
This commit is contained in:
Родитель
6d671b6221
Коммит
5438d76596
|
@ -3,8 +3,8 @@ repos:
|
|||
rev: stable
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.6
|
||||
language_version: python3.7
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v1.2.3
|
||||
hooks:
|
||||
- id: flake8
|
||||
- id: flake8
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"*Copyright (c) Microsoft Corporation. All rights reserved.* \n",
|
||||
"*Licensed under the MIT License.*\n",
|
||||
"# Named Entity Recognition Using BERT\n",
|
||||
"## Summary\n",
|
||||
"This notebook demonstrates how to fine tune [pretrained BERT model](https://github.com/huggingface/pytorch-pretrained-BERT) for named entity recognition (NER) task. Utility functions and classes in the NLP Best Practices repo are used to facilitate data preprocessing, model training, and model evaluation. \n",
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
nlp_path = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
@ -7,23 +11,95 @@ nlp_path = os.path.dirname(
|
|||
if nlp_path not in sys.path:
|
||||
sys.path.insert(0, nlp_path)
|
||||
|
||||
from utils_nlp.bert.common import Tokenizer
|
||||
from utils_nlp.bert.common import Tokenizer, create_data_loader, Language
|
||||
|
||||
INPUT_TEXT = ["Johnathan is studying in the University of Michigan."]
|
||||
INPUT_LABELS = [["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]]
|
||||
INPUT_TOKEN_IDS = [
|
||||
[
|
||||
1287,
|
||||
9779,
|
||||
1389,
|
||||
1110,
|
||||
5076,
|
||||
1107,
|
||||
1103,
|
||||
1239,
|
||||
1104,
|
||||
3312,
|
||||
119,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
]
|
||||
]
|
||||
INPUT_LABEL_IDS = [
|
||||
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
]
|
||||
INPUT_MASK = [[1] * 11 + [0] * 9]
|
||||
|
||||
INPUT_TEXT = ["Sarah is studying in the library."]
|
||||
INPUT_LABELS = [["I-PER", "O", "O", "O", "O", "I-LOC"]]
|
||||
|
||||
UNIQUE_LABELS = ["O", "I-LOC", "I-MISC", "I-PER", "I-ORG", "X"]
|
||||
LABEL_MAP = {label: i for i, label in enumerate(UNIQUE_LABELS)}
|
||||
|
||||
|
||||
def test_tokenizer_preprocess_ner_tokens():
|
||||
pass
|
||||
# tokenizer = Tokenizer()
|
||||
expected_trailing_token_mask = [[True] * 20]
|
||||
false_pos = [1, 2, 10]
|
||||
for p in false_pos:
|
||||
expected_trailing_token_mask[0][p] = False
|
||||
expected_label_ids = [
|
||||
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
]
|
||||
seq_length = 20
|
||||
|
||||
# preprocessed_tokens = tokenizer.preprocess_ner_tokens(
|
||||
# text=INPUT_TEXT, labels=INPUT_LABELS, label_map=LABEL_MAP
|
||||
# )
|
||||
tokenizer = Tokenizer(language=Language.ENGLISHCASED, to_lower=False)
|
||||
|
||||
# test providing labels
|
||||
preprocessed_tokens = tokenizer.preprocess_ner_tokens(
|
||||
text=INPUT_TEXT,
|
||||
labels=INPUT_LABELS,
|
||||
label_map=LABEL_MAP,
|
||||
max_len=seq_length,
|
||||
)
|
||||
|
||||
assert len(preprocessed_tokens[0][0]) == seq_length
|
||||
assert len(preprocessed_tokens[1][0]) == seq_length
|
||||
assert preprocessed_tokens[2] == expected_trailing_token_mask
|
||||
assert preprocessed_tokens[3] == expected_label_ids
|
||||
|
||||
# test not providing labels
|
||||
preprocessed_tokens = tokenizer.preprocess_ner_tokens(
|
||||
text=INPUT_TEXT, label_map=LABEL_MAP, max_len=20
|
||||
)
|
||||
assert preprocessed_tokens[2] == expected_trailing_token_mask
|
||||
|
||||
|
||||
def test_create_data_loader():
|
||||
pass
|
||||
with pytest.raises(ValueError):
|
||||
create_data_loader(
|
||||
input_ids=INPUT_TOKEN_IDS,
|
||||
input_mask=INPUT_MASK,
|
||||
label_ids=INPUT_LABEL_IDS,
|
||||
sample_method="dummy",
|
||||
)
|
||||
|
||||
create_data_loader(
|
||||
input_ids=INPUT_TOKEN_IDS,
|
||||
input_mask=INPUT_MASK,
|
||||
label_ids=INPUT_LABEL_IDS,
|
||||
sample_method="sequential",
|
||||
)
|
||||
|
||||
create_data_loader(
|
||||
input_ids=INPUT_TOKEN_IDS,
|
||||
input_mask=INPUT_MASK,
|
||||
label_ids=INPUT_LABEL_IDS,
|
||||
sample_method="random",
|
||||
)
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
nlp_path = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
@ -7,7 +12,111 @@ nlp_path = os.path.dirname(
|
|||
if nlp_path not in sys.path:
|
||||
sys.path.insert(0, nlp_path)
|
||||
|
||||
# from utils_nlp.bert.token_classification import (
|
||||
# BERTTokenClassifier,
|
||||
# postprocess_token_labels,
|
||||
# )
|
||||
from utils_nlp.bert.token_classification import (
|
||||
BERTTokenClassifier,
|
||||
postprocess_token_labels,
|
||||
)
|
||||
|
||||
# Test data
|
||||
INPUT_TOKEN_IDS = [
|
||||
[
|
||||
1287,
|
||||
9779,
|
||||
1389,
|
||||
1110,
|
||||
5076,
|
||||
1107,
|
||||
1103,
|
||||
1239,
|
||||
1104,
|
||||
3312,
|
||||
119,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
]
|
||||
]
|
||||
INPUT_LABEL_IDS = [
|
||||
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
]
|
||||
INPUT_MASK = [[1] * 11 + [0] * 9]
|
||||
PREDICTED_LABELS = [
|
||||
[3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
]
|
||||
TRAILING_TOKEN_MASK = [[True] * 20]
|
||||
false_pos = [1, 2, 10]
|
||||
for p in false_pos:
|
||||
TRAILING_TOKEN_MASK[0][p] = False
|
||||
|
||||
UNIQUE_LABELS = ["O", "I-LOC", "I-MISC", "I-PER", "I-ORG", "X"]
|
||||
LABEL_MAP = {label: i for i, label in enumerate(UNIQUE_LABELS)}
|
||||
|
||||
CACHE_DIR = "./test_bert_token_cache"
|
||||
|
||||
|
||||
def test_token_classifier_num_labels():
|
||||
with pytest.raises(ValueError):
|
||||
BERTTokenClassifier(num_labels=1)
|
||||
|
||||
|
||||
def test_token_classifier_fit_predict():
|
||||
token_classifier = BERTTokenClassifier(num_labels=6, cache_dir=CACHE_DIR)
|
||||
|
||||
# test fit, no warmup
|
||||
token_classifier.fit(
|
||||
token_ids=INPUT_TOKEN_IDS,
|
||||
input_mask=INPUT_MASK,
|
||||
labels=INPUT_LABEL_IDS,
|
||||
)
|
||||
|
||||
# test fit, with warmup
|
||||
token_classifier.fit(
|
||||
token_ids=INPUT_TOKEN_IDS,
|
||||
input_mask=INPUT_MASK,
|
||||
labels=INPUT_LABEL_IDS,
|
||||
warmup_proportion=0.1,
|
||||
)
|
||||
# test predict, no labels
|
||||
token_classifier.predict(token_ids=INPUT_TOKEN_IDS, input_mask=INPUT_MASK)
|
||||
|
||||
# test predict, with labels
|
||||
token_classifier.predict(
|
||||
token_ids=INPUT_TOKEN_IDS,
|
||||
input_mask=INPUT_MASK,
|
||||
labels=INPUT_LABEL_IDS,
|
||||
)
|
||||
shutil.rmtree(CACHE_DIR)
|
||||
|
||||
|
||||
def test_postprocess_token_labels():
|
||||
expected_labels_no_padding = [
|
||||
["I-PER", "X", "X", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "X"]
|
||||
]
|
||||
|
||||
labels_no_padding = postprocess_token_labels(
|
||||
labels=PREDICTED_LABELS, input_mask=INPUT_MASK, label_map=LABEL_MAP
|
||||
)
|
||||
|
||||
assert labels_no_padding == expected_labels_no_padding
|
||||
|
||||
|
||||
def test_postprocess_token_labels_remove_trailing():
|
||||
expected_postprocessed_labels = [
|
||||
["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]
|
||||
]
|
||||
|
||||
labels_no_padding_no_trailing = postprocess_token_labels(
|
||||
labels=PREDICTED_LABELS,
|
||||
input_mask=INPUT_MASK,
|
||||
label_map=LABEL_MAP,
|
||||
remove_trailing_word_pieces=True,
|
||||
trailing_token_mask=TRAILING_TOKEN_MASK,
|
||||
)
|
||||
|
||||
assert labels_no_padding_no_trailing == expected_postprocessed_labels
|
||||
|
|
|
@ -12,7 +12,6 @@ from torch.utils.data import (
|
|||
SequentialSampler,
|
||||
TensorDataset,
|
||||
)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
# Max supported sequence length
|
||||
BERT_MAX_LEN = 512
|
||||
|
@ -157,7 +156,7 @@ class Tokenizer:
|
|||
if labels is None:
|
||||
label_available = False
|
||||
# create an artificial label list for creating trailing token mask
|
||||
labels = ["O"] * len(text)
|
||||
labels = [["O"] * len(text)]
|
||||
|
||||
input_ids_all = []
|
||||
input_mask_all = []
|
||||
|
@ -166,18 +165,28 @@ class Tokenizer:
|
|||
for t, t_labels in zip(text, labels):
|
||||
new_labels = []
|
||||
tokens = []
|
||||
for word, tag in zip(t.split(), t_labels):
|
||||
sub_words = self.tokenizer.tokenize(word)
|
||||
for count, sub_word in enumerate(sub_words):
|
||||
if count > 0:
|
||||
tag = trailing_piece_tag
|
||||
new_labels.append(tag)
|
||||
tokens.append(sub_word)
|
||||
if label_available:
|
||||
for word, tag in zip(t.split(), t_labels):
|
||||
sub_words = self.tokenizer.tokenize(word)
|
||||
for count, sub_word in enumerate(sub_words):
|
||||
if count > 0:
|
||||
tag = trailing_piece_tag
|
||||
new_labels.append(tag)
|
||||
tokens.append(sub_word)
|
||||
else:
|
||||
for word in t.split():
|
||||
sub_words = self.tokenizer.tokenize(word)
|
||||
for count, sub_word in enumerate(sub_words):
|
||||
if count > 0:
|
||||
tag = trailing_piece_tag
|
||||
else:
|
||||
tag = "O"
|
||||
new_labels.append(tag)
|
||||
tokens.append(sub_word)
|
||||
|
||||
if len(tokens) > max_len:
|
||||
tokens = tokens[:max_len]
|
||||
new_labels = new_labels[:max_len]
|
||||
|
||||
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
# The mask has 1 for real tokens and 0 for padding tokens.
|
||||
|
@ -239,8 +248,7 @@ def create_data_loader(
|
|||
each sublist contains token labels of a input
|
||||
sentence/paragraph. Default value is None.
|
||||
sample_method (str, optional): Order of data sampling. Accepted
|
||||
values are "random", "sequential" and "distributed". Default
|
||||
value is "random".
|
||||
values are "random", "sequential". Default value is "random".
|
||||
batch_size (int, optional): Number of samples used in each training
|
||||
iteration. Default value is 32.
|
||||
|
||||
|
@ -264,8 +272,6 @@ def create_data_loader(
|
|||
sampler = RandomSampler(tensor_data)
|
||||
elif sample_method == "sequential":
|
||||
sampler = SequentialSampler(tensor_data)
|
||||
elif sample_method == "distributed":
|
||||
sampler = DistributedSampler(tensor_data)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid sample_method value, accepted values are: "
|
||||
|
|
|
@ -45,7 +45,7 @@ class BERTTokenClassifier:
|
|||
"""
|
||||
|
||||
if num_labels < 2:
|
||||
raise Exception("Number of labels should be at least 2.")
|
||||
raise ValueError("Number of labels should be at least 2.")
|
||||
|
||||
self.language = language
|
||||
self.num_labels = num_labels
|
||||
|
@ -150,8 +150,8 @@ class BERTTokenClassifier:
|
|||
else:
|
||||
num_gpus_used = min(num_gpus, torch.cuda.device_count())
|
||||
|
||||
num_train_optimization_steps = (
|
||||
int(len(token_ids) / batch_size) * num_epochs
|
||||
num_train_optimization_steps = max(
|
||||
(int(len(token_ids) / batch_size) * num_epochs), 1
|
||||
)
|
||||
optimizer = self._get_optimizer(
|
||||
learning_rate=learning_rate,
|
||||
|
|
Загрузка…
Ссылка в новой задаче