Added test code for NER utils.

2019-06-13 18:25:27 +00:00 · 2019-06-13 18:25:27 +00:00 · 5438d76596
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,8 +3,8 @@ repos:
    rev: stable
    hooks:
    - id: black
-      language_version: python3.6
+      language_version: python3.7
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v1.2.3
    hooks:
-    - id: flake8
+    - id: flake8
--- a/scenarios/named_entity_recognition/ner_wikigold_bert.ipynb
+++ b/scenarios/named_entity_recognition/ner_wikigold_bert.ipynb
@ -4,6 +4,8 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
+    "*Copyright (c) Microsoft Corporation. All rights reserved.*  \n",
+    "*Licensed under the MIT License.*\n",
    "# Named Entity Recognition Using BERT\n",
    "## Summary\n",
    "This notebook demonstrates how to fine tune [pretrained BERT model](https://github.com/huggingface/pytorch-pretrained-BERT) for named entity recognition (NER) task. Utility functions and classes in the NLP Best Practices repo are used to facilitate data preprocessing, model training, and model evaluation. \n",
--- a/tests/unit/test_bert_common.py
+++ b/tests/unit/test_bert_common.py
@ -1,5 +1,9 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 import os
 import sys
+import pytest

 nlp_path = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@ -7,23 +11,95 @@ nlp_path = os.path.dirname(
 if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)

-from utils_nlp.bert.common import Tokenizer
+from utils_nlp.bert.common import Tokenizer, create_data_loader, Language
+
+INPUT_TEXT = ["Johnathan is studying in the University of Michigan."]
+INPUT_LABELS = [["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]]
+INPUT_TOKEN_IDS = [
+    [
+        1287,
+        9779,
+        1389,
+        1110,
+        5076,
+        1107,
+        1103,
+        1239,
+        1104,
+        3312,
+        119,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+    ]
+]
+INPUT_LABEL_IDS = [
+    [3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+]
+INPUT_MASK = [[1] * 11 + [0] * 9]

-INPUT_TEXT = ["Sarah is studying in the library."]
-INPUT_LABELS = [["I-PER", "O", "O", "O", "O", "I-LOC"]]

 UNIQUE_LABELS = ["O", "I-LOC", "I-MISC", "I-PER", "I-ORG", "X"]
 LABEL_MAP = {label: i for i, label in enumerate(UNIQUE_LABELS)}


 def test_tokenizer_preprocess_ner_tokens():
-    pass
-    # tokenizer = Tokenizer()
+    expected_trailing_token_mask = [[True] * 20]
+    false_pos = [1, 2, 10]
+    for p in false_pos:
+        expected_trailing_token_mask[0][p] = False
+    expected_label_ids = [
+        [3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    ]
+    seq_length = 20

-    # preprocessed_tokens = tokenizer.preprocess_ner_tokens(
-    #     text=INPUT_TEXT, labels=INPUT_LABELS, label_map=LABEL_MAP
-    # )
+    tokenizer = Tokenizer(language=Language.ENGLISHCASED, to_lower=False)
+
+    # test providing labels
+    preprocessed_tokens = tokenizer.preprocess_ner_tokens(
+        text=INPUT_TEXT,
+        labels=INPUT_LABELS,
+        label_map=LABEL_MAP,
+        max_len=seq_length,
+    )
+
+    assert len(preprocessed_tokens[0][0]) == seq_length
+    assert len(preprocessed_tokens[1][0]) == seq_length
+    assert preprocessed_tokens[2] == expected_trailing_token_mask
+    assert preprocessed_tokens[3] == expected_label_ids
+
+    # test not providing labels
+    preprocessed_tokens = tokenizer.preprocess_ner_tokens(
+        text=INPUT_TEXT, label_map=LABEL_MAP, max_len=20
+    )
+    assert preprocessed_tokens[2] == expected_trailing_token_mask


 def test_create_data_loader():
-    pass
+    with pytest.raises(ValueError):
+        create_data_loader(
+            input_ids=INPUT_TOKEN_IDS,
+            input_mask=INPUT_MASK,
+            label_ids=INPUT_LABEL_IDS,
+            sample_method="dummy",
+        )
+
+    create_data_loader(
+        input_ids=INPUT_TOKEN_IDS,
+        input_mask=INPUT_MASK,
+        label_ids=INPUT_LABEL_IDS,
+        sample_method="sequential",
+    )
+
+    create_data_loader(
+        input_ids=INPUT_TOKEN_IDS,
+        input_mask=INPUT_MASK,
+        label_ids=INPUT_LABEL_IDS,
+        sample_method="random",
+    )
--- a/tests/unit/test_bert_token_classification.py
+++ b/tests/unit/test_bert_token_classification.py
@ -1,5 +1,10 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 import os
 import sys
+import shutil
+import pytest

 nlp_path = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@ -7,7 +12,111 @@ nlp_path = os.path.dirname(
 if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)

-# from utils_nlp.bert.token_classification import (
-#     BERTTokenClassifier,
-#     postprocess_token_labels,
-# )
+from utils_nlp.bert.token_classification import (
+    BERTTokenClassifier,
+    postprocess_token_labels,
+)
+
+# Test data
+INPUT_TOKEN_IDS = [
+    [
+        1287,
+        9779,
+        1389,
+        1110,
+        5076,
+        1107,
+        1103,
+        1239,
+        1104,
+        3312,
+        119,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+    ]
+]
+INPUT_LABEL_IDS = [
+    [3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+]
+INPUT_MASK = [[1] * 11 + [0] * 9]
+PREDICTED_LABELS = [
+    [3, 5, 5, 0, 0, 0, 0, 4, 4, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+]
+TRAILING_TOKEN_MASK = [[True] * 20]
+false_pos = [1, 2, 10]
+for p in false_pos:
+    TRAILING_TOKEN_MASK[0][p] = False
+
+UNIQUE_LABELS = ["O", "I-LOC", "I-MISC", "I-PER", "I-ORG", "X"]
+LABEL_MAP = {label: i for i, label in enumerate(UNIQUE_LABELS)}
+
+CACHE_DIR = "./test_bert_token_cache"
+
+
+def test_token_classifier_num_labels():
+    with pytest.raises(ValueError):
+        BERTTokenClassifier(num_labels=1)
+
+
+def test_token_classifier_fit_predict():
+    token_classifier = BERTTokenClassifier(num_labels=6, cache_dir=CACHE_DIR)
+
+    # test fit, no warmup
+    token_classifier.fit(
+        token_ids=INPUT_TOKEN_IDS,
+        input_mask=INPUT_MASK,
+        labels=INPUT_LABEL_IDS,
+    )
+
+    # test fit, with warmup
+    token_classifier.fit(
+        token_ids=INPUT_TOKEN_IDS,
+        input_mask=INPUT_MASK,
+        labels=INPUT_LABEL_IDS,
+        warmup_proportion=0.1,
+    )
+    # test predict, no labels
+    token_classifier.predict(token_ids=INPUT_TOKEN_IDS, input_mask=INPUT_MASK)
+
+    # test predict, with labels
+    token_classifier.predict(
+        token_ids=INPUT_TOKEN_IDS,
+        input_mask=INPUT_MASK,
+        labels=INPUT_LABEL_IDS,
+    )
+    shutil.rmtree(CACHE_DIR)
+
+
+def test_postprocess_token_labels():
+    expected_labels_no_padding = [
+        ["I-PER", "X", "X", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG", "X"]
+    ]
+
+    labels_no_padding = postprocess_token_labels(
+        labels=PREDICTED_LABELS, input_mask=INPUT_MASK, label_map=LABEL_MAP
+    )
+
+    assert labels_no_padding == expected_labels_no_padding
+
+
+def test_postprocess_token_labels_remove_trailing():
+    expected_postprocessed_labels = [
+        ["I-PER", "O", "O", "O", "O", "I-ORG", "I-ORG", "I-ORG"]
+    ]
+
+    labels_no_padding_no_trailing = postprocess_token_labels(
+        labels=PREDICTED_LABELS,
+        input_mask=INPUT_MASK,
+        label_map=LABEL_MAP,
+        remove_trailing_word_pieces=True,
+        trailing_token_mask=TRAILING_TOKEN_MASK,
+    )
+
+    assert labels_no_padding_no_trailing == expected_postprocessed_labels
--- a/utils_nlp/bert/common.py
+++ b/utils_nlp/bert/common.py
@ -12,7 +12,6 @@ from torch.utils.data import (
    SequentialSampler,
    TensorDataset,
 )
-from torch.utils.data.distributed import DistributedSampler

 # Max supported sequence length
 BERT_MAX_LEN = 512
@ -157,7 +156,7 @@ class Tokenizer:
        if labels is None:
            label_available = False
            # create an artificial label list for creating trailing token mask
-            labels = ["O"] * len(text)
+            labels = [["O"] * len(text)]

        input_ids_all = []
        input_mask_all = []
@ -166,18 +165,28 @@ class Tokenizer:
        for t, t_labels in zip(text, labels):
            new_labels = []
            tokens = []
-            for word, tag in zip(t.split(), t_labels):
-                sub_words = self.tokenizer.tokenize(word)
-                for count, sub_word in enumerate(sub_words):
-                    if count > 0:
-                        tag = trailing_piece_tag
-                    new_labels.append(tag)
-                    tokens.append(sub_word)
+            if label_available:
+                for word, tag in zip(t.split(), t_labels):
+                    sub_words = self.tokenizer.tokenize(word)
+                    for count, sub_word in enumerate(sub_words):
+                        if count > 0:
+                            tag = trailing_piece_tag
+                        new_labels.append(tag)
+                        tokens.append(sub_word)
+            else:
+                for word in t.split():
+                    sub_words = self.tokenizer.tokenize(word)
+                    for count, sub_word in enumerate(sub_words):
+                        if count > 0:
+                            tag = trailing_piece_tag
+                        else:
+                            tag = "O"
+                        new_labels.append(tag)
+                        tokens.append(sub_word)

            if len(tokens) > max_len:
                tokens = tokens[:max_len]
                new_labels = new_labels[:max_len]
-
            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens.
@ -239,8 +248,7 @@ def create_data_loader(
            each sublist contains token labels of a input
            sentence/paragraph. Default value is None.
        sample_method (str, optional): Order of data sampling. Accepted
-            values are "random", "sequential" and "distributed". Default
-            value is "random".
+            values are "random", "sequential". Default value is "random".
        batch_size (int, optional): Number of samples used in each training
            iteration. Default value is 32.

@ -264,8 +272,6 @@ def create_data_loader(
        sampler = RandomSampler(tensor_data)
    elif sample_method == "sequential":
        sampler = SequentialSampler(tensor_data)
-    elif sample_method == "distributed":
-        sampler = DistributedSampler(tensor_data)
    else:
        raise ValueError(
            "Invalid sample_method value, accepted values are: "
--- a/utils_nlp/bert/token_classification.py
+++ b/utils_nlp/bert/token_classification.py
@ -45,7 +45,7 @@ class BERTTokenClassifier:
        """

        if num_labels < 2:
-            raise Exception("Number of labels should be at least 2.")
+            raise ValueError("Number of labels should be at least 2.")

        self.language = language
        self.num_labels = num_labels
@ -150,8 +150,8 @@ class BERTTokenClassifier:
        else:
            num_gpus_used = min(num_gpus, torch.cuda.device_count())

-        num_train_optimization_steps = (
-            int(len(token_ids) / batch_size) * num_epochs
+        num_train_optimization_steps = max(
+            (int(len(token_ids) / batch_size) * num_epochs), 1
        )
        optimizer = self._get_optimizer(
            learning_rate=learning_rate,