This commit is contained in:
saidbleik 2020-02-11 05:27:23 +00:00
Родитель 6769aa765d
Коммит 67b1d8a8ea
3 изменённых файлов: 265 добавлений и 165 удалений

Просмотреть файл

@ -69,23 +69,19 @@
"source": [
"import os\n",
"import sys\n",
"import scrapbook as sb\n",
"\n",
"import scrapbook as sb\n",
"import torch\n",
"\n",
"nlp_path = os.path.abspath('../../')\n",
"if nlp_path not in sys.path:\n",
" sys.path.insert(0, nlp_path)\n",
"\n",
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
"from utils_nlp.common.timer import Timer\n",
"from utils_nlp.dataset.squad import load_pandas_df\n",
"from utils_nlp.eval.question_answering import evaluate_qa\n",
"from utils_nlp.models.transformers.datasets import QADataset\n",
"from utils_nlp.models.transformers.question_answering import (\n",
" AnswerExtractor,\n",
" QAProcessor,\n",
" AnswerExtractor\n",
")\n",
" \n",
"from utils_nlp.eval.question_answering import evaluate_qa\n",
"from utils_nlp.common.timer import Timer"
")"
]
},
{
@ -559,7 +555,7 @@
"* Pad the concatenated token sequence to `max_seq_length` if it's shorter.\n",
"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
"\n",
"`QAProcessor.preprocess` returns a Pytorch Dataloader. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
"`QAProcessor.preprocess` returns a Pytorch DataSet. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
]
},
{
@ -577,24 +573,28 @@
],
"source": [
"qa_processor = QAProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
"train_dataloader = qa_processor.preprocess(\n",
" train_dataset, \n",
" batch_size=PER_GPU_BATCH_SIZE,\n",
" num_gpus=NUM_GPUS,\n",
"train_dataset = qa_processor.preprocess(\n",
" train_dataset,\n",
" is_training=True,\n",
" max_question_length=MAX_QUESTION_LENGTH,\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
" doc_stride=DOC_STRIDE\n",
" doc_stride=DOC_STRIDE,\n",
")\n",
"\n",
"dev_dataloader = qa_processor.preprocess(\n",
" dev_dataset, \n",
" batch_size=PER_GPU_BATCH_SIZE,\n",
" num_gpus=NUM_GPUS,\n",
"# we keep a copy of the oroginal dev_dataset as it is needed for evaluation\n",
"dev_dataset_processed = qa_processor.preprocess(\n",
" dev_dataset,\n",
" is_training=False,\n",
" max_question_length=MAX_QUESTION_LENGTH,\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
" doc_stride=DOC_STRIDE\n",
" doc_stride=DOC_STRIDE,\n",
")\n",
"\n",
"train_dataloader = dataloader_from_dataset(\n",
" train_dataset, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
")\n",
"dev_dataloader = dataloader_from_dataset(\n",
" dev_dataset_processed, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
")"
]
},

Просмотреть файл

@ -65,22 +65,18 @@ def qa_test_data(qa_test_df, tmp_module):
qa_id_col=qa_test_df["qa_id_col"],
)
# bert
qa_processor_bert = QAProcessor(cache_dir=tmp_module)
train_features_bert = qa_processor_bert.preprocess(
train_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=True,
max_question_length=16,
max_seq_length=64,
doc_stride=32,
feature_cache_dir=tmp_module,
)
test_features_bert = qa_processor_bert.preprocess(
test_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=False,
max_question_length=16,
max_seq_length=64,
@ -88,22 +84,18 @@ def qa_test_data(qa_test_df, tmp_module):
feature_cache_dir=tmp_module,
)
# xlnet
qa_processor_xlnet = QAProcessor(model_name="xlnet-base-cased", cache_dir=tmp_module)
train_features_xlnet = qa_processor_xlnet.preprocess(
train_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=True,
max_question_length=16,
max_seq_length=64,
doc_stride=32,
feature_cache_dir=tmp_module,
)
test_features_xlnet = qa_processor_xlnet.preprocess(
test_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=False,
max_question_length=16,
max_seq_length=64,
@ -111,22 +103,20 @@ def qa_test_data(qa_test_df, tmp_module):
feature_cache_dir=tmp_module,
)
qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
# distilbert
qa_processor_distilbert = QAProcessor(
model_name="distilbert-base-uncased", cache_dir=tmp_module
)
train_features_distilbert = qa_processor_distilbert.preprocess(
train_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=True,
max_question_length=16,
max_seq_length=64,
doc_stride=32,
feature_cache_dir=tmp_module,
)
test_features_distilbert = qa_processor_distilbert.preprocess(
test_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=False,
max_question_length=16,
max_seq_length=64,
@ -151,11 +141,21 @@ def qa_test_data(qa_test_df, tmp_module):
@pytest.mark.gpu
def test_QAProcessor(qa_test_data, tmp_module):
for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
for model_name in [
"bert-base-cased",
"xlnet-base-cased",
"distilbert-base-uncased",
]:
qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)
qa_processor.preprocess(
qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module,
)
qa_processor.preprocess(
qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module,
)
qa_processor.preprocess(
qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module,
)
# test unsupported model type
with pytest.raises(ValueError):
@ -163,18 +163,24 @@ def test_QAProcessor(qa_test_data, tmp_module):
# test training data has no ground truth exception
with pytest.raises(Exception):
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)
qa_processor.preprocess(
qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
)
# test when answer start is a list, but answer text is not
with pytest.raises(Exception):
qa_processor.preprocess(
qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
qa_test_data["train_dataset_start_text_mismatch"],
is_training=True,
feature_cache_dir=tmp_module,
)
# test when training data has multiple answers
with pytest.raises(Exception):
qa_processor.preprocess(
qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
qa_test_data["train_dataset_multi_answers"],
is_training=True,
feature_cache_dir=tmp_module,
)
@ -190,7 +196,9 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
assert os.path.exists(os.path.join(model_output_dir, "config.json"))
qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
qa_extractor_from_cache = AnswerExtractor(
cache_dir=tmp_module, load_model_from_dir=model_output_dir
)
qa_extractor_from_cache.predict(test_loader_bert, verbose=False)
# xlnet
@ -202,8 +210,12 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
# distilbert
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
test_loader_xlnet = dataloader_from_dataset(
qa_test_data["test_features_distilbert"], shuffle=False
)
qa_extractor_distilbert = AnswerExtractor(
model_name="distilbert-base-uncased", cache_dir=tmp_module
)
qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)

Просмотреть файл

@ -27,19 +27,41 @@ import jsonlines
import torch
from torch.utils.data import TensorDataset
from tqdm import tqdm
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
from transformers.modeling_albert import (
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
AlbertForQuestionAnswering,
)
from transformers.modeling_bert import (
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
BertForQuestionAnswering,
)
from transformers.modeling_distilbert import (
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForQuestionAnswering,
)
from transformers.modeling_xlnet import (
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
XLNetForQuestionAnswering,
)
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
from utils_nlp.common.pytorch_utils import (
compute_training_steps,
get_device,
move_model_to_device,
)
from utils_nlp.models.transformers.common import (
MAX_SEQ_LEN,
TOKENIZER_CLASS,
Transformer,
)
MODEL_CLASS = {}
MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update(
{k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
# cached files during preprocessing
@ -62,25 +84,29 @@ class QAProcessor:
Class for preprocessing and postprocessing question answering data.
Args:
model_name (str, optional): Name of the model. Call QAProcessor.list_supported_models() to
get all supported models. Defaults to "bert-base-cased".
model_name (str, optional): Name of the model.
Call QAProcessor.list_supported_models() to get all supported models.
Defaults to "bert-base-cased".
to_lower (bool, optional): Whether to convert all letters to lower case during
tokenization. This is determined by if a cased model is used. Defaults to False,
which corresponds to a cased model.
custom_tokenize (function, optional): A custom tokenize function used to tokenize the
input text. If not provided, the default tokenizer corresponding to the model_name
is loaded and its `tokenize` method is used. NOTE that even this function is
provided, the numerical token ids are still generated by the `convert_tokens_to_ids`
method of the default tokenizer, so there is a risk that tokens generated by the
custom_tokenize function don't have correponding token ids in the default toeknizer.
tokenization. This is determined by if a cased model is used.
Defaults to False, which corresponds to a cased model.
custom_tokenize (function, optional): A custom tokenize function
used to tokenize the input text. If not provided, the default tokenizer
corresponding to the model_name is loaded and its `tokenize` method is used.
NOTE that even this function is provided, the numerical token ids are still
generated by the `convert_tokens_to_ids` method of the default tokenizer,
so there is a risk that tokens generated by the custom_tokenize
function don't have correponding token ids in the default toeknizer.
Defaults to None.
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
"""
def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
def __init__(
self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir=".",
):
self.model_name = model_name
self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False,
)
self.do_lower_case = to_lower
self.custom_tokenize = custom_tokenize
@ -149,9 +175,6 @@ class QAProcessor:
self,
qa_dataset,
is_training,
batch_size=32,
num_gpus=None,
distributed=False,
max_question_length=64,
max_seq_length=MAX_SEQ_LEN,
doc_stride=128,
@ -161,28 +184,31 @@ class QAProcessor:
Preprocesses raw question answering data and generates train/test features.
Args:
qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`): Question answering data in
standard QADataset format.
qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`):
Question answering data in standard QADataset format.
is_training (bool): Whether the input data is training data.
max_question_length (int, optional): Maximum number of tokens of the question sequence
after tokenization, so the number of words in the raw question is usually less than
max_question_length. Defaults to 64.
max_seq_length (int, optional): Maximum number of tokens of the entire feature token
sequence after tokenization. The entire feature token sequence is composed
of [CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP] for models other
than XLNet, and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
max_question_length (int, optional): Maximum number of tokens
of the question sequence after tokenization, so the number of words
in the raw question is usually less than max_question_length.
Defaults to 64.
max_seq_length (int, optional): Maximum number of tokens of the entire
feature token sequence after tokenization. The entire feature token
sequence is composed of:
[CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP]
for models other than XLNet,
and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
XLNet. Defaults to MAX_SEQ_LEN.
doc_stride (int, optional): Size (number of tokens) of the sliding window when
breaking down a long document paragraph in to multiple document spans. Defaults
to 128.
feature_cache_dir (int, optional): Directory to save some intermediate preprocessing
results.
doc_stride (int, optional): Size (number of tokens) of the sliding window
when breaking down a long document paragraph in to multiple document
spans. Defaults to 128.
feature_cache_dir (int, optional): Directory to save some intermediate
preprocessing results.
If `is_training` is True, CACHED_EXAMPLES_TRAIN_FILE and
CACHED_FEATURES_TRAIN_FILE are saved to this directory. Otherwise,
CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved to this
directory. These files are required during postprocessing to generate the final
answer texts from predicted answer start and answer end indices. Defaults to
"./cached_qa_features".
CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved
to this directory. These files are required during postprocessing to
generate the final answer texts from predicted answer start and answer
end indices. Defaults to "./cached_qa_features".
Returns:
DataSet: A Pytorch DataSet.
"""
@ -217,7 +243,9 @@ class QAProcessor:
qa_examples.append(qa_example_cur)
qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})
qa_examples_json.append(
{"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
)
features_cur = _create_qa_features(
qa_example_cur,
@ -257,17 +285,25 @@ class QAProcessor:
input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)
if is_training:
start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
qa_dataset = TensorDataset(
input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
input_ids,
input_mask,
segment_ids,
start_positions,
end_positions,
cls_index,
p_mask,
)
else:
unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)
qa_dataset = TensorDataset(
input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
)
return qa_dataset
@ -397,7 +433,14 @@ class QAResult(QAResult_):
QAResultExtended_ = collections.namedtuple(
"QAResultExtended",
["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
[
"unique_id",
"start_top_log_probs",
"start_top_index",
"end_top_log_probs",
"end_top_index",
"cls_logits",
],
)
@ -481,36 +524,45 @@ class AnswerExtractor(Transformer):
num_epochs (int, optional): Number of training epochs. Defaults to 1.
max_steps (int, optional): Total number of training steps.
If set to a positive value, it overrides num_epochs.
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
Otherwise, it's determined by the dataset length,
gradient_accumulation_steps, and num_epochs.
Defualts to -1.
gradient_accumulation_steps (int, optional): Number of steps to accumulate
before performing a backward/update pass.
Default to 1.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will be used.
num_gpus (int, optional): The number of GPUs to use.
If None, all available GPUs will be used.
If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-1, which means non-distributed training.
weight_decay (float, optional): Weight decay to apply after each parameter update.
Defaults to 0.0.
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
5e-5.
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
to `learning rate`. Defaults to 0.
verbose (bool, optional): Whether to print out the training log. Defaults to True.
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
cache_model (bool, optional): Whether to save the fine-tuned model. If True,
the fine-tuned model is saved to a `fine_tuned` folder under of the `cache_dir`
of AnswerExtractor. Defaults to True.
local_rank (int, optional): Local_rank for distributed training on GPUs.
Defaults to -1, which means non-distributed training.
weight_decay (float, optional): Weight decay to apply after each
parameter update. Defaults to 0.0.
learning_rate (float, optional): Learning rate of the AdamW optimizer.
Defaults to 5e-5.
adam_epsilon (float, optional): Epsilon of the AdamW optimizer.
Defaults to 1e-8.
warmup_steps (int, optional): Number of steps taken to increase
learning rate from 0 to `learning rate`.
Defaults to 0.
verbose (bool, optional): Whether to print out the training log.
Defaults to True.
seed (int, optional): Random seed used to improve reproducibility.
Defaults to None.
cache_model (bool, optional): Whether to save the fine-tuned model.
If True, the fine-tuned model is saved to a `fine_tuned` folder
under of the `cache_dir` of AnswerExtractor.
Defaults to True.
"""
# init optimizer
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
optimizer = Transformer.get_default_optimizer(
self.model, weight_decay, learning_rate, adam_epsilon
)
# compute the max number of training steps
max_steps = compute_training_steps(
@ -522,7 +574,7 @@ class AnswerExtractor(Transformer):
# inin scheduler
scheduler = Transformer.get_default_scheduler(
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
)
# fine tune
@ -530,7 +582,7 @@ class AnswerExtractor(Transformer):
train_dataloader=train_dataloader,
get_inputs=QAProcessor.get_inputs,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
gpu_ids=gpu_ids,
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
optimizer=optimizer,
@ -550,13 +602,15 @@ class AnswerExtractor(Transformer):
Args:
test_dataloader (DataLoader): DataLoader for scoring the data.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will
be used. Defaults to None.
num_gpus (int, optional): The number of GPUs to use.
If None, all available GPUs will be used.
If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
verbose (bool, optional): Whether to print out the predicting log. Defaults to True.
verbose (bool, optional): Whether to print out the predicting log.
Defaults to True.
Returns:
list: List of :class:`QAResult` or :class:`QAResultExtended`.
@ -589,7 +643,9 @@ class AnswerExtractor(Transformer):
)
else:
result = QAResult(
unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
unique_id=u_id.item(),
start_logits=_to_list(outputs[0][i]),
end_logits=_to_list(outputs[1][i]),
)
all_results.append(result)
torch.cuda.empty_cache()
@ -612,53 +668,61 @@ def postprocess_bert_answer(
verbose_logging=False,
):
"""
Postprocesses start and end logits generated by :meth:`AnswerExtractor.fit` for BERT.
Postprocesses start and end logits
generated by :meth:`AnswerExtractor.fit` for BERT.
Args:
results (list): List of :class:`QAResult`.
examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
contains the original document tokens that are used to generate the final answers
from the predicted start and end positions.
features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
contains the mapping from indices in the processed token list to the original
document tokens that are used to generate the final predicted answers.
do_lower_case (bool): Whether an uncased tokenizer was used during data preprocessing.
This is required during answer finalization by comparing the predicted answer text
and the original text span in :func:`_get_final_text`.
unanswerable_exists (bool, optional): Whether there are unanswerable questions in the
data. If True, the start and end logits of the [CLS] token, which indicate the
probability of the answer being empty, are included in the candidate answer list.
examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
This file contains the original document tokens that are used to generate
the final answers from the predicted start and end positions.
features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
This file contains the mapping from indices in the processed token list
to the original document tokens that are used to generate the final
predicted answers.
do_lower_case (bool): Whether an uncased tokenizer was used during
data preprocessing. This is required during answer finalization
by comparing the predicted answer text and the original
text span in :func:`_get_final_text`.
unanswerable_exists (bool, optional): Whether there are unanswerable
questions in the data. If True, the start and end logits of the [CLS]
token, which indicate the probability of the answer being empty,
are included in the candidate answer list.
Defaults to False.
n_best_size (int, optional): The number of candidates to choose from each QAResult to
generate the final prediction. It's also the maximum number of n-best answers to
output for each question. Note that the number of n-best answers can be smaller than
`n_best_size` because some unqualified answers, e.g. answer that are too long,
are removed.
n_best_size (int, optional): The number of candidates to choose from each
QAResult to generate the final prediction. It's also the maximum number
of n-best answers to output for each question.
Note that the number of n-best answers can be smaller than `n_best_size`
because some unqualified answers,
e.g. answer that are too long, are removed.
max_answer_length (int, optional): Maximum length of the answer. Defaults to 30.
output_prediction_file (str, optional): Path of the file to save the predicted answers.
Defaults to "./qa_predictions.json".
output_nbest_file (str, optional): Path of the file to save the n-best answers. Defaults
to "./nbest_predictions.json".
output_null_log_odds_file (str, optional): If unanswerable_exists is True, the score
difference between empty prediction and best non-empty prediction are saved to this
file. These scores can be used to find the best threshold for predicting an empty
answer. Defaults to "./null_odds.json".
null_score_diff_threshold (float, optional): If unanswerable_exists=True and the score
difference between empty prediction and best non-empty prediction is higher than this
threshold, the final predicted answer is empty. Defaults to 0.0.
verbose_logging (bool, optional): Whether to log details of answer postprocessing.
Defaults to False.
output_prediction_file (str, optional): Path of the file to save the
predicted answers. Defaults to "./qa_predictions.json".
output_nbest_file (str, optional): Path of the file to save the n-best answers.
Defaults to "./nbest_predictions.json".
output_null_log_odds_file (str, optional): If unanswerable_exists is True,
the score difference between empty prediction and best non-empty prediction
are saved to this file. These scores can be used to find the best threshold
for predicting an empty answer. Defaults to "./null_odds.json".
null_score_diff_threshold (float, optional): If unanswerable_exists=True
and the score difference between empty prediction and best non-empty
prediction is higher than this threshold, the final predicted
answer is empty.
Defaults to 0.0.
verbose_logging (bool, optional): Whether to log details of
answer postprocessing. Defaults to False.
Returns:
tuple: (OrderedDict, OrderedDict, OrderedDict)
The keys of the dictionaries are the `qa_id` in the original
:class:`utils_nlp.dataset.pytorch.QADataset`
The values of the first dictionary are the predicted answer texts in string type.
The values of the second dictionary are the softmax probabilities of the predicted
answers.
The values of the third dictionary are the n-best answers for each qa_id. Note that
the number of n-best answers can be smaller than `n_best_size` because some
unqualified answers, e.g. answers that are too long, are removed.
The values of the first dictionary are the predicted answer texts
in string type. The values of the second dictionary are the softmax
probabilities of the predicted answers.
The values of the third dictionary are the n-best answers for each qa_id.
Note that the number of n-best answers can be smaller than `n_best_size`
because some unqualified answers, e.g. answers that are too long,
are removed.
"""
with jsonlines.open(examples_file) as reader:
@ -753,7 +817,9 @@ def postprocess_bert_answer(
# Sort by the sum of the start and end logits in ascending order,
# so that the first element is the most probable answer
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
prelim_predictions = sorted(
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
)
seen_predictions = {}
nbest = []
@ -786,11 +852,19 @@ def postprocess_bert_answer(
final_text = ""
seen_predictions[final_text] = True
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
nbest.append(
_NbestPrediction(
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
)
)
# if we didn't include the empty option in the n-best, include it
if unanswerable_exists:
if "" not in seen_predictions:
nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
nbest.append(
_NbestPrediction(
text="", start_logit=null_start_logit, end_logit=null_end_logit
)
)
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
@ -834,7 +908,9 @@ def postprocess_bert_answer(
all_probs[example["qa_id"]] = nbest_json[0]["probability"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
score_diff = (
score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
)
scores_diff_json[example["qa_id"]] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example["qa_id"]] = ""
@ -1000,7 +1076,9 @@ def postprocess_xlnet_answer(
)
)
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
prelim_predictions = sorted(
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
)
seen_predictions = {}
nbest = []
@ -1031,14 +1109,20 @@ def postprocess_xlnet_answer(
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
final_text = _get_final_text(
tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
nbest.append(
_NbestPrediction(
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
)
)
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
@ -1185,7 +1269,9 @@ def _create_qa_example(qa_input, is_training):
actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
logger.warning(
"Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text,
)
return
else:
start_position = -1
@ -1408,7 +1494,7 @@ def _create_qa_features(
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text
all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text,
)
# The -3 accounts for [CLS], [SEP] and [SEP]
@ -1579,7 +1665,7 @@ def _create_qa_features(
# -------------------------------------------------------------------------------------------------
# Post processing helper functions
_PrelimPrediction = collections.namedtuple(
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
)
_NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
@ -1644,7 +1730,9 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
logger.info(
"Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using