fix qa notebook
This commit is contained in:
Родитель
6769aa765d
Коммит
67b1d8a8ea
|
@ -69,23 +69,19 @@
|
|||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"import scrapbook as sb\n",
|
||||
"\n",
|
||||
"import scrapbook as sb\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"nlp_path = os.path.abspath('../../')\n",
|
||||
"if nlp_path not in sys.path:\n",
|
||||
" sys.path.insert(0, nlp_path)\n",
|
||||
"\n",
|
||||
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
|
||||
"from utils_nlp.common.timer import Timer\n",
|
||||
"from utils_nlp.dataset.squad import load_pandas_df\n",
|
||||
"from utils_nlp.eval.question_answering import evaluate_qa\n",
|
||||
"from utils_nlp.models.transformers.datasets import QADataset\n",
|
||||
"from utils_nlp.models.transformers.question_answering import (\n",
|
||||
" AnswerExtractor,\n",
|
||||
" QAProcessor,\n",
|
||||
" AnswerExtractor\n",
|
||||
")\n",
|
||||
" \n",
|
||||
"from utils_nlp.eval.question_answering import evaluate_qa\n",
|
||||
"from utils_nlp.common.timer import Timer"
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -559,7 +555,7 @@
|
|||
"* Pad the concatenated token sequence to `max_seq_length` if it's shorter.\n",
|
||||
"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
|
||||
"\n",
|
||||
"`QAProcessor.preprocess` returns a Pytorch Dataloader. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
|
||||
"`QAProcessor.preprocess` returns a Pytorch DataSet. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -577,24 +573,28 @@
|
|||
],
|
||||
"source": [
|
||||
"qa_processor = QAProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
|
||||
"train_dataloader = qa_processor.preprocess(\n",
|
||||
" train_dataset, \n",
|
||||
" batch_size=PER_GPU_BATCH_SIZE,\n",
|
||||
" num_gpus=NUM_GPUS,\n",
|
||||
"train_dataset = qa_processor.preprocess(\n",
|
||||
" train_dataset,\n",
|
||||
" is_training=True,\n",
|
||||
" max_question_length=MAX_QUESTION_LENGTH,\n",
|
||||
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
||||
" doc_stride=DOC_STRIDE\n",
|
||||
" doc_stride=DOC_STRIDE,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"dev_dataloader = qa_processor.preprocess(\n",
|
||||
" dev_dataset, \n",
|
||||
" batch_size=PER_GPU_BATCH_SIZE,\n",
|
||||
" num_gpus=NUM_GPUS,\n",
|
||||
"# we keep a copy of the oroginal dev_dataset as it is needed for evaluation\n",
|
||||
"dev_dataset_processed = qa_processor.preprocess(\n",
|
||||
" dev_dataset,\n",
|
||||
" is_training=False,\n",
|
||||
" max_question_length=MAX_QUESTION_LENGTH,\n",
|
||||
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
||||
" doc_stride=DOC_STRIDE\n",
|
||||
" doc_stride=DOC_STRIDE,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"train_dataloader = dataloader_from_dataset(\n",
|
||||
" train_dataset, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
|
||||
")\n",
|
||||
"dev_dataloader = dataloader_from_dataset(\n",
|
||||
" dev_dataset_processed, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -65,22 +65,18 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
qa_id_col=qa_test_df["qa_id_col"],
|
||||
)
|
||||
|
||||
# bert
|
||||
qa_processor_bert = QAProcessor(cache_dir=tmp_module)
|
||||
train_features_bert = qa_processor_bert.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=True,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
doc_stride=32,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
test_features_bert = qa_processor_bert.preprocess(
|
||||
test_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=False,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -88,22 +84,18 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
# xlnet
|
||||
qa_processor_xlnet = QAProcessor(model_name="xlnet-base-cased", cache_dir=tmp_module)
|
||||
train_features_xlnet = qa_processor_xlnet.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=True,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
doc_stride=32,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
test_features_xlnet = qa_processor_xlnet.preprocess(
|
||||
test_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=False,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -111,22 +103,20 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
|
||||
# distilbert
|
||||
qa_processor_distilbert = QAProcessor(
|
||||
model_name="distilbert-base-uncased", cache_dir=tmp_module
|
||||
)
|
||||
train_features_distilbert = qa_processor_distilbert.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=True,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
doc_stride=32,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
test_features_distilbert = qa_processor_distilbert.preprocess(
|
||||
test_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=False,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -151,11 +141,21 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
|
||||
@pytest.mark.gpu
|
||||
def test_QAProcessor(qa_test_data, tmp_module):
|
||||
for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
|
||||
for model_name in [
|
||||
"bert-base-cased",
|
||||
"xlnet-base-cased",
|
||||
"distilbert-base-uncased",
|
||||
]:
|
||||
qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
|
||||
qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module,
|
||||
)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module,
|
||||
)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
# test unsupported model type
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -163,18 +163,24 @@ def test_QAProcessor(qa_test_data, tmp_module):
|
|||
|
||||
# test training data has no ground truth exception
|
||||
with pytest.raises(Exception):
|
||||
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
|
||||
)
|
||||
|
||||
# test when answer start is a list, but answer text is not
|
||||
with pytest.raises(Exception):
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
|
||||
qa_test_data["train_dataset_start_text_mismatch"],
|
||||
is_training=True,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
# test when training data has multiple answers
|
||||
with pytest.raises(Exception):
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
|
||||
qa_test_data["train_dataset_multi_answers"],
|
||||
is_training=True,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
|
||||
|
@ -190,7 +196,9 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
|
|||
assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
|
||||
assert os.path.exists(os.path.join(model_output_dir, "config.json"))
|
||||
|
||||
qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
|
||||
qa_extractor_from_cache = AnswerExtractor(
|
||||
cache_dir=tmp_module, load_model_from_dir=model_output_dir
|
||||
)
|
||||
qa_extractor_from_cache.predict(test_loader_bert, verbose=False)
|
||||
|
||||
# xlnet
|
||||
|
@ -202,8 +210,12 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
|
|||
|
||||
# distilbert
|
||||
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
|
||||
test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
|
||||
qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
|
||||
test_loader_xlnet = dataloader_from_dataset(
|
||||
qa_test_data["test_features_distilbert"], shuffle=False
|
||||
)
|
||||
qa_extractor_distilbert = AnswerExtractor(
|
||||
model_name="distilbert-base-uncased", cache_dir=tmp_module
|
||||
)
|
||||
qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
|
||||
qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)
|
||||
|
||||
|
|
|
@ -27,19 +27,41 @@ import jsonlines
|
|||
import torch
|
||||
from torch.utils.data import TensorDataset
|
||||
from tqdm import tqdm
|
||||
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
|
||||
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
|
||||
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
|
||||
from transformers.modeling_albert import (
|
||||
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
AlbertForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_bert import (
|
||||
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
BertForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_distilbert import (
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
DistilBertForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_xlnet import (
|
||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
XLNetForQuestionAnswering,
|
||||
)
|
||||
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||
|
||||
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
|
||||
from utils_nlp.common.pytorch_utils import (
|
||||
compute_training_steps,
|
||||
get_device,
|
||||
move_model_to_device,
|
||||
)
|
||||
from utils_nlp.models.transformers.common import (
|
||||
MAX_SEQ_LEN,
|
||||
TOKENIZER_CLASS,
|
||||
Transformer,
|
||||
)
|
||||
|
||||
MODEL_CLASS = {}
|
||||
MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update(
|
||||
{k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
|
||||
)
|
||||
MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
|
||||
# cached files during preprocessing
|
||||
|
@ -62,25 +84,29 @@ class QAProcessor:
|
|||
Class for preprocessing and postprocessing question answering data.
|
||||
|
||||
Args:
|
||||
model_name (str, optional): Name of the model. Call QAProcessor.list_supported_models() to
|
||||
get all supported models. Defaults to "bert-base-cased".
|
||||
model_name (str, optional): Name of the model.
|
||||
Call QAProcessor.list_supported_models() to get all supported models.
|
||||
Defaults to "bert-base-cased".
|
||||
to_lower (bool, optional): Whether to convert all letters to lower case during
|
||||
tokenization. This is determined by if a cased model is used. Defaults to False,
|
||||
which corresponds to a cased model.
|
||||
custom_tokenize (function, optional): A custom tokenize function used to tokenize the
|
||||
input text. If not provided, the default tokenizer corresponding to the model_name
|
||||
is loaded and its `tokenize` method is used. NOTE that even this function is
|
||||
provided, the numerical token ids are still generated by the `convert_tokens_to_ids`
|
||||
method of the default tokenizer, so there is a risk that tokens generated by the
|
||||
custom_tokenize function don't have correponding token ids in the default toeknizer.
|
||||
tokenization. This is determined by if a cased model is used.
|
||||
Defaults to False, which corresponds to a cased model.
|
||||
custom_tokenize (function, optional): A custom tokenize function
|
||||
used to tokenize the input text. If not provided, the default tokenizer
|
||||
corresponding to the model_name is loaded and its `tokenize` method is used.
|
||||
NOTE that even this function is provided, the numerical token ids are still
|
||||
generated by the `convert_tokens_to_ids` method of the default tokenizer,
|
||||
so there is a risk that tokens generated by the custom_tokenize
|
||||
function don't have correponding token ids in the default toeknizer.
|
||||
Defaults to None.
|
||||
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
|
||||
"""
|
||||
|
||||
def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
|
||||
def __init__(
|
||||
self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir=".",
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
|
||||
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
|
||||
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False,
|
||||
)
|
||||
self.do_lower_case = to_lower
|
||||
self.custom_tokenize = custom_tokenize
|
||||
|
@ -149,9 +175,6 @@ class QAProcessor:
|
|||
self,
|
||||
qa_dataset,
|
||||
is_training,
|
||||
batch_size=32,
|
||||
num_gpus=None,
|
||||
distributed=False,
|
||||
max_question_length=64,
|
||||
max_seq_length=MAX_SEQ_LEN,
|
||||
doc_stride=128,
|
||||
|
@ -161,28 +184,31 @@ class QAProcessor:
|
|||
Preprocesses raw question answering data and generates train/test features.
|
||||
|
||||
Args:
|
||||
qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`): Question answering data in
|
||||
standard QADataset format.
|
||||
qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`):
|
||||
Question answering data in standard QADataset format.
|
||||
is_training (bool): Whether the input data is training data.
|
||||
max_question_length (int, optional): Maximum number of tokens of the question sequence
|
||||
after tokenization, so the number of words in the raw question is usually less than
|
||||
max_question_length. Defaults to 64.
|
||||
max_seq_length (int, optional): Maximum number of tokens of the entire feature token
|
||||
sequence after tokenization. The entire feature token sequence is composed
|
||||
of [CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP] for models other
|
||||
than XLNet, and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
|
||||
max_question_length (int, optional): Maximum number of tokens
|
||||
of the question sequence after tokenization, so the number of words
|
||||
in the raw question is usually less than max_question_length.
|
||||
Defaults to 64.
|
||||
max_seq_length (int, optional): Maximum number of tokens of the entire
|
||||
feature token sequence after tokenization. The entire feature token
|
||||
sequence is composed of:
|
||||
[CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP]
|
||||
for models other than XLNet,
|
||||
and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
|
||||
XLNet. Defaults to MAX_SEQ_LEN.
|
||||
doc_stride (int, optional): Size (number of tokens) of the sliding window when
|
||||
breaking down a long document paragraph in to multiple document spans. Defaults
|
||||
to 128.
|
||||
feature_cache_dir (int, optional): Directory to save some intermediate preprocessing
|
||||
results.
|
||||
doc_stride (int, optional): Size (number of tokens) of the sliding window
|
||||
when breaking down a long document paragraph in to multiple document
|
||||
spans. Defaults to 128.
|
||||
feature_cache_dir (int, optional): Directory to save some intermediate
|
||||
preprocessing results.
|
||||
If `is_training` is True, CACHED_EXAMPLES_TRAIN_FILE and
|
||||
CACHED_FEATURES_TRAIN_FILE are saved to this directory. Otherwise,
|
||||
CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved to this
|
||||
directory. These files are required during postprocessing to generate the final
|
||||
answer texts from predicted answer start and answer end indices. Defaults to
|
||||
"./cached_qa_features".
|
||||
CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved
|
||||
to this directory. These files are required during postprocessing to
|
||||
generate the final answer texts from predicted answer start and answer
|
||||
end indices. Defaults to "./cached_qa_features".
|
||||
Returns:
|
||||
DataSet: A Pytorch DataSet.
|
||||
"""
|
||||
|
@ -217,7 +243,9 @@ class QAProcessor:
|
|||
|
||||
qa_examples.append(qa_example_cur)
|
||||
|
||||
qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})
|
||||
qa_examples_json.append(
|
||||
{"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
|
||||
)
|
||||
|
||||
features_cur = _create_qa_features(
|
||||
qa_example_cur,
|
||||
|
@ -257,17 +285,25 @@ class QAProcessor:
|
|||
input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
||||
segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
||||
cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
||||
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
||||
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)
|
||||
|
||||
if is_training:
|
||||
start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
||||
end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
|
||||
input_ids,
|
||||
input_mask,
|
||||
segment_ids,
|
||||
start_positions,
|
||||
end_positions,
|
||||
cls_index,
|
||||
p_mask,
|
||||
)
|
||||
else:
|
||||
unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
|
||||
qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
|
||||
)
|
||||
|
||||
return qa_dataset
|
||||
|
||||
|
@ -397,7 +433,14 @@ class QAResult(QAResult_):
|
|||
|
||||
QAResultExtended_ = collections.namedtuple(
|
||||
"QAResultExtended",
|
||||
["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
|
||||
[
|
||||
"unique_id",
|
||||
"start_top_log_probs",
|
||||
"start_top_index",
|
||||
"end_top_log_probs",
|
||||
"end_top_index",
|
||||
"cls_logits",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
|
@ -481,36 +524,45 @@ class AnswerExtractor(Transformer):
|
|||
num_epochs (int, optional): Number of training epochs. Defaults to 1.
|
||||
max_steps (int, optional): Total number of training steps.
|
||||
If set to a positive value, it overrides num_epochs.
|
||||
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
|
||||
Otherwise, it's determined by the dataset length,
|
||||
gradient_accumulation_steps, and num_epochs.
|
||||
Defualts to -1.
|
||||
gradient_accumulation_steps (int, optional): Number of steps to accumulate
|
||||
before performing a backward/update pass.
|
||||
Default to 1.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will be used.
|
||||
num_gpus (int, optional): The number of GPUs to use.
|
||||
If None, all available GPUs will be used.
|
||||
If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
|
||||
-1, which means non-distributed training.
|
||||
weight_decay (float, optional): Weight decay to apply after each parameter update.
|
||||
Defaults to 0.0.
|
||||
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
|
||||
5e-5.
|
||||
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
|
||||
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
|
||||
to `learning rate`. Defaults to 0.
|
||||
verbose (bool, optional): Whether to print out the training log. Defaults to True.
|
||||
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
|
||||
cache_model (bool, optional): Whether to save the fine-tuned model. If True,
|
||||
the fine-tuned model is saved to a `fine_tuned` folder under of the `cache_dir`
|
||||
of AnswerExtractor. Defaults to True.
|
||||
local_rank (int, optional): Local_rank for distributed training on GPUs.
|
||||
Defaults to -1, which means non-distributed training.
|
||||
weight_decay (float, optional): Weight decay to apply after each
|
||||
parameter update. Defaults to 0.0.
|
||||
learning_rate (float, optional): Learning rate of the AdamW optimizer.
|
||||
Defaults to 5e-5.
|
||||
adam_epsilon (float, optional): Epsilon of the AdamW optimizer.
|
||||
Defaults to 1e-8.
|
||||
warmup_steps (int, optional): Number of steps taken to increase
|
||||
learning rate from 0 to `learning rate`.
|
||||
Defaults to 0.
|
||||
verbose (bool, optional): Whether to print out the training log.
|
||||
Defaults to True.
|
||||
seed (int, optional): Random seed used to improve reproducibility.
|
||||
Defaults to None.
|
||||
cache_model (bool, optional): Whether to save the fine-tuned model.
|
||||
If True, the fine-tuned model is saved to a `fine_tuned` folder
|
||||
under of the `cache_dir` of AnswerExtractor.
|
||||
Defaults to True.
|
||||
|
||||
"""
|
||||
|
||||
# init optimizer
|
||||
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
|
||||
optimizer = Transformer.get_default_optimizer(
|
||||
self.model, weight_decay, learning_rate, adam_epsilon
|
||||
)
|
||||
|
||||
# compute the max number of training steps
|
||||
max_steps = compute_training_steps(
|
||||
|
@ -522,7 +574,7 @@ class AnswerExtractor(Transformer):
|
|||
|
||||
# inin scheduler
|
||||
scheduler = Transformer.get_default_scheduler(
|
||||
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
|
||||
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
|
||||
)
|
||||
|
||||
# fine tune
|
||||
|
@ -530,7 +582,7 @@ class AnswerExtractor(Transformer):
|
|||
train_dataloader=train_dataloader,
|
||||
get_inputs=QAProcessor.get_inputs,
|
||||
num_gpus=num_gpus,
|
||||
gpu_ids=gpu_ids,
|
||||
gpu_ids=gpu_ids,
|
||||
max_steps=max_steps,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
optimizer=optimizer,
|
||||
|
@ -550,13 +602,15 @@ class AnswerExtractor(Transformer):
|
|||
|
||||
Args:
|
||||
test_dataloader (DataLoader): DataLoader for scoring the data.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will
|
||||
be used. Defaults to None.
|
||||
num_gpus (int, optional): The number of GPUs to use.
|
||||
If None, all available GPUs will be used.
|
||||
If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
verbose (bool, optional): Whether to print out the predicting log. Defaults to True.
|
||||
verbose (bool, optional): Whether to print out the predicting log.
|
||||
Defaults to True.
|
||||
|
||||
Returns:
|
||||
list: List of :class:`QAResult` or :class:`QAResultExtended`.
|
||||
|
@ -589,7 +643,9 @@ class AnswerExtractor(Transformer):
|
|||
)
|
||||
else:
|
||||
result = QAResult(
|
||||
unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
|
||||
unique_id=u_id.item(),
|
||||
start_logits=_to_list(outputs[0][i]),
|
||||
end_logits=_to_list(outputs[1][i]),
|
||||
)
|
||||
all_results.append(result)
|
||||
torch.cuda.empty_cache()
|
||||
|
@ -612,53 +668,61 @@ def postprocess_bert_answer(
|
|||
verbose_logging=False,
|
||||
):
|
||||
"""
|
||||
Postprocesses start and end logits generated by :meth:`AnswerExtractor.fit` for BERT.
|
||||
Postprocesses start and end logits
|
||||
generated by :meth:`AnswerExtractor.fit` for BERT.
|
||||
|
||||
Args:
|
||||
results (list): List of :class:`QAResult`.
|
||||
examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
|
||||
contains the original document tokens that are used to generate the final answers
|
||||
from the predicted start and end positions.
|
||||
features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
|
||||
contains the mapping from indices in the processed token list to the original
|
||||
document tokens that are used to generate the final predicted answers.
|
||||
do_lower_case (bool): Whether an uncased tokenizer was used during data preprocessing.
|
||||
This is required during answer finalization by comparing the predicted answer text
|
||||
and the original text span in :func:`_get_final_text`.
|
||||
unanswerable_exists (bool, optional): Whether there are unanswerable questions in the
|
||||
data. If True, the start and end logits of the [CLS] token, which indicate the
|
||||
probability of the answer being empty, are included in the candidate answer list.
|
||||
examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
|
||||
This file contains the original document tokens that are used to generate
|
||||
the final answers from the predicted start and end positions.
|
||||
features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
|
||||
This file contains the mapping from indices in the processed token list
|
||||
to the original document tokens that are used to generate the final
|
||||
predicted answers.
|
||||
do_lower_case (bool): Whether an uncased tokenizer was used during
|
||||
data preprocessing. This is required during answer finalization
|
||||
by comparing the predicted answer text and the original
|
||||
text span in :func:`_get_final_text`.
|
||||
unanswerable_exists (bool, optional): Whether there are unanswerable
|
||||
questions in the data. If True, the start and end logits of the [CLS]
|
||||
token, which indicate the probability of the answer being empty,
|
||||
are included in the candidate answer list.
|
||||
Defaults to False.
|
||||
n_best_size (int, optional): The number of candidates to choose from each QAResult to
|
||||
generate the final prediction. It's also the maximum number of n-best answers to
|
||||
output for each question. Note that the number of n-best answers can be smaller than
|
||||
`n_best_size` because some unqualified answers, e.g. answer that are too long,
|
||||
are removed.
|
||||
n_best_size (int, optional): The number of candidates to choose from each
|
||||
QAResult to generate the final prediction. It's also the maximum number
|
||||
of n-best answers to output for each question.
|
||||
Note that the number of n-best answers can be smaller than `n_best_size`
|
||||
because some unqualified answers,
|
||||
e.g. answer that are too long, are removed.
|
||||
max_answer_length (int, optional): Maximum length of the answer. Defaults to 30.
|
||||
output_prediction_file (str, optional): Path of the file to save the predicted answers.
|
||||
Defaults to "./qa_predictions.json".
|
||||
output_nbest_file (str, optional): Path of the file to save the n-best answers. Defaults
|
||||
to "./nbest_predictions.json".
|
||||
output_null_log_odds_file (str, optional): If unanswerable_exists is True, the score
|
||||
difference between empty prediction and best non-empty prediction are saved to this
|
||||
file. These scores can be used to find the best threshold for predicting an empty
|
||||
answer. Defaults to "./null_odds.json".
|
||||
null_score_diff_threshold (float, optional): If unanswerable_exists=True and the score
|
||||
difference between empty prediction and best non-empty prediction is higher than this
|
||||
threshold, the final predicted answer is empty. Defaults to 0.0.
|
||||
verbose_logging (bool, optional): Whether to log details of answer postprocessing.
|
||||
Defaults to False.
|
||||
output_prediction_file (str, optional): Path of the file to save the
|
||||
predicted answers. Defaults to "./qa_predictions.json".
|
||||
output_nbest_file (str, optional): Path of the file to save the n-best answers.
|
||||
Defaults to "./nbest_predictions.json".
|
||||
output_null_log_odds_file (str, optional): If unanswerable_exists is True,
|
||||
the score difference between empty prediction and best non-empty prediction
|
||||
are saved to this file. These scores can be used to find the best threshold
|
||||
for predicting an empty answer. Defaults to "./null_odds.json".
|
||||
null_score_diff_threshold (float, optional): If unanswerable_exists=True
|
||||
and the score difference between empty prediction and best non-empty
|
||||
prediction is higher than this threshold, the final predicted
|
||||
answer is empty.
|
||||
Defaults to 0.0.
|
||||
verbose_logging (bool, optional): Whether to log details of
|
||||
answer postprocessing. Defaults to False.
|
||||
|
||||
Returns:
|
||||
tuple: (OrderedDict, OrderedDict, OrderedDict)
|
||||
The keys of the dictionaries are the `qa_id` in the original
|
||||
:class:`utils_nlp.dataset.pytorch.QADataset`
|
||||
The values of the first dictionary are the predicted answer texts in string type.
|
||||
The values of the second dictionary are the softmax probabilities of the predicted
|
||||
answers.
|
||||
The values of the third dictionary are the n-best answers for each qa_id. Note that
|
||||
the number of n-best answers can be smaller than `n_best_size` because some
|
||||
unqualified answers, e.g. answers that are too long, are removed.
|
||||
The values of the first dictionary are the predicted answer texts
|
||||
in string type. The values of the second dictionary are the softmax
|
||||
probabilities of the predicted answers.
|
||||
The values of the third dictionary are the n-best answers for each qa_id.
|
||||
Note that the number of n-best answers can be smaller than `n_best_size`
|
||||
because some unqualified answers, e.g. answers that are too long,
|
||||
are removed.
|
||||
|
||||
"""
|
||||
with jsonlines.open(examples_file) as reader:
|
||||
|
@ -753,7 +817,9 @@ def postprocess_bert_answer(
|
|||
|
||||
# Sort by the sum of the start and end logits in ascending order,
|
||||
# so that the first element is the most probable answer
|
||||
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
|
||||
prelim_predictions = sorted(
|
||||
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
|
||||
)
|
||||
|
||||
seen_predictions = {}
|
||||
nbest = []
|
||||
|
@ -786,11 +852,19 @@ def postprocess_bert_answer(
|
|||
final_text = ""
|
||||
seen_predictions[final_text] = True
|
||||
|
||||
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
|
||||
)
|
||||
)
|
||||
# if we didn't include the empty option in the n-best, include it
|
||||
if unanswerable_exists:
|
||||
if "" not in seen_predictions:
|
||||
nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text="", start_logit=null_start_logit, end_logit=null_end_logit
|
||||
)
|
||||
)
|
||||
|
||||
# In very rare edge cases we could only have single null prediction.
|
||||
# So we just create a nonce prediction in this case to avoid failure.
|
||||
|
@ -834,7 +908,9 @@ def postprocess_bert_answer(
|
|||
all_probs[example["qa_id"]] = nbest_json[0]["probability"]
|
||||
else:
|
||||
# predict "" iff the null score - the score of best non-null > threshold
|
||||
score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
|
||||
score_diff = (
|
||||
score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
|
||||
)
|
||||
scores_diff_json[example["qa_id"]] = score_diff
|
||||
if score_diff > null_score_diff_threshold:
|
||||
all_predictions[example["qa_id"]] = ""
|
||||
|
@ -1000,7 +1076,9 @@ def postprocess_xlnet_answer(
|
|||
)
|
||||
)
|
||||
|
||||
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
|
||||
prelim_predictions = sorted(
|
||||
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
|
||||
)
|
||||
|
||||
seen_predictions = {}
|
||||
nbest = []
|
||||
|
@ -1031,14 +1109,20 @@ def postprocess_xlnet_answer(
|
|||
tok_text = " ".join(tok_text.split())
|
||||
orig_text = " ".join(orig_tokens)
|
||||
|
||||
final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
|
||||
final_text = _get_final_text(
|
||||
tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
|
||||
)
|
||||
|
||||
if final_text in seen_predictions:
|
||||
continue
|
||||
|
||||
seen_predictions[final_text] = True
|
||||
|
||||
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
|
||||
)
|
||||
)
|
||||
|
||||
# In very rare edge cases we could have no valid predictions. So we
|
||||
# just create a nonce prediction in this case to avoid failure.
|
||||
|
@ -1185,7 +1269,9 @@ def _create_qa_example(qa_input, is_training):
|
|||
actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
|
||||
cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
|
||||
if actual_text.find(cleaned_answer_text) == -1:
|
||||
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
|
||||
logger.warning(
|
||||
"Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text,
|
||||
)
|
||||
return
|
||||
else:
|
||||
start_position = -1
|
||||
|
@ -1408,7 +1494,7 @@ def _create_qa_features(
|
|||
else:
|
||||
tok_end_position = len(all_doc_tokens) - 1
|
||||
(tok_start_position, tok_end_position) = _improve_answer_span(
|
||||
all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text
|
||||
all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text,
|
||||
)
|
||||
|
||||
# The -3 accounts for [CLS], [SEP] and [SEP]
|
||||
|
@ -1579,7 +1665,7 @@ def _create_qa_features(
|
|||
# -------------------------------------------------------------------------------------------------
|
||||
# Post processing helper functions
|
||||
_PrelimPrediction = collections.namedtuple(
|
||||
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
|
||||
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
|
||||
)
|
||||
|
||||
_NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
|
||||
|
@ -1644,7 +1730,9 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
|
|||
|
||||
if len(orig_ns_text) != len(tok_ns_text):
|
||||
if verbose_logging:
|
||||
logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
|
||||
logger.info(
|
||||
"Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
|
||||
)
|
||||
return orig_text
|
||||
|
||||
# We then project the characters in `pred_text` back to `orig_text` using
|
||||
|
|
Загрузка…
Ссылка в новой задаче