Black 20 release
This commit is contained in:
Родитель
e78c110338
Коммит
a75c64d80c
|
@ -112,7 +112,10 @@ if is_torch_available():
|
|||
cached_features_file = os.path.join(
|
||||
data_dir,
|
||||
"cached_{}_{}_{}_{}".format(
|
||||
"dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task,
|
||||
"dev" if evaluate else "train",
|
||||
tokenizer.__class__.__name__,
|
||||
str(max_seq_length),
|
||||
task,
|
||||
),
|
||||
)
|
||||
label_list = processor.get_labels()
|
||||
|
@ -278,7 +281,10 @@ class HansProcessor(DataProcessor):
|
|||
|
||||
|
||||
def hans_convert_examples_to_features(
|
||||
examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
|
||||
examples: List[InputExample],
|
||||
label_list: List[str],
|
||||
max_length: int,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
):
|
||||
"""
|
||||
Loads a data file into a list of ``InputFeatures``
|
||||
|
|
|
@ -20,7 +20,9 @@ class PlotArguments:
|
|||
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
|
||||
"""
|
||||
|
||||
csv_file: str = field(metadata={"help": "The csv file to plot."},)
|
||||
csv_file: str = field(
|
||||
metadata={"help": "The csv file to plot."},
|
||||
)
|
||||
plot_along_batch: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
|
||||
|
@ -30,7 +32,8 @@ class PlotArguments:
|
|||
metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
|
||||
)
|
||||
no_log_scale: bool = field(
|
||||
default=False, metadata={"help": "Disable logarithmic scale when plotting"},
|
||||
default=False,
|
||||
metadata={"help": "Disable logarithmic scale when plotting"},
|
||||
)
|
||||
is_train: bool = field(
|
||||
default=False,
|
||||
|
@ -39,7 +42,8 @@ class PlotArguments:
|
|||
},
|
||||
)
|
||||
figure_png_file: Optional[str] = field(
|
||||
default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
|
||||
default=None,
|
||||
metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
|
||||
)
|
||||
short_model_names: Optional[List[str]] = list_field(
|
||||
default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
|
||||
|
|
|
@ -101,30 +101,30 @@ class AlbertModelWithPabee(AlbertModel):
|
|||
regression=False,
|
||||
):
|
||||
r"""
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
|
||||
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||
layer weights are trained from the next sentence prediction (classification)
|
||||
objective during pre-training.
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
|
||||
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||
layer weights are trained from the next sentence prediction (classification)
|
||||
objective during pre-training.
|
||||
|
||||
This output is usually *not* a good summary
|
||||
of the semantic content of the input, you're often better with averaging or pooling
|
||||
the sequence of hidden-states for the whole input sequence.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
This output is usually *not* a good summary
|
||||
of the semantic content of the input, you're often better with averaging or pooling
|
||||
the sequence of hidden-states for the whole input sequence.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
|
@ -157,7 +157,10 @@ class AlbertModelWithPabee(AlbertModel):
|
|||
res = []
|
||||
for i in range(self.config.num_hidden_layers):
|
||||
encoder_outputs = self.encoder.adaptive_forward(
|
||||
encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
|
||||
encoder_outputs,
|
||||
current_layer=i,
|
||||
attention_mask=extended_attention_mask,
|
||||
head_mask=head_mask,
|
||||
)
|
||||
|
||||
pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
|
||||
|
@ -174,7 +177,10 @@ class AlbertModelWithPabee(AlbertModel):
|
|||
for i in range(self.config.num_hidden_layers):
|
||||
calculated_layer_num += 1
|
||||
encoder_outputs = self.encoder.adaptive_forward(
|
||||
encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
|
||||
encoder_outputs,
|
||||
current_layer=i,
|
||||
attention_mask=extended_attention_mask,
|
||||
head_mask=head_mask,
|
||||
)
|
||||
|
||||
pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
|
||||
|
@ -236,42 +242,42 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
|
|||
labels=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
|
||||
loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
|
||||
loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
from transformers import AlbertTokenizer
|
||||
from pabee import AlbertForSequenceClassificationWithPabee
|
||||
import torch
|
||||
from transformers import AlbertTokenizer
|
||||
from pabee import AlbertForSequenceClassificationWithPabee
|
||||
import torch
|
||||
|
||||
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
"""
|
||||
|
||||
|
|
|
@ -108,30 +108,30 @@ class BertModelWithPabee(BertModel):
|
|||
regression=False,
|
||||
):
|
||||
r"""
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||
layer weights are trained from the next sentence prediction (classification)
|
||||
objective during pre-training.
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||
layer weights are trained from the next sentence prediction (classification)
|
||||
objective during pre-training.
|
||||
|
||||
This output is usually *not* a good summary
|
||||
of the semantic content of the input, you're often better with averaging or pooling
|
||||
the sequence of hidden-states for the whole input sequence.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
This output is usually *not* a good summary
|
||||
of the semantic content of the input, you're often better with averaging or pooling
|
||||
the sequence of hidden-states for the whole input sequence.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
|
@ -266,44 +266,44 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
|
|||
labels=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
from transformers import BertTokenizer, BertForSequenceClassification
|
||||
from pabee import BertForSequenceClassificationWithPabee
|
||||
import torch
|
||||
from transformers import BertTokenizer, BertForSequenceClassification
|
||||
from pabee import BertForSequenceClassificationWithPabee
|
||||
import torch
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
|
||||
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
|
||||
loss, logits = outputs[:2]
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
"""
|
||||
|
||||
|
|
|
@ -120,7 +120,10 @@ def train(args, train_dataset, model, tokenizer):
|
|||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
|
||||
model,
|
||||
device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True,
|
||||
)
|
||||
|
||||
# Train!
|
||||
|
@ -151,13 +154,17 @@ def train(args, train_dataset, model, tokenizer):
|
|||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(
|
||||
" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch,
|
||||
" Will skip the first %d steps in the first epoch",
|
||||
steps_trained_in_current_epoch,
|
||||
)
|
||||
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
|
||||
epochs_trained,
|
||||
int(args.num_train_epochs),
|
||||
desc="Epoch",
|
||||
disable=args.local_rank not in [-1, 0],
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
|
@ -372,7 +379,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
features = convert_examples_to_features(
|
||||
examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
|
||||
examples,
|
||||
tokenizer,
|
||||
label_list=label_list,
|
||||
max_length=args.max_seq_length,
|
||||
output_mode=output_mode,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
|
@ -434,15 +445,24 @@ def main():
|
|||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--patience", default="0", type=str, required=False,
|
||||
"--patience",
|
||||
default="0",
|
||||
type=str,
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--regression_threshold", default=0, type=float, required=False,
|
||||
"--regression_threshold",
|
||||
default=0,
|
||||
type=float,
|
||||
required=False,
|
||||
)
|
||||
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
|
||||
"--config_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained config name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
|
@ -466,17 +486,27 @@ def main():
|
|||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
|
||||
"--evaluate_during_training",
|
||||
action="store_true",
|
||||
help="Run evaluation during training at each logging step.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
|
||||
"--do_lower_case",
|
||||
action="store_true",
|
||||
help="Set this flag if you are using an uncased model.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
|
||||
"--per_gpu_train_batch_size",
|
||||
default=8,
|
||||
type=int,
|
||||
help="Batch size per GPU/CPU for training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=1, type=int, help="Batch size per GPU/CPU for evaluation.",
|
||||
"--per_gpu_eval_batch_size",
|
||||
default=1,
|
||||
type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
|
@ -485,13 +515,19 @@ def main():
|
|||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.",
|
||||
"--learning_rate",
|
||||
default=5e-5,
|
||||
type=float,
|
||||
help="The initial learning rate for Adam.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
|
||||
"--num_train_epochs",
|
||||
default=3.0,
|
||||
type=float,
|
||||
help="Total number of training epochs to perform.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
|
@ -503,7 +539,10 @@ def main():
|
|||
|
||||
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.",
|
||||
"--save_steps",
|
||||
type=int,
|
||||
default=500,
|
||||
help="Save checkpoint every X updates steps.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
|
@ -512,10 +551,14 @@ def main():
|
|||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
|
||||
"--overwrite_output_dir",
|
||||
action="store_true",
|
||||
help="Overwrite the content of the output directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
|
||||
"--overwrite_cache",
|
||||
action="store_true",
|
||||
help="Overwrite the cached training and evaluation sets",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
|
@ -532,7 +575,10 @@ def main():
|
|||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--local_rank", type=int, default=-1, help="For distributed training: local_rank",
|
||||
"--local_rank",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="For distributed training: local_rank",
|
||||
)
|
||||
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
|
||||
|
@ -634,7 +680,8 @@ def main():
|
|||
print("Output Layers Parameters:", output_layers_param_num)
|
||||
single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters())
|
||||
print(
|
||||
"Added Output Layers Parameters:", output_layers_param_num - single_output_layer_param_num,
|
||||
"Added Output Layers Parameters:",
|
||||
output_layers_param_num - single_output_layer_param_num,
|
||||
)
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
|
|
@ -66,9 +66,9 @@ def print_2d_tensor(tensor):
|
|||
def compute_heads_importance(
|
||||
args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
|
||||
):
|
||||
""" This method shows how to compute:
|
||||
- head attention entropy
|
||||
- head importance scores according to http://arxiv.org/abs/1905.10650
|
||||
"""This method shows how to compute:
|
||||
- head attention entropy
|
||||
- head importance scores according to http://arxiv.org/abs/1905.10650
|
||||
"""
|
||||
# Prepare our tensors
|
||||
n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
|
||||
|
@ -150,8 +150,8 @@ def compute_heads_importance(
|
|||
|
||||
|
||||
def mask_heads(args, model, eval_dataloader):
|
||||
""" This method shows how to mask head (set some heads to zero), to test the effect on the network,
|
||||
based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
|
||||
"""This method shows how to mask head (set some heads to zero), to test the effect on the network,
|
||||
based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
|
||||
"""
|
||||
_, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
|
||||
preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
|
||||
|
@ -201,8 +201,8 @@ def mask_heads(args, model, eval_dataloader):
|
|||
|
||||
|
||||
def prune_heads(args, model, eval_dataloader, head_mask):
|
||||
""" This method shows how to prune head (remove heads weights) based on
|
||||
the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
|
||||
"""This method shows how to prune head (remove heads weights) based on
|
||||
the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
|
||||
"""
|
||||
# Try pruning and test time speedup
|
||||
# Pruning is like masking but we actually remove the masked weights
|
||||
|
@ -395,7 +395,8 @@ def main():
|
|||
cache_dir=args.cache_dir,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
cache_dir=args.cache_dir,
|
||||
)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
|
|
|
@ -138,6 +138,9 @@ def get_image_transforms():
|
|||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
|
||||
transforms.Normalize(
|
||||
mean=[0.46777044, 0.44531429, 0.40661017],
|
||||
std=[0.12221994, 0.12145835, 0.14380469],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
|
|
@ -30,7 +30,11 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
|
|||
)
|
||||
else:
|
||||
topk_filled_outputs.append(
|
||||
(masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
|
||||
(
|
||||
masked_input.replace(masked_token, predicted_token),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
)
|
||||
)
|
||||
return topk_filled_outputs
|
||||
|
||||
|
|
|
@ -71,10 +71,10 @@ def load_rocstories_dataset(dataset_path):
|
|||
|
||||
|
||||
def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
|
||||
""" Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
|
||||
"""Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
|
||||
|
||||
To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
|
||||
input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
|
||||
input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
"""
|
||||
tensor_datasets = []
|
||||
for dataset in encoded_datasets:
|
||||
|
@ -83,7 +83,10 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
|
|||
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
|
||||
lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
|
||||
mc_labels = np.zeros((n_batch,), dtype=np.int64)
|
||||
for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
|
||||
for (
|
||||
i,
|
||||
(story, cont1, cont2, mc_label),
|
||||
) in enumerate(dataset):
|
||||
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
|
||||
input_ids[i, 0, : len(with_cont1)] = with_cont1
|
||||
|
|
|
@ -629,7 +629,9 @@ def main():
|
|||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
|
||||
config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
)
|
||||
model = AutoModelForMultipleChoice.from_pretrained(
|
||||
args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
|
||||
)
|
||||
|
|
|
@ -358,7 +358,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
features = convert_examples_to_features(
|
||||
examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
|
||||
examples,
|
||||
tokenizer,
|
||||
label_list=label_list,
|
||||
max_length=args.max_seq_length,
|
||||
output_mode=output_mode,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
|
|
|
@ -14,8 +14,7 @@ from transformers.modeling_bert import (
|
|||
|
||||
|
||||
def entropy(x):
|
||||
""" Calculate entropy of a pre-softmax logit Tensor
|
||||
"""
|
||||
"""Calculate entropy of a pre-softmax logit Tensor"""
|
||||
exp_x = torch.exp(x)
|
||||
A = torch.sum(exp_x, dim=1) # sum of exp(x_i)
|
||||
B = torch.sum(x * exp_x, dim=1) # sum of x_i * exp(x_i)
|
||||
|
@ -104,7 +103,8 @@ class DeeBertEncoder(nn.Module):
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The Bert Model transformer with early exiting (DeeBERT). ", BERT_START_DOCSTRING,
|
||||
"The Bert Model transformer with early exiting (DeeBERT). ",
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class DeeBertModel(BertPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
|
@ -127,9 +127,9 @@ class DeeBertModel(BertPreTrainedModel):
|
|||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
@ -147,33 +147,33 @@ class DeeBertModel(BertPreTrainedModel):
|
|||
encoder_attention_mask=None,
|
||||
):
|
||||
r"""
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||
layer weights are trained from the next sentence prediction (classification)
|
||||
objective during pre-training.
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||
layer weights are trained from the next sentence prediction (classification)
|
||||
objective during pre-training.
|
||||
|
||||
This output is usually *not* a good summary
|
||||
of the semantic content of the input, you're often better with averaging or pooling
|
||||
the sequence of hidden-states for the whole input sequence.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
This output is usually *not* a good summary
|
||||
of the semantic content of the input, you're often better with averaging or pooling
|
||||
the sequence of hidden-states for the whole input sequence.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
|
||||
Tuple of each early exit's results (total length: number of layers)
|
||||
Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
|
||||
Tuple of each early exit's results (total length: number of layers)
|
||||
Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
|
||||
"""
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
|
@ -302,32 +302,32 @@ class DeeBertForSequenceClassification(BertPreTrainedModel):
|
|||
train_highway=False,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
|
||||
Tuple of each early exit's results (total length: number of layers)
|
||||
Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
|
||||
Tuple of each early exit's results (total length: number of layers)
|
||||
Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
|
||||
"""
|
||||
|
||||
exit_layer = self.num_layers
|
||||
|
|
|
@ -11,7 +11,8 @@ from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayExc
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The RoBERTa Model transformer with early exiting (DeeRoBERTa). ", ROBERTA_START_DOCSTRING,
|
||||
"The RoBERTa Model transformer with early exiting (DeeRoBERTa). ",
|
||||
ROBERTA_START_DOCSTRING,
|
||||
)
|
||||
class DeeRobertaModel(DeeBertModel):
|
||||
|
||||
|
@ -58,32 +59,32 @@ class DeeRobertaForSequenceClassification(BertPreTrainedModel):
|
|||
train_highway=False,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
|
||||
Tuple of each early exit's results (total length: number of layers)
|
||||
Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
|
||||
Tuple of each early exit's results (total length: number of layers)
|
||||
Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
|
||||
"""
|
||||
|
||||
exit_layer = self.num_layers
|
||||
|
|
|
@ -228,14 +228,20 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||
assert end_logits_tea.size() == end_logits_stu.size()
|
||||
|
||||
loss_fct = nn.KLDivLoss(reduction="batchmean")
|
||||
loss_start = loss_fct(
|
||||
F.log_softmax(start_logits_stu / args.temperature, dim=-1),
|
||||
F.softmax(start_logits_tea / args.temperature, dim=-1),
|
||||
) * (args.temperature ** 2)
|
||||
loss_end = loss_fct(
|
||||
F.log_softmax(end_logits_stu / args.temperature, dim=-1),
|
||||
F.softmax(end_logits_tea / args.temperature, dim=-1),
|
||||
) * (args.temperature ** 2)
|
||||
loss_start = (
|
||||
loss_fct(
|
||||
F.log_softmax(start_logits_stu / args.temperature, dim=-1),
|
||||
F.softmax(start_logits_tea / args.temperature, dim=-1),
|
||||
)
|
||||
* (args.temperature ** 2)
|
||||
)
|
||||
loss_end = (
|
||||
loss_fct(
|
||||
F.log_softmax(end_logits_stu / args.temperature, dim=-1),
|
||||
F.softmax(end_logits_tea / args.temperature, dim=-1),
|
||||
)
|
||||
* (args.temperature ** 2)
|
||||
)
|
||||
loss_ce = (loss_start + loss_end) / 2.0
|
||||
|
||||
loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
|
||||
|
|
|
@ -118,7 +118,8 @@ def init_gpu_params(params):
|
|||
if params.multi_gpu:
|
||||
logger.info("Initializing PyTorch distributed")
|
||||
torch.distributed.init_process_group(
|
||||
init_method="env://", backend="nccl",
|
||||
init_method="env://",
|
||||
backend="nccl",
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -233,7 +233,9 @@ def main():
|
|||
eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
|
||||
if config.model_type == "xlnet":
|
||||
data_collator = DataCollatorForPermutationLanguageModeling(
|
||||
tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length,
|
||||
tokenizer=tokenizer,
|
||||
plm_probability=data_args.plm_probability,
|
||||
max_span_length=data_args.max_span_length,
|
||||
)
|
||||
else:
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
|
|
|
@ -226,10 +226,14 @@ class BaseTransformer(pl.LightningModule):
|
|||
help="Decoder layer dropout probability (Optional). Goes into model.config",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dropout", type=float, help="Dropout probability (Optional). Goes into model.config",
|
||||
"--dropout",
|
||||
type=float,
|
||||
help="Dropout probability (Optional). Goes into model.config",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--attention_dropout", type=float, help="Attention dropout probability (Optional). Goes into model.config",
|
||||
"--attention_dropout",
|
||||
type=float,
|
||||
help="Attention dropout probability (Optional). Goes into model.config",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument(
|
||||
|
|
|
@ -95,7 +95,10 @@ def make_support(question, source="wiki40b", method="dense", n_results=10):
|
|||
)
|
||||
else:
|
||||
support_doc, hit_lst = query_es_index(
|
||||
question, es_client, index_name="english_wiki40b_snippets_100w", n_results=n_results,
|
||||
question,
|
||||
es_client,
|
||||
index_name="english_wiki40b_snippets_100w",
|
||||
n_results=n_results,
|
||||
)
|
||||
support_list = [
|
||||
(res["article_title"], res["section_title"].strip(), res["score"], res["passage_text"]) for res in hit_lst
|
||||
|
@ -154,7 +157,8 @@ header_full = """
|
|||
header_html,
|
||||
)
|
||||
st.sidebar.markdown(
|
||||
header_full, unsafe_allow_html=True,
|
||||
header_full,
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
|
||||
# Long Form QA with ELI5 and Wikipedia
|
||||
|
@ -173,9 +177,17 @@ action_list = [
|
|||
]
|
||||
demo_options = st.sidebar.checkbox("Demo options")
|
||||
if demo_options:
|
||||
action_st = st.sidebar.selectbox("", action_list, index=3,)
|
||||
action_st = st.sidebar.selectbox(
|
||||
"",
|
||||
action_list,
|
||||
index=3,
|
||||
)
|
||||
action = action_list.index(action_st)
|
||||
show_type = st.sidebar.selectbox("", ["Show full text of passages", "Show passage section titles"], index=0,)
|
||||
show_type = st.sidebar.selectbox(
|
||||
"",
|
||||
["Show full text of passages", "Show passage section titles"],
|
||||
index=0,
|
||||
)
|
||||
show_passages = show_type == "Show full text of passages"
|
||||
else:
|
||||
action = 3
|
||||
|
@ -250,7 +262,9 @@ questions_list = [
|
|||
"How does New Zealand have so many large bird predators?",
|
||||
]
|
||||
question_s = st.selectbox(
|
||||
"What would you like to ask? ---- select <MY QUESTION> to enter a new query", questions_list, index=1,
|
||||
"What would you like to ask? ---- select <MY QUESTION> to enter a new query",
|
||||
questions_list,
|
||||
index=1,
|
||||
)
|
||||
if question_s == "<MY QUESTION>":
|
||||
question = st.text_input("Enter your question here:", "")
|
||||
|
|
|
@ -48,7 +48,11 @@ def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_ki
|
|||
yield passage
|
||||
|
||||
# create the ES index
|
||||
for ok, action in streaming_bulk(client=es_client, index=index_name, actions=passage_generator(),):
|
||||
for ok, action in streaming_bulk(
|
||||
client=es_client,
|
||||
index=index_name,
|
||||
actions=passage_generator(),
|
||||
):
|
||||
progress.update(1)
|
||||
successes += ok
|
||||
print("Indexed %d documents" % (successes,))
|
||||
|
@ -137,7 +141,11 @@ class RetrievalQAEmbedder(torch.nn.Module):
|
|||
|
||||
# define function for checkpointing
|
||||
def partial_encode(*inputs):
|
||||
encoder_outputs = self.sent_encoder.encoder(inputs[0], attention_mask=inputs[1], head_mask=head_mask,)
|
||||
encoder_outputs = self.sent_encoder.encoder(
|
||||
inputs[0],
|
||||
attention_mask=inputs[1],
|
||||
head_mask=head_mask,
|
||||
)
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooled_output = self.sent_encoder.pooler(sequence_output)
|
||||
return pooled_output
|
||||
|
@ -234,7 +242,11 @@ def train_qa_retriever_epoch(model, dataset, tokenizer, optimizer, scheduler, ar
|
|||
if step % args.print_freq == 0 or step == 1:
|
||||
print(
|
||||
"{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
|
||||
e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
|
||||
e,
|
||||
step,
|
||||
len(dataset) // args.batch_size,
|
||||
loc_loss / loc_steps,
|
||||
time() - st_time,
|
||||
)
|
||||
)
|
||||
loc_loss = 0
|
||||
|
@ -273,7 +285,11 @@ def train_qa_retriever_joint_epoch(model, dataset_list, tokenizer, optimizer, sc
|
|||
if step % args.print_freq == 0:
|
||||
print(
|
||||
"{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
|
||||
e, step, len(dataset_list[0]) // args.batch_size, loc_loss / loc_steps, time() - st_time,
|
||||
e,
|
||||
step,
|
||||
len(dataset_list[0]) // args.batch_size,
|
||||
loc_loss / loc_steps,
|
||||
time() - st_time,
|
||||
)
|
||||
)
|
||||
loc_loss = 0
|
||||
|
@ -354,7 +370,8 @@ class ELI5DatasetS2S(Dataset):
|
|||
self.document_cache[q_id] = self.document_cache.get(q_id, self.make_doc_function(example["title"]))
|
||||
document = self.document_cache[q_id]
|
||||
in_st = "question: {} context: {}".format(
|
||||
question.lower().replace(" --t--", "").strip(), document.lower().strip(),
|
||||
question.lower().replace(" --t--", "").strip(),
|
||||
document.lower().strip(),
|
||||
)
|
||||
out_st = answer
|
||||
return (in_st, out_st)
|
||||
|
@ -427,7 +444,11 @@ def train_qa_s2s_epoch(model, dataset, tokenizer, optimizer, scheduler, args, e=
|
|||
if step % args.print_freq == 0 or step == 1:
|
||||
print(
|
||||
"{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
|
||||
e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
|
||||
e,
|
||||
step,
|
||||
len(dataset) // args.batch_size,
|
||||
loc_loss / loc_steps,
|
||||
time() - st_time,
|
||||
)
|
||||
)
|
||||
loc_loss = 0
|
||||
|
@ -456,10 +477,18 @@ def eval_qa_s2s_epoch(model, dataset, tokenizer, args):
|
|||
if step % args.print_freq == 0:
|
||||
print(
|
||||
"{:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
|
||||
step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
|
||||
step,
|
||||
len(dataset) // args.batch_size,
|
||||
loc_loss / loc_steps,
|
||||
time() - st_time,
|
||||
)
|
||||
)
|
||||
print("Total \t L: {:.3f} \t -- {:.3f}".format(loc_loss / loc_steps, time() - st_time,))
|
||||
print(
|
||||
"Total \t L: {:.3f} \t -- {:.3f}".format(
|
||||
loc_loss / loc_steps,
|
||||
time() - st_time,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def train_qa_s2s(qa_s2s_model, qa_s2s_tokenizer, s2s_train_dset, s2s_valid_dset, s2s_args):
|
||||
|
@ -506,7 +535,12 @@ def qa_s2s_generate(
|
|||
max_input_length=512,
|
||||
device="cuda:0",
|
||||
):
|
||||
model_inputs = make_qa_s2s_batch([(question_doc, "A")], qa_s2s_tokenizer, max_input_length, device=device,)
|
||||
model_inputs = make_qa_s2s_batch(
|
||||
[(question_doc, "A")],
|
||||
qa_s2s_tokenizer,
|
||||
max_input_length,
|
||||
device=device,
|
||||
)
|
||||
n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
|
||||
generated_ids = qa_s2s_model.generate(
|
||||
input_ids=model_inputs["input_ids"],
|
||||
|
|
|
@ -37,8 +37,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class BertEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings.
|
||||
"""
|
||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
|
@ -385,8 +384,8 @@ class BertPooler(nn.Module):
|
|||
|
||||
|
||||
class MaskedBertPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = MaskedBertConfig
|
||||
|
@ -492,9 +491,9 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
|
|||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
@ -685,31 +684,31 @@ class MaskedBertForSequenceClassification(MaskedBertPreTrainedModel):
|
|||
threshold=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
threshold (:obj:`float`):
|
||||
Threshold value (see :class:`~emmental.MaskedLinear`).
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
threshold (:obj:`float`):
|
||||
Threshold value (see :class:`~emmental.MaskedLinear`).
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
outputs = self.bert(
|
||||
|
@ -770,32 +769,32 @@ class MaskedBertForMultipleChoice(MaskedBertPreTrainedModel):
|
|||
threshold=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
threshold (:obj:`float`):
|
||||
Threshold value (see :class:`~emmental.MaskedLinear`).
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
threshold (:obj:`float`):
|
||||
Threshold value (see :class:`~emmental.MaskedLinear`).
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
|
||||
Classification loss.
|
||||
classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
|
||||
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
|
||||
Classification loss.
|
||||
classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
|
||||
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
|
||||
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
|
||||
"""
|
||||
num_choices = input_ids.shape[1]
|
||||
|
@ -860,29 +859,29 @@ class MaskedBertForTokenClassification(MaskedBertPreTrainedModel):
|
|||
threshold=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
threshold (:obj:`float`):
|
||||
Threshold value (see :class:`~emmental.MaskedLinear`).
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
threshold (:obj:`float`):
|
||||
Threshold value (see :class:`~emmental.MaskedLinear`).
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
|
||||
Classification loss.
|
||||
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
|
||||
Classification loss.
|
||||
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
|
||||
Classification scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
outputs = self.bert(
|
||||
|
@ -947,36 +946,36 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
|
|||
threshold=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
threshold (:obj:`float`):
|
||||
Threshold value (see :class:`~emmental.MaskedLinear`).
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
threshold (:obj:`float`):
|
||||
Threshold value (see :class:`~emmental.MaskedLinear`).
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
|
||||
Span-end scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
|
||||
Span-end scores (before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
outputs = self.bert(
|
||||
|
@ -996,7 +995,10 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
|
|||
start_logits = start_logits.squeeze(-1)
|
||||
end_logits = end_logits.squeeze(-1)
|
||||
|
||||
outputs = (start_logits, end_logits,) + outputs[2:]
|
||||
outputs = (
|
||||
start_logits,
|
||||
end_logits,
|
||||
) + outputs[2:]
|
||||
if start_positions is not None and end_positions is not None:
|
||||
# If we are on multi-GPU, split add a dimension
|
||||
if len(start_positions.size()) > 1:
|
||||
|
|
|
@ -173,7 +173,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
|
||||
model,
|
||||
device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True,
|
||||
)
|
||||
|
||||
# Train!
|
||||
|
@ -217,7 +220,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
|
||||
epochs_trained,
|
||||
int(args.num_train_epochs),
|
||||
desc="Epoch",
|
||||
disable=args.local_rank not in [-1, 0],
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
|
@ -280,11 +286,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||
attention_mask=inputs["attention_mask"],
|
||||
)
|
||||
|
||||
loss_logits = F.kl_div(
|
||||
input=F.log_softmax(logits_stu / args.temperature, dim=-1),
|
||||
target=F.softmax(logits_tea / args.temperature, dim=-1),
|
||||
reduction="batchmean",
|
||||
) * (args.temperature ** 2)
|
||||
loss_logits = (
|
||||
F.kl_div(
|
||||
input=F.log_softmax(logits_stu / args.temperature, dim=-1),
|
||||
target=F.softmax(logits_tea / args.temperature, dim=-1),
|
||||
reduction="batchmean",
|
||||
)
|
||||
* (args.temperature ** 2)
|
||||
)
|
||||
|
||||
loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
|
||||
|
||||
|
@ -529,7 +538,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
features = convert_examples_to_features(
|
||||
examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
|
||||
examples,
|
||||
tokenizer,
|
||||
max_length=args.max_seq_length,
|
||||
label_list=label_list,
|
||||
output_mode=output_mode,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
|
@ -592,7 +605,10 @@ def main():
|
|||
)
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
|
||||
"--config_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained config name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
|
@ -616,17 +632,27 @@ def main():
|
|||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
|
||||
"--evaluate_during_training",
|
||||
action="store_true",
|
||||
help="Run evaluation during training at each logging step.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
|
||||
"--do_lower_case",
|
||||
action="store_true",
|
||||
help="Set this flag if you are using an uncased model.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
|
||||
"--per_gpu_train_batch_size",
|
||||
default=8,
|
||||
type=int,
|
||||
help="Batch size per GPU/CPU for training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
|
||||
"--per_gpu_eval_batch_size",
|
||||
default=8,
|
||||
type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
|
||||
|
@ -723,7 +749,10 @@ def main():
|
|||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
|
||||
"--num_train_epochs",
|
||||
default=3.0,
|
||||
type=float,
|
||||
help="Total number of training epochs to perform.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
|
@ -742,10 +771,14 @@ def main():
|
|||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
|
||||
"--overwrite_output_dir",
|
||||
action="store_true",
|
||||
help="Overwrite the content of the output directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
|
||||
"--overwrite_cache",
|
||||
action="store_true",
|
||||
help="Overwrite the cached training and evaluation sets",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
|
|
|
@ -181,7 +181,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
|
||||
model,
|
||||
device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True,
|
||||
)
|
||||
|
||||
# Train!
|
||||
|
@ -304,16 +307,22 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||
attention_mask=inputs["attention_mask"],
|
||||
)
|
||||
|
||||
loss_start = F.kl_div(
|
||||
input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
|
||||
target=F.softmax(start_logits_tea / args.temperature, dim=-1),
|
||||
reduction="batchmean",
|
||||
) * (args.temperature ** 2)
|
||||
loss_end = F.kl_div(
|
||||
input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
|
||||
target=F.softmax(end_logits_tea / args.temperature, dim=-1),
|
||||
reduction="batchmean",
|
||||
) * (args.temperature ** 2)
|
||||
loss_start = (
|
||||
F.kl_div(
|
||||
input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
|
||||
target=F.softmax(start_logits_tea / args.temperature, dim=-1),
|
||||
reduction="batchmean",
|
||||
)
|
||||
* (args.temperature ** 2)
|
||||
)
|
||||
loss_end = (
|
||||
F.kl_div(
|
||||
input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
|
||||
target=F.softmax(end_logits_tea / args.temperature, dim=-1),
|
||||
reduction="batchmean",
|
||||
)
|
||||
* (args.temperature ** 2)
|
||||
)
|
||||
loss_logits = (loss_start + loss_end) / 2.0
|
||||
|
||||
loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
|
||||
|
@ -859,7 +868,10 @@ def main():
|
|||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
|
||||
"--num_train_epochs",
|
||||
default=3.0,
|
||||
type=float,
|
||||
help="Total number of training epochs to perform.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
|
|
|
@ -100,7 +100,12 @@ if is_torch_available():
|
|||
|
||||
cached_features_file = os.path.join(
|
||||
data_dir,
|
||||
"cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,),
|
||||
"cached_{}_{}_{}_{}".format(
|
||||
mode.value,
|
||||
tokenizer.__class__.__name__,
|
||||
str(max_seq_length),
|
||||
task,
|
||||
),
|
||||
)
|
||||
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
|
@ -121,7 +126,12 @@ if is_torch_available():
|
|||
else:
|
||||
examples = processor.get_train_examples(data_dir)
|
||||
logger.info("Training examples: %s", len(examples))
|
||||
self.features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer,)
|
||||
self.features = convert_examples_to_features(
|
||||
examples,
|
||||
label_list,
|
||||
max_seq_length,
|
||||
tokenizer,
|
||||
)
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save(self.features, cached_features_file)
|
||||
|
||||
|
@ -164,7 +174,12 @@ if is_tf_available():
|
|||
examples = processor.get_train_examples(data_dir)
|
||||
logger.info("Training examples: %s", len(examples))
|
||||
|
||||
self.features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer,)
|
||||
self.features = convert_examples_to_features(
|
||||
examples,
|
||||
label_list,
|
||||
max_seq_length,
|
||||
tokenizer,
|
||||
)
|
||||
|
||||
def gen():
|
||||
for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
|
||||
|
@ -491,7 +506,10 @@ class ArcProcessor(DataProcessor):
|
|||
|
||||
|
||||
def convert_examples_to_features(
|
||||
examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
|
||||
examples: List[InputExample],
|
||||
label_list: List[str],
|
||||
max_length: int,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
) -> List[InputFeatures]:
|
||||
"""
|
||||
Loads a data file into a list of `InputFeatures`
|
||||
|
|
|
@ -137,7 +137,12 @@ def main():
|
|||
)
|
||||
|
||||
# Initialize our Trainer
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
)
|
||||
|
||||
# Training
|
||||
if training_args.do_train:
|
||||
|
|
|
@ -231,7 +231,12 @@ def main():
|
|||
eval_dataset = eval_dataset.apply(tf.data.experimental.assert_cardinality(len(eval_examples)))
|
||||
|
||||
# Initialize our Trainer
|
||||
trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
|
||||
trainer = TFTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
)
|
||||
|
||||
# Training
|
||||
if training_args.do_train:
|
||||
|
|
|
@ -28,7 +28,7 @@ BERTABS_FINETUNED_CONFIG_MAP = {
|
|||
|
||||
|
||||
class BertAbsConfig(PretrainedConfig):
|
||||
r""" Class to store the configuration of the BertAbs model.
|
||||
r"""Class to store the configuration of the BertAbs model.
|
||||
|
||||
Arguments:
|
||||
vocab_size: int
|
||||
|
|
|
@ -62,7 +62,7 @@ BertAbsConfig = namedtuple(
|
|||
|
||||
|
||||
def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
|
||||
""" Copy/paste and tweak the pre-trained weights provided by the creators
|
||||
"""Copy/paste and tweak the pre-trained weights provided by the creators
|
||||
of BertAbs for the internal architecture.
|
||||
"""
|
||||
|
||||
|
@ -164,13 +164,22 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
|
|||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
|
||||
"--bertabs_checkpoint_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path the official PyTorch dump.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
|
||||
"--pytorch_dump_folder_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the output PyTorch model.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
convert_bertabs_checkpoints(
|
||||
args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
|
||||
args.bertabs_checkpoint_path,
|
||||
args.pytorch_dump_folder_path,
|
||||
)
|
||||
|
|
|
@ -105,10 +105,17 @@ class BertAbs(BertAbsPreTrainedModel):
|
|||
p.data.zero_()
|
||||
|
||||
def forward(
|
||||
self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask,
|
||||
self,
|
||||
encoder_input_ids,
|
||||
decoder_input_ids,
|
||||
token_type_ids,
|
||||
encoder_attention_mask,
|
||||
decoder_attention_mask,
|
||||
):
|
||||
encoder_output = self.bert(
|
||||
input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask,
|
||||
input_ids=encoder_input_ids,
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=encoder_attention_mask,
|
||||
)
|
||||
encoder_hidden_states = encoder_output[0]
|
||||
dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
|
||||
|
@ -117,8 +124,7 @@ class BertAbs(BertAbsPreTrainedModel):
|
|||
|
||||
|
||||
class Bert(nn.Module):
|
||||
""" This class is not really necessary and should probably disappear.
|
||||
"""
|
||||
"""This class is not really necessary and should probably disappear."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
@ -307,7 +313,14 @@ class TransformerDecoderLayer(nn.Module):
|
|||
self.register_buffer("mask", mask)
|
||||
|
||||
def forward(
|
||||
self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None,
|
||||
self,
|
||||
inputs,
|
||||
memory_bank,
|
||||
src_pad_mask,
|
||||
tgt_pad_mask,
|
||||
previous_input=None,
|
||||
layer_cache=None,
|
||||
step=None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
|
@ -331,13 +344,25 @@ class TransformerDecoderLayer(nn.Module):
|
|||
all_input = torch.cat((previous_input, input_norm), dim=1)
|
||||
dec_mask = None
|
||||
|
||||
query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",)
|
||||
query = self.self_attn(
|
||||
all_input,
|
||||
all_input,
|
||||
input_norm,
|
||||
mask=dec_mask,
|
||||
layer_cache=layer_cache,
|
||||
type="self",
|
||||
)
|
||||
|
||||
query = self.drop(query) + inputs
|
||||
|
||||
query_norm = self.layer_norm_2(query)
|
||||
mid = self.context_attn(
|
||||
memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context",
|
||||
memory_bank,
|
||||
memory_bank,
|
||||
query_norm,
|
||||
mask=src_pad_mask,
|
||||
layer_cache=layer_cache,
|
||||
type="context",
|
||||
)
|
||||
output = self.feed_forward(self.drop(mid) + query)
|
||||
|
||||
|
@ -422,7 +447,14 @@ class MultiHeadedAttention(nn.Module):
|
|||
self.final_linear = nn.Linear(model_dim, model_dim)
|
||||
|
||||
def forward(
|
||||
self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None,
|
||||
self,
|
||||
key,
|
||||
value,
|
||||
query,
|
||||
mask=None,
|
||||
layer_cache=None,
|
||||
type=None,
|
||||
predefined_graph_1=None,
|
||||
):
|
||||
"""
|
||||
Compute the context vector and the attention vectors.
|
||||
|
@ -628,7 +660,7 @@ def gelu(x):
|
|||
|
||||
|
||||
class PositionwiseFeedForward(nn.Module):
|
||||
""" A two-layer Feed-Forward-Network with residual layer norm.
|
||||
"""A two-layer Feed-Forward-Network with residual layer norm.
|
||||
|
||||
Args:
|
||||
d_model (int): the size of input for the first-layer of the FFN.
|
||||
|
@ -770,8 +802,7 @@ class Translator(object):
|
|||
self.max_length = args.max_length
|
||||
|
||||
def translate(self, batch, step, attn_debug=False):
|
||||
""" Generates summaries from one batch of data.
|
||||
"""
|
||||
"""Generates summaries from one batch of data."""
|
||||
self.model.eval()
|
||||
with torch.no_grad():
|
||||
batch_data = self.translate_batch(batch)
|
||||
|
@ -798,8 +829,7 @@ class Translator(object):
|
|||
# Where the beam search lives
|
||||
# I have no idea why it is being called from the method above
|
||||
def _fast_translate_batch(self, batch, max_length, min_length=0):
|
||||
""" Beam Search using the encoder inputs contained in `batch`.
|
||||
"""
|
||||
"""Beam Search using the encoder inputs contained in `batch`."""
|
||||
|
||||
# The batch object is funny
|
||||
# Instead of just looking at the size of the arguments we encapsulate
|
||||
|
@ -981,7 +1011,7 @@ def tile(x, count, dim=0):
|
|||
|
||||
|
||||
class BertSumOptimizer(object):
|
||||
""" Specific optimizer for BertSum.
|
||||
"""Specific optimizer for BertSum.
|
||||
|
||||
As described in [1], the authors fine-tune BertSum for abstractive
|
||||
summarization using two Adam Optimizers with different warm-up steps and
|
||||
|
@ -999,10 +1029,16 @@ class BertSumOptimizer(object):
|
|||
|
||||
self.optimizers = {
|
||||
"encoder": torch.optim.Adam(
|
||||
model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps,
|
||||
model.encoder.parameters(),
|
||||
lr=lr["encoder"],
|
||||
betas=(beta_1, beta_2),
|
||||
eps=eps,
|
||||
),
|
||||
"decoder": torch.optim.Adam(
|
||||
model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps,
|
||||
model.decoder.parameters(),
|
||||
lr=lr["decoder"],
|
||||
betas=(beta_1, beta_2),
|
||||
eps=eps,
|
||||
),
|
||||
}
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ def evaluate(args):
|
|||
|
||||
|
||||
def save_summaries(summaries, path, original_document_name):
|
||||
""" Write the summaries in fies that are prefixed by the original
|
||||
"""Write the summaries in fies that are prefixed by the original
|
||||
files' name with the `_summary` appended.
|
||||
|
||||
Attributes:
|
||||
|
@ -125,7 +125,7 @@ def save_summaries(summaries, path, original_document_name):
|
|||
|
||||
|
||||
def format_summary(translation):
|
||||
""" Transforms the output of the `from_batch` function
|
||||
"""Transforms the output of the `from_batch` function
|
||||
into nicely formatted summaries.
|
||||
"""
|
||||
raw_summary, _, _ = translation
|
||||
|
@ -190,7 +190,12 @@ def build_data_iterator(args, tokenizer):
|
|||
def collate_fn(data):
|
||||
return collate(data, tokenizer, block_size=512, device=args.device)
|
||||
|
||||
iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
|
||||
iterator = DataLoader(
|
||||
dataset,
|
||||
sampler=sampler,
|
||||
batch_size=args.batch_size,
|
||||
collate_fn=collate_fn,
|
||||
)
|
||||
|
||||
return iterator
|
||||
|
||||
|
@ -201,7 +206,7 @@ def load_and_cache_examples(args, tokenizer):
|
|||
|
||||
|
||||
def collate(data, tokenizer, block_size, device):
|
||||
""" Collate formats the data passed to the data loader.
|
||||
"""Collate formats the data passed to the data loader.
|
||||
|
||||
In particular we tokenize the data batch after batch to avoid keeping them
|
||||
all in memory. We output the data as a namedtuple to fit the original BertAbs's
|
||||
|
@ -231,7 +236,7 @@ def collate(data, tokenizer, block_size, device):
|
|||
|
||||
|
||||
def decode_summary(summary_tokens, tokenizer):
|
||||
""" Decode the summary and return it in a format
|
||||
"""Decode the summary and return it in a format
|
||||
suitable for evaluation.
|
||||
"""
|
||||
summary_tokens = summary_tokens.to("cpu").numpy()
|
||||
|
@ -242,8 +247,7 @@ def decode_summary(summary_tokens, tokenizer):
|
|||
|
||||
|
||||
def main():
|
||||
""" The main function defines the interface with the users.
|
||||
"""
|
||||
"""The main function defines the interface with the users."""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--documents_dir",
|
||||
|
@ -268,23 +272,41 @@ def main():
|
|||
)
|
||||
# EVALUATION options
|
||||
parser.add_argument(
|
||||
"--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
|
||||
"--no_cuda",
|
||||
default=False,
|
||||
type=bool,
|
||||
help="Whether to force the execution on CPU.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
|
||||
"--batch_size",
|
||||
default=4,
|
||||
type=int,
|
||||
help="Batch size per GPU/CPU for training.",
|
||||
)
|
||||
# BEAM SEARCH arguments
|
||||
parser.add_argument(
|
||||
"--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
|
||||
"--min_length",
|
||||
default=50,
|
||||
type=int,
|
||||
help="Minimum number of tokens for the summaries.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
|
||||
"--max_length",
|
||||
default=200,
|
||||
type=int,
|
||||
help="Maixmum number of tokens for the summaries.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
|
||||
"--beam_size",
|
||||
default=5,
|
||||
type=int,
|
||||
help="The number of beams to start with for each example.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
|
||||
"--alpha",
|
||||
default=0.95,
|
||||
type=float,
|
||||
help="The value of alpha for the length penalty in the beam search.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--block_trigram",
|
||||
|
|
|
@ -43,8 +43,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
|
|||
self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
|
||||
|
||||
def test_process_story_no_highlights(self):
|
||||
""" Processing a story with no highlights returns an empty list for the summary.
|
||||
"""
|
||||
"""Processing a story with no highlights returns an empty list for the summary."""
|
||||
raw_story = """It was the year of Our Lord one thousand seven hundred and
|
||||
seventy-five.\n\nSpiritual revelations were conceded to England at that
|
||||
favoured period, as at this."""
|
||||
|
@ -52,8 +51,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
|
|||
self.assertEqual(summary_lines, [])
|
||||
|
||||
def test_process_empty_story(self):
|
||||
""" An empty story returns an empty collection of lines.
|
||||
"""
|
||||
"""An empty story returns an empty collection of lines."""
|
||||
raw_story = ""
|
||||
story_lines, summary_lines = process_story(raw_story)
|
||||
self.assertEqual(story_lines, [])
|
||||
|
|
|
@ -11,7 +11,7 @@ from torch.utils.data import Dataset
|
|||
|
||||
|
||||
class CNNDMDataset(Dataset):
|
||||
""" Abstracts the dataset used to train seq2seq models.
|
||||
"""Abstracts the dataset used to train seq2seq models.
|
||||
|
||||
The class will process the documents that are located in the specified
|
||||
folder. The preprocessing will work on any document that is reasonably
|
||||
|
@ -31,7 +31,7 @@ class CNNDMDataset(Dataset):
|
|||
"""
|
||||
|
||||
def __init__(self, path="", prefix="train"):
|
||||
""" We initialize the class by listing all the documents to summarize.
|
||||
"""We initialize the class by listing all the documents to summarize.
|
||||
Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
|
||||
"""
|
||||
assert os.path.isdir(path)
|
||||
|
@ -60,7 +60,7 @@ class CNNDMDataset(Dataset):
|
|||
|
||||
|
||||
def process_story(raw_story):
|
||||
""" Extract the story and summary from a story file.
|
||||
"""Extract the story and summary from a story file.
|
||||
|
||||
Arguments:
|
||||
raw_story (str): content of the story file as an utf-8 encoded string.
|
||||
|
@ -108,7 +108,7 @@ def _add_missing_period(line):
|
|||
|
||||
|
||||
def truncate_or_pad(sequence, block_size, pad_token_id):
|
||||
""" Adapt the source and target sequences' lengths to the block size.
|
||||
"""Adapt the source and target sequences' lengths to the block size.
|
||||
If the sequence is shorter we append padding token to the right of the sequence.
|
||||
"""
|
||||
if len(sequence) > block_size:
|
||||
|
@ -119,8 +119,8 @@ def truncate_or_pad(sequence, block_size, pad_token_id):
|
|||
|
||||
|
||||
def build_mask(sequence, pad_token_id):
|
||||
""" Builds the mask. The attention mechanism will only attend to positions
|
||||
with value 1. """
|
||||
"""Builds the mask. The attention mechanism will only attend to positions
|
||||
with value 1."""
|
||||
mask = torch.ones_like(sequence)
|
||||
idx_pad_tokens = sequence == pad_token_id
|
||||
mask[idx_pad_tokens] = 0
|
||||
|
@ -128,7 +128,7 @@ def build_mask(sequence, pad_token_id):
|
|||
|
||||
|
||||
def encode_for_summarization(story_lines, summary_lines, tokenizer):
|
||||
""" Encode the story and summary lines, and join them
|
||||
"""Encode the story and summary lines, and join them
|
||||
as specified in [1] by using `[SEP] [CLS]` tokens to separate
|
||||
sentences.
|
||||
"""
|
||||
|
@ -141,7 +141,7 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
|
|||
|
||||
|
||||
def compute_token_type_ids(batch, separator_token_id):
|
||||
""" Segment embeddings as described in [1]
|
||||
"""Segment embeddings as described in [1]
|
||||
|
||||
The values {0,1} were found in the repository [2].
|
||||
|
||||
|
|
|
@ -97,4 +97,9 @@ def get_checkpoint_callback(output_dir, metric):
|
|||
|
||||
|
||||
def get_early_stopping_callback(metric, patience):
|
||||
return EarlyStopping(monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,)
|
||||
return EarlyStopping(
|
||||
monitor=f"val_{metric}",
|
||||
mode="max",
|
||||
patience=patience,
|
||||
verbose=True,
|
||||
)
|
||||
|
|
|
@ -348,7 +348,10 @@ class T5SummarizationDistiller(BartSummarizationDistiller):
|
|||
if self.different_encoder:
|
||||
with torch.no_grad():
|
||||
teacher_enc_outputs, teacher_enc_hid = self.teacher.encoder(
|
||||
source_ids, attention_mask=source_mask, output_hidden_states=True, use_cache=False,
|
||||
source_ids,
|
||||
attention_mask=source_mask,
|
||||
output_hidden_states=True,
|
||||
use_cache=False,
|
||||
)
|
||||
if self.hparams.alpha_encoder_loss > 0:
|
||||
loss_encoder = self.calc_mse_loss(enc_outputs, teacher_enc_outputs, source_mask)
|
||||
|
|
|
@ -117,7 +117,12 @@ class TestSummarizationDistiller(unittest.TestCase):
|
|||
|
||||
@require_multigpu
|
||||
def test_multigpu(self):
|
||||
updates = dict(no_teacher=True, freeze_encoder=True, gpus=2, sortish_sampler=False,)
|
||||
updates = dict(
|
||||
no_teacher=True,
|
||||
freeze_encoder=True,
|
||||
gpus=2,
|
||||
sortish_sampler=False,
|
||||
)
|
||||
self._test_distiller_cli(updates)
|
||||
|
||||
def test_distill_no_teacher(self):
|
||||
|
@ -261,7 +266,8 @@ def test_run_eval_bart(model):
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["model"], [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)],
|
||||
["model"],
|
||||
[pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)],
|
||||
)
|
||||
def test_finetune(model):
|
||||
args_d: dict = CHEAP_ARGS.copy()
|
||||
|
@ -329,7 +335,8 @@ def test_finetune_extra_model_args():
|
|||
output_dir = tempfile.mkdtemp(prefix="output_1_")
|
||||
args_d1 = args_d.copy()
|
||||
args_d1.update(
|
||||
model_name_or_path=model, output_dir=output_dir,
|
||||
model_name_or_path=model,
|
||||
output_dir=output_dir,
|
||||
)
|
||||
extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
|
||||
for p in extra_model_params:
|
||||
|
@ -344,7 +351,8 @@ def test_finetune_extra_model_args():
|
|||
output_dir = tempfile.mkdtemp(prefix="output_2_")
|
||||
args_d2 = args_d.copy()
|
||||
args_d2.update(
|
||||
model_name_or_path=model, output_dir=output_dir,
|
||||
model_name_or_path=model,
|
||||
output_dir=output_dir,
|
||||
)
|
||||
unsupported_param = "encoder_layerdrop"
|
||||
args_d2[unsupported_param] = 0.5
|
||||
|
@ -478,7 +486,11 @@ def test_summarization_dataset_truncation(tok):
|
|||
max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
|
||||
trunc_target = 4
|
||||
train_dataset = Seq2SeqDataset(
|
||||
tokenizer, data_dir=tmp_dir, type_path="train", max_source_length=20, max_target_length=trunc_target,
|
||||
tokenizer,
|
||||
data_dir=tmp_dir,
|
||||
type_path="train",
|
||||
max_source_length=20,
|
||||
max_target_length=trunc_target,
|
||||
)
|
||||
dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
|
||||
for batch in dataloader:
|
||||
|
|
|
@ -63,7 +63,9 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
|
|||
|
||||
|
||||
def trim_batch(
|
||||
input_ids, pad_token_id, attention_mask=None,
|
||||
input_ids,
|
||||
pad_token_id,
|
||||
attention_mask=None,
|
||||
):
|
||||
"""Remove columns that are populated exclusively by pad_token_id"""
|
||||
keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
|
||||
|
|
|
@ -153,7 +153,11 @@ class GLUETransformer(BaseTransformer):
|
|||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--task", default="", type=str, required=True, help="The GLUE task to run",
|
||||
"--task",
|
||||
default="",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The GLUE task to run",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gpus",
|
||||
|
@ -177,7 +181,10 @@ def main():
|
|||
|
||||
# If output_dir not provided, a folder will be generated in pwd
|
||||
if args.output_dir is None:
|
||||
args.output_dir = os.path.join("./results", f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",)
|
||||
args.output_dir = os.path.join(
|
||||
"./results",
|
||||
f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
|
||||
)
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
model = GLUETransformer(args)
|
||||
|
|
|
@ -328,7 +328,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||
processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
features = convert_examples_to_features(
|
||||
examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
|
||||
examples,
|
||||
tokenizer,
|
||||
max_length=args.max_seq_length,
|
||||
label_list=label_list,
|
||||
output_mode=output_mode,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
|
|
|
@ -698,7 +698,9 @@ def run_pplm_example(
|
|||
for word_id in pert_gen_tok_text.tolist()[0]:
|
||||
if word_id in bow_word_ids:
|
||||
pert_gen_text += "{}{}{}".format(
|
||||
colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL,
|
||||
colorama.Fore.RED,
|
||||
tokenizer.decode([word_id]),
|
||||
colorama.Style.RESET_ALL,
|
||||
)
|
||||
else:
|
||||
pert_gen_text += tokenizer.decode([word_id])
|
||||
|
@ -729,7 +731,10 @@ if __name__ == "__main__":
|
|||
parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
|
||||
parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
|
||||
parser.add_argument(
|
||||
"--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
|
||||
"--num_samples",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of samples to generate from the modified latents",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bag_of_words",
|
||||
|
@ -751,13 +756,22 @@ if __name__ == "__main__":
|
|||
help="Discriminator to use",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--discrim_weights", type=str, default=None, help="Weights for the generic discriminator",
|
||||
"--discrim_weights",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Weights for the generic discriminator",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator",
|
||||
"--discrim_meta",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Meta information for the generic discriminator",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--class_label", type=int, default=-1, help="Class label used for the discriminator",
|
||||
"--class_label",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Class label used for the discriminator",
|
||||
)
|
||||
parser.add_argument("--length", type=int, default=100)
|
||||
parser.add_argument("--stepsize", type=float, default=0.02)
|
||||
|
@ -773,7 +787,10 @@ if __name__ == "__main__":
|
|||
help="Length of past which is being optimized; 0 corresponds to infinite window length",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--horizon_length", type=int, default=1, help="Length of future to optimize over",
|
||||
"--horizon_length",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Length of future to optimize over",
|
||||
)
|
||||
parser.add_argument("--decay", action="store_true", help="whether to decay or not")
|
||||
parser.add_argument("--gamma", type=float, default=1.5)
|
||||
|
@ -783,7 +800,10 @@ if __name__ == "__main__":
|
|||
parser.add_argument("--no_cuda", action="store_true", help="no cuda")
|
||||
parser.add_argument("--colorama", action="store_true", help="colors keywords")
|
||||
parser.add_argument(
|
||||
"--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
|
||||
"--repetition_penalty",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Penalize repetition. More than 1.0 -> less repetition",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -242,7 +242,12 @@ def train_discriminator(
|
|||
|
||||
text = torchtext_data.Field()
|
||||
label = torchtext_data.Field(sequential=False)
|
||||
train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)
|
||||
train_data, val_data, test_data = datasets.SST.splits(
|
||||
text,
|
||||
label,
|
||||
fine_grained=True,
|
||||
train_subtrees=True,
|
||||
)
|
||||
|
||||
x = []
|
||||
y = []
|
||||
|
|
|
@ -41,7 +41,9 @@ from transformers import (
|
|||
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -197,7 +199,10 @@ def main():
|
|||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
|
||||
logger.warning(
|
||||
"device: %s, n_gpu: %s, 16-bits training: %s", args.device, args.n_gpu, args.fp16,
|
||||
"device: %s, n_gpu: %s, 16-bits training: %s",
|
||||
args.device,
|
||||
args.n_gpu,
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
set_seed(args)
|
||||
|
|
|
@ -90,11 +90,11 @@ class TokenClassificationTask:
|
|||
sequence_a_segment_id=0,
|
||||
mask_padding_with_zero=True,
|
||||
) -> List[InputFeatures]:
|
||||
""" Loads a data file into a list of `InputFeatures`
|
||||
`cls_token_at_end` define the location of the CLS token:
|
||||
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
|
||||
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
|
||||
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
|
||||
"""Loads a data file into a list of `InputFeatures`
|
||||
`cls_token_at_end` define the location of the CLS token:
|
||||
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
|
||||
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
|
||||
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
|
||||
"""
|
||||
# TODO clean up all this to leverage built-in features of tokenizers
|
||||
|
||||
|
@ -230,7 +230,8 @@ if is_torch_available():
|
|||
):
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(
|
||||
data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
|
||||
data_dir,
|
||||
"cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
|
||||
)
|
||||
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
|
|
|
@ -14,18 +14,18 @@ def swish(x):
|
|||
|
||||
|
||||
def _gelu_python(x):
|
||||
""" Original Implementation of the gelu activation function in Google Bert repo when initially created.
|
||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
This is now written in C in torch.nn.functional
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
"""Original Implementation of the gelu activation function in Google Bert repo when initially created.
|
||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
This is now written in C in torch.nn.functional
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
"""
|
||||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||
|
||||
|
||||
def gelu_new(x):
|
||||
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
"""Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
"""
|
||||
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
|
||||
|
||||
|
|
|
@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark):
|
|||
# run additional 10 times to stabilize compilation for tpu and torchscript
|
||||
logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
|
||||
timeit.repeat(
|
||||
func, repeat=1, number=5,
|
||||
func,
|
||||
repeat=1,
|
||||
number=5,
|
||||
)
|
||||
|
||||
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
|
||||
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
|
||||
runtimes = timeit.repeat(
|
||||
func,
|
||||
repeat=self.args.repeat,
|
||||
number=10,
|
||||
)
|
||||
|
||||
if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
|
||||
import torch_xla.debug.metrics as met
|
||||
|
|
|
@ -32,10 +32,12 @@ logger = logging.get_logger(__name__)
|
|||
@dataclass
|
||||
class TensorFlowBenchmarkArguments(BenchmarkArguments):
|
||||
tpu_name: str = field(
|
||||
default=None, metadata={"help": "Name of TPU"},
|
||||
default=None,
|
||||
metadata={"help": "Name of TPU"},
|
||||
)
|
||||
device_idx: int = field(
|
||||
default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
|
||||
default=0,
|
||||
metadata={"help": "CPU / GPU device index. Defaults to 0."},
|
||||
)
|
||||
eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
|
||||
use_xla: bool = field(
|
||||
|
|
|
@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark):
|
|||
timeit.repeat(func, repeat=1, number=5)
|
||||
|
||||
# as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
|
||||
runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
|
||||
runtimes = timeit.repeat(
|
||||
func,
|
||||
repeat=self.args.repeat,
|
||||
number=10,
|
||||
)
|
||||
|
||||
return min(runtimes) / 10.0
|
||||
except ResourceExhaustedError as e:
|
||||
|
|
|
@ -63,15 +63,15 @@ BenchmarkOutput = namedtuple(
|
|||
|
||||
def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
|
||||
"""
|
||||
This function wraps another function into its own separated process.
|
||||
In order to ensure accurate memory measurements it is important that the function
|
||||
is executed in a separate process
|
||||
This function wraps another function into its own separated process.
|
||||
In order to ensure accurate memory measurements it is important that the function
|
||||
is executed in a separate process
|
||||
|
||||
Args:
|
||||
- `func`: (`callable`): function() -> ...
|
||||
generic function which will be executed in its own separate process
|
||||
- `do_multi_processing`: (`bool`)
|
||||
Whether to run function on separate process or not
|
||||
Args:
|
||||
- `func`: (`callable`): function() -> ...
|
||||
generic function which will be executed in its own separate process
|
||||
- `do_multi_processing`: (`bool`)
|
||||
Whether to run function on separate process or not
|
||||
"""
|
||||
|
||||
def multi_process_func(*args, **kwargs):
|
||||
|
@ -106,13 +106,13 @@ def is_memory_tracing_enabled():
|
|||
|
||||
|
||||
class Frame(NamedTuple):
|
||||
""" `Frame` is a NamedTuple used to gather the current frame state.
|
||||
`Frame` has the following fields:
|
||||
- 'filename' (string): Name of the file currently executed
|
||||
- 'module' (string): Name of the module currently executed
|
||||
- 'line_number' (int): Number of the line currently executed
|
||||
- 'event' (string): Event that triggered the tracing (default will be "line")
|
||||
- 'line_text' (string): Text of the line in the python script
|
||||
"""`Frame` is a NamedTuple used to gather the current frame state.
|
||||
`Frame` has the following fields:
|
||||
- 'filename' (string): Name of the file currently executed
|
||||
- 'module' (string): Name of the module currently executed
|
||||
- 'line_number' (int): Number of the line currently executed
|
||||
- 'event' (string): Event that triggered the tracing (default will be "line")
|
||||
- 'line_text' (string): Text of the line in the python script
|
||||
"""
|
||||
|
||||
filename: str
|
||||
|
@ -123,10 +123,10 @@ class Frame(NamedTuple):
|
|||
|
||||
|
||||
class UsedMemoryState(NamedTuple):
|
||||
""" `UsedMemoryState` are named tuples with the following fields:
|
||||
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
|
||||
- 'cpu_memory': CPU RSS memory state *before* executing the line
|
||||
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
|
||||
"""`UsedMemoryState` are named tuples with the following fields:
|
||||
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
|
||||
- 'cpu_memory': CPU RSS memory state *before* executing the line
|
||||
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
|
||||
"""
|
||||
|
||||
frame: Frame
|
||||
|
@ -135,9 +135,9 @@ class UsedMemoryState(NamedTuple):
|
|||
|
||||
|
||||
class Memory(NamedTuple):
|
||||
""" `Memory` NamedTuple have a single field `bytes` and
|
||||
you can get a human readable str of the number of mega bytes by calling `__repr__`
|
||||
- `byte` (integer): number of bytes,
|
||||
"""`Memory` NamedTuple have a single field `bytes` and
|
||||
you can get a human readable str of the number of mega bytes by calling `__repr__`
|
||||
- `byte` (integer): number of bytes,
|
||||
"""
|
||||
|
||||
bytes: int
|
||||
|
@ -147,11 +147,11 @@ class Memory(NamedTuple):
|
|||
|
||||
|
||||
class MemoryState(NamedTuple):
|
||||
""" `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
|
||||
- `frame` (`Frame`): the current frame (see above)
|
||||
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
"""`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
|
||||
- `frame` (`Frame`): the current frame (see above)
|
||||
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
"""
|
||||
|
||||
frame: Frame
|
||||
|
@ -161,14 +161,14 @@ class MemoryState(NamedTuple):
|
|||
|
||||
|
||||
class MemorySummary(NamedTuple):
|
||||
""" `MemorySummary` namedtuple otherwise with the fields:
|
||||
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
|
||||
by substracting the memory after executing each line from the memory before executing said line.
|
||||
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
|
||||
obtained by summing repeated memory increase for a line if it's executed several times.
|
||||
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
|
||||
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
|
||||
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
|
||||
"""`MemorySummary` namedtuple otherwise with the fields:
|
||||
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
|
||||
by substracting the memory after executing each line from the memory before executing said line.
|
||||
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
|
||||
obtained by summing repeated memory increase for a line if it's executed several times.
|
||||
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
|
||||
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
|
||||
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
|
||||
"""
|
||||
|
||||
sequential: List[MemoryState]
|
||||
|
@ -182,38 +182,38 @@ MemoryTrace = List[UsedMemoryState]
|
|||
|
||||
def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
|
||||
"""
|
||||
measures peak cpu memory consumption of a given `function`
|
||||
running the function for at least interval seconds
|
||||
and at most 20 * interval seconds.
|
||||
This function is heavily inspired by: `memory_usage`
|
||||
of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
|
||||
measures peak cpu memory consumption of a given `function`
|
||||
running the function for at least interval seconds
|
||||
and at most 20 * interval seconds.
|
||||
This function is heavily inspired by: `memory_usage`
|
||||
of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
|
||||
|
||||
Args:
|
||||
- `function`: (`callable`): function() -> ...
|
||||
function without any arguments to measure for which to measure the peak memory
|
||||
Args:
|
||||
- `function`: (`callable`): function() -> ...
|
||||
function without any arguments to measure for which to measure the peak memory
|
||||
|
||||
- `interval`: (`float`, `optional`, defaults to `0.5`)
|
||||
interval in second for which to measure the memory usage
|
||||
- `interval`: (`float`, `optional`, defaults to `0.5`)
|
||||
interval in second for which to measure the memory usage
|
||||
|
||||
- `device_idx`: (`int`, `optional`, defaults to `None`)
|
||||
device id for which to measure gpu usage
|
||||
- `device_idx`: (`int`, `optional`, defaults to `None`)
|
||||
device id for which to measure gpu usage
|
||||
|
||||
Returns:
|
||||
- `max_memory`: (`int`)
|
||||
cosumed memory peak in Bytes
|
||||
Returns:
|
||||
- `max_memory`: (`int`)
|
||||
cosumed memory peak in Bytes
|
||||
"""
|
||||
|
||||
def get_cpu_memory(process_id: int) -> int:
|
||||
"""
|
||||
measures current cpu memory usage of a given `process_id`
|
||||
measures current cpu memory usage of a given `process_id`
|
||||
|
||||
Args:
|
||||
- `process_id`: (`int`)
|
||||
process_id for which to measure memory
|
||||
Args:
|
||||
- `process_id`: (`int`)
|
||||
process_id for which to measure memory
|
||||
|
||||
Returns
|
||||
- `memory`: (`int`)
|
||||
cosumed memory in Bytes
|
||||
Returns
|
||||
- `memory`: (`int`)
|
||||
cosumed memory in Bytes
|
||||
"""
|
||||
process = psutil.Process(process_id)
|
||||
try:
|
||||
|
@ -234,8 +234,8 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
|
|||
class MemoryMeasureProcess(Process):
|
||||
|
||||
"""
|
||||
`MemoryMeasureProcess` inherits from `Process` and overwrites
|
||||
its `run()` method. Used to measure the memory usage of a process
|
||||
`MemoryMeasureProcess` inherits from `Process` and overwrites
|
||||
its `run()` method. Used to measure the memory usage of a process
|
||||
"""
|
||||
|
||||
def __init__(self, process_id: int, child_connection: Connection, interval: float):
|
||||
|
@ -309,37 +309,37 @@ def start_memory_tracing(
|
|||
events_to_trace: str = "line",
|
||||
gpus_to_trace: Optional[List[int]] = None,
|
||||
) -> MemoryTrace:
|
||||
""" Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
|
||||
See `./benchmark.py` for usage examples.
|
||||
Current memory consumption is returned using psutil and in particular is the RSS memory
|
||||
"Resident Set Size” (the non-swapped physical memory the process is using).
|
||||
See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
|
||||
"""Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
|
||||
See `./benchmark.py` for usage examples.
|
||||
Current memory consumption is returned using psutil and in particular is the RSS memory
|
||||
"Resident Set Size” (the non-swapped physical memory the process is using).
|
||||
See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
|
||||
|
||||
Args:
|
||||
- `modules_to_trace`: (None, string, list/tuple of string)
|
||||
if None, all events are recorded
|
||||
if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
|
||||
- `modules_not_to_trace`: (None, string, list/tuple of string)
|
||||
if None, no module is avoided
|
||||
if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
|
||||
- `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
|
||||
default to line
|
||||
- `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
|
||||
Args:
|
||||
- `modules_to_trace`: (None, string, list/tuple of string)
|
||||
if None, all events are recorded
|
||||
if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
|
||||
- `modules_not_to_trace`: (None, string, list/tuple of string)
|
||||
if None, no module is avoided
|
||||
if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
|
||||
- `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
|
||||
default to line
|
||||
- `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
|
||||
|
||||
Return:
|
||||
- `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
|
||||
- `UsedMemoryState` are named tuples with the following fields:
|
||||
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
|
||||
- 'cpu_memory': CPU RSS memory state *before* executing the line
|
||||
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
|
||||
Return:
|
||||
- `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
|
||||
- `UsedMemoryState` are named tuples with the following fields:
|
||||
- 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
|
||||
- 'cpu_memory': CPU RSS memory state *before* executing the line
|
||||
- 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
|
||||
|
||||
`Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
|
||||
`Frame` has the following fields:
|
||||
- 'filename' (string): Name of the file currently executed
|
||||
- 'module' (string): Name of the module currently executed
|
||||
- 'line_number' (int): Number of the line currently executed
|
||||
- 'event' (string): Event that triggered the tracing (default will be "line")
|
||||
- 'line_text' (string): Text of the line in the python script
|
||||
`Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
|
||||
`Frame` has the following fields:
|
||||
- 'filename' (string): Name of the file currently executed
|
||||
- 'module' (string): Name of the module currently executed
|
||||
- 'line_number' (int): Number of the line currently executed
|
||||
- 'event' (string): Event that triggered the tracing (default will be "line")
|
||||
- 'line_text' (string): Text of the line in the python script
|
||||
|
||||
"""
|
||||
if is_psutil_available():
|
||||
|
@ -371,8 +371,8 @@ def start_memory_tracing(
|
|||
memory_trace = []
|
||||
|
||||
def traceit(frame, event, args):
|
||||
""" Tracing method executed before running each line in a module or sub-module
|
||||
Record memory allocated in a list with debugging information
|
||||
"""Tracing method executed before running each line in a module or sub-module
|
||||
Record memory allocated in a list with debugging information
|
||||
"""
|
||||
global _is_memory_tracing_enabled
|
||||
|
||||
|
@ -456,39 +456,39 @@ def start_memory_tracing(
|
|||
def stop_memory_tracing(
|
||||
memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
|
||||
) -> Optional[MemorySummary]:
|
||||
""" Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
|
||||
"""Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
|
||||
|
||||
Args:
|
||||
- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
|
||||
- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
|
||||
Args:
|
||||
- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
|
||||
- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
|
||||
|
||||
Return:
|
||||
- None if `memory_trace` is None
|
||||
- `MemorySummary` namedtuple otherwise with the fields:
|
||||
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
|
||||
by substracting the memory after executing each line from the memory before executing said line.
|
||||
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
|
||||
obtained by summing repeated memory increase for a line if it's executed several times.
|
||||
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
|
||||
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
|
||||
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
|
||||
Return:
|
||||
- None if `memory_trace` is None
|
||||
- `MemorySummary` namedtuple otherwise with the fields:
|
||||
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
|
||||
by substracting the memory after executing each line from the memory before executing said line.
|
||||
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
|
||||
obtained by summing repeated memory increase for a line if it's executed several times.
|
||||
The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
|
||||
- `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
|
||||
Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
|
||||
|
||||
`Memory` named tuple have fields
|
||||
- `byte` (integer): number of bytes,
|
||||
- `string` (string): same as human readable string (ex: "3.5MB")
|
||||
`Memory` named tuple have fields
|
||||
- `byte` (integer): number of bytes,
|
||||
- `string` (string): same as human readable string (ex: "3.5MB")
|
||||
|
||||
`Frame` are namedtuple used to list the current frame state and have the following fields:
|
||||
- 'filename' (string): Name of the file currently executed
|
||||
- 'module' (string): Name of the module currently executed
|
||||
- 'line_number' (int): Number of the line currently executed
|
||||
- 'event' (string): Event that triggered the tracing (default will be "line")
|
||||
- 'line_text' (string): Text of the line in the python script
|
||||
`Frame` are namedtuple used to list the current frame state and have the following fields:
|
||||
- 'filename' (string): Name of the file currently executed
|
||||
- 'module' (string): Name of the module currently executed
|
||||
- 'line_number' (int): Number of the line currently executed
|
||||
- 'event' (string): Event that triggered the tracing (default will be "line")
|
||||
- 'line_text' (string): Text of the line in the python script
|
||||
|
||||
`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
|
||||
- `frame` (`Frame`): the current frame (see above)
|
||||
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
|
||||
- `frame` (`Frame`): the current frame (see above)
|
||||
- `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
- `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
- `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
|
||||
"""
|
||||
global _is_memory_tracing_enabled
|
||||
_is_memory_tracing_enabled = False
|
||||
|
@ -499,15 +499,19 @@ def stop_memory_tracing(
|
|||
|
||||
cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
|
||||
|
||||
for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip(
|
||||
memory_trace[:-1], memory_trace[1:]
|
||||
):
|
||||
for (
|
||||
(frame, cpu_mem, gpu_mem),
|
||||
(next_frame, next_cpu_mem, next_gpu_mem),
|
||||
) in zip(memory_trace[:-1], memory_trace[1:]):
|
||||
cpu_mem_inc = next_cpu_mem - cpu_mem
|
||||
gpu_mem_inc = next_gpu_mem - gpu_mem
|
||||
cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
|
||||
memory_diff_trace.append(
|
||||
MemoryState(
|
||||
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
|
||||
frame=frame,
|
||||
cpu=Memory(cpu_mem_inc),
|
||||
gpu=Memory(gpu_mem_inc),
|
||||
cpu_gpu=Memory(cpu_gpu_mem_inc),
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -529,7 +533,10 @@ def stop_memory_tracing(
|
|||
) # order by the total CPU + GPU memory increase
|
||||
cumulative_memory = list(
|
||||
MemoryState(
|
||||
frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
|
||||
frame=frame,
|
||||
cpu=Memory(cpu_mem_inc),
|
||||
gpu=Memory(gpu_mem_inc),
|
||||
cpu_gpu=Memory(cpu_gpu_mem_inc),
|
||||
)
|
||||
for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
|
||||
)
|
||||
|
@ -544,15 +551,17 @@ def stop_memory_tracing(
|
|||
total_memory = Memory(total_memory)
|
||||
|
||||
return MemorySummary(
|
||||
sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory,
|
||||
sequential=memory_diff_trace,
|
||||
cumulative=cumulative_memory,
|
||||
current=memory_curr_trace,
|
||||
total=total_memory,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def bytes_to_mega_bytes(memory_amount: int) -> int:
|
||||
""" Utility to convert a number of bytes (int) into a number of mega bytes (int)
|
||||
"""
|
||||
"""Utility to convert a number of bytes (int) into a number of mega bytes (int)"""
|
||||
return memory_amount >> 20
|
||||
|
||||
|
||||
|
|
|
@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class AlbertConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
|
||||
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
|
||||
It is used to instantiate an ALBERT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30000):
|
||||
Vocabulary size of the ALBERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
|
||||
embedding_size (:obj:`int`, optional, defaults to 128):
|
||||
Dimensionality of vocabulary embeddings.
|
||||
hidden_size (:obj:`int`, optional, defaults to 4096):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_hidden_groups (:obj:`int`, optional, defaults to 1):
|
||||
Number of groups for the hidden layers, parameters in the same group are shared.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 64):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 16384):
|
||||
The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
inner_group_num (:obj:`int`, optional, defaults to 1):
|
||||
The number of inner repetition of attention and ffn.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something
|
||||
large (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for attached classifiers.
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30000):
|
||||
Vocabulary size of the ALBERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
|
||||
embedding_size (:obj:`int`, optional, defaults to 128):
|
||||
Dimensionality of vocabulary embeddings.
|
||||
hidden_size (:obj:`int`, optional, defaults to 4096):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_hidden_groups (:obj:`int`, optional, defaults to 1):
|
||||
Number of groups for the hidden layers, parameters in the same group are shared.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 64):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 16384):
|
||||
The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
inner_group_num (:obj:`int`, optional, defaults to 1):
|
||||
The number of inner repetition of attention and ffn.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something
|
||||
large (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for attached classifiers.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import AlbertConfig, AlbertModel
|
||||
>>> # Initializing an ALBERT-xxlarge style configuration
|
||||
>>> albert_xxlarge_configuration = AlbertConfig()
|
||||
>>> from transformers import AlbertConfig, AlbertModel
|
||||
>>> # Initializing an ALBERT-xxlarge style configuration
|
||||
>>> albert_xxlarge_configuration = AlbertConfig()
|
||||
|
||||
>>> # Initializing an ALBERT-base style configuration
|
||||
>>> albert_base_configuration = AlbertConfig(
|
||||
... hidden_size=768,
|
||||
... num_attention_heads=12,
|
||||
... intermediate_size=3072,
|
||||
... )
|
||||
>>> # Initializing an ALBERT-base style configuration
|
||||
>>> albert_base_configuration = AlbertConfig(
|
||||
... hidden_size=768,
|
||||
... num_attention_heads=12,
|
||||
... intermediate_size=3072,
|
||||
... )
|
||||
|
||||
>>> # Initializing a model from the ALBERT-base style configuration
|
||||
>>> model = AlbertModel(albert_xxlarge_configuration)
|
||||
>>> # Initializing a model from the ALBERT-base style configuration
|
||||
>>> model = AlbertModel(albert_xxlarge_configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "albert"
|
||||
|
|
|
@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
|
|||
|
||||
CONFIG_MAPPING = OrderedDict(
|
||||
[
|
||||
("retribert", RetriBertConfig,),
|
||||
("t5", T5Config,),
|
||||
("mobilebert", MobileBertConfig,),
|
||||
("distilbert", DistilBertConfig,),
|
||||
("albert", AlbertConfig,),
|
||||
("camembert", CamembertConfig,),
|
||||
("xlm-roberta", XLMRobertaConfig,),
|
||||
(
|
||||
"retribert",
|
||||
RetriBertConfig,
|
||||
),
|
||||
(
|
||||
"t5",
|
||||
T5Config,
|
||||
),
|
||||
(
|
||||
"mobilebert",
|
||||
MobileBertConfig,
|
||||
),
|
||||
(
|
||||
"distilbert",
|
||||
DistilBertConfig,
|
||||
),
|
||||
(
|
||||
"albert",
|
||||
AlbertConfig,
|
||||
),
|
||||
(
|
||||
"camembert",
|
||||
CamembertConfig,
|
||||
),
|
||||
(
|
||||
"xlm-roberta",
|
||||
XLMRobertaConfig,
|
||||
),
|
||||
("pegasus", PegasusConfig),
|
||||
("marian", MarianConfig,),
|
||||
("mbart", MBartConfig,),
|
||||
("bart", BartConfig,),
|
||||
("reformer", ReformerConfig,),
|
||||
("longformer", LongformerConfig,),
|
||||
("roberta", RobertaConfig,),
|
||||
("flaubert", FlaubertConfig,),
|
||||
("bert", BertConfig,),
|
||||
("openai-gpt", OpenAIGPTConfig,),
|
||||
("gpt2", GPT2Config,),
|
||||
("transfo-xl", TransfoXLConfig,),
|
||||
("xlnet", XLNetConfig,),
|
||||
("xlm", XLMConfig,),
|
||||
("ctrl", CTRLConfig,),
|
||||
("electra", ElectraConfig,),
|
||||
("encoder-decoder", EncoderDecoderConfig,),
|
||||
(
|
||||
"marian",
|
||||
MarianConfig,
|
||||
),
|
||||
(
|
||||
"mbart",
|
||||
MBartConfig,
|
||||
),
|
||||
(
|
||||
"bart",
|
||||
BartConfig,
|
||||
),
|
||||
(
|
||||
"reformer",
|
||||
ReformerConfig,
|
||||
),
|
||||
(
|
||||
"longformer",
|
||||
LongformerConfig,
|
||||
),
|
||||
(
|
||||
"roberta",
|
||||
RobertaConfig,
|
||||
),
|
||||
(
|
||||
"flaubert",
|
||||
FlaubertConfig,
|
||||
),
|
||||
(
|
||||
"bert",
|
||||
BertConfig,
|
||||
),
|
||||
(
|
||||
"openai-gpt",
|
||||
OpenAIGPTConfig,
|
||||
),
|
||||
(
|
||||
"gpt2",
|
||||
GPT2Config,
|
||||
),
|
||||
(
|
||||
"transfo-xl",
|
||||
TransfoXLConfig,
|
||||
),
|
||||
(
|
||||
"xlnet",
|
||||
XLNetConfig,
|
||||
),
|
||||
(
|
||||
"xlm",
|
||||
XLMConfig,
|
||||
),
|
||||
(
|
||||
"ctrl",
|
||||
CTRLConfig,
|
||||
),
|
||||
(
|
||||
"electra",
|
||||
ElectraConfig,
|
||||
),
|
||||
(
|
||||
"encoder-decoder",
|
||||
EncoderDecoderConfig,
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class AutoConfig:
|
||||
r"""
|
||||
:class:`~transformers.AutoConfig` is a generic configuration class
|
||||
that will be instantiated as one of the configuration classes of the library
|
||||
when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
|
||||
:class:`~transformers.AutoConfig` is a generic configuration class
|
||||
that will be instantiated as one of the configuration classes of the library
|
||||
when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
|
||||
|
||||
The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
|
||||
based on the `model_type` property of the config object, or when it's missing,
|
||||
falling back to using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
|
||||
based on the `model_type` property of the config object, or when it's missing,
|
||||
falling back to using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
|
|
@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r"""
|
|||
@add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
|
||||
class BartConfig(PretrainedConfig):
|
||||
r"""
|
||||
Configuration class for Bart. Parameters are renamed from the fairseq implementation
|
||||
Configuration class for Bart. Parameters are renamed from the fairseq implementation
|
||||
"""
|
||||
model_type = "bart"
|
||||
|
||||
|
@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig):
|
|||
**common_kwargs
|
||||
):
|
||||
r"""
|
||||
:class:`~transformers.BartConfig` is the configuration class for `BartModel`.
|
||||
:class:`~transformers.BartConfig` is the configuration class for `BartModel`.
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> from transformers import BartConfig, BartModel
|
||||
>>> from transformers import BartConfig, BartModel
|
||||
|
||||
>>> config = BartConfig.from_pretrained('facebook/bart-large')
|
||||
>>> model = BartModel(config)
|
||||
>>> config = BartConfig.from_pretrained('facebook/bart-large')
|
||||
>>> model = BartModel(config)
|
||||
|
||||
"""
|
||||
if "hidden_size" in common_kwargs:
|
||||
|
|
|
@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class BertConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
|
||||
It is used to instantiate an BERT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
|
||||
It is used to instantiate an BERT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the BERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||
hidden_size (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
gradient_checkpointing (:obj:`bool`, optional, defaults to False):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the BERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||
hidden_size (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
gradient_checkpointing (:obj:`bool`, optional, defaults to False):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import BertModel, BertConfig
|
||||
>>> from transformers import BertModel, BertConfig
|
||||
|
||||
>>> # Initializing a BERT bert-base-uncased style configuration
|
||||
>>> configuration = BertConfig()
|
||||
>>> # Initializing a BERT bert-base-uncased style configuration
|
||||
>>> configuration = BertConfig()
|
||||
|
||||
>>> # Initializing a model from the bert-base-uncased style configuration
|
||||
>>> model = BertModel(configuration)
|
||||
>>> # Initializing a model from the bert-base-uncased style configuration
|
||||
>>> model = BertModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "bert"
|
||||
|
||||
|
|
|
@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h
|
|||
|
||||
class CTRLConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
|
||||
It is used to instantiate an CTRL model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
|
||||
It is used to instantiate an CTRL model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 246534):
|
||||
Vocabulary size of the CTRL model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
|
||||
n_positions (:obj:`int`, optional, defaults to 256):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, optional, defaults to 256):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, optional, defaults to 1280):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
dff (:obj:`int`, optional, defaults to 8192):
|
||||
Dimensionality of the inner dimension of the FFN.
|
||||
n_layer (:obj:`int`, optional, defaults to 48):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 246534):
|
||||
Vocabulary size of the CTRL model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
|
||||
n_positions (:obj:`int`, optional, defaults to 256):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, optional, defaults to 256):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, optional, defaults to 1280):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
dff (:obj:`int`, optional, defaults to 8192):
|
||||
Dimensionality of the inner dimension of the FFN.
|
||||
n_layer (:obj:`int`, optional, defaults to 48):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import CTRLModel, CTRLConfig
|
||||
>>> from transformers import CTRLModel, CTRLConfig
|
||||
|
||||
>>> # Initializing a CTRL configuration
|
||||
>>> configuration = CTRLConfig()
|
||||
>>> # Initializing a CTRL configuration
|
||||
>>> configuration = CTRLConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = CTRLModel(configuration)
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = CTRLModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "ctrl"
|
||||
|
|
|
@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class DistilBertConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
|
||||
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
|
||||
It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the DistilBERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use sinusoidal positional embeddings.
|
||||
n_layers (:obj:`int`, optional, defaults to 6):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
dim (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
hidden_dim (:obj:`int`, optional, defaults to 3072):
|
||||
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
qa_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilities used in the question answering model
|
||||
:class:`~transformers.DistilBertForQuestionAnswering`.
|
||||
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
|
||||
The dropout probabilities used in the sequence classification and the multiple choice model
|
||||
:class:`~transformers.DistilBertForSequenceClassification`.
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the DistilBERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use sinusoidal positional embeddings.
|
||||
n_layers (:obj:`int`, optional, defaults to 6):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
dim (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
hidden_dim (:obj:`int`, optional, defaults to 3072):
|
||||
The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
qa_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilities used in the question answering model
|
||||
:class:`~transformers.DistilBertForQuestionAnswering`.
|
||||
seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
|
||||
The dropout probabilities used in the sequence classification and the multiple choice model
|
||||
:class:`~transformers.DistilBertForSequenceClassification`.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import DistilBertModel, DistilBertConfig
|
||||
>>> from transformers import DistilBertModel, DistilBertConfig
|
||||
|
||||
>>> # Initializing a DistilBERT configuration
|
||||
>>> configuration = DistilBertConfig()
|
||||
>>> # Initializing a DistilBERT configuration
|
||||
>>> configuration = DistilBertConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = DistilBertModel(configuration)
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = DistilBertModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "distilbert"
|
||||
|
||||
|
|
|
@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class DPRConfig(BertConfig):
|
||||
r"""
|
||||
:class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
|
||||
`DPRModel`.
|
||||
:class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
|
||||
`DPRModel`.
|
||||
|
||||
This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
|
||||
It is used to instantiate the components of the DPR model.
|
||||
This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
|
||||
It is used to instantiate the components of the DPR model.
|
||||
|
||||
Args:
|
||||
projection_dim (:obj:`int`, optional, defaults to 0):
|
||||
Dimension of the projection for the context and question encoders.
|
||||
If it is set to zero (default), then no projection is done.
|
||||
Args:
|
||||
projection_dim (:obj:`int`, optional, defaults to 0):
|
||||
Dimension of the projection for the context and question encoders.
|
||||
If it is set to zero (default), then no projection is done.
|
||||
"""
|
||||
model_type = "dpr"
|
||||
|
||||
|
|
|
@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class ElectraConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
|
||||
It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
|
||||
architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
|
||||
It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the ELECTRA model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
|
||||
embedding_size (:obj:`int`, optional, defaults to 128):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
hidden_size (:obj:`int`, optional, defaults to 256):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 4):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
summary_type (:obj:`string`, optional, defaults to "first"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.ElectraForMultipleChoice`.
|
||||
Is one of the following options:
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the ELECTRA model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
|
||||
embedding_size (:obj:`int`, optional, defaults to 128):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
hidden_size (:obj:`int`, optional, defaults to 256):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 4):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
summary_type (:obj:`string`, optional, defaults to "first"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.ElectraForMultipleChoice`.
|
||||
Is one of the following options:
|
||||
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.ElectraForMultipleChoice`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.ElectraForMultipleChoice`.
|
||||
'gelu' => add a gelu activation to the output, Other => no activation.
|
||||
summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.ElectraForMultipleChoice`.
|
||||
Add a dropout after the projection and activation
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.ElectraForMultipleChoice`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.ElectraForMultipleChoice`.
|
||||
'gelu' => add a gelu activation to the output, Other => no activation.
|
||||
summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.ElectraForMultipleChoice`.
|
||||
Add a dropout after the projection and activation
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import ElectraModel, ElectraConfig
|
||||
>>> from transformers import ElectraModel, ElectraConfig
|
||||
|
||||
>>> # Initializing a ELECTRA electra-base-uncased style configuration
|
||||
>>> configuration = ElectraConfig()
|
||||
>>> # Initializing a ELECTRA electra-base-uncased style configuration
|
||||
>>> configuration = ElectraConfig()
|
||||
|
||||
>>> # Initializing a model from the electra-base-uncased style configuration
|
||||
>>> model = ElectraModel(configuration)
|
||||
>>> # Initializing a model from the electra-base-uncased style configuration
|
||||
>>> model = ElectraModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "electra"
|
||||
|
||||
|
|
|
@ -25,47 +25,47 @@ logger = logging.get_logger(__name__)
|
|||
|
||||
class EncoderDecoderConfig(PretrainedConfig):
|
||||
r"""
|
||||
:class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
|
||||
:class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
|
||||
|
||||
It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
|
||||
and can be used to control the model outputs.
|
||||
See the documentation for :class:`~transformers.PretrainedConfig` for more information.
|
||||
It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
|
||||
and can be used to control the model outputs.
|
||||
See the documentation for :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
Args:
|
||||
kwargs (`optional`):
|
||||
Remaining dictionary of keyword arguments. Notably:
|
||||
encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
|
||||
An instance of a configuration object that defines the encoder config.
|
||||
decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
|
||||
An instance of a configuration object that defines the decoder config.
|
||||
Args:
|
||||
kwargs (`optional`):
|
||||
Remaining dictionary of keyword arguments. Notably:
|
||||
encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
|
||||
An instance of a configuration object that defines the encoder config.
|
||||
decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
|
||||
An instance of a configuration object that defines the decoder config.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
|
||||
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
|
||||
|
||||
>>> # Initializing a BERT bert-base-uncased style configuration
|
||||
>>> config_encoder = BertConfig()
|
||||
>>> config_decoder = BertConfig()
|
||||
>>> # Initializing a BERT bert-base-uncased style configuration
|
||||
>>> config_encoder = BertConfig()
|
||||
>>> config_decoder = BertConfig()
|
||||
|
||||
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
|
||||
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
|
||||
|
||||
>>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
|
||||
>>> model = EncoderDecoderModel(config=config)
|
||||
>>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
|
||||
>>> model = EncoderDecoderModel(config=config)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> config_encoder = model.config.encoder
|
||||
>>> config_decoder = model.config.decoder
|
||||
>>> # set decoder config to causal lm
|
||||
>>> config_decoder.is_decoder = True
|
||||
>>> config_decoder.add_cross_attention = True
|
||||
>>> # Accessing the model configuration
|
||||
>>> config_encoder = model.config.encoder
|
||||
>>> config_decoder = model.config.decoder
|
||||
>>> # set decoder config to causal lm
|
||||
>>> config_decoder.is_decoder = True
|
||||
>>> config_decoder.add_cross_attention = True
|
||||
|
||||
>>> # Saving the model, including its configuration
|
||||
>>> model.save_pretrained('my-model')
|
||||
>>> # Saving the model, including its configuration
|
||||
>>> model.save_pretrained('my-model')
|
||||
|
||||
>>> # loading model and config from pretrained folder
|
||||
>>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
|
||||
>>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
|
||||
>>> # loading model and config from pretrained folder
|
||||
>>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
|
||||
>>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
|
||||
"""
|
||||
model_type = "encoder_decoder"
|
||||
|
||||
|
|
|
@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class FlaubertConfig(XLMConfig):
|
||||
"""
|
||||
Configuration class to store the configuration of a `FlaubertModel`.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
|
||||
It is used to instantiate an XLM model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
|
||||
Configuration class to store the configuration of a `FlaubertModel`.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
|
||||
It is used to instantiate an XLM model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to apply the layer normalization before or after the feed forward layer following the
|
||||
attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
|
||||
layerdrop (:obj:`float`, `optional`, defaults to 0.0):
|
||||
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
|
||||
with Structured Dropout. ICLR 2020)
|
||||
vocab_size (:obj:`int`, optional, defaults to 30145):
|
||||
Vocabulary size of the Flaubert model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
|
||||
emb_dim (:obj:`int`, optional, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for the attention mechanism
|
||||
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
|
||||
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
|
||||
causal (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Set this to `True` for the model to behave in a causal manner.
|
||||
Causal models use a triangular attention mask in order to only attend to the left-side context instead
|
||||
if a bidirectional context.
|
||||
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
|
||||
layer.
|
||||
n_langs (:obj:`int`, optional, defaults to 1):
|
||||
The number of languages the model handles. Set to 1 for monolingual models.
|
||||
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
|
||||
Whether to use language embeddings. Some models use additional language embeddings, see
|
||||
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
|
||||
for information on how to use them.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing the embedding matrices.
|
||||
init_std (:obj:`int`, optional, defaults to 50257):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices except the embedding matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
bos_index (:obj:`int`, optional, defaults to 0):
|
||||
The index of the beginning of sentence token in the vocabulary.
|
||||
eos_index (:obj:`int`, optional, defaults to 1):
|
||||
The index of the end of sentence token in the vocabulary.
|
||||
pad_index (:obj:`int`, optional, defaults to 2):
|
||||
The index of the padding token in the vocabulary.
|
||||
unk_index (:obj:`int`, optional, defaults to 3):
|
||||
The index of the unknown token in the vocabulary.
|
||||
mask_index (:obj:`int`, optional, defaults to 5):
|
||||
The index of the masking token in the vocabulary.
|
||||
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
|
||||
summary_type (:obj:`string`, optional, defaults to "first"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Is one of the following options:
|
||||
Args:
|
||||
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to apply the layer normalization before or after the feed forward layer following the
|
||||
attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
|
||||
layerdrop (:obj:`float`, `optional`, defaults to 0.0):
|
||||
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
|
||||
with Structured Dropout. ICLR 2020)
|
||||
vocab_size (:obj:`int`, optional, defaults to 30145):
|
||||
Vocabulary size of the Flaubert model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
|
||||
emb_dim (:obj:`int`, optional, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for the attention mechanism
|
||||
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
|
||||
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
|
||||
causal (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Set this to `True` for the model to behave in a causal manner.
|
||||
Causal models use a triangular attention mask in order to only attend to the left-side context instead
|
||||
if a bidirectional context.
|
||||
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
|
||||
layer.
|
||||
n_langs (:obj:`int`, optional, defaults to 1):
|
||||
The number of languages the model handles. Set to 1 for monolingual models.
|
||||
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
|
||||
Whether to use language embeddings. Some models use additional language embeddings, see
|
||||
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
|
||||
for information on how to use them.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing the embedding matrices.
|
||||
init_std (:obj:`int`, optional, defaults to 50257):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices except the embedding matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
bos_index (:obj:`int`, optional, defaults to 0):
|
||||
The index of the beginning of sentence token in the vocabulary.
|
||||
eos_index (:obj:`int`, optional, defaults to 1):
|
||||
The index of the end of sentence token in the vocabulary.
|
||||
pad_index (:obj:`int`, optional, defaults to 2):
|
||||
The index of the padding token in the vocabulary.
|
||||
unk_index (:obj:`int`, optional, defaults to 3):
|
||||
The index of the unknown token in the vocabulary.
|
||||
mask_index (:obj:`int`, optional, defaults to 5):
|
||||
The index of the masking token in the vocabulary.
|
||||
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
|
||||
summary_type (:obj:`string`, optional, defaults to "first"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Is one of the following options:
|
||||
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Add a dropout before the projection and activation
|
||||
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
mask_token_id (:obj:`int`, optional, defaults to 0):
|
||||
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
|
||||
lang_id (:obj:`int`, optional, defaults to 1):
|
||||
The ID of the language used by the model. This parameter is used when generating
|
||||
text in a given language.
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Add a dropout before the projection and activation
|
||||
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
mask_token_id (:obj:`int`, optional, defaults to 0):
|
||||
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
|
||||
lang_id (:obj:`int`, optional, defaults to 1):
|
||||
The ID of the language used by the model. This parameter is used when generating
|
||||
text in a given language.
|
||||
"""
|
||||
|
||||
model_type = "flaubert"
|
||||
|
||||
def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
|
||||
"""Constructs FlaubertConfig.
|
||||
"""
|
||||
"""Constructs FlaubertConfig."""
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
|
||||
self.layerdrop = layerdrop
|
||||
self.pre_norm = pre_norm
|
||||
|
|
|
@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class GPT2Config(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
|
||||
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
|
||||
It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 50257):
|
||||
Vocabulary size of the GPT-2 model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
|
||||
n_positions (:obj:`int`, optional, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
n_inner (:obj:`int`, optional, defaults to None):
|
||||
Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
|
||||
activation_function (:obj:`str`, optional, defaults to 'gelu'):
|
||||
Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
|
||||
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
summary_type (:obj:`string`, optional, defaults to "cls_index"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
Is one of the following options:
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 50257):
|
||||
Vocabulary size of the GPT-2 model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
|
||||
n_positions (:obj:`int`, optional, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
n_inner (:obj:`int`, optional, defaults to None):
|
||||
Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
|
||||
activation_function (:obj:`str`, optional, defaults to 'gelu'):
|
||||
Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
|
||||
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
summary_type (:obj:`string`, optional, defaults to "cls_index"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
Is one of the following options:
|
||||
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
Add a dropout before the projection and activation
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||
Add a dropout before the projection and activation
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import GPT2Model, GPT2Config
|
||||
>>> from transformers import GPT2Model, GPT2Config
|
||||
|
||||
>>> # Initializing a GPT2 configuration
|
||||
>>> configuration = GPT2Config()
|
||||
>>> # Initializing a GPT2 configuration
|
||||
>>> configuration = GPT2Config()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = GPT2Model(configuration)
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = GPT2Model(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "gpt2"
|
||||
|
|
|
@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class LongformerConfig(RobertaConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
|
||||
It is used to instantiate an Longformer model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
|
||||
It is used to instantiate an Longformer model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
|
||||
|
||||
The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
|
||||
It reuses the same defaults. Please check the parent class for more information.
|
||||
The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
|
||||
It reuses the same defaults. Please check the parent class for more information.
|
||||
|
||||
Args:
|
||||
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
|
||||
Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
|
||||
To specify a different window size for each layer, use a :obj:`List[int]` where
|
||||
``len(attention_window) == num_hidden_layers``.
|
||||
Args:
|
||||
attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
|
||||
Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
|
||||
To specify a different window size for each layer, use a :obj:`List[int]` where
|
||||
``len(attention_window) == num_hidden_layers``.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import LongformerConfig, LongformerModel
|
||||
>>> from transformers import LongformerConfig, LongformerModel
|
||||
|
||||
>>> # Initializing a Longformer configuration
|
||||
>>> configuration = LongformerConfig()
|
||||
>>> # Initializing a Longformer configuration
|
||||
>>> configuration = LongformerConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = LongformerModel(configuration)
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = LongformerModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "longformer"
|
||||
|
||||
|
|
|
@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class MobileBertConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
|
||||
It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
|
||||
It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the MobileBERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
|
||||
hidden_size (:obj:`int`, optional, defaults to 512):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 4):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 512):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the MobileBERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
|
||||
hidden_size (:obj:`int`, optional, defaults to 512):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 4):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 512):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
|
||||
pad_token_id (:obj:`int`, optional, defaults to 0):
|
||||
The ID of the token in the word embedding to use as padding.
|
||||
embedding_size (:obj:`int`, optional, defaults to 128):
|
||||
The dimension of the word embedding vectors.
|
||||
trigram_input (:obj:`bool`, optional, defaults to True):
|
||||
Use a convolution of trigram as input.
|
||||
use_bottleneck (:obj:`bool`, optional, defaults to True):
|
||||
Whether to use bottleneck in BERT.
|
||||
intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
|
||||
Size of bottleneck layer output.
|
||||
use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
|
||||
Whether to use attention inputs from the bottleneck transformation.
|
||||
key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
|
||||
Whether to use the same linear transformation for query&key in the bottleneck.
|
||||
num_feedforward_networks (:obj:`int`, optional, defaults to 4):
|
||||
Number of FFNs in a block.
|
||||
normalization_type (:obj:`str`, optional, defaults to "no_norm"):
|
||||
The normalization type in BERT.
|
||||
pad_token_id (:obj:`int`, optional, defaults to 0):
|
||||
The ID of the token in the word embedding to use as padding.
|
||||
embedding_size (:obj:`int`, optional, defaults to 128):
|
||||
The dimension of the word embedding vectors.
|
||||
trigram_input (:obj:`bool`, optional, defaults to True):
|
||||
Use a convolution of trigram as input.
|
||||
use_bottleneck (:obj:`bool`, optional, defaults to True):
|
||||
Whether to use bottleneck in BERT.
|
||||
intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
|
||||
Size of bottleneck layer output.
|
||||
use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
|
||||
Whether to use attention inputs from the bottleneck transformation.
|
||||
key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
|
||||
Whether to use the same linear transformation for query&key in the bottleneck.
|
||||
num_feedforward_networks (:obj:`int`, optional, defaults to 4):
|
||||
Number of FFNs in a block.
|
||||
normalization_type (:obj:`str`, optional, defaults to "no_norm"):
|
||||
The normalization type in BERT.
|
||||
|
||||
Example:
|
||||
Example:
|
||||
|
||||
>>> from transformers import MobileBertModel, MobileBertConfig
|
||||
>>> from transformers import MobileBertModel, MobileBertConfig
|
||||
|
||||
>>> # Initializing a MobileBERT configuration
|
||||
>>> configuration = MobileBertConfig()
|
||||
>>> # Initializing a MobileBERT configuration
|
||||
>>> configuration = MobileBertConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration above
|
||||
>>> model = MobileBertModel(configuration)
|
||||
>>> # Initializing a model from the configuration above
|
||||
>>> model = MobileBertModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
|
||||
Attributes:
|
||||
pretrained_config_archive_map (Dict[str, str]):
|
||||
A dictionary containing all the available pre-trained checkpoints.
|
||||
Attributes:
|
||||
pretrained_config_archive_map (Dict[str, str]):
|
||||
A dictionary containing all the available pre-trained checkpoints.
|
||||
"""
|
||||
pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
model_type = "mobilebert"
|
||||
|
|
|
@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class OpenAIGPTConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
|
||||
It is used to instantiate an GPT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
|
||||
It is used to instantiate an GPT model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 40478):
|
||||
Vocabulary size of the GPT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
|
||||
n_positions (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, optional, defaults to 512):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Whether special tokens should be predicted when the model is has a language modeling head.
|
||||
summary_type (:obj:`string`, optional, defaults to "cls_index"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
Is one of the following options:
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 40478):
|
||||
Vocabulary size of the GPT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
|
||||
n_positions (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
n_ctx (:obj:`int`, optional, defaults to 512):
|
||||
Dimensionality of the causal mask (usually same as n_positions).
|
||||
n_embd (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
resid_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
embd_pdrop (:obj:`int`, optional, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Whether special tokens should be predicted when the model is has a language modeling head.
|
||||
summary_type (:obj:`string`, optional, defaults to "cls_index"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
Is one of the following options:
|
||||
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
Add a dropout before the projection and activation
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||
Add a dropout before the projection and activation
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
|
||||
>>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
|
||||
|
||||
>>> # Initializing a GPT configuration
|
||||
>>> configuration = OpenAIGPTConfig()
|
||||
>>> # Initializing a GPT configuration
|
||||
>>> configuration = OpenAIGPTConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = OpenAIGPTModel(configuration)
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = OpenAIGPTModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "openai-gpt"
|
||||
|
|
|
@ -92,8 +92,8 @@ expected_alpha = {
|
|||
@add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
|
||||
class PegasusConfig(BartConfig):
|
||||
r"""
|
||||
:class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
|
||||
`PegasusModel`.
|
||||
:class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
|
||||
`PegasusModel`.
|
||||
"""
|
||||
model_type = "pegasus"
|
||||
# The implementation of the config object is in BartConfig
|
||||
|
|
|
@ -29,105 +29,105 @@ REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class ReformerConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
|
||||
It is used to instantiate an Reformer model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
|
||||
It is used to instantiate an Reformer model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
attention_head_size (:obj:`int`, optional, defaults to 64):
|
||||
Dimensionality of the projected key, query and value vectors
|
||||
attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
|
||||
List of attention layer types in ascending order. It can be chosen between a
|
||||
LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
|
||||
For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
|
||||
For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
|
||||
axial_pos_embds (:obj:`bool`, optional, defaults to True):
|
||||
If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
|
||||
axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
|
||||
The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
|
||||
axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
|
||||
The position dims of the axial position encodings.
|
||||
During training the product of the position dims has to equal the sequence length.
|
||||
For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
|
||||
axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
|
||||
The embedding dims of the axial position encodings.
|
||||
The sum of the embedding dims has to equal the hidden size.
|
||||
For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
|
||||
chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
|
||||
The chunk size of the final language model feed forward head layer.
|
||||
A chunk size of 0 means that the feed forward layer is not chunked.
|
||||
A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
|
||||
For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
|
||||
eos_token_id (:obj:`int`, optional, defaults to 2):
|
||||
The token id for the <EOS> token.
|
||||
feed_forward_size (:obj:`int`, optional, defaults to 512):
|
||||
Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
|
||||
hash_seed (:obj:`int`, optional, defaults to `None`):
|
||||
Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
|
||||
The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
|
||||
If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
hidden_size (:obj:`int`, optional, defaults to 256):
|
||||
Dimensionality of the output hidden states of the residual attention blocks.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
is_decoder (:obj:`bool`, optional, defaults to False):
|
||||
If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
|
||||
When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
local_chunk_length (:obj:`int`, optional, defaults to 64):
|
||||
Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
|
||||
local_num_chunks_before (:obj:`int`, optional, defaults to 1):
|
||||
Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
|
||||
local_num_chunks_after (:obj:`int`, optional, defaults to 0):
|
||||
Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
|
||||
local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities in LocalSelfAttention.
|
||||
lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
|
||||
Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
|
||||
lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
|
||||
Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
|
||||
lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
|
||||
Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
|
||||
lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities in LSHSelfAttention.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
|
||||
Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
|
||||
The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
|
||||
The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
|
||||
num_hashes (:obj:`int`, optional, defaults to 1):
|
||||
Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
|
||||
The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
|
||||
pad_token_id (:obj:`int`, optional, defaults to 0):
|
||||
The token id for the <PAD> token.
|
||||
vocab_size (:obj:`int`, optional, defaults to 320):
|
||||
Vocabulary size of the Reformer model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
|
||||
Args:
|
||||
attention_head_size (:obj:`int`, optional, defaults to 64):
|
||||
Dimensionality of the projected key, query and value vectors
|
||||
attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
|
||||
List of attention layer types in ascending order. It can be chosen between a
|
||||
LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
|
||||
For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
|
||||
For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
|
||||
axial_pos_embds (:obj:`bool`, optional, defaults to True):
|
||||
If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
|
||||
axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
|
||||
The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
|
||||
axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
|
||||
The position dims of the axial position encodings.
|
||||
During training the product of the position dims has to equal the sequence length.
|
||||
For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
|
||||
axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
|
||||
The embedding dims of the axial position encodings.
|
||||
The sum of the embedding dims has to equal the hidden size.
|
||||
For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
|
||||
chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
|
||||
The chunk size of the final language model feed forward head layer.
|
||||
A chunk size of 0 means that the feed forward layer is not chunked.
|
||||
A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
|
||||
For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
|
||||
eos_token_id (:obj:`int`, optional, defaults to 2):
|
||||
The token id for the <EOS> token.
|
||||
feed_forward_size (:obj:`int`, optional, defaults to 512):
|
||||
Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
|
||||
hash_seed (:obj:`int`, optional, defaults to `None`):
|
||||
Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
|
||||
The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
|
||||
If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
hidden_size (:obj:`int`, optional, defaults to 256):
|
||||
Dimensionality of the output hidden states of the residual attention blocks.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
is_decoder (:obj:`bool`, optional, defaults to False):
|
||||
If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
|
||||
When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
local_chunk_length (:obj:`int`, optional, defaults to 64):
|
||||
Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
|
||||
local_num_chunks_before (:obj:`int`, optional, defaults to 1):
|
||||
Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
|
||||
local_num_chunks_after (:obj:`int`, optional, defaults to 0):
|
||||
Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
|
||||
local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities in LocalSelfAttention.
|
||||
lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
|
||||
Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
|
||||
lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
|
||||
Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
|
||||
lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
|
||||
Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
|
||||
lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities in LSHSelfAttention.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
|
||||
Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
|
||||
The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
|
||||
The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
|
||||
num_hashes (:obj:`int`, optional, defaults to 1):
|
||||
Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
|
||||
The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
|
||||
pad_token_id (:obj:`int`, optional, defaults to 0):
|
||||
The token id for the <PAD> token.
|
||||
vocab_size (:obj:`int`, optional, defaults to 320):
|
||||
Vocabulary size of the Reformer model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import ReformerModel, ReformerConfig
|
||||
>>> from transformers import ReformerModel, ReformerConfig
|
||||
|
||||
>>> # Initializing a Reformer configuration
|
||||
>>> configuration = ReformerConfig()
|
||||
>>> # Initializing a Reformer configuration
|
||||
>>> configuration = ReformerConfig()
|
||||
|
||||
>>> # Initializing a Reformer model
|
||||
>>> model = ReformerModel(configuration)
|
||||
>>> # Initializing a Reformer model
|
||||
>>> model = ReformerModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "reformer"
|
||||
|
||||
|
|
|
@ -28,47 +28,47 @@ RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class RetriBertConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
|
||||
It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
|
||||
It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the BERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||
hidden_size (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
share_encoders (:obj:`bool`, optional, defaults to True):
|
||||
Whether to use the same Bert-type encoder for the queries and document
|
||||
projection_dim (:obj:`int`, optional, defaults to 128):
|
||||
Final dimension of the query and document representation after projection
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30522):
|
||||
Vocabulary size of the BERT model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
|
||||
hidden_size (:obj:`int`, optional, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (:obj:`int`, optional, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the encoder and pooler.
|
||||
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, optional, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
share_encoders (:obj:`bool`, optional, defaults to True):
|
||||
Whether to use the same Bert-type encoder for the queries and document
|
||||
projection_dim (:obj:`int`, optional, defaults to 128):
|
||||
Final dimension of the query and document representation after projection
|
||||
|
||||
"""
|
||||
model_type = "retribert"
|
||||
|
|
|
@ -33,34 +33,33 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class RobertaConfig(BertConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
|
||||
It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
|
||||
It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
|
||||
It reuses the same defaults. Please check the parent class for more information.
|
||||
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
|
||||
It reuses the same defaults. Please check the parent class for more information.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import RobertaConfig, RobertaModel
|
||||
>>> from transformers import RobertaConfig, RobertaModel
|
||||
|
||||
>>> # Initializing a RoBERTa configuration
|
||||
>>> configuration = RobertaConfig()
|
||||
>>> # Initializing a RoBERTa configuration
|
||||
>>> configuration = RobertaConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = RobertaModel(configuration)
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = RobertaModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "roberta"
|
||||
|
||||
def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
|
||||
"""Constructs RobertaConfig.
|
||||
"""
|
||||
"""Constructs RobertaConfig."""
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
|
|
@ -31,33 +31,33 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class T5Config(PretrainedConfig):
|
||||
r"""
|
||||
:class:`~transformers.T5Config` is the configuration class to store the configuration of a
|
||||
`T5Model`.
|
||||
:class:`~transformers.T5Config` is the configuration class to store the configuration of a
|
||||
`T5Model`.
|
||||
|
||||
|
||||
Arguments:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
|
||||
d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
|
||||
num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
|
||||
d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
|
||||
d_ff: Size of the intermediate feed forward layer in each `T5Block`.
|
||||
num_heads: Number of attention heads for each attention layer in
|
||||
the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
|
||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||
layer in the Transformer encoder.
|
||||
hidden_act: The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||
probabilities.
|
||||
n_positions: The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
|
||||
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
||||
`T5Model`.
|
||||
initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
|
||||
layer_norm_eps: The epsilon used by LayerNorm.
|
||||
Arguments:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
|
||||
d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
|
||||
num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
|
||||
d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
|
||||
d_ff: Size of the intermediate feed forward layer in each `T5Block`.
|
||||
num_heads: Number of attention heads for each attention layer in
|
||||
the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
|
||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||
layer in the Transformer encoder.
|
||||
hidden_act: The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||
probabilities.
|
||||
n_positions: The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
|
||||
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
||||
`T5Model`.
|
||||
initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
|
||||
layer_norm_eps: The epsilon used by LayerNorm.
|
||||
"""
|
||||
model_type = "t5"
|
||||
|
||||
|
@ -80,7 +80,10 @@ class T5Config(PretrainedConfig):
|
|||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **kwargs,
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.n_positions = n_positions
|
||||
|
|
|
@ -31,84 +31,84 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class TransfoXLConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
|
||||
It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
|
||||
It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 267735):
|
||||
Vocabulary size of the Transformer XL model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
|
||||
cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
|
||||
Cutoffs for the adaptive softmax
|
||||
d_model (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the model's hidden states.
|
||||
d_embed (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the embeddings
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
d_head (:obj:`int`, optional, defaults to 64):
|
||||
Dimensionality of the model's heads.
|
||||
d_inner (:obj:`int`, optional, defaults to 4096):
|
||||
Inner dimension in FF
|
||||
div_val (:obj:`int`, optional, defaults to 4):
|
||||
Divident value for adapative input and softmax
|
||||
pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Apply LayerNorm to the input instead of the output
|
||||
n_layer (:obj:`int`, optional, defaults to 18):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
tgt_len (:obj:`int`, optional, defaults to 128):
|
||||
Number of tokens to predict
|
||||
ext_len (:obj:`int`, optional, defaults to 0):
|
||||
Length of the extended context
|
||||
mem_len (:obj:`int`, optional, defaults to 1600):
|
||||
Length of the retained previous heads
|
||||
clamp_len (:obj:`int`, optional, defaults to 1000):
|
||||
use the same pos embeddings after clamp_len
|
||||
same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Use the same attn length for all tokens
|
||||
proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
True to share all but first projs, False not to share.
|
||||
attn_type (:obj:`int`, optional, defaults to 0):
|
||||
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
|
||||
sample_softmax (:obj:`int`, optional, defaults to -1):
|
||||
number of samples in sampled softmax
|
||||
adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
use adaptive softmax
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
dropatt (:obj:`float`, optional, defaults to 0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Untie relative position biases
|
||||
init (:obj:`string`, optional, defaults to `normal`):
|
||||
Parameter initializer to use
|
||||
init_range (:obj:`float`, optional, defaults to 0.01):
|
||||
Parameters initialized by U(-init_range, init_range).
|
||||
proj_init_std (:obj:`float`, optional, defaults to 0.01):
|
||||
Parameters initialized by N(0, init_std)
|
||||
init_std (:obj:`float`, optional, defaults to 0.02):
|
||||
Parameters initialized by N(0, init_std)
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 267735):
|
||||
Vocabulary size of the Transformer XL model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
|
||||
cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
|
||||
Cutoffs for the adaptive softmax
|
||||
d_model (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the model's hidden states.
|
||||
d_embed (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the embeddings
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
d_head (:obj:`int`, optional, defaults to 64):
|
||||
Dimensionality of the model's heads.
|
||||
d_inner (:obj:`int`, optional, defaults to 4096):
|
||||
Inner dimension in FF
|
||||
div_val (:obj:`int`, optional, defaults to 4):
|
||||
Divident value for adapative input and softmax
|
||||
pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Apply LayerNorm to the input instead of the output
|
||||
n_layer (:obj:`int`, optional, defaults to 18):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
tgt_len (:obj:`int`, optional, defaults to 128):
|
||||
Number of tokens to predict
|
||||
ext_len (:obj:`int`, optional, defaults to 0):
|
||||
Length of the extended context
|
||||
mem_len (:obj:`int`, optional, defaults to 1600):
|
||||
Length of the retained previous heads
|
||||
clamp_len (:obj:`int`, optional, defaults to 1000):
|
||||
use the same pos embeddings after clamp_len
|
||||
same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Use the same attn length for all tokens
|
||||
proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
True to share all but first projs, False not to share.
|
||||
attn_type (:obj:`int`, optional, defaults to 0):
|
||||
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
|
||||
sample_softmax (:obj:`int`, optional, defaults to -1):
|
||||
number of samples in sampled softmax
|
||||
adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
use adaptive softmax
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
dropatt (:obj:`float`, optional, defaults to 0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Untie relative position biases
|
||||
init (:obj:`string`, optional, defaults to `normal`):
|
||||
Parameter initializer to use
|
||||
init_range (:obj:`float`, optional, defaults to 0.01):
|
||||
Parameters initialized by U(-init_range, init_range).
|
||||
proj_init_std (:obj:`float`, optional, defaults to 0.01):
|
||||
Parameters initialized by N(0, init_std)
|
||||
init_std (:obj:`float`, optional, defaults to 0.02):
|
||||
Parameters initialized by N(0, init_std)
|
||||
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import TransfoXLConfig, TransfoXLModel
|
||||
>>> from transformers import TransfoXLConfig, TransfoXLModel
|
||||
|
||||
>>> # Initializing a Transformer XL configuration
|
||||
>>> configuration = TransfoXLConfig()
|
||||
>>> # Initializing a Transformer XL configuration
|
||||
>>> configuration = TransfoXLConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = TransfoXLModel(configuration)
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = TransfoXLModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "transfo-xl"
|
||||
|
|
|
@ -29,116 +29,116 @@ logger = logging.get_logger(__name__)
|
|||
|
||||
|
||||
class PretrainedConfig(object):
|
||||
r""" Base class for all configuration classes.
|
||||
Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving
|
||||
configurations.
|
||||
r"""Base class for all configuration classes.
|
||||
Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving
|
||||
configurations.
|
||||
|
||||
Note:
|
||||
A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
|
||||
initialize a model does **not** load the model weights.
|
||||
It only affects the model's configuration.
|
||||
Note:
|
||||
A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
|
||||
initialize a model does **not** load the model weights.
|
||||
It only affects the model's configuration.
|
||||
|
||||
Class attributes (overridden by derived classes)
|
||||
- **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
|
||||
recreate the correct object in :class:`~transformers.AutoConfig`.
|
||||
Class attributes (overridden by derived classes)
|
||||
- **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
|
||||
recreate the correct object in :class:`~transformers.AutoConfig`.
|
||||
|
||||
Args:
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should return all hidden-states.
|
||||
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should returns all attentions.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether the model is used as an encoder/decoder or not.
|
||||
is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether the model is used as decoder or not (in which case it's used as an encoder).
|
||||
add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether cross-attention layers should be added to the model. Note, this option is only relevant for models that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
|
||||
tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
|
||||
Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder and decoder model to have the exact same parameter names.
|
||||
prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
|
||||
Pruned heads of the model. The keys are the selected layer indices and the associated values, the list
|
||||
of heads to prune in said layer.
|
||||
Args:
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should return all hidden-states.
|
||||
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should returns all attentions.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether the model is used as an encoder/decoder or not.
|
||||
is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether the model is used as decoder or not (in which case it's used as an encoder).
|
||||
add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether cross-attention layers should be added to the model. Note, this option is only relevant for models that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
|
||||
tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
|
||||
Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder and decoder model to have the exact same parameter names.
|
||||
prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
|
||||
Pruned heads of the model. The keys are the selected layer indices and the associated values, the list
|
||||
of heads to prune in said layer.
|
||||
|
||||
For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer
|
||||
2.
|
||||
xla_device (:obj:`bool`, `optional`):
|
||||
A flag to indicate if TPU are available or not.
|
||||
chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
|
||||
The chunk size of all feed forward layers in the residual attention blocks.
|
||||
A chunk size of :obj:`0` means that the feed forward layer is not chunked.
|
||||
A chunk size of n means that the feed forward layer processes :obj:`n` < sequence_length embeddings at a time.
|
||||
For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
|
||||
For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer
|
||||
2.
|
||||
xla_device (:obj:`bool`, `optional`):
|
||||
A flag to indicate if TPU are available or not.
|
||||
chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
|
||||
The chunk size of all feed forward layers in the residual attention blocks.
|
||||
A chunk size of :obj:`0` means that the feed forward layer is not chunked.
|
||||
A chunk size of n means that the feed forward layer processes :obj:`n` < sequence_length embeddings at a time.
|
||||
For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
|
||||
|
||||
Parameters for sequence generation
|
||||
- **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by
|
||||
default in the :obj:`generate` method of the model.
|
||||
- **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by
|
||||
default in the :obj:`generate` method of the model.
|
||||
- **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in
|
||||
the :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
|
||||
- **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by
|
||||
default in the :obj:`generate` method of the model. Whether to stop the beam search when at least
|
||||
``num_beams`` sentences are finished per batch or not.
|
||||
- **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be
|
||||
used by default in the :obj:`generate` method of the model. 1 means no beam search.
|
||||
- **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
|
||||
probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
|
||||
positive.
|
||||
- **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to
|
||||
keep for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
|
||||
- **top_p** (:obj:`float`, `optional`, defaults to 1) -- Value that will be used by default in the
|
||||
:obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens
|
||||
with probabilities that add up to ``top_p`` or higher are kept for generation.
|
||||
- **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty
|
||||
that will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
|
||||
- **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that
|
||||
will be used by default in the :obj:`generate` method of the model.
|
||||
- **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default
|
||||
in the :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of
|
||||
that size can only occur once.
|
||||
- **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be
|
||||
generated that will be used by default in the :obj:`generate` method of the model. In order to get the
|
||||
tokens of the words that should not appear in the generated text, use
|
||||
:obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
|
||||
- **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed
|
||||
returned sequences for each element in the batch that will be used by default in the :obj:`generate`
|
||||
method of the model.
|
||||
Parameters for sequence generation
|
||||
- **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by
|
||||
default in the :obj:`generate` method of the model.
|
||||
- **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by
|
||||
default in the :obj:`generate` method of the model.
|
||||
- **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in
|
||||
the :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
|
||||
- **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by
|
||||
default in the :obj:`generate` method of the model. Whether to stop the beam search when at least
|
||||
``num_beams`` sentences are finished per batch or not.
|
||||
- **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be
|
||||
used by default in the :obj:`generate` method of the model. 1 means no beam search.
|
||||
- **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
|
||||
probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
|
||||
positive.
|
||||
- **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to
|
||||
keep for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
|
||||
- **top_p** (:obj:`float`, `optional`, defaults to 1) -- Value that will be used by default in the
|
||||
:obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens
|
||||
with probabilities that add up to ``top_p`` or higher are kept for generation.
|
||||
- **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty
|
||||
that will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
|
||||
- **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that
|
||||
will be used by default in the :obj:`generate` method of the model.
|
||||
- **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default
|
||||
in the :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of
|
||||
that size can only occur once.
|
||||
- **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be
|
||||
generated that will be used by default in the :obj:`generate` method of the model. In order to get the
|
||||
tokens of the words that should not appear in the generated text, use
|
||||
:obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
|
||||
- **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed
|
||||
returned sequences for each element in the batch that will be used by default in the :obj:`generate`
|
||||
method of the model.
|
||||
|
||||
Parameters for fine-tuning tasks
|
||||
- **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the
|
||||
model pretrained weights.
|
||||
- **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
|
||||
used when converting from an original (TensorFlow or PyTorch) checkpoint.
|
||||
- **id2label** (:obj:`List[str]`, `optional`) -- A map from index (for instance prediction index, or target
|
||||
index) to label.
|
||||
- **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
|
||||
- **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
|
||||
typically for a classification task.
|
||||
- **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for
|
||||
the current task.
|
||||
Parameters for fine-tuning tasks
|
||||
- **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the
|
||||
model pretrained weights.
|
||||
- **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
|
||||
used when converting from an original (TensorFlow or PyTorch) checkpoint.
|
||||
- **id2label** (:obj:`List[str]`, `optional`) -- A map from index (for instance prediction index, or target
|
||||
index) to label.
|
||||
- **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
|
||||
- **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
|
||||
typically for a classification task.
|
||||
- **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for
|
||||
the current task.
|
||||
|
||||
Parameters linked to the tokenizer
|
||||
- **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each
|
||||
text before calling the model.
|
||||
- **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
|
||||
- **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
|
||||
- **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
|
||||
- **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with
|
||||
a different token than `bos`, the id of that token.
|
||||
Parameters linked to the tokenizer
|
||||
- **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each
|
||||
text before calling the model.
|
||||
- **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
|
||||
- **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
|
||||
- **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
|
||||
- **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with
|
||||
a different token than `bos`, the id of that token.
|
||||
|
||||
PyTorch specific parameters
|
||||
- **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
|
||||
used with Torchscript.
|
||||
- **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer.
|
||||
PyTorch specific parameters
|
||||
- **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
|
||||
used with Torchscript.
|
||||
- **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer.
|
||||
|
||||
TensorFlow specific parameters
|
||||
- **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should
|
||||
use BFloat16 scalars (only used by some TensorFlow models).
|
||||
TensorFlow specific parameters
|
||||
- **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should
|
||||
use BFloat16 scalars (only used by some TensorFlow models).
|
||||
"""
|
||||
model_type: str = ""
|
||||
|
||||
|
|
|
@ -36,120 +36,120 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class XLMConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
|
||||
It is used to instantiate an XLM model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
|
||||
It is used to instantiate an XLM model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30145):
|
||||
Vocabulary size of the XLM model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
|
||||
emb_dim (:obj:`int`, optional, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for the attention mechanism
|
||||
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
|
||||
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
|
||||
causal (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Set this to `True` for the model to behave in a causal manner.
|
||||
Causal models use a triangular attention mask in order to only attend to the left-side context instead
|
||||
if a bidirectional context.
|
||||
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
|
||||
layer.
|
||||
n_langs (:obj:`int`, optional, defaults to 1):
|
||||
The number of languages the model handles. Set to 1 for monolingual models.
|
||||
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
|
||||
Whether to use language embeddings. Some models use additional language embeddings, see
|
||||
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
|
||||
for information on how to use them.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing the embedding matrices.
|
||||
init_std (:obj:`int`, optional, defaults to 50257):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices except the embedding matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
bos_index (:obj:`int`, optional, defaults to 0):
|
||||
The index of the beginning of sentence token in the vocabulary.
|
||||
eos_index (:obj:`int`, optional, defaults to 1):
|
||||
The index of the end of sentence token in the vocabulary.
|
||||
pad_index (:obj:`int`, optional, defaults to 2):
|
||||
The index of the padding token in the vocabulary.
|
||||
unk_index (:obj:`int`, optional, defaults to 3):
|
||||
The index of the unknown token in the vocabulary.
|
||||
mask_index (:obj:`int`, optional, defaults to 5):
|
||||
The index of the masking token in the vocabulary.
|
||||
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
|
||||
summary_type (:obj:`string`, optional, defaults to "first"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Is one of the following options:
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 30145):
|
||||
Vocabulary size of the XLM model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
|
||||
emb_dim (:obj:`int`, optional, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
n_layer (:obj:`int`, optional, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for the attention mechanism
|
||||
gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
|
||||
sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
|
||||
causal (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Set this to `True` for the model to behave in a causal manner.
|
||||
Causal models use a triangular attention mask in order to only attend to the left-side context instead
|
||||
if a bidirectional context.
|
||||
asm (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
|
||||
layer.
|
||||
n_langs (:obj:`int`, optional, defaults to 1):
|
||||
The number of languages the model handles. Set to 1 for monolingual models.
|
||||
use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
|
||||
Whether to use language embeddings. Some models use additional language embeddings, see
|
||||
`the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
|
||||
for information on how to use them.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 512):
|
||||
The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing the embedding matrices.
|
||||
init_std (:obj:`int`, optional, defaults to 50257):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices except the embedding matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
bos_index (:obj:`int`, optional, defaults to 0):
|
||||
The index of the beginning of sentence token in the vocabulary.
|
||||
eos_index (:obj:`int`, optional, defaults to 1):
|
||||
The index of the end of sentence token in the vocabulary.
|
||||
pad_index (:obj:`int`, optional, defaults to 2):
|
||||
The index of the padding token in the vocabulary.
|
||||
unk_index (:obj:`int`, optional, defaults to 3):
|
||||
The index of the unknown token in the vocabulary.
|
||||
mask_index (:obj:`int`, optional, defaults to 5):
|
||||
The index of the masking token in the vocabulary.
|
||||
is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
|
||||
summary_type (:obj:`string`, optional, defaults to "first"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Is one of the following options:
|
||||
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Add a dropout before the projection and activation
|
||||
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
mask_token_id (:obj:`int`, optional, defaults to 0):
|
||||
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
|
||||
lang_id (:obj:`int`, optional, defaults to 1):
|
||||
The ID of the language used by the model. This parameter is used when generating
|
||||
text in a given language.
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLMForSequenceClassification`.
|
||||
Add a dropout before the projection and activation
|
||||
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
mask_token_id (:obj:`int`, optional, defaults to 0):
|
||||
Model agnostic parameter to identify masked tokens when generating text in an MLM context.
|
||||
lang_id (:obj:`int`, optional, defaults to 1):
|
||||
The ID of the language used by the model. This parameter is used when generating
|
||||
text in a given language.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import XLMConfig, XLMModel
|
||||
>>> from transformers import XLMConfig, XLMModel
|
||||
|
||||
>>> # Initializing a XLM configuration
|
||||
>>> configuration = XLMConfig()
|
||||
>>> # Initializing a XLM configuration
|
||||
>>> configuration = XLMConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = XLMModel(configuration)
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = XLMModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "xlm"
|
||||
|
@ -191,8 +191,7 @@ class XLMConfig(PretrainedConfig):
|
|||
bos_token_id=0,
|
||||
**kwargs
|
||||
):
|
||||
"""Constructs XLMConfig.
|
||||
"""
|
||||
"""Constructs XLMConfig."""
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
|
|
|
@ -31,104 +31,104 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||
|
||||
class XLNetConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
|
||||
It is used to instantiate an XLNet model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
|
||||
It is used to instantiate an XLNet model according to the specified arguments, defining the model
|
||||
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 32000):
|
||||
Vocabulary size of the XLNet model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
|
||||
d_model (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
n_layer (:obj:`int`, optional, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
d_inner (:obj:`int`, optional, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
ff_activation (:obj:`string`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Untie relative position biases
|
||||
attn_type (:obj:`string`, optional, defaults to "bi"):
|
||||
The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
The number of tokens to cache. The key/value pairs that have already been pre-computed
|
||||
in a previous forward pass won't be re-computed. See the
|
||||
`quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
|
||||
for more information.
|
||||
reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
The number of tokens in the current batch to be cached and reused in the future.
|
||||
bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use bidirectional input pipeline. Usually set to `True` during
|
||||
pretraining and `False` during finetuning.
|
||||
clamp_len (:obj:`int`, optional, defaults to -1):
|
||||
Clamp all relative distances larger than clamp_len.
|
||||
Setting this attribute to -1 means no clamping.
|
||||
same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use the same attention length for each token.
|
||||
summary_type (:obj:`string`, optional, defaults to "last"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
Is one of the following options:
|
||||
Args:
|
||||
vocab_size (:obj:`int`, optional, defaults to 32000):
|
||||
Vocabulary size of the XLNet model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
|
||||
d_model (:obj:`int`, optional, defaults to 1024):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
n_layer (:obj:`int`, optional, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (:obj:`int`, optional, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
d_inner (:obj:`int`, optional, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
ff_activation (:obj:`string`, optional, defaults to "gelu"):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Untie relative position biases
|
||||
attn_type (:obj:`string`, optional, defaults to "bi"):
|
||||
The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
The number of tokens to cache. The key/value pairs that have already been pre-computed
|
||||
in a previous forward pass won't be re-computed. See the
|
||||
`quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
|
||||
for more information.
|
||||
reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
The number of tokens in the current batch to be cached and reused in the future.
|
||||
bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use bidirectional input pipeline. Usually set to `True` during
|
||||
pretraining and `False` during finetuning.
|
||||
clamp_len (:obj:`int`, optional, defaults to -1):
|
||||
Clamp all relative distances larger than clamp_len.
|
||||
Setting this attribute to -1 means no clamping.
|
||||
same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
|
||||
Whether to use the same attention length for each token.
|
||||
summary_type (:obj:`string`, optional, defaults to "last"):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
Is one of the following options:
|
||||
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
Add a dropout after the projection and activation
|
||||
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last pre-computed hidden states.
|
||||
- 'last' => take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
Add a projection after the vector extraction
|
||||
summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
'tanh' => add a tanh activation to the output, Other => no activation.
|
||||
summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
|
||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||
:class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
|
||||
Add a dropout after the projection and activation
|
||||
start_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
end_n_top (:obj:`int`, optional, defaults to 5):
|
||||
Used in the SQuAD evaluation script for XLM and XLNet.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last pre-computed hidden states.
|
||||
|
||||
.. note::
|
||||
This flag behaves differently from with other models: it just controls the inference behavior, during
|
||||
training the model always uses ``use_cache=True``.
|
||||
.. note::
|
||||
This flag behaves differently from with other models: it just controls the inference behavior, during
|
||||
training the model always uses ``use_cache=True``.
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import XLNetConfig, XLNetModel
|
||||
>>> from transformers import XLNetConfig, XLNetModel
|
||||
|
||||
>>> # Initializing a XLNet configuration
|
||||
>>> configuration = XLNetConfig()
|
||||
>>> # Initializing a XLNet configuration
|
||||
>>> configuration = XLNetConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = XLNetModel(configuration)
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = XLNetModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "xlnet"
|
||||
|
@ -162,8 +162,7 @@ class XLNetConfig(PretrainedConfig):
|
|||
eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
"""Constructs XLNetConfig.
|
||||
"""
|
||||
"""Constructs XLNetConfig."""
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.d_model = d_model
|
||||
|
|
|
@ -27,5 +27,6 @@ if __name__ == "__main__":
|
|||
checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
|
||||
pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
|
||||
convert_dialogpt_checkpoint(
|
||||
checkpoint_path, pytorch_dump_folder_path,
|
||||
checkpoint_path,
|
||||
pytorch_dump_folder_path,
|
||||
)
|
||||
|
|
|
@ -38,24 +38,39 @@ class OnnxConverterArgumentParser(ArgumentParser):
|
|||
super().__init__("ONNX Converter")
|
||||
|
||||
self.add_argument(
|
||||
"--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction",
|
||||
"--pipeline",
|
||||
type=str,
|
||||
choices=SUPPORTED_PIPELINES,
|
||||
default="feature-extraction",
|
||||
)
|
||||
self.add_argument(
|
||||
"--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)",
|
||||
"--model",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model's id or path (ex: bert-base-cased)",
|
||||
)
|
||||
self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
|
||||
self.add_argument(
|
||||
"--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model",
|
||||
"--framework",
|
||||
type=str,
|
||||
choices=["pt", "tf"],
|
||||
help="Framework for loading the model",
|
||||
)
|
||||
self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
|
||||
self.add_argument(
|
||||
"--check-loading", action="store_true", help="Check ONNX is able to load the model",
|
||||
"--check-loading",
|
||||
action="store_true",
|
||||
help="Check ONNX is able to load the model",
|
||||
)
|
||||
self.add_argument(
|
||||
"--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb",
|
||||
"--use-external-format",
|
||||
action="store_true",
|
||||
help="Allow exporting model >= than 2Gb",
|
||||
)
|
||||
self.add_argument(
|
||||
"--quantize", action="store_true", help="Quantize the neural network to be run with int8",
|
||||
"--quantize",
|
||||
action="store_true",
|
||||
help="Quantize the neural network to be run with int8",
|
||||
)
|
||||
self.add_argument("output")
|
||||
|
||||
|
@ -376,7 +391,10 @@ def quantize(onnx_model_path: Path) -> Path:
|
|||
)
|
||||
|
||||
quantized_model = quantize(
|
||||
model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True,
|
||||
model=onnx_model,
|
||||
quantization_mode=QuantizationMode.IntegerOps,
|
||||
force_fusions=True,
|
||||
symmetric_weight=True,
|
||||
)
|
||||
|
||||
# Append "-quantized" at the end of the model's name
|
||||
|
|
|
@ -255,7 +255,11 @@ license: apache-2.0
|
|||
|
||||
|
||||
def write_model_card(
|
||||
hf_model_name: str, repo_root="OPUS-MT-train", save_dir=Path("marian_converted"), dry_run=False, extra_metadata={},
|
||||
hf_model_name: str,
|
||||
repo_root="OPUS-MT-train",
|
||||
save_dir=Path("marian_converted"),
|
||||
dry_run=False,
|
||||
extra_metadata={},
|
||||
) -> str:
|
||||
"""Copy the most recent model's readme section from opus, and add metadata.
|
||||
upload command: aws s3 sync model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
|
||||
|
@ -604,7 +608,9 @@ class OpusState:
|
|||
|
||||
assert "hidden_size" not in cfg.to_dict()
|
||||
load_layers_(
|
||||
model.model.encoder.layers, state_dict, BART_CONVERTER,
|
||||
model.model.encoder.layers,
|
||||
state_dict,
|
||||
BART_CONVERTER,
|
||||
)
|
||||
load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)
|
||||
|
||||
|
|
|
@ -108,7 +108,12 @@ if is_torch_available():
|
|||
logging.set_verbosity_info()
|
||||
|
||||
MODEL_CLASSES = {
|
||||
"bert": (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,),
|
||||
"bert": (
|
||||
BertConfig,
|
||||
TFBertForPreTraining,
|
||||
BertForPreTraining,
|
||||
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"bert-large-uncased-whole-word-masking-finetuned-squad": (
|
||||
BertConfig,
|
||||
TFBertForQuestionAnswering,
|
||||
|
@ -127,9 +132,24 @@ MODEL_CLASSES = {
|
|||
BertForSequenceClassification,
|
||||
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"gpt2": (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,),
|
||||
"xlnet": (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,),
|
||||
"xlm": (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,),
|
||||
"gpt2": (
|
||||
GPT2Config,
|
||||
TFGPT2LMHeadModel,
|
||||
GPT2LMHeadModel,
|
||||
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"xlnet": (
|
||||
XLNetConfig,
|
||||
TFXLNetLMHeadModel,
|
||||
XLNetLMHeadModel,
|
||||
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"xlm": (
|
||||
XLMConfig,
|
||||
TFXLMWithLMHeadModel,
|
||||
XLMWithLMHeadModel,
|
||||
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"xlm-roberta": (
|
||||
XLMRobertaConfig,
|
||||
TFXLMRobertaForMaskedLM,
|
||||
|
@ -148,7 +168,12 @@ MODEL_CLASSES = {
|
|||
OpenAIGPTLMHeadModel,
|
||||
OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"roberta": (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,),
|
||||
"roberta": (
|
||||
RobertaConfig,
|
||||
TFRobertaForMaskedLM,
|
||||
RobertaForMaskedLM,
|
||||
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"roberta-large-mnli": (
|
||||
RobertaConfig,
|
||||
TFRobertaForSequenceClassification,
|
||||
|
@ -179,10 +204,30 @@ MODEL_CLASSES = {
|
|||
DistilBertForQuestionAnswering,
|
||||
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"ctrl": (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,),
|
||||
"albert": (AlbertConfig, TFAlbertForPreTraining, AlbertForPreTraining, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,),
|
||||
"t5": (T5Config, TFT5ForConditionalGeneration, T5ForConditionalGeneration, T5_PRETRAINED_CONFIG_ARCHIVE_MAP,),
|
||||
"electra": (ElectraConfig, TFElectraForPreTraining, ElectraForPreTraining, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,),
|
||||
"ctrl": (
|
||||
CTRLConfig,
|
||||
TFCTRLLMHeadModel,
|
||||
CTRLLMHeadModel,
|
||||
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"albert": (
|
||||
AlbertConfig,
|
||||
TFAlbertForPreTraining,
|
||||
AlbertForPreTraining,
|
||||
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"t5": (
|
||||
T5Config,
|
||||
TFT5ForConditionalGeneration,
|
||||
T5ForConditionalGeneration,
|
||||
T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
"electra": (
|
||||
ElectraConfig,
|
||||
TFElectraForPreTraining,
|
||||
ElectraForPreTraining,
|
||||
ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -49,10 +49,12 @@ def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
|
|||
torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
torch_layer.self_attention.value,
|
||||
torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
|
||||
torch_layer.output.dense,
|
||||
torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
|
||||
)
|
||||
|
||||
|
||||
|
@ -64,16 +66,20 @@ def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size):
|
|||
np_dense = np.asarray(weights[3])
|
||||
|
||||
set_param(
|
||||
torch_layer.self_attention.query, torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
torch_layer.self_attention.query,
|
||||
torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.self_attention.key, torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
torch_layer.self_attention.key,
|
||||
torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
torch_layer.self_attention.value,
|
||||
torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
|
||||
torch_layer.output.dense,
|
||||
torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
|
||||
)
|
||||
|
||||
|
||||
|
@ -83,7 +89,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
|
|||
layer_norm_1_weight = np.asarray(layer_norm_1[0])
|
||||
layer_norm_1_bias = np.asarray(layer_norm_1[1])
|
||||
set_param(
|
||||
torch_block.attention.layer_norm, torch.tensor(layer_norm_1_weight), torch.tensor(layer_norm_1_bias),
|
||||
torch_block.attention.layer_norm,
|
||||
torch.tensor(layer_norm_1_weight),
|
||||
torch.tensor(layer_norm_1_bias),
|
||||
)
|
||||
|
||||
# lsh weights + output
|
||||
|
@ -104,7 +112,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
|
|||
layer_norm_2_weight = np.asarray(intermediate_weights[0][0])
|
||||
layer_norm_2_bias = np.asarray(intermediate_weights[0][1])
|
||||
set_param(
|
||||
torch_block.feed_forward.layer_norm, torch.tensor(layer_norm_2_weight), torch.tensor(layer_norm_2_bias),
|
||||
torch_block.feed_forward.layer_norm,
|
||||
torch.tensor(layer_norm_2_weight),
|
||||
torch.tensor(layer_norm_2_bias),
|
||||
)
|
||||
|
||||
# intermediate dense
|
||||
|
@ -133,7 +143,8 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
|
|||
# word embeds
|
||||
word_embeddings = np.asarray(weights[1])
|
||||
set_param(
|
||||
torch_model_reformer.embeddings.word_embeddings, torch.tensor(word_embeddings),
|
||||
torch_model_reformer.embeddings.word_embeddings,
|
||||
torch.tensor(word_embeddings),
|
||||
)
|
||||
|
||||
if isinstance(weights[3], tuple):
|
||||
|
|
|
@ -86,7 +86,10 @@ class GlueDataset(Dataset):
|
|||
cached_features_file = os.path.join(
|
||||
cache_dir if cache_dir is not None else args.data_dir,
|
||||
"cached_{}_{}_{}_{}".format(
|
||||
mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
|
||||
mode.value,
|
||||
tokenizer.__class__.__name__,
|
||||
str(args.max_seq_length),
|
||||
args.task_name,
|
||||
),
|
||||
)
|
||||
label_list = self.processor.get_labels()
|
||||
|
|
|
@ -21,7 +21,11 @@ class TextDataset(Dataset):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False,
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
file_path: str,
|
||||
block_size: int,
|
||||
overwrite_cache=False,
|
||||
):
|
||||
assert os.path.isfile(file_path), f"Input file path {file_path} not found"
|
||||
|
||||
|
@ -29,7 +33,12 @@ class TextDataset(Dataset):
|
|||
|
||||
directory, filename = os.path.split(file_path)
|
||||
cached_features_file = os.path.join(
|
||||
directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
|
||||
directory,
|
||||
"cached_lm_{}_{}_{}".format(
|
||||
tokenizer.__class__.__name__,
|
||||
str(block_size),
|
||||
filename,
|
||||
),
|
||||
)
|
||||
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
|
|
|
@ -119,7 +119,10 @@ class SquadDataset(Dataset):
|
|||
cached_features_file = os.path.join(
|
||||
cache_dir if cache_dir is not None else args.data_dir,
|
||||
"cached_{}_{}_{}_{}".format(
|
||||
mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), version_tag,
|
||||
mode.value,
|
||||
tokenizer.__class__.__name__,
|
||||
str(args.max_seq_length),
|
||||
version_tag,
|
||||
),
|
||||
)
|
||||
|
||||
|
|
|
@ -589,10 +589,10 @@ def compute_predictions_log_probs(
|
|||
tokenizer,
|
||||
verbose_logging,
|
||||
):
|
||||
""" XLNet write prediction logic (more complex than Bert's).
|
||||
Write final predictions to the json file and log-odds of null if needed.
|
||||
"""XLNet write prediction logic (more complex than Bert's).
|
||||
Write final predictions to the json file and log-odds of null if needed.
|
||||
|
||||
Requires utils_squad_evaluate.py
|
||||
Requires utils_squad_evaluate.py
|
||||
"""
|
||||
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
|
||||
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
|
||||
|
|
|
@ -69,7 +69,10 @@ def glue_convert_examples_to_features(
|
|||
if is_tf_available():
|
||||
|
||||
def _tf_glue_convert_examples_to_features(
|
||||
examples: tf.data.Dataset, tokenizer: PreTrainedTokenizer, task=str, max_length: Optional[int] = None,
|
||||
examples: tf.data.Dataset,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
task=str,
|
||||
max_length: Optional[int] = None,
|
||||
) -> tf.data.Dataset:
|
||||
"""
|
||||
Returns:
|
||||
|
|
|
@ -269,7 +269,9 @@ class SingleSentenceClassificationProcessor(DataProcessor):
|
|||
logger.info("Tokenizing example %d", ex_index)
|
||||
|
||||
input_ids = tokenizer.encode(
|
||||
example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len),
|
||||
example.text_a,
|
||||
add_special_tokens=True,
|
||||
max_length=min(max_length, tokenizer.max_len),
|
||||
)
|
||||
all_input_ids.append(input_ids)
|
||||
|
||||
|
|
|
@ -329,7 +329,13 @@ class TFGenerationMixin:
|
|||
if self.config.is_encoder_decoder:
|
||||
|
||||
# create empty decoder_input_ids
|
||||
input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id
|
||||
input_ids = (
|
||||
tf.ones(
|
||||
(effective_batch_size * num_beams, 1),
|
||||
dtype=tf.int32,
|
||||
)
|
||||
* decoder_start_token_id
|
||||
)
|
||||
cur_len = 1
|
||||
|
||||
assert (
|
||||
|
@ -422,8 +428,8 @@ class TFGenerationMixin:
|
|||
attention_mask,
|
||||
use_cache,
|
||||
):
|
||||
""" Generate sequences for each example without beam search (num_beams == 1).
|
||||
All returned sequence are generated independantly.
|
||||
"""Generate sequences for each example without beam search (num_beams == 1).
|
||||
All returned sequence are generated independantly.
|
||||
"""
|
||||
|
||||
# length of generated sentences / unfinished sentences
|
||||
|
@ -587,8 +593,7 @@ class TFGenerationMixin:
|
|||
attention_mask,
|
||||
use_cache,
|
||||
):
|
||||
""" Generate sequences for each example with beam search.
|
||||
"""
|
||||
"""Generate sequences for each example with beam search."""
|
||||
|
||||
# generated hypotheses
|
||||
generated_hyps = [
|
||||
|
@ -960,14 +965,14 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
|
|||
|
||||
|
||||
def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
|
||||
""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
||||
Args:
|
||||
logits: logits distribution shape (batch size, vocabulary size)
|
||||
if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
|
||||
if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
|
||||
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
|
||||
Make sure we keep at least min_tokens_to_keep per batch example in the output
|
||||
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
|
||||
"""Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
||||
Args:
|
||||
logits: logits distribution shape (batch size, vocabulary size)
|
||||
if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
|
||||
if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
|
||||
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
|
||||
Make sure we keep at least min_tokens_to_keep per batch example in the output
|
||||
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
|
||||
"""
|
||||
logits_shape = shape_list(logits)
|
||||
|
||||
|
@ -1001,7 +1006,8 @@ def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("In
|
|||
# Shift the indices to the right to keep also the first token above the threshold
|
||||
sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
|
||||
sorted_indices_to_remove = tf.concat(
|
||||
[tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1,
|
||||
[tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]],
|
||||
-1,
|
||||
)
|
||||
# scatter sorted tensors to original indexing
|
||||
indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices)
|
||||
|
@ -1027,9 +1033,9 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
|
|||
|
||||
def sample_without_replacement(logits, num_samples):
|
||||
"""
|
||||
categorical sampling witouth replacement is currently not implemented
|
||||
the gumbel-max trick will do for now
|
||||
see https://github.com/tensorflow/tensorflow/issues/9260 for more info
|
||||
categorical sampling witouth replacement is currently not implemented
|
||||
the gumbel-max trick will do for now
|
||||
see https://github.com/tensorflow/tensorflow/issues/9260 for more info
|
||||
"""
|
||||
z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
|
||||
_, indices = tf.nn.top_k(logits + z, num_samples)
|
||||
|
|
|
@ -83,7 +83,11 @@ class GenerationMixin:
|
|||
# repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
|
||||
if repetition_penalty != 1.0:
|
||||
self.enforce_repetition_penalty_(
|
||||
scores, batch_size, num_beams, input_ids, repetition_penalty,
|
||||
scores,
|
||||
batch_size,
|
||||
num_beams,
|
||||
input_ids,
|
||||
repetition_penalty,
|
||||
)
|
||||
|
||||
# set eos token prob to zero if min_length is not reached
|
||||
|
@ -324,7 +328,10 @@ class GenerationMixin:
|
|||
"or a `bos_token_id` (integer >= 0) as a first token to start the generation."
|
||||
)
|
||||
input_ids = torch.full(
|
||||
(batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device,
|
||||
(batch_size, 1),
|
||||
bos_token_id,
|
||||
dtype=torch.long,
|
||||
device=next(self.parameters()).device,
|
||||
)
|
||||
else:
|
||||
assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
|
||||
|
@ -514,8 +521,8 @@ class GenerationMixin:
|
|||
use_cache,
|
||||
model_specific_kwargs,
|
||||
):
|
||||
""" Generate sequences for each example without beam search (num_beams == 1).
|
||||
All returned sequence are generated independantly.
|
||||
"""Generate sequences for each example without beam search (num_beams == 1).
|
||||
All returned sequence are generated independantly.
|
||||
"""
|
||||
# length of generated sentences / unfinished sentences
|
||||
unfinished_sents = input_ids.new(batch_size).fill_(1)
|
||||
|
@ -619,8 +626,7 @@ class GenerationMixin:
|
|||
use_cache,
|
||||
model_specific_kwargs,
|
||||
):
|
||||
""" Generate sequences for each example with beam search.
|
||||
"""
|
||||
"""Generate sequences for each example with beam search."""
|
||||
|
||||
# generated hypotheses
|
||||
generated_hyps = [
|
||||
|
@ -749,7 +755,8 @@ class GenerationMixin:
|
|||
if is_beam_token_worse_than_top_num_beams:
|
||||
continue
|
||||
generated_hyps[batch_idx].add(
|
||||
input_ids[effective_beam_id].clone(), beam_token_score.item(),
|
||||
input_ids[effective_beam_id].clone(),
|
||||
beam_token_score.item(),
|
||||
)
|
||||
else:
|
||||
# add next predicted token since it is not eos_token
|
||||
|
@ -806,7 +813,8 @@ class GenerationMixin:
|
|||
assert torch.all(
|
||||
next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx]
|
||||
), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
|
||||
next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx],
|
||||
next_scores[:, :num_beams][batch_idx],
|
||||
beam_scores.view(batch_size, num_beams)[batch_idx],
|
||||
)
|
||||
|
||||
# need to add best num_beams hypotheses to generated hyps
|
||||
|
@ -916,7 +924,7 @@ def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iter
|
|||
|
||||
|
||||
def set_scores_to_inf_for_banned_tokens(scores: torch.Tensor, banned_tokens: List[List[int]]) -> None:
|
||||
""" Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
|
||||
"""Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
|
||||
a list of list of banned tokens to ban in the format [[batch index, vocabulary position],...]
|
||||
Args:
|
||||
scores: logits distribution of shape (batch size, vocabulary size)
|
||||
|
@ -946,14 +954,14 @@ def top_k_top_p_filtering(
|
|||
filter_value: float = -float("Inf"),
|
||||
min_tokens_to_keep: int = 1,
|
||||
) -> Tensor:
|
||||
""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
||||
Args:
|
||||
logits: logits distribution shape (batch size, vocabulary size)
|
||||
if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
|
||||
if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
|
||||
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
|
||||
Make sure we keep at least min_tokens_to_keep per batch example in the output
|
||||
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
|
||||
"""Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
||||
Args:
|
||||
logits: logits distribution shape (batch size, vocabulary size)
|
||||
if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
|
||||
if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
|
||||
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
|
||||
Make sure we keep at least min_tokens_to_keep per batch example in the output
|
||||
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
|
||||
"""
|
||||
if top_k > 0:
|
||||
top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check
|
||||
|
|
|
@ -36,20 +36,20 @@ logger = logging.get_logger(__name__)
|
|||
|
||||
|
||||
class ModelCard:
|
||||
r""" Structured Model Card class.
|
||||
Store model card as well as methods for loading/downloading/saving model cards.
|
||||
r"""Structured Model Card class.
|
||||
Store model card as well as methods for loading/downloading/saving model cards.
|
||||
|
||||
Please read the following paper for details and explanation on the sections:
|
||||
"Model Cards for Model Reporting"
|
||||
by Margaret Mitchell, Simone Wu,
|
||||
Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
|
||||
Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
|
||||
Link: https://arxiv.org/abs/1810.03993
|
||||
Please read the following paper for details and explanation on the sections:
|
||||
"Model Cards for Model Reporting"
|
||||
by Margaret Mitchell, Simone Wu,
|
||||
Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
|
||||
Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
|
||||
Link: https://arxiv.org/abs/1810.03993
|
||||
|
||||
Note:
|
||||
A model card can be loaded and saved to disk.
|
||||
Note:
|
||||
A model card can be loaded and saved to disk.
|
||||
|
||||
Parameters:
|
||||
Parameters:
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
@ -73,8 +73,7 @@ class ModelCard:
|
|||
raise err
|
||||
|
||||
def save_pretrained(self, save_directory_or_file):
|
||||
""" Save a model card object to the directory or file `save_directory_or_file`.
|
||||
"""
|
||||
"""Save a model card object to the directory or file `save_directory_or_file`."""
|
||||
if os.path.isdir(save_directory_or_file):
|
||||
# If we save using the predefined names, we can load using `from_pretrained`
|
||||
output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
|
||||
|
@ -86,7 +85,7 @@ class ModelCard:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||
r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
|
||||
r"""Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path: either:
|
||||
|
|
|
@ -302,7 +302,10 @@ class AlbertLayer(nn.Module):
|
|||
attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
|
||||
|
||||
ffn_output = apply_chunking_to_forward(
|
||||
self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output[0],
|
||||
self.ff_chunk,
|
||||
self.chunk_size_feed_forward,
|
||||
self.seq_len_dim,
|
||||
attention_output[0],
|
||||
)
|
||||
hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
|
||||
|
||||
|
@ -397,8 +400,8 @@ class AlbertTransformer(nn.Module):
|
|||
|
||||
|
||||
class AlbertPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = AlbertConfig
|
||||
|
@ -406,8 +409,7 @@ class AlbertPreTrainedModel(PreTrainedModel):
|
|||
authorized_missing_keys = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
"""Initialize the weights."""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
|
@ -543,17 +545,17 @@ class AlbertModel(AlbertPreTrainedModel):
|
|||
return self.embeddings.word_embeddings
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
|
||||
If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
|
||||
is a total of 4 different layers.
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
|
||||
If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
|
||||
is a total of 4 different layers.
|
||||
|
||||
These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
|
||||
while [2,3] correspond to the two inner groups of the second hidden layer.
|
||||
These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
|
||||
while [2,3] correspond to the two inner groups of the second hidden layer.
|
||||
|
||||
Any layer with in index other than [0,1,2,3] will result in an error.
|
||||
See base class PreTrainedModel for more information about head pruning
|
||||
Any layer with in index other than [0,1,2,3] will result in an error.
|
||||
See base class PreTrainedModel for more information about head pruning
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
group_idx = int(layer / self.config.inner_group_num)
|
||||
|
@ -672,34 +674,34 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
|||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates original order (sequence A, then sequence B),
|
||||
``1`` indicates switched order (sequence B, then sequence A).
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates original order (sequence A, then sequence B),
|
||||
``1`` indicates switched order (sequence B, then sequence A).
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> from transformers import AlbertTokenizer, AlbertForPreTraining
|
||||
>>> import torch
|
||||
>>> from transformers import AlbertTokenizer, AlbertForPreTraining
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
>>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True)
|
||||
>>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
>>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True)
|
||||
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
|
||||
>>> prediction_logits = outputs.prediction_logits
|
||||
>>> sop_logits = outputs.sop_logits
|
||||
>>> prediction_logits = outputs.prediction_logits
|
||||
>>> sop_logits = outputs.sop_logits
|
||||
|
||||
"""
|
||||
|
||||
|
@ -787,7 +789,8 @@ class AlbertSOPHead(nn.Module):
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
|
||||
"Albert Model with a `language modeling` head on top.",
|
||||
ALBERT_START_DOCSTRING,
|
||||
)
|
||||
class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
|
@ -952,7 +955,10 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1033,7 +1039,10 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1215,5 +1224,8 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return MultipleChoiceModelOutput(
|
||||
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=reshaped_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
|
|
@ -374,12 +374,12 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
|
|||
|
||||
class AutoModel:
|
||||
r"""
|
||||
:class:`~transformers.AutoModel` is a generic model class
|
||||
that will be instantiated as one of the base model classes of the library
|
||||
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||
or the `AutoModel.from_config(config)` class methods.
|
||||
:class:`~transformers.AutoModel` is a generic model class
|
||||
that will be instantiated as one of the base model classes of the library
|
||||
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||
or the `AutoModel.from_config(config)` class methods.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -391,7 +391,7 @@ class AutoModel:
|
|||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a configuration.
|
||||
|
||||
Note:
|
||||
|
@ -433,7 +433,7 @@ class AutoModel:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
|
@ -531,11 +531,11 @@ class AutoModel:
|
|||
|
||||
class AutoModelForPreTraining:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForPreTraining` is a generic model class
|
||||
that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
:class:`~transformers.AutoModelForPreTraining` is a generic model class
|
||||
that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -547,7 +547,7 @@ class AutoModelForPreTraining:
|
|||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a configuration.
|
||||
|
||||
Note:
|
||||
|
@ -589,7 +589,7 @@ class AutoModelForPreTraining:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
|
||||
r"""Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
based on the `model_type` property of the config object, or when it's missing,
|
||||
|
@ -680,12 +680,12 @@ class AutoModelForPreTraining:
|
|||
|
||||
class AutoModelWithLMHead:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelWithLMHead` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
:class:`~transformers.AutoModelWithLMHead` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -697,7 +697,7 @@ class AutoModelWithLMHead:
|
|||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a configuration.
|
||||
|
||||
Note:
|
||||
|
@ -743,7 +743,7 @@ class AutoModelWithLMHead:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the language modeling model classes of the library
|
||||
r"""Instantiates one of the language modeling model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
|
@ -839,12 +839,12 @@ class AutoModelWithLMHead:
|
|||
|
||||
class AutoModelForCausalLM:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForCausalLM` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
when created with the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
:class:`~transformers.AutoModelForCausalLM` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
when created with the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -856,7 +856,7 @@ class AutoModelForCausalLM:
|
|||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a configuration.
|
||||
|
||||
Note:
|
||||
|
@ -893,7 +893,7 @@ class AutoModelForCausalLM:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the language modeling model classes of the library
|
||||
r"""Instantiates one of the language modeling model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
|
@ -976,12 +976,12 @@ class AutoModelForCausalLM:
|
|||
|
||||
class AutoModelForMaskedLM:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForMaskedLM` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
when created with the `AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
:class:`~transformers.AutoModelForMaskedLM` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
when created with the `AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -993,7 +993,7 @@ class AutoModelForMaskedLM:
|
|||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a configuration.
|
||||
|
||||
Note:
|
||||
|
@ -1033,7 +1033,7 @@ class AutoModelForMaskedLM:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the language modeling model classes of the library
|
||||
r"""Instantiates one of the language modeling model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
|
@ -1119,12 +1119,12 @@ class AutoModelForMaskedLM:
|
|||
|
||||
class AutoModelForSeq2SeqLM:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForSeq2SeqLM` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
when created with the `AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
:class:`~transformers.AutoModelForSeq2SeqLM` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
when created with the `AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -1136,7 +1136,7 @@ class AutoModelForSeq2SeqLM:
|
|||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a configuration.
|
||||
|
||||
Note:
|
||||
|
@ -1172,7 +1172,7 @@ class AutoModelForSeq2SeqLM:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the language modeling model classes of the library
|
||||
r"""Instantiates one of the language modeling model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
|
@ -1254,12 +1254,12 @@ class AutoModelForSeq2SeqLM:
|
|||
|
||||
class AutoModelForSequenceClassification:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForSequenceClassification` is a generic model class
|
||||
that will be instantiated as one of the sequence classification model classes of the library
|
||||
when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
:class:`~transformers.AutoModelForSequenceClassification` is a generic model class
|
||||
that will be instantiated as one of the sequence classification model classes of the library
|
||||
when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -1271,7 +1271,7 @@ class AutoModelForSequenceClassification:
|
|||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a configuration.
|
||||
|
||||
Note:
|
||||
|
@ -1313,7 +1313,7 @@ class AutoModelForSequenceClassification:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the sequence classification model classes of the library
|
||||
r"""Instantiates one of the sequence classification model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
|
@ -1406,12 +1406,12 @@ class AutoModelForSequenceClassification:
|
|||
|
||||
class AutoModelForQuestionAnswering:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
|
||||
that will be instantiated as one of the question answering model classes of the library
|
||||
when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
:class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
|
||||
that will be instantiated as one of the question answering model classes of the library
|
||||
when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -1423,7 +1423,7 @@ class AutoModelForQuestionAnswering:
|
|||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a configuration.
|
||||
|
||||
Note:
|
||||
|
@ -1462,7 +1462,7 @@ class AutoModelForQuestionAnswering:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the question answering model classes of the library
|
||||
r"""Instantiates one of the question answering model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
|
@ -1551,12 +1551,12 @@ class AutoModelForQuestionAnswering:
|
|||
|
||||
class AutoModelForTokenClassification:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForTokenClassification` is a generic model class
|
||||
that will be instantiated as one of the token classification model classes of the library
|
||||
when created with the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
:class:`~transformers.AutoModelForTokenClassification` is a generic model class
|
||||
that will be instantiated as one of the token classification model classes of the library
|
||||
when created with the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -1568,7 +1568,7 @@ class AutoModelForTokenClassification:
|
|||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
r"""Instantiates one of the base model classes of the library
|
||||
from a configuration.
|
||||
|
||||
Note:
|
||||
|
@ -1611,7 +1611,7 @@ class AutoModelForTokenClassification:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the question answering model classes of the library
|
||||
r"""Instantiates one of the question answering model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
|
@ -1703,12 +1703,12 @@ class AutoModelForTokenClassification:
|
|||
|
||||
class AutoModelForMultipleChoice:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForMultipleChoice` is a generic model class
|
||||
that will be instantiated as one of the multiple choice model classes of the library
|
||||
when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
:class:`~transformers.AutoModelForMultipleChoice` is a generic model class
|
||||
that will be instantiated as one of the multiple choice model classes of the library
|
||||
when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
|
|
@ -223,7 +223,9 @@ class EncoderLayer(nn.Module):
|
|||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
self.self_attn = SelfAttention(
|
||||
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout,
|
||||
self.embed_dim,
|
||||
config.encoder_attention_heads,
|
||||
dropout=config.attention_dropout,
|
||||
)
|
||||
self.normalize_before = config.normalize_before
|
||||
self.self_attn_layer_norm = LayerNorm(self.embed_dim)
|
||||
|
@ -297,7 +299,10 @@ class BartEncoder(nn.Module):
|
|||
)
|
||||
else:
|
||||
self.embed_positions = LearnedPositionalEmbedding(
|
||||
config.max_position_embeddings, embed_dim, self.padding_idx, config.extra_pos_embeddings,
|
||||
config.max_position_embeddings,
|
||||
embed_dim,
|
||||
self.padding_idx,
|
||||
config.extra_pos_embeddings,
|
||||
)
|
||||
self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])
|
||||
self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
|
||||
|
@ -370,7 +375,9 @@ class DecoderLayer(nn.Module):
|
|||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
self.self_attn = SelfAttention(
|
||||
embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout,
|
||||
embed_dim=self.embed_dim,
|
||||
num_heads=config.decoder_attention_heads,
|
||||
dropout=config.attention_dropout,
|
||||
)
|
||||
self.dropout = config.dropout
|
||||
self.activation_fn = ACT2FN[config.activation_function]
|
||||
|
@ -477,7 +484,10 @@ class BartDecoder(nn.Module):
|
|||
)
|
||||
else:
|
||||
self.embed_positions = LearnedPositionalEmbedding(
|
||||
config.max_position_embeddings, config.d_model, self.padding_idx, config.extra_pos_embeddings,
|
||||
config.max_position_embeddings,
|
||||
config.d_model,
|
||||
self.padding_idx,
|
||||
config.extra_pos_embeddings,
|
||||
)
|
||||
self.layers = nn.ModuleList(
|
||||
[DecoderLayer(config) for _ in range(config.decoder_layers)]
|
||||
|
@ -695,7 +705,10 @@ class SelfAttention(nn.Module):
|
|||
# This is part of a workaround to get around fork/join parallelism not supporting Optional types.
|
||||
if key_padding_mask is not None and key_padding_mask.dim() == 0:
|
||||
key_padding_mask = None
|
||||
assert key_padding_mask is None or key_padding_mask.size()[:2] == (bsz, src_len,)
|
||||
assert key_padding_mask is None or key_padding_mask.size()[:2] == (
|
||||
bsz,
|
||||
src_len,
|
||||
)
|
||||
|
||||
if key_padding_mask is not None: # don't attend to padding symbols
|
||||
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
|
@ -703,7 +716,11 @@ class SelfAttention(nn.Module):
|
|||
attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
|
||||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
attn_weights = F.softmax(attn_weights, dim=-1)
|
||||
attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,)
|
||||
attn_probs = F.dropout(
|
||||
attn_weights,
|
||||
p=self.dropout,
|
||||
training=self.training,
|
||||
)
|
||||
|
||||
assert v is not None
|
||||
attn_output = torch.bmm(attn_probs, v)
|
||||
|
@ -754,7 +771,11 @@ class BartClassificationHead(nn.Module):
|
|||
# This can trivially be shared with RobertaClassificationHead
|
||||
|
||||
def __init__(
|
||||
self, input_dim, inner_dim, num_classes, pooler_dropout,
|
||||
self,
|
||||
input_dim,
|
||||
inner_dim,
|
||||
num_classes,
|
||||
pooler_dropout,
|
||||
):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(input_dim, inner_dim)
|
||||
|
@ -819,7 +840,8 @@ def _get_shape(t):
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare BART Model outputting raw hidden-states without any specific head on top.", BART_START_DOCSTRING,
|
||||
"The bare BART Model outputting raw hidden-states without any specific head on top.",
|
||||
BART_START_DOCSTRING,
|
||||
)
|
||||
class BartModel(PretrainedBartModel):
|
||||
def __init__(self, config: BartConfig):
|
||||
|
@ -981,31 +1003,31 @@ class BartForConditionalGeneration(PretrainedBartModel):
|
|||
**unused,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
|
||||
with labels in ``[0, ..., config.vocab_size]``.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
|
||||
with labels in ``[0, ..., config.vocab_size]``.
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Conditional generation example::
|
||||
Conditional generation example::
|
||||
|
||||
# Mask filling only works for bart-large
|
||||
from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
|
||||
TXT = "My friends are <mask> but they eat too many carbs."
|
||||
# Mask filling only works for bart-large
|
||||
from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
|
||||
TXT = "My friends are <mask> but they eat too many carbs."
|
||||
|
||||
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
|
||||
input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
|
||||
logits = model(input_ids).logits
|
||||
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
|
||||
input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
|
||||
logits = model(input_ids).logits
|
||||
|
||||
masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
|
||||
probs = logits[0, masked_index].softmax(dim=0)
|
||||
values, predictions = probs.topk(5)
|
||||
masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
|
||||
probs = logits[0, masked_index].softmax(dim=0)
|
||||
values, predictions = probs.topk(5)
|
||||
|
||||
tokenizer.decode(predictions).split()
|
||||
# ['good', 'great', 'all', 'really', 'very']
|
||||
tokenizer.decode(predictions).split()
|
||||
# ['good', 'great', 'all', 'really', 'very']
|
||||
"""
|
||||
if "lm_labels" in unused:
|
||||
warnings.warn(
|
||||
|
@ -1116,7 +1138,10 @@ class BartForSequenceClassification(PretrainedBartModel):
|
|||
super().__init__(config, **kwargs)
|
||||
self.model = BartModel(config)
|
||||
self.classification_head = BartClassificationHead(
|
||||
config.d_model, config.d_model, config.num_labels, config.classif_dropout,
|
||||
config.d_model,
|
||||
config.d_model,
|
||||
config.num_labels,
|
||||
config.classif_dropout,
|
||||
)
|
||||
self.model._init_weights(self.classification_head.dense)
|
||||
self.model._init_weights(self.classification_head.out_proj)
|
||||
|
@ -1279,7 +1304,10 @@ class BartForQuestionAnswering(PretrainedBartModel):
|
|||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits,) + outputs[1:]
|
||||
output = (
|
||||
start_logits,
|
||||
end_logits,
|
||||
) + outputs[1:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
return Seq2SeqQuestionAnsweringModelOutput(
|
||||
|
@ -1307,7 +1335,7 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
|
|||
@staticmethod
|
||||
def _init_weight(out: nn.Parameter):
|
||||
"""Identical to the XLM create_sinusoidal_embeddings except features are not interleaved.
|
||||
The cos features are in the 2nd half of the vector. [dim // 2:]
|
||||
The cos features are in the 2nd half of the vector. [dim // 2:]
|
||||
"""
|
||||
n_pos, dim = out.shape
|
||||
position_enc = np.array(
|
||||
|
|
|
@ -89,8 +89,7 @@ BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|||
|
||||
|
||||
def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
||||
""" Load tf checkpoints in a pytorch model.
|
||||
"""
|
||||
"""Load tf checkpoints in a pytorch model."""
|
||||
try:
|
||||
import re
|
||||
|
||||
|
@ -174,8 +173,7 @@ BertLayerNorm = torch.nn.LayerNorm
|
|||
|
||||
|
||||
class BertEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings.
|
||||
"""
|
||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
|
@ -343,7 +341,12 @@ class BertAttention(nn.Module):
|
|||
output_attentions=False,
|
||||
):
|
||||
self_outputs = self.self(
|
||||
hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
attention_output = self.output(self_outputs[0], hidden_states)
|
||||
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
|
||||
|
@ -403,7 +406,10 @@ class BertLayer(nn.Module):
|
|||
output_attentions=False,
|
||||
):
|
||||
self_attention_outputs = self.attention(
|
||||
hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
head_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
attention_output = self_attention_outputs[0]
|
||||
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
|
||||
|
@ -582,8 +588,8 @@ class BertPreTrainingHeads(nn.Module):
|
|||
|
||||
|
||||
class BertPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = BertConfig
|
||||
|
@ -733,9 +739,9 @@ class BertModel(BertPreTrainedModel):
|
|||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
@ -877,34 +883,34 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||
**kwargs
|
||||
):
|
||||
r"""
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates sequence B is a continuation of sequence A,
|
||||
``1`` indicates sequence B is a random sequence.
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates sequence B is a continuation of sequence A,
|
||||
``1`` indicates sequence B is a random sequence.
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> from transformers import BertTokenizer, BertForPreTraining
|
||||
>>> import torch
|
||||
>>> from transformers import BertTokenizer, BertForPreTraining
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
|
||||
>>> prediction_logits = outptus.prediction_logits
|
||||
>>> seq_relationship_logits = outputs.seq_relationship_logits
|
||||
>>> prediction_logits = outptus.prediction_logits
|
||||
>>> seq_relationship_logits = outputs.seq_relationship_logits
|
||||
"""
|
||||
if "masked_lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
|
@ -986,36 +992,36 @@ class BertLMHeadModel(BertPreTrainedModel):
|
|||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
|
||||
if the model is configured as a decoder.
|
||||
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Mask to avoid performing attention on the padding token indices of the encoder input. This mask
|
||||
is used in the cross-attention if the model is configured as a decoder.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the left-to-right language modeling loss (next word prediction).
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
|
||||
if the model is configured as a decoder.
|
||||
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Mask to avoid performing attention on the padding token indices of the encoder input. This mask
|
||||
is used in the cross-attention if the model is configured as a decoder.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the left-to-right language modeling loss (next word prediction).
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
|
||||
>>> import torch
|
||||
>>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
||||
>>> config = BertConfig.from_pretrained("bert-base-cased")
|
||||
>>> config.is_decoder = True
|
||||
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True)
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
||||
>>> config = BertConfig.from_pretrained("bert-base-cased")
|
||||
>>> config.is_decoder = True
|
||||
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True)
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
|
||||
>>> prediction_logits = outputs.logits
|
||||
>>> prediction_logits = outputs.logits
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
|
@ -1049,7 +1055,10 @@ class BertLMHeadModel(BertPreTrainedModel):
|
|||
return ((lm_loss,) + output) if lm_loss is not None else output
|
||||
|
||||
return CausalLMOutput(
|
||||
loss=lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=lm_loss,
|
||||
logits=prediction_scores,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
|
||||
|
@ -1173,7 +1182,8 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
|
||||
"""Bert Model with a `next sentence prediction (classification)` head on top. """,
|
||||
BERT_START_DOCSTRING,
|
||||
)
|
||||
class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
|
@ -1200,29 +1210,29 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
|||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates sequence B is a continuation of sequence A,
|
||||
``1`` indicates sequence B is a random sequence.
|
||||
next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates sequence B is a continuation of sequence A,
|
||||
``1`` indicates sequence B is a random sequence.
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Example::
|
||||
Example::
|
||||
|
||||
>>> from transformers import BertTokenizer, BertForNextSentencePrediction
|
||||
>>> import torch
|
||||
>>> from transformers import BertTokenizer, BertForNextSentencePrediction
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)
|
||||
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
|
||||
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
|
||||
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
|
||||
|
||||
>>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
|
||||
>>> logits = outputs.logits
|
||||
>>> assert logits[0, 0] < logits[0, 1] # next sentence was random
|
||||
>>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
|
||||
>>> logits = outputs.logits
|
||||
>>> assert logits[0, 0] < logits[0, 1] # next sentence was random
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
|
@ -1336,7 +1346,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1422,7 +1435,10 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return MultipleChoiceModelOutput(
|
||||
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=reshaped_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1505,7 +1521,10 @@ class BertForTokenClassification(BertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -68,7 +68,8 @@ class CamembertModel(RobertaModel):
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
|
||||
"""CamemBERT Model with a `language modeling` head on top. """,
|
||||
CAMEMBERT_START_DOCSTRING,
|
||||
)
|
||||
class CamembertForMaskedLM(RobertaForMaskedLM):
|
||||
"""
|
||||
|
|
|
@ -212,16 +212,15 @@ class EncoderLayer(torch.nn.Module):
|
|||
|
||||
|
||||
class CTRLPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = CTRLConfig
|
||||
base_model_prefix = "transformer"
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
"""Initialize the weights."""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
|
@ -331,8 +330,8 @@ class CTRLModel(CTRLPreTrainedModel):
|
|||
self.w = new_embeddings
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.h[layer].multi_head_attention.prune_heads(heads)
|
||||
|
|
|
@ -261,7 +261,12 @@ class TransformerBlock(nn.Module):
|
|||
"""
|
||||
# Self-Attention
|
||||
sa_output = self.attention(
|
||||
query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask, output_attentions=output_attentions,
|
||||
query=x,
|
||||
key=x,
|
||||
value=x,
|
||||
mask=attn_mask,
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
if output_attentions:
|
||||
sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
|
||||
|
@ -343,8 +348,8 @@ class Transformer(nn.Module):
|
|||
|
||||
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
|
||||
class DistilBertPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = DistilBertConfig
|
||||
|
@ -352,8 +357,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
|
|||
base_model_prefix = "distilbert"
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
"""Initialize the weights."""
|
||||
if isinstance(module, nn.Embedding):
|
||||
if module.weight.requires_grad:
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
|
@ -432,9 +436,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
|
|||
self.embeddings.word_embeddings = new_embeddings
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.transformer.layer[layer].attention.prune_heads(heads)
|
||||
|
@ -493,7 +497,8 @@ class DistilBertModel(DistilBertPreTrainedModel):
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
|
||||
"""DistilBert Model with a `masked language modeling` head on top. """,
|
||||
DISTILBERT_START_DOCSTRING,
|
||||
)
|
||||
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
|
@ -829,7 +834,10 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -863,32 +871,32 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
|
|||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
|
||||
>>> import torch
|
||||
>>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
|
||||
>>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True)
|
||||
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
|
||||
>>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True)
|
||||
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> choice0 = "It is eaten with a fork and a knife."
|
||||
>>> choice1 = "It is eaten while held in the hand."
|
||||
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> choice0 = "It is eaten with a fork and a knife."
|
||||
>>> choice1 = "It is eaten while held in the hand."
|
||||
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
|
||||
|
||||
>>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
|
||||
>>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
|
||||
>>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
|
||||
>>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
|
||||
|
||||
>>> # the linear classifier still needs to be trained
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
>>> # the linear classifier still needs to be trained
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
@ -930,5 +938,8 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return MultipleChoiceModelOutput(
|
||||
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=reshaped_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
|
|
@ -265,8 +265,8 @@ class DPRSpanPredictor(PreTrainedModel):
|
|||
|
||||
|
||||
class DPRPretrainedContextEncoder(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = DPRConfig
|
||||
|
@ -278,8 +278,8 @@ class DPRPretrainedContextEncoder(PreTrainedModel):
|
|||
|
||||
|
||||
class DPRPretrainedQuestionEncoder(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = DPRConfig
|
||||
|
@ -291,8 +291,8 @@ class DPRPretrainedQuestionEncoder(PreTrainedModel):
|
|||
|
||||
|
||||
class DPRPretrainedReader(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = DPRConfig
|
||||
|
@ -421,15 +421,15 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
|
|||
return_dict=None,
|
||||
) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]:
|
||||
r"""
|
||||
Return:
|
||||
Return:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
|
||||
tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
|
||||
model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
|
||||
input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
|
||||
embeddings = model(input_ids).pooler_output
|
||||
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
|
||||
tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
|
||||
model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
|
||||
input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
|
||||
embeddings = model(input_ids).pooler_output
|
||||
"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
|
@ -499,15 +499,15 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
|
|||
return_dict=None,
|
||||
) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]:
|
||||
r"""
|
||||
Return:
|
||||
Return:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
|
||||
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
|
||||
model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True)
|
||||
input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
|
||||
embeddings = model(input_ids).pooler_output
|
||||
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
|
||||
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
|
||||
model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True)
|
||||
input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
|
||||
embeddings = model(input_ids).pooler_output
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
|
@ -553,7 +553,8 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare DPRReader transformer outputting span predictions.", DPR_START_DOCSTRING,
|
||||
"The bare DPRReader transformer outputting span predictions.",
|
||||
DPR_START_DOCSTRING,
|
||||
)
|
||||
class DPRReader(DPRPretrainedReader):
|
||||
def __init__(self, config: DPRConfig):
|
||||
|
@ -574,23 +575,23 @@ class DPRReader(DPRPretrainedReader):
|
|||
return_dict=None,
|
||||
) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
|
||||
r"""
|
||||
Return:
|
||||
Return:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
from transformers import DPRReader, DPRReaderTokenizer
|
||||
tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
|
||||
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True)
|
||||
encoded_inputs = tokenizer(
|
||||
questions=["What is love ?"],
|
||||
titles=["Haddaway"],
|
||||
texts=["'What Is Love' is a song recorded by the artist Haddaway"],
|
||||
return_tensors='pt'
|
||||
)
|
||||
outputs = model(**encoded_inputs)
|
||||
start_logits = outputs.stat_logits
|
||||
end_logits = outputs.end_logits
|
||||
relevance_logits = outputs.relevance_logits
|
||||
from transformers import DPRReader, DPRReaderTokenizer
|
||||
tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
|
||||
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True)
|
||||
encoded_inputs = tokenizer(
|
||||
questions=["What is love ?"],
|
||||
titles=["Haddaway"],
|
||||
texts=["'What Is Love' is a song recorded by the artist Haddaway"],
|
||||
return_tensors='pt'
|
||||
)
|
||||
outputs = model(**encoded_inputs)
|
||||
start_logits = outputs.stat_logits
|
||||
end_logits = outputs.end_logits
|
||||
relevance_logits = outputs.relevance_logits
|
||||
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
|
|
|
@ -46,8 +46,7 @@ ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|||
|
||||
|
||||
def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"):
|
||||
""" Load tf checkpoints in a pytorch model.
|
||||
"""
|
||||
"""Load tf checkpoints in a pytorch model."""
|
||||
try:
|
||||
import re
|
||||
|
||||
|
@ -179,8 +178,8 @@ class ElectraGeneratorPredictions(nn.Module):
|
|||
|
||||
|
||||
class ElectraPreTrainedModel(BertPreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = ElectraConfig
|
||||
|
@ -311,9 +310,9 @@ class ElectraModel(ElectraPreTrainedModel):
|
|||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
@ -512,24 +511,24 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
|
|||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates the token is an original token,
|
||||
``1`` indicates the token was replaced.
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates the token is an original token,
|
||||
``1`` indicates the token was replaced.
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> from transformers import ElectraTokenizer, ElectraForPreTraining
|
||||
>>> import torch
|
||||
>>> from transformers import ElectraTokenizer, ElectraForPreTraining
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
|
||||
>>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
|
||||
>>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
|
||||
>>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
|
||||
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> logits = model(input_ids).logits
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> logits = model(input_ids).logits
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
|
@ -836,7 +835,10 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
|
|||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits,) + discriminator_hidden_states[1:]
|
||||
output = (
|
||||
start_logits,
|
||||
end_logits,
|
||||
) + discriminator_hidden_states[1:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
return QuestionAnsweringModelOutput(
|
||||
|
|
|
@ -28,11 +28,11 @@ logger = logging.get_logger(__name__)
|
|||
|
||||
class EncoderDecoderModel(PreTrainedModel):
|
||||
r"""
|
||||
:class:`~transformers.EncoderDecoder` is a generic model class that will be
|
||||
instantiated as a transformer architecture with one of the base model
|
||||
classes of the library as encoder and another one as
|
||||
decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method for the encoder and `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` class method for the decoder.
|
||||
:class:`~transformers.EncoderDecoder` is a generic model class that will be
|
||||
instantiated as a transformer architecture with one of the base model
|
||||
classes of the library as encoder and another one as
|
||||
decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method for the encoder and `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` class method for the decoder.
|
||||
"""
|
||||
config_class = EncoderDecoderConfig
|
||||
base_model_prefix = "encoder_decoder"
|
||||
|
@ -103,7 +103,7 @@ class EncoderDecoderModel(PreTrainedModel):
|
|||
*model_args,
|
||||
**kwargs
|
||||
) -> PreTrainedModel:
|
||||
r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.
|
||||
r"""Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.
|
||||
|
||||
|
||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
|
||||
|
|
|
@ -240,7 +240,11 @@ class FlaubertModel(XLMModel):
|
|||
# self attention
|
||||
if not self.pre_norm:
|
||||
attn_outputs = self.attentions[i](
|
||||
tensor, attn_mask, cache=cache, head_mask=head_mask[i], output_attentions=output_attentions,
|
||||
tensor,
|
||||
attn_mask,
|
||||
cache=cache,
|
||||
head_mask=head_mask[i],
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
attn = attn_outputs[0]
|
||||
if output_attentions:
|
||||
|
|
|
@ -61,8 +61,7 @@ GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|||
|
||||
|
||||
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
||||
""" Load tf checkpoints in a pytorch model
|
||||
"""
|
||||
"""Load tf checkpoints in a pytorch model"""
|
||||
try:
|
||||
import re
|
||||
|
||||
|
@ -324,8 +323,8 @@ class Block(nn.Module):
|
|||
|
||||
|
||||
class GPT2PreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = GPT2Config
|
||||
|
@ -336,8 +335,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
|||
super().__init__(*inputs, **kwargs)
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
"""Initialize the weights."""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
|
@ -483,8 +481,8 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||
self.wte = new_embeddings
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.h[layer].attn.prune_heads(heads)
|
||||
|
@ -800,47 +798,47 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
|
||||
Index of the classification token in each input sequence.
|
||||
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for language modeling.
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
|
||||
Index of the classification token in each input sequence.
|
||||
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for language modeling.
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Return:
|
||||
Return:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> import torch
|
||||
>>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
|
||||
>>> import torch
|
||||
>>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
|
||||
|
||||
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
>>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True)
|
||||
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
>>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True)
|
||||
|
||||
>>> # Add a [CLS] to the vocabulary (we should train it also!)
|
||||
>>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
|
||||
>>> # Add a [CLS] to the vocabulary (we should train it also!)
|
||||
>>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
|
||||
|
||||
>>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
|
||||
>>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
|
||||
|
||||
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
||||
>>> encoded_choices = [tokenizer.encode(s) for s in choices]
|
||||
>>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
|
||||
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
||||
>>> encoded_choices = [tokenizer.encode(s) for s in choices]
|
||||
>>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
|
||||
|
||||
>>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
|
||||
>>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
|
||||
>>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
|
||||
>>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
|
||||
|
||||
>>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
|
||||
>>> lm_logits = outputs.lm_logits
|
||||
>>> mc_logits = outputs.mc_logits
|
||||
>>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
|
||||
>>> lm_logits = outputs.lm_logits
|
||||
>>> mc_logits = outputs.mc_logits
|
||||
|
||||
"""
|
||||
if "lm_labels" in kwargs:
|
||||
|
|
|
@ -66,7 +66,7 @@ LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|||
|
||||
def _get_question_end_index(input_ids, sep_token_id):
|
||||
"""
|
||||
Computes the index of the first occurance of `sep_token_id`.
|
||||
Computes the index of the first occurance of `sep_token_id`.
|
||||
"""
|
||||
|
||||
sep_token_indices = (input_ids == sep_token_id).nonzero()
|
||||
|
@ -81,9 +81,9 @@ def _get_question_end_index(input_ids, sep_token_id):
|
|||
|
||||
def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True):
|
||||
"""
|
||||
Computes global attention mask by putting attention on all tokens
|
||||
before `sep_token_id` if `before_sep_token is True` else after
|
||||
`sep_token_id`.
|
||||
Computes global attention mask by putting attention on all tokens
|
||||
before `sep_token_id` if `before_sep_token is True` else after
|
||||
`sep_token_id`.
|
||||
"""
|
||||
question_end_index = _get_question_end_index(input_ids, sep_token_id)
|
||||
question_end_index = question_end_index.unsqueeze(dim=1) # size: batch_size x 1
|
||||
|
@ -135,7 +135,10 @@ class LongformerSelfAttention(nn.Module):
|
|||
self.one_sided_attn_window_size = attention_window // 2
|
||||
|
||||
def forward(
|
||||
self, hidden_states, attention_mask=None, output_attentions=False,
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
"""
|
||||
LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`.
|
||||
|
@ -314,17 +317,17 @@ class LongformerSelfAttention(nn.Module):
|
|||
@staticmethod
|
||||
def _pad_and_diagonalize(chunked_hidden_states):
|
||||
"""shift every row 1 step right, converting columns into diagonals.
|
||||
Example:
|
||||
chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
|
||||
-1.8348, 0.7672, 0.2986, 0.0285,
|
||||
-0.7584, 0.4206, -0.0405, 0.1599,
|
||||
2.0514, -1.1600, 0.5372, 0.2629 ]
|
||||
window_overlap = num_rows = 4
|
||||
(pad & diagonilize) =>
|
||||
[ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
|
||||
0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
|
||||
0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
|
||||
0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
|
||||
Example:
|
||||
chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
|
||||
-1.8348, 0.7672, 0.2986, 0.0285,
|
||||
-0.7584, 0.4206, -0.0405, 0.1599,
|
||||
2.0514, -1.1600, 0.5372, 0.2629 ]
|
||||
window_overlap = num_rows = 4
|
||||
(pad & diagonilize) =>
|
||||
[ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
|
||||
0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
|
||||
0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
|
||||
0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
|
||||
"""
|
||||
total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
|
||||
chunked_hidden_states = F.pad(
|
||||
|
@ -442,7 +445,7 @@ class LongformerSelfAttention(nn.Module):
|
|||
self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
|
||||
):
|
||||
"""Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors.
|
||||
Returned tensor will be of the same shape as `attn_probs`"""
|
||||
Returned tensor will be of the same shape as `attn_probs`"""
|
||||
batch_size, seq_len, num_heads, head_dim = value.size()
|
||||
|
||||
assert seq_len % (window_overlap * 2) == 0
|
||||
|
@ -622,7 +625,10 @@ class LongformerSelfAttention(nn.Module):
|
|||
is_local_index_no_global_attn_nonzero[0], :, is_local_index_no_global_attn_nonzero[1], :
|
||||
] = -10000.0
|
||||
|
||||
global_attn_scores = global_attn_scores.masked_fill(is_index_masked[:, None, None, :], -10000.0,)
|
||||
global_attn_scores = global_attn_scores.masked_fill(
|
||||
is_index_masked[:, None, None, :],
|
||||
-10000.0,
|
||||
)
|
||||
|
||||
global_attn_scores = global_attn_scores.view(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)
|
||||
|
||||
|
@ -676,9 +682,16 @@ class LongformerAttention(nn.Module):
|
|||
self.pruned_heads = self.pruned_heads.union(heads)
|
||||
|
||||
def forward(
|
||||
self, hidden_states, attention_mask=None, output_attentions=False,
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
self_outputs = self.self(hidden_states, attention_mask, output_attentions,)
|
||||
self_outputs = self.self(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
attn_output = self.output(self_outputs[0], hidden_states)
|
||||
outputs = (attn_output,) + self_outputs[1:] # add attentions if we output them
|
||||
return outputs
|
||||
|
@ -694,9 +707,16 @@ class LongformerLayer(nn.Module):
|
|||
self.seq_len_dim = 1
|
||||
|
||||
def forward(
|
||||
self, hidden_states, attention_mask=None, output_attentions=False,
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
self_attn_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions,)
|
||||
self_attn_outputs = self.attention(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
attn_output = self_attn_outputs[0]
|
||||
outputs = self_attn_outputs[1:] # add self attentions if we output attention weights
|
||||
|
||||
|
@ -741,10 +761,16 @@ class LongformerEncoder(nn.Module):
|
|||
return custom_forward
|
||||
|
||||
layer_outputs = torch.utils.checkpoint.checkpoint(
|
||||
create_custom_forward(layer_module), hidden_states, attention_mask,
|
||||
create_custom_forward(layer_module),
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(hidden_states, attention_mask, output_attentions,)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if output_attentions:
|
||||
|
@ -762,9 +788,9 @@ class LongformerEncoder(nn.Module):
|
|||
|
||||
|
||||
class LongformerPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = LongformerConfig
|
||||
|
@ -896,9 +922,9 @@ class LongformerModel(LongformerPreTrainedModel):
|
|||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
@ -938,7 +964,9 @@ class LongformerModel(LongformerPreTrainedModel):
|
|||
position_ids = F.pad(position_ids, (0, padding_len), value=pad_token_id)
|
||||
if inputs_embeds is not None:
|
||||
input_ids_padding = inputs_embeds.new_full(
|
||||
(batch_size, padding_len), self.config.pad_token_id, dtype=torch.long,
|
||||
(batch_size, padding_len),
|
||||
self.config.pad_token_id,
|
||||
dtype=torch.long,
|
||||
)
|
||||
inputs_embeds_padding = self.embeddings(input_ids_padding)
|
||||
inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2)
|
||||
|
@ -976,28 +1004,28 @@ class LongformerModel(LongformerPreTrainedModel):
|
|||
):
|
||||
r"""
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> import torch
|
||||
>>> from transformers import LongformerModel, LongformerTokenizer
|
||||
>>> import torch
|
||||
>>> from transformers import LongformerModel, LongformerTokenizer
|
||||
|
||||
>>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True)
|
||||
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
|
||||
>>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True)
|
||||
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
|
||||
|
||||
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
|
||||
>>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
|
||||
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
|
||||
>>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
|
||||
|
||||
>>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
|
||||
>>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
|
||||
>>> attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example,
|
||||
... # classification: the <s> token
|
||||
... # QA: question tokens
|
||||
... # LM: potentially on the beginning of sentences and paragraphs
|
||||
>>> outputs = model(input_ids, attention_mask=attention_mask)
|
||||
>>> sequence_output = outputs.last_hidden_state
|
||||
>>> pooled_output = outputs.pooler_output
|
||||
>>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
|
||||
>>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
|
||||
>>> attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example,
|
||||
... # classification: the <s> token
|
||||
... # QA: question tokens
|
||||
... # LM: potentially on the beginning of sentences and paragraphs
|
||||
>>> outputs = model(input_ids, attention_mask=attention_mask)
|
||||
>>> sequence_output = outputs.last_hidden_state
|
||||
>>> pooled_output = outputs.pooler_output
|
||||
"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
|
@ -1102,32 +1130,32 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
|||
**kwargs
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> import torch
|
||||
>>> from transformers import LongformerForMaskedLM, LongformerTokenizer
|
||||
>>> import torch
|
||||
>>> from transformers import LongformerForMaskedLM, LongformerTokenizer
|
||||
|
||||
>>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True)
|
||||
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
|
||||
>>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True)
|
||||
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
|
||||
|
||||
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
|
||||
>>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
|
||||
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
|
||||
>>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
|
||||
|
||||
>>> attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
|
||||
... # check ``LongformerModel.forward`` for more details how to set `attention_mask`
|
||||
>>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
|
||||
>>> loss = outputs.loss
|
||||
>>> prediction_logits = output.logits
|
||||
>>> attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
|
||||
... # check ``LongformerModel.forward`` for more details how to set `attention_mask`
|
||||
>>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
|
||||
>>> loss = outputs.loss
|
||||
>>> prediction_logits = output.logits
|
||||
"""
|
||||
|
||||
if "masked_lm_labels" in kwargs:
|
||||
|
@ -1252,7 +1280,10 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1310,39 +1341,39 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
|
|||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
Returns:
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
Returns:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
|
||||
>>> import torch
|
||||
>>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
|
||||
>>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True)
|
||||
>>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
|
||||
>>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True)
|
||||
|
||||
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
>>> encoding = tokenizer(question, text, return_tensors="pt")
|
||||
>>> input_ids = encoding["input_ids"]
|
||||
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
>>> encoding = tokenizer(question, text, return_tensors="pt")
|
||||
>>> input_ids = encoding["input_ids"]
|
||||
|
||||
>>> # default is local attention everywhere
|
||||
>>> # the forward method will automatically set global attention on question tokens
|
||||
>>> attention_mask = encoding["attention_mask"]
|
||||
>>> # default is local attention everywhere
|
||||
>>> # the forward method will automatically set global attention on question tokens
|
||||
>>> attention_mask = encoding["attention_mask"]
|
||||
|
||||
>>> outputs = model(input_ids, attention_mask=attention_mask)
|
||||
>>> start_logits = outputs.start_logits
|
||||
>>> end_logits = outputs.end_logits
|
||||
>>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
|
||||
>>> outputs = model(input_ids, attention_mask=attention_mask)
|
||||
>>> start_logits = outputs.start_logits
|
||||
>>> end_logits = outputs.end_logits
|
||||
>>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
|
||||
|
||||
>>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
|
||||
>>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
|
||||
>>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
|
||||
>>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
|
||||
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
@ -1487,7 +1518,10 @@ class LongformerForTokenClassification(BertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1592,5 +1626,8 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return MultipleChoiceModelOutput(
|
||||
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=reshaped_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
|
|
@ -32,8 +32,7 @@ _CONFIG_FOR_DOC = "MMBTConfig"
|
|||
|
||||
|
||||
class ModalEmbeddings(nn.Module):
|
||||
"""Generic Modal Embeddings which takes in an encoder, and a transformer embedding.
|
||||
"""
|
||||
"""Generic Modal Embeddings which takes in an encoder, and a transformer embedding."""
|
||||
|
||||
def __init__(self, config, encoder, embeddings):
|
||||
super().__init__()
|
||||
|
@ -154,7 +153,8 @@ MMBT_INPUTS_DOCSTRING = r""" Inputs:
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare MMBT Model outputting raw hidden-states without any specific head on top.", MMBT_START_DOCSTRING,
|
||||
"The bare MMBT Model outputting raw hidden-states without any specific head on top.",
|
||||
MMBT_START_DOCSTRING,
|
||||
)
|
||||
class MMBTModel(nn.Module, ModuleUtilsMixin):
|
||||
def __init__(self, config, transformer, encoder):
|
||||
|
@ -288,34 +288,34 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
|
|||
)
|
||||
class MMBTForClassification(nn.Module):
|
||||
r"""
|
||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
# For example purposes. Not runnable.
|
||||
transformer = BertModel.from_pretrained('bert-base-uncased')
|
||||
encoder = ImageEncoder(args)
|
||||
model = MMBTForClassification(config, transformer, encoder)
|
||||
outputs = model(input_modal, input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
"""
|
||||
# For example purposes. Not runnable.
|
||||
transformer = BertModel.from_pretrained('bert-base-uncased')
|
||||
encoder = ImageEncoder(args)
|
||||
model = MMBTForClassification(config, transformer, encoder)
|
||||
outputs = model(input_modal, input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
"""
|
||||
|
||||
def __init__(self, config, transformer, encoder):
|
||||
super().__init__()
|
||||
|
@ -378,5 +378,8 @@ class MMBTForClassification(nn.Module):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
|
|
@ -64,8 +64,7 @@ MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]
|
|||
|
||||
|
||||
def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
|
||||
""" Load tf checkpoints in a pytorch model.
|
||||
"""
|
||||
"""Load tf checkpoints in a pytorch model."""
|
||||
try:
|
||||
import re
|
||||
|
||||
|
@ -161,8 +160,7 @@ NORM2FN = {"layer_norm": torch.nn.LayerNorm, "no_norm": NoNorm}
|
|||
|
||||
|
||||
class MobileBertEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings.
|
||||
"""
|
||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
|
@ -663,8 +661,8 @@ class MobileBertPreTrainingHeads(nn.Module):
|
|||
|
||||
|
||||
class MobileBertPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = MobileBertConfig
|
||||
|
@ -788,7 +786,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
|
|||
)
|
||||
class MobileBertModel(MobileBertPreTrainedModel):
|
||||
"""
|
||||
https://arxiv.org/pdf/2004.02984.pdf
|
||||
https://arxiv.org/pdf/2004.02984.pdf
|
||||
"""
|
||||
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
|
@ -809,9 +807,9 @@ class MobileBertModel(MobileBertPreTrainedModel):
|
|||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
@ -965,31 +963,31 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
|
|||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates sequence B is a continuation of sequence A,
|
||||
``1`` indicates sequence B is a random sequence.
|
||||
Returns:
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates sequence B is a continuation of sequence A,
|
||||
``1`` indicates sequence B is a random sequence.
|
||||
Returns:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
|
||||
>>> import torch
|
||||
>>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
|
||||
>>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True)
|
||||
>>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
|
||||
>>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True)
|
||||
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
|
||||
>>> prediction_logits = outptus.prediction_logits
|
||||
>>> seq_relationship_logits = outputs.seq_relationship_logits
|
||||
>>> prediction_logits = outptus.prediction_logits
|
||||
>>> seq_relationship_logits = outputs.seq_relationship_logits
|
||||
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
@ -1176,29 +1174,29 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
|
|||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates sequence B is a continuation of sequence A,
|
||||
``1`` indicates sequence B is a random sequence.
|
||||
next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates sequence B is a continuation of sequence A,
|
||||
``1`` indicates sequence B is a random sequence.
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
>>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
|
||||
>>> import torch
|
||||
>>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
|
||||
>>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True)
|
||||
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
|
||||
>>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True)
|
||||
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
|
||||
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
|
||||
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
|
||||
|
||||
>>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
>>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
|
@ -1308,7 +1306,10 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1491,7 +1492,10 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return MultipleChoiceModelOutput(
|
||||
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=reshaped_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1574,5 +1578,8 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
|
|
@ -59,8 +59,7 @@ OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|||
|
||||
|
||||
def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
|
||||
""" Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
|
||||
"""
|
||||
"""Load tf pre-trained weights in a pytorch model (from NumPy arrays here)"""
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
|
@ -257,7 +256,10 @@ class Block(nn.Module):
|
|||
|
||||
def forward(self, x, attention_mask=None, head_mask=None, output_attentions=False):
|
||||
attn_outputs = self.attn(
|
||||
x, attention_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions,
|
||||
x,
|
||||
attention_mask=attention_mask,
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
a = attn_outputs[0]
|
||||
|
||||
|
@ -270,8 +272,8 @@ class Block(nn.Module):
|
|||
|
||||
|
||||
class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = OpenAIGPTConfig
|
||||
|
@ -280,8 +282,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
|||
authorized_missing_keys = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
"""Initialize the weights."""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
|
@ -408,8 +409,8 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||
self.tokens_embed = new_embeddings
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.h[layer].attn.prune_heads(heads)
|
||||
|
@ -506,7 +507,9 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
|
||||
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions,
|
||||
last_hidden_state=hidden_states,
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_attentions,
|
||||
)
|
||||
|
||||
|
||||
|
@ -632,41 +635,41 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||
**kwargs
|
||||
):
|
||||
r"""
|
||||
mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
|
||||
Index of the classification token in each input sequence.
|
||||
Selected in the range ``[0, input_ids.size(-1) - 1]``.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for language modeling.
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
|
||||
Index of the classification token in each input sequence.
|
||||
Selected in the range ``[0, input_ids.size(-1) - 1]``.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for language modeling.
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Return:
|
||||
Return:
|
||||
|
||||
Examples::
|
||||
Examples::
|
||||
|
||||
from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
|
||||
import torch
|
||||
from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
|
||||
import torch
|
||||
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
|
||||
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
|
||||
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0) # Batch size 1
|
||||
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0) # Batch size 1
|
||||
|
||||
outputs = model(input_ids, mc_token_ids=mc_token_ids)
|
||||
lm_logits = outputs.lm_logits
|
||||
mc_logits = outputs.mc_logits
|
||||
outputs = model(input_ids, mc_token_ids=mc_token_ids)
|
||||
lm_logits = outputs.lm_logits
|
||||
mc_logits = outputs.mc_logits
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
if "lm_labels" in kwargs:
|
||||
|
|
|
@ -78,7 +78,8 @@ ReformerBackwardOutput = namedtuple(
|
|||
"ReformerBackwardOutput", ["attn_output", "hidden_states", "grad_attn_output", "grad_hidden_states"]
|
||||
)
|
||||
ReformerEncoderOutput = namedtuple(
|
||||
"ReformerEncoderOutput", ["hidden_states", "all_hidden_states", "all_attentions", "past_buckets_states"],
|
||||
"ReformerEncoderOutput",
|
||||
["hidden_states", "all_hidden_states", "all_attentions", "past_buckets_states"],
|
||||
)
|
||||
|
||||
|
||||
|
@ -192,7 +193,9 @@ class AxialPositionEmbeddings(nn.Module):
|
|||
assert (
|
||||
reduce(mul, self.axial_pos_shape) >= sequence_length
|
||||
), "Make sure that config.axial_pos_shape factors: {} multiply at least to max(sequence_length, least_common_mult_chunk_length): max({}, {})".format(
|
||||
self.axial_pos_shape, sequence_length, self.least_common_mult_chunk_length,
|
||||
self.axial_pos_shape,
|
||||
sequence_length,
|
||||
self.least_common_mult_chunk_length,
|
||||
)
|
||||
|
||||
# compute how many columns are needed
|
||||
|
@ -218,8 +221,7 @@ class AxialPositionEmbeddings(nn.Module):
|
|||
|
||||
|
||||
class PositionEmbeddings(nn.Module):
|
||||
"""Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`.
|
||||
"""
|
||||
"""Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
|
@ -233,8 +235,7 @@ class PositionEmbeddings(nn.Module):
|
|||
|
||||
|
||||
class ReformerEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings.
|
||||
"""
|
||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
|
@ -285,16 +286,16 @@ class EfficientAttentionMixin:
|
|||
"""
|
||||
|
||||
def _look_adjacent(self, vectors, num_chunks_before, num_chunks_after):
|
||||
""" Used to implement attention between consecutive chunks.
|
||||
"""Used to implement attention between consecutive chunks.
|
||||
|
||||
Args:
|
||||
vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...]
|
||||
num_chunks_before: chunks before current chunk to include in attention
|
||||
num_chunks_after: chunks after current chunk to include in attention
|
||||
Args:
|
||||
vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...]
|
||||
num_chunks_before: chunks before current chunk to include in attention
|
||||
num_chunks_after: chunks after current chunk to include in attention
|
||||
|
||||
Returns:
|
||||
tensor of shape [num_chunks, N * chunk_length, ...], where
|
||||
N = (1 + num_chunks_before + num_chunks_after).
|
||||
Returns:
|
||||
tensor of shape [num_chunks, N * chunk_length, ...], where
|
||||
N = (1 + num_chunks_before + num_chunks_after).
|
||||
"""
|
||||
if num_chunks_before == 0 and num_chunks_after == 0:
|
||||
return vectors
|
||||
|
@ -309,7 +310,7 @@ class EfficientAttentionMixin:
|
|||
|
||||
def _split_hidden_size_dim(self, x, num_attn_heads, attn_head_size):
|
||||
"""
|
||||
splits hidden_size dim into attn_head_size and num_attn_heads
|
||||
splits hidden_size dim into attn_head_size and num_attn_heads
|
||||
"""
|
||||
new_x_shape = x.size()[:-1] + (num_attn_heads, attn_head_size)
|
||||
x = x.view(*new_x_shape)
|
||||
|
@ -317,14 +318,14 @@ class EfficientAttentionMixin:
|
|||
|
||||
def _merge_hidden_size_dims(self, x, num_attn_heads, attn_head_size):
|
||||
"""
|
||||
merges attn_head_size dim and num_attn_heads dim into hidden_size
|
||||
merges attn_head_size dim and num_attn_heads dim into hidden_size
|
||||
"""
|
||||
x = x.permute(0, 2, 1, 3)
|
||||
return torch.reshape(x, (x.size()[0], -1, num_attn_heads * attn_head_size))
|
||||
|
||||
def _split_seq_length_dim_to(self, vectors, dim_factor_1, dim_factor_2, num_attn_heads, attn_head_size=None):
|
||||
"""
|
||||
splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims
|
||||
splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims
|
||||
"""
|
||||
batch_size = vectors.shape[0]
|
||||
split_dim_shape = (batch_size, num_attn_heads, dim_factor_1, dim_factor_2)
|
||||
|
@ -418,10 +419,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
# split key & value vectors by num hashes to apply
|
||||
# self attention on each separately
|
||||
query_key_vectors = self._split_seq_length_dim_to(
|
||||
query_key_vectors, num_hashes, -1, self.num_attention_heads, self.attention_head_size,
|
||||
query_key_vectors,
|
||||
num_hashes,
|
||||
-1,
|
||||
self.num_attention_heads,
|
||||
self.attention_head_size,
|
||||
)
|
||||
value_vectors = self._split_seq_length_dim_to(
|
||||
value_vectors, num_hashes, -1, self.num_attention_heads, self.attention_head_size,
|
||||
value_vectors,
|
||||
num_hashes,
|
||||
-1,
|
||||
self.num_attention_heads,
|
||||
self.attention_head_size,
|
||||
)
|
||||
# repeat query vectors across hash dimension
|
||||
query_vectors = query_vectors.unsqueeze(2).repeat(1, 1, num_hashes, 1, 1)
|
||||
|
@ -496,10 +505,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
query_key_vectors = self._gather_by_expansion(query_key_vectors, sorted_bucket_idx_per_hash, num_hashes)
|
||||
value_vectors = self._gather_by_expansion(value_vectors, sorted_bucket_idx_per_hash, num_hashes)
|
||||
query_key_vectors = self._split_seq_length_dim_to(
|
||||
query_key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
|
||||
query_key_vectors,
|
||||
-1,
|
||||
self.chunk_length,
|
||||
self.num_attention_heads,
|
||||
self.attention_head_size,
|
||||
)
|
||||
value_vectors = self._split_seq_length_dim_to(
|
||||
value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
|
||||
value_vectors,
|
||||
-1,
|
||||
self.chunk_length,
|
||||
self.num_attention_heads,
|
||||
self.attention_head_size,
|
||||
)
|
||||
|
||||
if self.chunk_length is None:
|
||||
|
@ -548,10 +565,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
# sum up all hash rounds
|
||||
if num_hashes > 1:
|
||||
out_vectors = self._split_seq_length_dim_to(
|
||||
out_vectors, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size,
|
||||
out_vectors,
|
||||
num_hashes,
|
||||
sequence_length,
|
||||
self.num_attention_heads,
|
||||
self.attention_head_size,
|
||||
)
|
||||
logits = self._split_seq_length_dim_to(
|
||||
logits, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size,
|
||||
logits,
|
||||
num_hashes,
|
||||
sequence_length,
|
||||
self.num_attention_heads,
|
||||
self.attention_head_size,
|
||||
).unsqueeze(-1)
|
||||
|
||||
probs_vectors = torch.exp(logits - torch.logsumexp(logits, dim=2, keepdim=True))
|
||||
|
@ -697,7 +722,8 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
|
||||
# factorize `num_buckets` if `num_buckets` becomes too large
|
||||
num_buckets_limit = 2 * max(
|
||||
int((self.max_position_embeddings // self.chunk_length) ** (0.5)), self.chunk_length,
|
||||
int((self.max_position_embeddings // self.chunk_length) ** (0.5)),
|
||||
self.chunk_length,
|
||||
)
|
||||
if num_buckets > num_buckets_limit:
|
||||
num_buckets = [2 ** (num_buckets_pow_2 // 2), 2 ** (num_buckets_pow_2 - num_buckets_pow_2 // 2)]
|
||||
|
@ -946,7 +972,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
|
||||
def _len_and_dim_norm(self, vectors):
|
||||
"""
|
||||
length and attention head size dim normalization
|
||||
length and attention head size dim normalization
|
||||
"""
|
||||
vectors = self._len_norm(vectors)
|
||||
vectors = vectors * torch.rsqrt(
|
||||
|
@ -956,7 +982,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
|
||||
def _len_norm(self, x, epsilon=1e-6):
|
||||
"""
|
||||
length normalization
|
||||
length normalization
|
||||
"""
|
||||
variance = torch.mean(x ** 2, -1, keepdim=True)
|
||||
norm_x = x * torch.rsqrt(variance + epsilon)
|
||||
|
@ -964,7 +990,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
|
||||
def _gather_by_expansion(self, vectors, idxs, num_hashes):
|
||||
"""
|
||||
expand dims of idxs and vectors for all hashes and gather
|
||||
expand dims of idxs and vectors for all hashes and gather
|
||||
"""
|
||||
expanded_idxs = idxs.unsqueeze(-1).expand(-1, -1, -1, self.attention_head_size)
|
||||
vectors = vectors.repeat(1, 1, num_hashes, 1)
|
||||
|
@ -973,11 +999,11 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
|
||||
class ReverseSort(Function):
|
||||
"""
|
||||
After chunked attention is applied which sorted clusters,
|
||||
original ordering has to be restored.
|
||||
Since customized backward function is used for Reformer,
|
||||
the gradients of the output vectors have to be explicitely
|
||||
sorted here.
|
||||
After chunked attention is applied which sorted clusters,
|
||||
original ordering has to be restored.
|
||||
Since customized backward function is used for Reformer,
|
||||
the gradients of the output vectors have to be explicitely
|
||||
sorted here.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
@ -1113,13 +1139,25 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
# chunk vectors
|
||||
# B x Num_Attn_Head x Seq_Len // chunk_len x chunk_len x attn_head_size
|
||||
query_vectors = self._split_seq_length_dim_to(
|
||||
query_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
|
||||
query_vectors,
|
||||
-1,
|
||||
self.chunk_length,
|
||||
self.num_attention_heads,
|
||||
self.attention_head_size,
|
||||
)
|
||||
key_vectors = self._split_seq_length_dim_to(
|
||||
key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
|
||||
key_vectors,
|
||||
-1,
|
||||
self.chunk_length,
|
||||
self.num_attention_heads,
|
||||
self.attention_head_size,
|
||||
)
|
||||
value_vectors = self._split_seq_length_dim_to(
|
||||
value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
|
||||
value_vectors,
|
||||
-1,
|
||||
self.chunk_length,
|
||||
self.num_attention_heads,
|
||||
self.attention_head_size,
|
||||
)
|
||||
|
||||
# chunk indices
|
||||
|
@ -1179,7 +1217,12 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
if not do_standard_self_attention:
|
||||
out_vectors = out_vectors.flatten(start_dim=2, end_dim=3)
|
||||
|
||||
assert out_vectors.shape == (batch_size, self.num_attention_heads, sequence_length, self.attention_head_size,)
|
||||
assert out_vectors.shape == (
|
||||
batch_size,
|
||||
self.num_attention_heads,
|
||||
sequence_length,
|
||||
self.attention_head_size,
|
||||
)
|
||||
|
||||
out_vectors = self._merge_hidden_size_dims(out_vectors, self.num_attention_heads, self.attention_head_size)
|
||||
|
||||
|
@ -1321,7 +1364,9 @@ class ReformerAttention(nn.Module):
|
|||
attention_output = self.output(self_attention_outputs.hidden_states)
|
||||
|
||||
return AttentionOutput(
|
||||
hidden_states=attention_output, attention_probs=self_attention_outputs.attention_probs, buckets=buckets,
|
||||
hidden_states=attention_output,
|
||||
attention_probs=self_attention_outputs.attention_probs,
|
||||
buckets=buckets,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1369,7 +1414,10 @@ class ChunkReformerFeedForward(nn.Module):
|
|||
|
||||
def forward(self, attention_output):
|
||||
return apply_chunking_to_forward(
|
||||
self.forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output,
|
||||
self.forward_chunk,
|
||||
self.chunk_size_feed_forward,
|
||||
self.seq_len_dim,
|
||||
attention_output,
|
||||
)
|
||||
|
||||
def forward_chunk(self, hidden_states):
|
||||
|
@ -1391,11 +1439,11 @@ class ReformerLayer(nn.Module):
|
|||
|
||||
def _init_attention_seed(self):
|
||||
"""
|
||||
This function sets a new seed for the
|
||||
attention layer to make dropout deterministic
|
||||
for both forward calls: 1 normal forward
|
||||
call and 1 forward call in backward
|
||||
to recalculate activations.
|
||||
This function sets a new seed for the
|
||||
attention layer to make dropout deterministic
|
||||
for both forward calls: 1 normal forward
|
||||
call and 1 forward call in backward
|
||||
to recalculate activations.
|
||||
"""
|
||||
|
||||
# randomize seeds
|
||||
|
@ -1412,11 +1460,11 @@ class ReformerLayer(nn.Module):
|
|||
|
||||
def _init_feed_forward_seed(self):
|
||||
"""
|
||||
This function sets a new seed for the
|
||||
feed forward layer to make dropout deterministic
|
||||
for both forward calls: 1 normal forward
|
||||
call and 1 forward call in backward
|
||||
to recalculate activations.
|
||||
This function sets a new seed for the
|
||||
feed forward layer to make dropout deterministic
|
||||
for both forward calls: 1 normal forward
|
||||
call and 1 forward call in backward
|
||||
to recalculate activations.
|
||||
"""
|
||||
# randomize seeds
|
||||
# use cuda generator if available
|
||||
|
@ -1520,7 +1568,10 @@ class ReformerLayer(nn.Module):
|
|||
# f(X_2)
|
||||
# use cached buckets for backprob if buckets not None for LSHSelfAttention
|
||||
output = self.attention(
|
||||
hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, buckets=buckets,
|
||||
hidden_states=hidden_states,
|
||||
head_mask=head_mask,
|
||||
attention_mask=attention_mask,
|
||||
buckets=buckets,
|
||||
).hidden_states
|
||||
output.backward(grad_attn_output, retain_graph=True)
|
||||
|
||||
|
@ -1738,8 +1789,8 @@ class ReformerOnlyLMHead(nn.Module):
|
|||
|
||||
|
||||
class ReformerPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = ReformerConfig
|
||||
|
@ -1947,9 +1998,9 @@ class ReformerModel(ReformerPreTrainedModel):
|
|||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
@ -2099,7 +2150,10 @@ class ReformerModel(ReformerPreTrainedModel):
|
|||
)
|
||||
|
||||
padded_input_ids = torch.full(
|
||||
(input_shape[0], padding_length), self.config.pad_token_id, device=device, dtype=torch.long,
|
||||
(input_shape[0], padding_length),
|
||||
self.config.pad_token_id,
|
||||
device=device,
|
||||
dtype=torch.long,
|
||||
)
|
||||
|
||||
# Extend `attention_mask`
|
||||
|
@ -2369,11 +2423,11 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
|
|||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
|
@ -2407,7 +2461,10 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
|
|||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче