Black 20 release

2020-08-26 17:20:22 +02:00 · 2020-08-26 17:20:22 +02:00 · a75c64d80c
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@ -112,7 +112,10 @@ if is_torch_available():
            cached_features_file = os.path.join(
                data_dir,
                "cached_{}_{}_{}_{}".format(
-                    "dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task,
+                    "dev" if evaluate else "train",
+                    tokenizer.__class__.__name__,
+                    str(max_seq_length),
+                    task,
                ),
            )
            label_list = processor.get_labels()
@ -278,7 +281,10 @@ class HansProcessor(DataProcessor):


 def hans_convert_examples_to_features(
-    examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
 ):
    """
    Loads a data file into a list of ``InputFeatures``
--- a/examples/benchmarking/plot_csv_file.py
+++ b/examples/benchmarking/plot_csv_file.py
@ -20,7 +20,9 @@ class PlotArguments:
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

-    csv_file: str = field(metadata={"help": "The csv file to plot."},)
+    csv_file: str = field(
+        metadata={"help": "The csv file to plot."},
+    )
    plot_along_batch: bool = field(
        default=False,
        metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
@ -30,7 +32,8 @@ class PlotArguments:
        metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
    )
    no_log_scale: bool = field(
-        default=False, metadata={"help": "Disable logarithmic scale when plotting"},
+        default=False,
+        metadata={"help": "Disable logarithmic scale when plotting"},
    )
    is_train: bool = field(
        default=False,
@ -39,7 +42,8 @@ class PlotArguments:
        },
    )
    figure_png_file: Optional[str] = field(
-        default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
+        default=None,
+        metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
    )
    short_model_names: Optional[List[str]] = list_field(
        default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
--- a/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
@ -101,30 +101,30 @@ class AlbertModelWithPabee(AlbertModel):
        regression=False,
    ):
        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+        Return:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
+            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the model.
+            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+                Last layer hidden-state of the first token of the sequence (classification token)
+                further processed by a Linear layer and a Tanh activation function. The Linear
+                layer weights are trained from the next sentence prediction (classification)
+                objective during pre-training.

-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                This output is usually *not* a good summary
+                of the semantic content of the input, you're often better with averaging or pooling
+                the sequence of hidden-states for the whole input sequence.
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        if input_ids is not None and inputs_embeds is not None:
@ -157,7 +157,10 @@ class AlbertModelWithPabee(AlbertModel):
            res = []
            for i in range(self.config.num_hidden_layers):
                encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
+                    encoder_outputs,
+                    current_layer=i,
+                    attention_mask=extended_attention_mask,
+                    head_mask=head_mask,
                )

                pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
@ -174,7 +177,10 @@ class AlbertModelWithPabee(AlbertModel):
            for i in range(self.config.num_hidden_layers):
                calculated_layer_num += 1
                encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
+                    encoder_outputs,
+                    current_layer=i,
+                    attention_mask=extended_attention_mask,
+                    head_mask=head_mask,
                )

                pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
@ -236,42 +242,42 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
        labels=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in ``[0, ..., config.num_labels - 1]``.
+                If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+                If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-        loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification (or regression if config.num_labels==1) loss.
-        logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
+            loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+                Classification (or regression if config.num_labels==1) loss.
+            logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.

-        Examples::
+            Examples::

-            from transformers import AlbertTokenizer
-            from pabee import AlbertForSequenceClassificationWithPabee
-            import torch
+                from transformers import AlbertTokenizer
+                from pabee import AlbertForSequenceClassificationWithPabee
+                import torch

-            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-            model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=labels)
-            loss, logits = outputs[:2]
+                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+                model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
+                input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+                labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+                outputs = model(input_ids, labels=labels)
+                loss, logits = outputs[:2]

        """

--- a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
@ -108,30 +108,30 @@ class BertModelWithPabee(BertModel):
        regression=False,
    ):
        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+        Return:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the model.
+            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+                Last layer hidden-state of the first token of the sequence (classification token)
+                further processed by a Linear layer and a Tanh activation function. The Linear
+                layer weights are trained from the next sentence prediction (classification)
+                objective during pre-training.

-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                This output is usually *not* a good summary
+                of the semantic content of the input, you're often better with averaging or pooling
+                the sequence of hidden-states for the whole input sequence.
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        if input_ids is not None and inputs_embeds is not None:
@ -266,44 +266,44 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
        labels=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.

-    Examples::
+        Examples::

-        from transformers import BertTokenizer, BertForSequenceClassification
-        from pabee import BertForSequenceClassificationWithPabee
-        import torch
+            from transformers import BertTokenizer, BertForSequenceClassification
+            from pabee import BertForSequenceClassificationWithPabee
+            import torch

-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')

-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
+            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+            outputs = model(input_ids, labels=labels)

-        loss, logits = outputs[:2]
+            loss, logits = outputs[:2]

        """

--- a/examples/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/bert-loses-patience/run_glue_with_pabee.py
@ -120,7 +120,10 @@ def train(args, train_dataset, model, tokenizer):
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+            model,
+            device_ids=[args.local_rank],
+            output_device=args.local_rank,
+            find_unused_parameters=True,
        )

    # Train!
@ -151,13 +154,17 @@ def train(args, train_dataset, model, tokenizer):
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info(
-            "  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch,
+            "  Will skip the first %d steps in the first epoch",
+            steps_trained_in_current_epoch,
        )

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
+        epochs_trained,
+        int(args.num_train_epochs),
+        desc="Epoch",
+        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
@ -372,7 +379,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@ -434,15 +445,24 @@ def main():
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
-        "--patience", default="0", type=str, required=False,
+        "--patience",
+        default="0",
+        type=str,
+        required=False,
    )
    parser.add_argument(
-        "--regression_threshold", default=0, type=float, required=False,
+        "--regression_threshold",
+        default=0,
+        type=float,
+        required=False,
    )

    # Other parameters
    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
@ -466,17 +486,27 @@ def main():
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
+        "--evaluate_during_training",
+        action="store_true",
+        help="Run evaluation during training at each logging step.",
    )
    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
+        "--do_lower_case",
+        action="store_true",
+        help="Set this flag if you are using an uncased model.",
    )

    parser.add_argument(
-        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
+        "--per_gpu_train_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=1, type=int, help="Batch size per GPU/CPU for evaluation.",
+        "--per_gpu_eval_batch_size",
+        default=1,
+        type=int,
+        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
@ -485,13 +515,19 @@ def main():
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
-        "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.",
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
@ -503,7 +539,10 @@ def main():

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument(
-        "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.",
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
@ -512,10 +551,14 @@ def main():
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
+        "--overwrite_output_dir",
+        action="store_true",
+        help="Overwrite the content of the output directory",
    )
    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
+        "--overwrite_cache",
+        action="store_true",
+        help="Overwrite the cached training and evaluation sets",
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

@ -532,7 +575,10 @@ def main():
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument(
-        "--local_rank", type=int, default=-1, help="For distributed training: local_rank",
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="For distributed training: local_rank",
    )
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
@ -634,7 +680,8 @@ def main():
    print("Output Layers Parameters:", output_layers_param_num)
    single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters())
    print(
-        "Added Output Layers Parameters:", output_layers_param_num - single_output_layer_param_num,
+        "Added Output Layers Parameters:",
+        output_layers_param_num - single_output_layer_param_num,
    )

    logger.info("Training/evaluation parameters %s", args)
--- a/examples/bertology/run_bertology.py
+++ b/examples/bertology/run_bertology.py
@ -66,9 +66,9 @@ def print_2d_tensor(tensor):
 def compute_heads_importance(
    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
 ):
-    """ This method shows how to compute:
-        - head attention entropy
-        - head importance scores according to http://arxiv.org/abs/1905.10650
+    """This method shows how to compute:
+    - head attention entropy
+    - head importance scores according to http://arxiv.org/abs/1905.10650
    """
    # Prepare our tensors
    n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
@ -150,8 +150,8 @@ def compute_heads_importance(


 def mask_heads(args, model, eval_dataloader):
-    """ This method shows how to mask head (set some heads to zero), to test the effect on the network,
-        based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """This method shows how to mask head (set some heads to zero), to test the effect on the network,
+    based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
    """
    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
@ -201,8 +201,8 @@ def mask_heads(args, model, eval_dataloader):


 def prune_heads(args, model, eval_dataloader, head_mask):
-    """ This method shows how to prune head (remove heads weights) based on
-        the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """This method shows how to prune head (remove heads weights) based on
+    the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
    """
    # Try pruning and test time speedup
    # Pruning is like masking but we actually remove the masked weights
@ -395,7 +395,8 @@ def main():
        cache_dir=args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        cache_dir=args.cache_dir,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
--- a/examples/contrib/mm-imdb/utils_mmimdb.py
+++ b/examples/contrib/mm-imdb/utils_mmimdb.py
@ -138,6 +138,9 @@ def get_image_transforms():
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
+            transforms.Normalize(
+                mean=[0.46777044, 0.44531429, 0.40661017],
+                std=[0.12221994, 0.12145835, 0.14380469],
+            ),
        ]
    )
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@ -30,7 +30,11 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
            )
        else:
            topk_filled_outputs.append(
-                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
+                (
+                    masked_input.replace(masked_token, predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                )
            )
    return topk_filled_outputs

--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@ -71,10 +71,10 @@ def load_rocstories_dataset(dataset_path):


 def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
-    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
+    """Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)

-        To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
-        input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+    To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
+    input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
    """
    tensor_datasets = []
    for dataset in encoded_datasets:
@ -83,7 +83,10 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
        lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
        mc_labels = np.zeros((n_batch,), dtype=np.int64)
-        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
+        for (
+            i,
+            (story, cont1, cont2, mc_label),
+        ) in enumerate(dataset):
            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
            input_ids[i, 0, : len(with_cont1)] = with_cont1
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@ -629,7 +629,9 @@ def main():
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+    )
    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
    )
--- a/examples/deebert/run_glue_deebert.py
+++ b/examples/deebert/run_glue_deebert.py
@ -358,7 +358,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
--- a/examples/deebert/src/modeling_highway_bert.py
+++ b/examples/deebert/src/modeling_highway_bert.py
@ -14,8 +14,7 @@ from transformers.modeling_bert import (


 def entropy(x):
-    """ Calculate entropy of a pre-softmax logit Tensor
-    """
+    """Calculate entropy of a pre-softmax logit Tensor"""
    exp_x = torch.exp(x)
    A = torch.sum(exp_x, dim=1)  # sum of exp(x_i)
    B = torch.sum(x * exp_x, dim=1)  # sum of x_i * exp(x_i)
@ -104,7 +103,8 @@ class DeeBertEncoder(nn.Module):


@add_start_docstrings(
-    "The Bert Model transformer with early exiting (DeeBERT). ", BERT_START_DOCSTRING,
+    "The Bert Model transformer with early exiting (DeeBERT). ",
+    BERT_START_DOCSTRING,
 )
 class DeeBertModel(BertPreTrainedModel):
    def __init__(self, config):
@ -127,9 +127,9 @@ class DeeBertModel(BertPreTrainedModel):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@ -147,33 +147,33 @@ class DeeBertModel(BertPreTrainedModel):
        encoder_attention_mask=None,
    ):
        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+        Return:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the model.
+            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+                Last layer hidden-state of the first token of the sequence (classification token)
+                further processed by a Linear layer and a Tanh activation function. The Linear
+                layer weights are trained from the next sentence prediction (classification)
+                objective during pre-training.

-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                This output is usually *not* a good summary
+                of the semantic content of the input, you're often better with averaging or pooling
+                the sequence of hidden-states for the whole input sequence.
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-            Tuple of each early exit's results (total length: number of layers)
-            Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
+                Tuple of each early exit's results (total length: number of layers)
+                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
        """
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@ -302,32 +302,32 @@ class DeeBertForSequenceClassification(BertPreTrainedModel):
        train_highway=False,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-            Tuple of each early exit's results (total length: number of layers)
-            Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
+                Tuple of each early exit's results (total length: number of layers)
+                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
        """

        exit_layer = self.num_layers
--- a/examples/deebert/src/modeling_highway_roberta.py
+++ b/examples/deebert/src/modeling_highway_roberta.py
@ -11,7 +11,8 @@ from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayExc


@add_start_docstrings(
-    "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ", ROBERTA_START_DOCSTRING,
+    "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ",
+    ROBERTA_START_DOCSTRING,
 )
 class DeeRobertaModel(DeeBertModel):

@ -58,32 +59,32 @@ class DeeRobertaForSequenceClassification(BertPreTrainedModel):
        train_highway=False,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-            Tuple of each early exit's results (total length: number of layers)
-            Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
+            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
+                Tuple of each early exit's results (total length: number of layers)
+                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
        """

        exit_layer = self.num_layers
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@ -228,14 +228,20 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                assert end_logits_tea.size() == end_logits_stu.size()

                loss_fct = nn.KLDivLoss(reduction="batchmean")
-                loss_start = loss_fct(
-                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                    F.softmax(start_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature ** 2)
-                loss_end = loss_fct(
-                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                    F.softmax(end_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature ** 2)
+                loss_start = (
+                    loss_fct(
+                        F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        F.softmax(start_logits_tea / args.temperature, dim=-1),
+                    )
+                    * (args.temperature ** 2)
+                )
+                loss_end = (
+                    loss_fct(
+                        F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        F.softmax(end_logits_tea / args.temperature, dim=-1),
+                    )
+                    * (args.temperature ** 2)
+                )
                loss_ce = (loss_start + loss_end) / 2.0

                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@ -118,7 +118,8 @@ def init_gpu_params(params):
    if params.multi_gpu:
        logger.info("Initializing PyTorch distributed")
        torch.distributed.init_process_group(
-            init_method="env://", backend="nccl",
+            init_method="env://",
+            backend="nccl",
        )


--- a/examples/language-modeling/run_language_modeling.py
+++ b/examples/language-modeling/run_language_modeling.py
@ -233,7 +233,9 @@ def main():
    eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
-            tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length,
+            tokenizer=tokenizer,
+            plm_probability=data_args.plm_probability,
+            max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
--- a/examples/lightning_base.py
+++ b/examples/lightning_base.py
@ -226,10 +226,14 @@ class BaseTransformer(pl.LightningModule):
            help="Decoder layer dropout probability (Optional). Goes into model.config",
        )
        parser.add_argument(
-            "--dropout", type=float, help="Dropout probability (Optional). Goes into model.config",
+            "--dropout",
+            type=float,
+            help="Dropout probability (Optional). Goes into model.config",
        )
        parser.add_argument(
-            "--attention_dropout", type=float, help="Attention dropout probability (Optional). Goes into model.config",
+            "--attention_dropout",
+            type=float,
+            help="Attention dropout probability (Optional). Goes into model.config",
        )
        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
        parser.add_argument(
--- a/examples/longform-qa/eli5_app.py
+++ b/examples/longform-qa/eli5_app.py
@ -95,7 +95,10 @@ def make_support(question, source="wiki40b", method="dense", n_results=10):
            )
        else:
            support_doc, hit_lst = query_es_index(
-                question, es_client, index_name="english_wiki40b_snippets_100w", n_results=n_results,
+                question,
+                es_client,
+                index_name="english_wiki40b_snippets_100w",
+                n_results=n_results,
            )
    support_list = [
        (res["article_title"], res["section_title"].strip(), res["score"], res["passage_text"]) for res in hit_lst
@ -154,7 +157,8 @@ header_full = """
    header_html,
 )
 st.sidebar.markdown(
-    header_full, unsafe_allow_html=True,
+    header_full,
+    unsafe_allow_html=True,
 )

 # Long Form QA with ELI5 and Wikipedia
@ -173,9 +177,17 @@ action_list = [
 ]
 demo_options = st.sidebar.checkbox("Demo options")
 if demo_options:
-    action_st = st.sidebar.selectbox("", action_list, index=3,)
+    action_st = st.sidebar.selectbox(
+        "",
+        action_list,
+        index=3,
+    )
    action = action_list.index(action_st)
-    show_type = st.sidebar.selectbox("", ["Show full text of passages", "Show passage section titles"], index=0,)
+    show_type = st.sidebar.selectbox(
+        "",
+        ["Show full text of passages", "Show passage section titles"],
+        index=0,
+    )
    show_passages = show_type == "Show full text of passages"
 else:
    action = 3
@ -250,7 +262,9 @@ questions_list = [
    "How does New Zealand have so many large bird predators?",
 ]
 question_s = st.selectbox(
-    "What would you like to ask? ---- select <MY QUESTION> to enter a new query", questions_list, index=1,
+    "What would you like to ask? ---- select <MY QUESTION> to enter a new query",
+    questions_list,
+    index=1,
 )
 if question_s == "<MY QUESTION>":
    question = st.text_input("Enter your question here:", "")
--- a/examples/longform-qa/eli5_utils.py
+++ b/examples/longform-qa/eli5_utils.py
@ -48,7 +48,11 @@ def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_ki
            yield passage

    # create the ES index
-    for ok, action in streaming_bulk(client=es_client, index=index_name, actions=passage_generator(),):
+    for ok, action in streaming_bulk(
+        client=es_client,
+        index=index_name,
+        actions=passage_generator(),
+    ):
        progress.update(1)
        successes += ok
    print("Indexed %d documents" % (successes,))
@ -137,7 +141,11 @@ class RetrievalQAEmbedder(torch.nn.Module):

            # define function for checkpointing
            def partial_encode(*inputs):
-                encoder_outputs = self.sent_encoder.encoder(inputs[0], attention_mask=inputs[1], head_mask=head_mask,)
+                encoder_outputs = self.sent_encoder.encoder(
+                    inputs[0],
+                    attention_mask=inputs[1],
+                    head_mask=head_mask,
+                )
                sequence_output = encoder_outputs[0]
                pooled_output = self.sent_encoder.pooler(sequence_output)
                return pooled_output
@ -234,7 +242,11 @@ def train_qa_retriever_epoch(model, dataset, tokenizer, optimizer, scheduler, ar
        if step % args.print_freq == 0 or step == 1:
            print(
                "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                    e,
+                    step,
+                    len(dataset) // args.batch_size,
+                    loc_loss / loc_steps,
+                    time() - st_time,
                )
            )
            loc_loss = 0
@ -273,7 +285,11 @@ def train_qa_retriever_joint_epoch(model, dataset_list, tokenizer, optimizer, sc
        if step % args.print_freq == 0:
            print(
                "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e, step, len(dataset_list[0]) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                    e,
+                    step,
+                    len(dataset_list[0]) // args.batch_size,
+                    loc_loss / loc_steps,
+                    time() - st_time,
                )
            )
            loc_loss = 0
@ -354,7 +370,8 @@ class ELI5DatasetS2S(Dataset):
            self.document_cache[q_id] = self.document_cache.get(q_id, self.make_doc_function(example["title"]))
        document = self.document_cache[q_id]
        in_st = "question: {} context: {}".format(
-            question.lower().replace(" --t--", "").strip(), document.lower().strip(),
+            question.lower().replace(" --t--", "").strip(),
+            document.lower().strip(),
        )
        out_st = answer
        return (in_st, out_st)
@ -427,7 +444,11 @@ def train_qa_s2s_epoch(model, dataset, tokenizer, optimizer, scheduler, args, e=
        if step % args.print_freq == 0 or step == 1:
            print(
                "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                    e,
+                    step,
+                    len(dataset) // args.batch_size,
+                    loc_loss / loc_steps,
+                    time() - st_time,
                )
            )
            loc_loss = 0
@ -456,10 +477,18 @@ def eval_qa_s2s_epoch(model, dataset, tokenizer, args):
            if step % args.print_freq == 0:
                print(
                    "{:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                        step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time,
+                        step,
+                        len(dataset) // args.batch_size,
+                        loc_loss / loc_steps,
+                        time() - st_time,
                    )
                )
-    print("Total \t L: {:.3f} \t -- {:.3f}".format(loc_loss / loc_steps, time() - st_time,))
+    print(
+        "Total \t L: {:.3f} \t -- {:.3f}".format(
+            loc_loss / loc_steps,
+            time() - st_time,
+        )
+    )


 def train_qa_s2s(qa_s2s_model, qa_s2s_tokenizer, s2s_train_dset, s2s_valid_dset, s2s_args):
@ -506,7 +535,12 @@ def qa_s2s_generate(
    max_input_length=512,
    device="cuda:0",
 ):
-    model_inputs = make_qa_s2s_batch([(question_doc, "A")], qa_s2s_tokenizer, max_input_length, device=device,)
+    model_inputs = make_qa_s2s_batch(
+        [(question_doc, "A")],
+        qa_s2s_tokenizer,
+        max_input_length,
+        device=device,
+    )
    n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
    generated_ids = qa_s2s_model.generate(
        input_ids=model_inputs["input_ids"],
--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
@ -37,8 +37,7 @@ logger = logging.getLogger(__name__)


 class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
@ -385,8 +384,8 @@ class BertPooler(nn.Module):


 class MaskedBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = MaskedBertConfig
@ -492,9 +491,9 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@ -685,31 +684,31 @@ class MaskedBertForSequenceClassification(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification (or regression if config.num_labels==1) loss.
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        outputs = self.bert(
@ -770,32 +769,32 @@ class MaskedBertForMultipleChoice(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the multiple choice classification loss.
+                Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+                of the input tensors. (see `input_ids` above)
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+                Classification loss.
+            classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+                `num_choices` is the second dimension of the input tensors. (see `input_ids` above).

-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                Classification scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.

        """
        num_choices = input_ids.shape[1]
@ -860,29 +859,29 @@ class MaskedBertForTokenClassification(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the token classification loss.
+                Indices should be in ``[0, ..., config.num_labels - 1]``.
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+                Classification loss.
+            scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
+                Classification scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        outputs = self.bert(
@ -947,36 +946,36 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            threshold (:obj:`float`):
+                Threshold value (see :class:`~emmental.MaskedLinear`).

-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        Returns:
+            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+                Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+            start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+                Span-start scores (before SoftMax).
+            end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+                Span-end scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.

-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
+                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                heads.
        """

        outputs = self.bert(
@ -996,7 +995,10 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

-        outputs = (start_logits, end_logits,) + outputs[2:]
+        outputs = (
+            start_logits,
+            end_logits,
+        ) + outputs[2:]
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
--- a/examples/movement-pruning/masked_run_glue.py
+++ b/examples/movement-pruning/masked_run_glue.py
@ -173,7 +173,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+            model,
+            device_ids=[args.local_rank],
+            output_device=args.local_rank,
+            find_unused_parameters=True,
        )

    # Train!
@ -217,7 +220,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
+        epochs_trained,
+        int(args.num_train_epochs),
+        desc="Epoch",
+        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
@ -280,11 +286,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                        attention_mask=inputs["attention_mask"],
                    )

-                loss_logits = F.kl_div(
-                    input=F.log_softmax(logits_stu / args.temperature, dim=-1),
-                    target=F.softmax(logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature ** 2)
+                loss_logits = (
+                    F.kl_div(
+                        input=F.log_softmax(logits_stu / args.temperature, dim=-1),
+                        target=F.softmax(logits_tea / args.temperature, dim=-1),
+                        reduction="batchmean",
+                    )
+                    * (args.temperature ** 2)
+                )

                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss

@ -529,7 +538,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
+            examples,
+            tokenizer,
+            max_length=args.max_seq_length,
+            label_list=label_list,
+            output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@ -592,7 +605,10 @@ def main():
    )
    # Other parameters
    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
@ -616,17 +632,27 @@ def main():
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
+        "--evaluate_during_training",
+        action="store_true",
+        help="Run evaluation during training at each logging step.",
    )
    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
+        "--do_lower_case",
+        action="store_true",
+        help="Set this flag if you are using an uncased model.",
    )

    parser.add_argument(
-        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
+        "--per_gpu_train_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
+        "--per_gpu_eval_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")

@ -723,7 +749,10 @@ def main():
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
@ -742,10 +771,14 @@ def main():
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
+        "--overwrite_output_dir",
+        action="store_true",
+        help="Overwrite the content of the output directory",
    )
    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
+        "--overwrite_cache",
+        action="store_true",
+        help="Overwrite the cached training and evaluation sets",
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

--- a/examples/movement-pruning/masked_run_squad.py
+++ b/examples/movement-pruning/masked_run_squad.py
@ -181,7 +181,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+            model,
+            device_ids=[args.local_rank],
+            output_device=args.local_rank,
+            find_unused_parameters=True,
        )

    # Train!
@ -304,16 +307,22 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                        attention_mask=inputs["attention_mask"],
                    )

-                loss_start = F.kl_div(
-                    input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                    target=F.softmax(start_logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature ** 2)
-                loss_end = F.kl_div(
-                    input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                    target=F.softmax(end_logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature ** 2)
+                loss_start = (
+                    F.kl_div(
+                        input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        target=F.softmax(start_logits_tea / args.temperature, dim=-1),
+                        reduction="batchmean",
+                    )
+                    * (args.temperature ** 2)
+                )
+                loss_end = (
+                    F.kl_div(
+                        input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        target=F.softmax(end_logits_tea / args.temperature, dim=-1),
+                        reduction="batchmean",
+                    )
+                    * (args.temperature ** 2)
+                )
                loss_logits = (loss_start + loss_end) / 2.0

                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
@ -859,7 +868,10 @@ def main():
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
--- a/examples/multiple-choice/utils_multiple_choice.py
+++ b/examples/multiple-choice/utils_multiple_choice.py
@ -100,7 +100,12 @@ if is_torch_available():

            cached_features_file = os.path.join(
                data_dir,
-                "cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,),
+                "cached_{}_{}_{}_{}".format(
+                    mode.value,
+                    tokenizer.__class__.__name__,
+                    str(max_seq_length),
+                    task,
+                ),
            )

            # Make sure only the first process in distributed training processes the dataset,
@ -121,7 +126,12 @@ if is_torch_available():
                    else:
                        examples = processor.get_train_examples(data_dir)
                    logger.info("Training examples: %s", len(examples))
-                    self.features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer,)
+                    self.features = convert_examples_to_features(
+                        examples,
+                        label_list,
+                        max_seq_length,
+                        tokenizer,
+                    )
                    logger.info("Saving features into cached file %s", cached_features_file)
                    torch.save(self.features, cached_features_file)

@ -164,7 +174,12 @@ if is_tf_available():
                examples = processor.get_train_examples(data_dir)
            logger.info("Training examples: %s", len(examples))

-            self.features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer,)
+            self.features = convert_examples_to_features(
+                examples,
+                label_list,
+                max_seq_length,
+                tokenizer,
+            )

            def gen():
                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
@ -491,7 +506,10 @@ class ArcProcessor(DataProcessor):


 def convert_examples_to_features(
-    examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
 ) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
--- a/examples/question-answering/run_squad_trainer.py
+++ b/examples/question-answering/run_squad_trainer.py
@ -137,7 +137,12 @@ def main():
    )

    # Initialize our Trainer
-    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )

    # Training
    if training_args.do_train:
--- a/examples/question-answering/run_tf_squad.py
+++ b/examples/question-answering/run_tf_squad.py
@ -231,7 +231,12 @@ def main():
    eval_dataset = eval_dataset.apply(tf.data.experimental.assert_cardinality(len(eval_examples)))

    # Initialize our Trainer
-    trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
+    trainer = TFTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )

    # Training
    if training_args.do_train:
--- a/examples/seq2seq/bertabs/configuration_bertabs.py
+++ b/examples/seq2seq/bertabs/configuration_bertabs.py
@ -28,7 +28,7 @@ BERTABS_FINETUNED_CONFIG_MAP = {


 class BertAbsConfig(PretrainedConfig):
-    r""" Class to store the configuration of the BertAbs model.
+    r"""Class to store the configuration of the BertAbs model.

    Arguments:
        vocab_size: int
--- a/examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
@ -62,7 +62,7 @@ BertAbsConfig = namedtuple(


 def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
-    """ Copy/paste and tweak the pre-trained weights provided by the creators
+    """Copy/paste and tweak the pre-trained weights provided by the creators
    of BertAbs for the internal architecture.
    """

@ -164,13 +164,22 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
-        "--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
+        "--bertabs_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path the official PyTorch dump.",
    )
    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the output PyTorch model.",
    )
    args = parser.parse_args()

    convert_bertabs_checkpoints(
-        args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
+        args.bertabs_checkpoint_path,
+        args.pytorch_dump_folder_path,
    )
--- a/examples/seq2seq/bertabs/modeling_bertabs.py
+++ b/examples/seq2seq/bertabs/modeling_bertabs.py
@ -105,10 +105,17 @@ class BertAbs(BertAbsPreTrainedModel):
                p.data.zero_()

    def forward(
-        self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask,
+        self,
+        encoder_input_ids,
+        decoder_input_ids,
+        token_type_ids,
+        encoder_attention_mask,
+        decoder_attention_mask,
    ):
        encoder_output = self.bert(
-            input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask,
+            input_ids=encoder_input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=encoder_attention_mask,
        )
        encoder_hidden_states = encoder_output[0]
        dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
@ -117,8 +124,7 @@ class BertAbs(BertAbsPreTrainedModel):


 class Bert(nn.Module):
-    """ This class is not really necessary and should probably disappear.
-    """
+    """This class is not really necessary and should probably disappear."""

    def __init__(self):
        super().__init__()
@ -307,7 +313,14 @@ class TransformerDecoderLayer(nn.Module):
        self.register_buffer("mask", mask)

    def forward(
-        self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None,
+        self,
+        inputs,
+        memory_bank,
+        src_pad_mask,
+        tgt_pad_mask,
+        previous_input=None,
+        layer_cache=None,
+        step=None,
    ):
        """
        Args:
@ -331,13 +344,25 @@ class TransformerDecoderLayer(nn.Module):
            all_input = torch.cat((previous_input, input_norm), dim=1)
            dec_mask = None

-        query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",)
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type="self",
+        )

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid = self.context_attn(
-            memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context",
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type="context",
        )
        output = self.feed_forward(self.drop(mid) + query)

@ -422,7 +447,14 @@ class MultiHeadedAttention(nn.Module):
            self.final_linear = nn.Linear(model_dim, model_dim)

    def forward(
-        self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None,
+        self,
+        key,
+        value,
+        query,
+        mask=None,
+        layer_cache=None,
+        type=None,
+        predefined_graph_1=None,
    ):
        """
        Compute the context vector and the attention vectors.
@ -628,7 +660,7 @@ def gelu(x):


 class PositionwiseFeedForward(nn.Module):
-    """ A two-layer Feed-Forward-Network with residual layer norm.
+    """A two-layer Feed-Forward-Network with residual layer norm.

    Args:
        d_model (int): the size of input for the first-layer of the FFN.
@ -770,8 +802,7 @@ class Translator(object):
        self.max_length = args.max_length

    def translate(self, batch, step, attn_debug=False):
-        """ Generates summaries from one batch of data.
-        """
+        """Generates summaries from one batch of data."""
        self.model.eval()
        with torch.no_grad():
            batch_data = self.translate_batch(batch)
@ -798,8 +829,7 @@ class Translator(object):
    # Where the beam search lives
    # I have no idea why it is being called from the method above
    def _fast_translate_batch(self, batch, max_length, min_length=0):
-        """ Beam Search using the encoder inputs contained in `batch`.
-        """
+        """Beam Search using the encoder inputs contained in `batch`."""

        # The batch object is funny
        # Instead of just looking at the size of the arguments we encapsulate
@ -981,7 +1011,7 @@ def tile(x, count, dim=0):


 class BertSumOptimizer(object):
-    """ Specific optimizer for BertSum.
+    """Specific optimizer for BertSum.

    As described in [1], the authors fine-tune BertSum for abstractive
    summarization using two Adam Optimizers with different warm-up steps and
@ -999,10 +1029,16 @@ class BertSumOptimizer(object):

        self.optimizers = {
            "encoder": torch.optim.Adam(
-                model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps,
+                model.encoder.parameters(),
+                lr=lr["encoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
            ),
            "decoder": torch.optim.Adam(
-                model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps,
+                model.decoder.parameters(),
+                lr=lr["decoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
            ),
        }

--- a/examples/seq2seq/bertabs/run_summarization.py
+++ b/examples/seq2seq/bertabs/run_summarization.py
@ -99,7 +99,7 @@ def evaluate(args):


 def save_summaries(summaries, path, original_document_name):
-    """ Write the summaries in fies that are prefixed by the original
+    """Write the summaries in fies that are prefixed by the original
    files' name with the `_summary` appended.

    Attributes:
@ -125,7 +125,7 @@ def save_summaries(summaries, path, original_document_name):


 def format_summary(translation):
-    """ Transforms the output of the `from_batch` function
+    """Transforms the output of the `from_batch` function
    into nicely formatted summaries.
    """
    raw_summary, _, _ = translation
@ -190,7 +190,12 @@ def build_data_iterator(args, tokenizer):
    def collate_fn(data):
        return collate(data, tokenizer, block_size=512, device=args.device)

-    iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
+    iterator = DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=args.batch_size,
+        collate_fn=collate_fn,
+    )

    return iterator

@ -201,7 +206,7 @@ def load_and_cache_examples(args, tokenizer):


 def collate(data, tokenizer, block_size, device):
-    """ Collate formats the data passed to the data loader.
+    """Collate formats the data passed to the data loader.

    In particular we tokenize the data batch after batch to avoid keeping them
    all in memory. We output the data as a namedtuple to fit the original BertAbs's
@ -231,7 +236,7 @@ def collate(data, tokenizer, block_size, device):


 def decode_summary(summary_tokens, tokenizer):
-    """ Decode the summary and return it in a format
+    """Decode the summary and return it in a format
    suitable for evaluation.
    """
    summary_tokens = summary_tokens.to("cpu").numpy()
@ -242,8 +247,7 @@ def decode_summary(summary_tokens, tokenizer):


 def main():
-    """ The main function defines the interface with the users.
-    """
+    """The main function defines the interface with the users."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--documents_dir",
@ -268,23 +272,41 @@ def main():
    )
    # EVALUATION options
    parser.add_argument(
-        "--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
+        "--no_cuda",
+        default=False,
+        type=bool,
+        help="Whether to force the execution on CPU.",
    )
    parser.add_argument(
-        "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
+        "--batch_size",
+        default=4,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
    )
    # BEAM SEARCH arguments
    parser.add_argument(
-        "--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
+        "--min_length",
+        default=50,
+        type=int,
+        help="Minimum number of tokens for the summaries.",
    )
    parser.add_argument(
-        "--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
+        "--max_length",
+        default=200,
+        type=int,
+        help="Maixmum number of tokens for the summaries.",
    )
    parser.add_argument(
-        "--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
+        "--beam_size",
+        default=5,
+        type=int,
+        help="The number of beams to start with for each example.",
    )
    parser.add_argument(
-        "--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
+        "--alpha",
+        default=0.95,
+        type=float,
+        help="The value of alpha for the length penalty in the beam search.",
    )
    parser.add_argument(
        "--block_trigram",
--- a/examples/seq2seq/bertabs/test_utils_summarization.py
+++ b/examples/seq2seq/bertabs/test_utils_summarization.py
@ -43,8 +43,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)

    def test_process_story_no_highlights(self):
-        """ Processing a story with no highlights returns an empty list for the summary.
-        """
+        """Processing a story with no highlights returns an empty list for the summary."""
        raw_story = """It was the year of Our Lord one thousand seven hundred and
        seventy-five.\n\nSpiritual revelations were conceded to England at that
        favoured period, as at this."""
@ -52,8 +51,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
        self.assertEqual(summary_lines, [])

    def test_process_empty_story(self):
-        """ An empty story returns an empty collection of lines.
-        """
+        """An empty story returns an empty collection of lines."""
        raw_story = ""
        story_lines, summary_lines = process_story(raw_story)
        self.assertEqual(story_lines, [])
--- a/examples/seq2seq/bertabs/utils_summarization.py
+++ b/examples/seq2seq/bertabs/utils_summarization.py
@ -11,7 +11,7 @@ from torch.utils.data import Dataset


 class CNNDMDataset(Dataset):
-    """ Abstracts the dataset used to train seq2seq models.
+    """Abstracts the dataset used to train seq2seq models.

    The class will process the documents that are located in the specified
    folder. The preprocessing will work on any document that is reasonably
@ -31,7 +31,7 @@ class CNNDMDataset(Dataset):
    """

    def __init__(self, path="", prefix="train"):
-        """ We initialize the class by listing all the documents to summarize.
+        """We initialize the class by listing all the documents to summarize.
        Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
        """
        assert os.path.isdir(path)
@ -60,7 +60,7 @@ class CNNDMDataset(Dataset):


 def process_story(raw_story):
-    """ Extract the story and summary from a story file.
+    """Extract the story and summary from a story file.

    Arguments:
        raw_story (str): content of the story file as an utf-8 encoded string.
@ -108,7 +108,7 @@ def _add_missing_period(line):


 def truncate_or_pad(sequence, block_size, pad_token_id):
-    """ Adapt the source and target sequences' lengths to the block size.
+    """Adapt the source and target sequences' lengths to the block size.
    If the sequence is shorter we append padding token to the right of the sequence.
    """
    if len(sequence) > block_size:
@ -119,8 +119,8 @@ def truncate_or_pad(sequence, block_size, pad_token_id):


 def build_mask(sequence, pad_token_id):
-    """ Builds the mask. The attention mechanism will only attend to positions
-    with value 1. """
+    """Builds the mask. The attention mechanism will only attend to positions
+    with value 1."""
    mask = torch.ones_like(sequence)
    idx_pad_tokens = sequence == pad_token_id
    mask[idx_pad_tokens] = 0
@ -128,7 +128,7 @@ def build_mask(sequence, pad_token_id):


 def encode_for_summarization(story_lines, summary_lines, tokenizer):
-    """ Encode the story and summary lines, and join them
+    """Encode the story and summary lines, and join them
    as specified in [1] by using `[SEP] [CLS]` tokens to separate
    sentences.
    """
@ -141,7 +141,7 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):


 def compute_token_type_ids(batch, separator_token_id):
-    """ Segment embeddings as described in [1]
+    """Segment embeddings as described in [1]

    The values {0,1} were found in the repository [2].

--- a/examples/seq2seq/callbacks.py
+++ b/examples/seq2seq/callbacks.py
@ -97,4 +97,9 @@ def get_checkpoint_callback(output_dir, metric):


 def get_early_stopping_callback(metric, patience):
-    return EarlyStopping(monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,)
+    return EarlyStopping(
+        monitor=f"val_{metric}",
+        mode="max",
+        patience=patience,
+        verbose=True,
+    )
--- a/examples/seq2seq/distillation.py
+++ b/examples/seq2seq/distillation.py
@ -348,7 +348,10 @@ class T5SummarizationDistiller(BartSummarizationDistiller):
        if self.different_encoder:
            with torch.no_grad():
                teacher_enc_outputs, teacher_enc_hid = self.teacher.encoder(
-                    source_ids, attention_mask=source_mask, output_hidden_states=True, use_cache=False,
+                    source_ids,
+                    attention_mask=source_mask,
+                    output_hidden_states=True,
+                    use_cache=False,
                )
            if self.hparams.alpha_encoder_loss > 0:
                loss_encoder = self.calc_mse_loss(enc_outputs, teacher_enc_outputs, source_mask)
--- a/examples/seq2seq/test_seq2seq_examples.py
+++ b/examples/seq2seq/test_seq2seq_examples.py
@ -117,7 +117,12 @@ class TestSummarizationDistiller(unittest.TestCase):

    @require_multigpu
    def test_multigpu(self):
-        updates = dict(no_teacher=True, freeze_encoder=True, gpus=2, sortish_sampler=False,)
+        updates = dict(
+            no_teacher=True,
+            freeze_encoder=True,
+            gpus=2,
+            sortish_sampler=False,
+        )
        self._test_distiller_cli(updates)

    def test_distill_no_teacher(self):
@ -261,7 +266,8 @@ def test_run_eval_bart(model):


@pytest.mark.parametrize(
-    ["model"], [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)],
+    ["model"],
+    [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)],
 )
 def test_finetune(model):
    args_d: dict = CHEAP_ARGS.copy()
@ -329,7 +335,8 @@ def test_finetune_extra_model_args():
    output_dir = tempfile.mkdtemp(prefix="output_1_")
    args_d1 = args_d.copy()
    args_d1.update(
-        model_name_or_path=model, output_dir=output_dir,
+        model_name_or_path=model,
+        output_dir=output_dir,
    )
    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
    for p in extra_model_params:
@ -344,7 +351,8 @@ def test_finetune_extra_model_args():
    output_dir = tempfile.mkdtemp(prefix="output_2_")
    args_d2 = args_d.copy()
    args_d2.update(
-        model_name_or_path=model, output_dir=output_dir,
+        model_name_or_path=model,
+        output_dir=output_dir,
    )
    unsupported_param = "encoder_layerdrop"
    args_d2[unsupported_param] = 0.5
@ -478,7 +486,11 @@ def test_summarization_dataset_truncation(tok):
    max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
    trunc_target = 4
    train_dataset = Seq2SeqDataset(
-        tokenizer, data_dir=tmp_dir, type_path="train", max_source_length=20, max_target_length=trunc_target,
+        tokenizer,
+        data_dir=tmp_dir,
+        type_path="train",
+        max_source_length=20,
+        max_target_length=trunc_target,
    )
    dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
    for batch in dataloader:
--- a/examples/seq2seq/utils.py
+++ b/examples/seq2seq/utils.py
@ -63,7 +63,9 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:


 def trim_batch(
-    input_ids, pad_token_id, attention_mask=None,
+    input_ids,
+    pad_token_id,
+    attention_mask=None,
 ):
    """Remove columns that are populated exclusively by pad_token_id"""
    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
--- a/examples/text-classification/run_pl_glue.py
+++ b/examples/text-classification/run_pl_glue.py
@ -153,7 +153,11 @@ class GLUETransformer(BaseTransformer):
        )

        parser.add_argument(
-            "--task", default="", type=str, required=True, help="The GLUE task to run",
+            "--task",
+            default="",
+            type=str,
+            required=True,
+            help="The GLUE task to run",
        )
        parser.add_argument(
            "--gpus",
@ -177,7 +181,10 @@ def main():

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
-        args.output_dir = os.path.join("./results", f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",)
+        args.output_dir = os.path.join(
+            "./results",
+            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
+        )
        os.makedirs(args.output_dir)

    model = GLUETransformer(args)
--- a/examples/text-classification/run_xnli.py
+++ b/examples/text-classification/run_xnli.py
@ -328,7 +328,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
+            examples,
+            tokenizer,
+            max_length=args.max_seq_length,
+            label_list=label_list,
+            output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
--- a/examples/text-generation/pplm/run_pplm.py
+++ b/examples/text-generation/pplm/run_pplm.py
@ -698,7 +698,9 @@ def run_pplm_example(
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += "{}{}{}".format(
-                            colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL,
+                            colorama.Fore.RED,
+                            tokenizer.decode([word_id]),
+                            colorama.Style.RESET_ALL,
                        )
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
@ -729,7 +731,10 @@ if __name__ == "__main__":
    parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
    parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
    parser.add_argument(
-        "--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
+        "--num_samples",
+        type=int,
+        default=1,
+        help="Number of samples to generate from the modified latents",
    )
    parser.add_argument(
        "--bag_of_words",
@ -751,13 +756,22 @@ if __name__ == "__main__":
        help="Discriminator to use",
    )
    parser.add_argument(
-        "--discrim_weights", type=str, default=None, help="Weights for the generic discriminator",
+        "--discrim_weights",
+        type=str,
+        default=None,
+        help="Weights for the generic discriminator",
    )
    parser.add_argument(
-        "--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator",
+        "--discrim_meta",
+        type=str,
+        default=None,
+        help="Meta information for the generic discriminator",
    )
    parser.add_argument(
-        "--class_label", type=int, default=-1, help="Class label used for the discriminator",
+        "--class_label",
+        type=int,
+        default=-1,
+        help="Class label used for the discriminator",
    )
    parser.add_argument("--length", type=int, default=100)
    parser.add_argument("--stepsize", type=float, default=0.02)
@ -773,7 +787,10 @@ if __name__ == "__main__":
        help="Length of past which is being optimized; 0 corresponds to infinite window length",
    )
    parser.add_argument(
-        "--horizon_length", type=int, default=1, help="Length of future to optimize over",
+        "--horizon_length",
+        type=int,
+        default=1,
+        help="Length of future to optimize over",
    )
    parser.add_argument("--decay", action="store_true", help="whether to decay or not")
    parser.add_argument("--gamma", type=float, default=1.5)
@ -783,7 +800,10 @@ if __name__ == "__main__":
    parser.add_argument("--no_cuda", action="store_true", help="no cuda")
    parser.add_argument("--colorama", action="store_true", help="colors keywords")
    parser.add_argument(
-        "--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
+        "--repetition_penalty",
+        type=float,
+        default=1.0,
+        help="Penalize repetition. More than 1.0 -> less repetition",
    )

    args = parser.parse_args()
--- a/examples/text-generation/pplm/run_pplm_discrim_train.py
+++ b/examples/text-generation/pplm/run_pplm_discrim_train.py
@ -242,7 +242,12 @@ def train_discriminator(

        text = torchtext_data.Field()
        label = torchtext_data.Field(sequential=False)
-        train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)
+        train_data, val_data, test_data = datasets.SST.splits(
+            text,
+            label,
+            fine_grained=True,
+            train_subtrees=True,
+        )

        x = []
        y = []
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@ -41,7 +41,9 @@ from transformers import (


 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
 )
 logger = logging.getLogger(__name__)

@ -197,7 +199,10 @@ def main():
    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()

    logger.warning(
-        "device: %s, n_gpu: %s, 16-bits training: %s", args.device, args.n_gpu, args.fp16,
+        "device: %s, n_gpu: %s, 16-bits training: %s",
+        args.device,
+        args.n_gpu,
+        args.fp16,
    )

    set_seed(args)
--- a/examples/token-classification/utils_ner.py
+++ b/examples/token-classification/utils_ner.py
@ -90,11 +90,11 @@ class TokenClassificationTask:
        sequence_a_segment_id=0,
        mask_padding_with_zero=True,
    ) -> List[InputFeatures]:
-        """ Loads a data file into a list of `InputFeatures`
-            `cls_token_at_end` define the location of the CLS token:
-                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+        """Loads a data file into a list of `InputFeatures`
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
        """
        # TODO clean up all this to leverage built-in features of tokenizers

@ -230,7 +230,8 @@ if is_torch_available():
        ):
            # Load data features from cache or dataset file
            cached_features_file = os.path.join(
-                data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
+                data_dir,
+                "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
            )

            # Make sure only the first process in distributed training processes the dataset,
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@ -14,18 +14,18 @@ def swish(x):


 def _gelu_python(x):
-    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        This is now written in C in torch.nn.functional
-        Also see https://arxiv.org/abs/1606.08415
+    """Original Implementation of the gelu activation function in Google Bert repo when initially created.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    This is now written in C in torch.nn.functional
+    Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


 def gelu_new(x):
-    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
-        Also see https://arxiv.org/abs/1606.08415
+    """Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+    Also see https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@ -199,11 +199,17 @@ class PyTorchBenchmark(Benchmark):
                # run additional 10 times to stabilize compilation for tpu and torchscript
                logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
                timeit.repeat(
-                    func, repeat=1, number=5,
+                    func,
+                    repeat=1,
+                    number=5,
                )

            # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-            runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+            runtimes = timeit.repeat(
+                func,
+                repeat=self.args.repeat,
+                number=10,
+            )

            if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
                import torch_xla.debug.metrics as met
--- a/src/transformers/benchmark/benchmark_args_tf.py
+++ b/src/transformers/benchmark/benchmark_args_tf.py
@ -32,10 +32,12 @@ logger = logging.get_logger(__name__)
@dataclass
 class TensorFlowBenchmarkArguments(BenchmarkArguments):
    tpu_name: str = field(
-        default=None, metadata={"help": "Name of TPU"},
+        default=None,
+        metadata={"help": "Name of TPU"},
    )
    device_idx: int = field(
-        default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
+        default=0,
+        metadata={"help": "CPU / GPU device index. Defaults to 0."},
    )
    eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
    use_xla: bool = field(
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@ -219,7 +219,11 @@ class TensorFlowBenchmark(Benchmark):
                    timeit.repeat(func, repeat=1, number=5)

                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+                runtimes = timeit.repeat(
+                    func,
+                    repeat=self.args.repeat,
+                    number=10,
+                )

                return min(runtimes) / 10.0
            except ResourceExhaustedError as e:
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@ -63,15 +63,15 @@ BenchmarkOutput = namedtuple(

 def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
    """
-        This function wraps another function into its own separated process.
-        In order to ensure accurate memory measurements it is important that the function
-        is executed in a separate process
+    This function wraps another function into its own separated process.
+    In order to ensure accurate memory measurements it is important that the function
+    is executed in a separate process

-        Args:
-            - `func`: (`callable`): function() -> ...
-                generic function which will be executed in its own separate process
-            - `do_multi_processing`: (`bool`)
-                Whether to run function on separate process or not
+    Args:
+        - `func`: (`callable`): function() -> ...
+            generic function which will be executed in its own separate process
+        - `do_multi_processing`: (`bool`)
+            Whether to run function on separate process or not
    """

    def multi_process_func(*args, **kwargs):
@ -106,13 +106,13 @@ def is_memory_tracing_enabled():


 class Frame(NamedTuple):
-    """ `Frame` is a NamedTuple used to gather the current frame state.
-            `Frame` has the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
+    """`Frame` is a NamedTuple used to gather the current frame state.
+    `Frame` has the following fields:
+    - 'filename' (string): Name of the file currently executed
+    - 'module' (string): Name of the module currently executed
+    - 'line_number' (int): Number of the line currently executed
+    - 'event' (string): Event that triggered the tracing (default will be "line")
+    - 'line_text' (string): Text of the line in the python script
    """

    filename: str
@ -123,10 +123,10 @@ class Frame(NamedTuple):


 class UsedMemoryState(NamedTuple):
-    """ `UsedMemoryState` are named tuples with the following fields:
-        - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-        - 'cpu_memory': CPU RSS memory state *before* executing the line
-        - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+    """`UsedMemoryState` are named tuples with the following fields:
+    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+    - 'cpu_memory': CPU RSS memory state *before* executing the line
+    - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
    """

    frame: Frame
@ -135,9 +135,9 @@ class UsedMemoryState(NamedTuple):


 class Memory(NamedTuple):
-    """ `Memory` NamedTuple have a single field `bytes` and
-        you can get a human readable str of the number of mega bytes by calling `__repr__`
-            - `byte` (integer): number of bytes,
+    """`Memory` NamedTuple have a single field `bytes` and
+    you can get a human readable str of the number of mega bytes by calling `__repr__`
+        - `byte` (integer): number of bytes,
    """

    bytes: int
@ -147,11 +147,11 @@ class Memory(NamedTuple):


 class MemoryState(NamedTuple):
-    """ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
-        - `frame` (`Frame`): the current frame (see above)
-        - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
-        - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
-        - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+    """`MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+    - `frame` (`Frame`): the current frame (see above)
+    - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+    - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+    - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
    """

    frame: Frame
@ -161,14 +161,14 @@ class MemoryState(NamedTuple):


 class MemorySummary(NamedTuple):
-    """ `MemorySummary` namedtuple otherwise with the fields:
-        - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
-            by substracting the memory after executing each line from the memory before executing said line.
-        - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
-            obtained by summing repeated memory increase for a line if it's executed several times.
-            The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
-        - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
-            Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+    """`MemorySummary` namedtuple otherwise with the fields:
+    - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+        by substracting the memory after executing each line from the memory before executing said line.
+    - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+        obtained by summing repeated memory increase for a line if it's executed several times.
+        The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+    - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+        Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
    """

    sequential: List[MemoryState]
@ -182,38 +182,38 @@ MemoryTrace = List[UsedMemoryState]

 def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
    """
-        measures peak cpu memory consumption of a given `function`
-        running the function for at least interval seconds
-        and at most 20 * interval seconds.
-        This function is heavily inspired by: `memory_usage`
-        of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
+    measures peak cpu memory consumption of a given `function`
+    running the function for at least interval seconds
+    and at most 20 * interval seconds.
+    This function is heavily inspired by: `memory_usage`
+    of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239

-        Args:
-            - `function`: (`callable`): function() -> ...
-                function without any arguments to measure for which to measure the peak memory
+    Args:
+        - `function`: (`callable`): function() -> ...
+            function without any arguments to measure for which to measure the peak memory

-            - `interval`: (`float`, `optional`, defaults to `0.5`)
-                interval in second for which to measure the memory usage
+        - `interval`: (`float`, `optional`, defaults to `0.5`)
+            interval in second for which to measure the memory usage

-            - `device_idx`: (`int`, `optional`, defaults to `None`)
-                device id for which to measure gpu usage
+        - `device_idx`: (`int`, `optional`, defaults to `None`)
+            device id for which to measure gpu usage

-        Returns:
-            - `max_memory`: (`int`)
-                cosumed memory peak in Bytes
+    Returns:
+        - `max_memory`: (`int`)
+            cosumed memory peak in Bytes
    """

    def get_cpu_memory(process_id: int) -> int:
        """
-            measures current cpu memory usage of a given `process_id`
+        measures current cpu memory usage of a given `process_id`

-            Args:
-                - `process_id`: (`int`)
-                    process_id for which to measure memory
+        Args:
+            - `process_id`: (`int`)
+                process_id for which to measure memory

-            Returns
-                - `memory`: (`int`)
-                    cosumed memory in Bytes
+        Returns
+            - `memory`: (`int`)
+                cosumed memory in Bytes
        """
        process = psutil.Process(process_id)
        try:
@ -234,8 +234,8 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
        class MemoryMeasureProcess(Process):

            """
-                `MemoryMeasureProcess` inherits from `Process` and overwrites
-                its `run()` method. Used to measure the memory usage of a process
+            `MemoryMeasureProcess` inherits from `Process` and overwrites
+            its `run()` method. Used to measure the memory usage of a process
            """

            def __init__(self, process_id: int, child_connection: Connection, interval: float):
@ -309,37 +309,37 @@ def start_memory_tracing(
    events_to_trace: str = "line",
    gpus_to_trace: Optional[List[int]] = None,
 ) -> MemoryTrace:
-    """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
-        See `./benchmark.py` for usage examples.
-        Current memory consumption is returned using psutil and in particular is the RSS memory
-            "Resident Set Size” (the non-swapped physical memory the process is using).
-            See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
+    """Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
+    See `./benchmark.py` for usage examples.
+    Current memory consumption is returned using psutil and in particular is the RSS memory
+        "Resident Set Size” (the non-swapped physical memory the process is using).
+        See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info

-        Args:
-            - `modules_to_trace`: (None, string, list/tuple of string)
-                if None, all events are recorded
-                if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
-            - `modules_not_to_trace`: (None, string, list/tuple of string)
-                if None, no module is avoided
-                if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
-            - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
-                default to line
-            - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
+    Args:
+        - `modules_to_trace`: (None, string, list/tuple of string)
+            if None, all events are recorded
+            if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
+        - `modules_not_to_trace`: (None, string, list/tuple of string)
+            if None, no module is avoided
+            if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
+        - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
+            default to line
+        - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs

-        Return:
-            - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
-                - `UsedMemoryState` are named tuples with the following fields:
-                    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-                    - 'cpu_memory': CPU RSS memory state *before* executing the line
-                    - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+    Return:
+        - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
+            - `UsedMemoryState` are named tuples with the following fields:
+                - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+                - 'cpu_memory': CPU RSS memory state *before* executing the line
+                - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)

-        `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
-            `Frame` has the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
+    `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
+        `Frame` has the following fields:
+        - 'filename' (string): Name of the file currently executed
+        - 'module' (string): Name of the module currently executed
+        - 'line_number' (int): Number of the line currently executed
+        - 'event' (string): Event that triggered the tracing (default will be "line")
+        - 'line_text' (string): Text of the line in the python script

    """
    if is_psutil_available():
@ -371,8 +371,8 @@ def start_memory_tracing(
    memory_trace = []

    def traceit(frame, event, args):
-        """ Tracing method executed before running each line in a module or sub-module
-            Record memory allocated in a list with debugging information
+        """Tracing method executed before running each line in a module or sub-module
+        Record memory allocated in a list with debugging information
        """
        global _is_memory_tracing_enabled

@ -456,39 +456,39 @@ def start_memory_tracing(
 def stop_memory_tracing(
    memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
 ) -> Optional[MemorySummary]:
-    """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
+    """Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.

-        Args:
-            - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
-            - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
+    Args:
+        - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
+        - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory

-        Return:
-            - None if `memory_trace` is None
-            - `MemorySummary` namedtuple otherwise with the fields:
-                - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
-                    by substracting the memory after executing each line from the memory before executing said line.
-                - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
-                    obtained by summing repeated memory increase for a line if it's executed several times.
-                    The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
-                - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
-                    Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+    Return:
+        - None if `memory_trace` is None
+        - `MemorySummary` namedtuple otherwise with the fields:
+            - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+                by substracting the memory after executing each line from the memory before executing said line.
+            - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+                obtained by summing repeated memory increase for a line if it's executed several times.
+                The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+            - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+                Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).

-        `Memory` named tuple have fields
-            - `byte` (integer): number of bytes,
-            - `string` (string): same as human readable string (ex: "3.5MB")
+    `Memory` named tuple have fields
+        - `byte` (integer): number of bytes,
+        - `string` (string): same as human readable string (ex: "3.5MB")

-        `Frame` are namedtuple used to list the current frame state and have the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
+    `Frame` are namedtuple used to list the current frame state and have the following fields:
+        - 'filename' (string): Name of the file currently executed
+        - 'module' (string): Name of the module currently executed
+        - 'line_number' (int): Number of the line currently executed
+        - 'event' (string): Event that triggered the tracing (default will be "line")
+        - 'line_text' (string): Text of the line in the python script

-        `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
-            - `frame` (`Frame`): the current frame (see above)
-            - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
-            - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
-            - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+    `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+        - `frame` (`Frame`): the current frame (see above)
+        - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+        - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+        - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
    """
    global _is_memory_tracing_enabled
    _is_memory_tracing_enabled = False
@ -499,15 +499,19 @@ def stop_memory_tracing(

        cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])

-        for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip(
-            memory_trace[:-1], memory_trace[1:]
-        ):
+        for (
+            (frame, cpu_mem, gpu_mem),
+            (next_frame, next_cpu_mem, next_gpu_mem),
+        ) in zip(memory_trace[:-1], memory_trace[1:]):
            cpu_mem_inc = next_cpu_mem - cpu_mem
            gpu_mem_inc = next_gpu_mem - gpu_mem
            cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
            memory_diff_trace.append(
                MemoryState(
-                    frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+                    frame=frame,
+                    cpu=Memory(cpu_mem_inc),
+                    gpu=Memory(gpu_mem_inc),
+                    cpu_gpu=Memory(cpu_gpu_mem_inc),
                )
            )

@ -529,7 +533,10 @@ def stop_memory_tracing(
        )  # order by the total CPU + GPU memory increase
        cumulative_memory = list(
            MemoryState(
-                frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+                frame=frame,
+                cpu=Memory(cpu_mem_inc),
+                gpu=Memory(gpu_mem_inc),
+                cpu_gpu=Memory(cpu_gpu_mem_inc),
            )
            for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
        )
@ -544,15 +551,17 @@ def stop_memory_tracing(
        total_memory = Memory(total_memory)

        return MemorySummary(
-            sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory,
+            sequential=memory_diff_trace,
+            cumulative=cumulative_memory,
+            current=memory_curr_trace,
+            total=total_memory,
        )

    return None


 def bytes_to_mega_bytes(memory_amount: int) -> int:
-    """ Utility to convert a number of bytes (int) into a number of mega bytes (int)
-    """
+    """Utility to convert a number of bytes (int) into a number of mega bytes (int)"""
    return memory_amount >> 20


--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@ -32,71 +32,71 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class AlbertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
-        It is used to instantiate an ALBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel`.
+    It is used to instantiate an ALBERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.


-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30000):
-                Vocabulary size of the ALBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of vocabulary embeddings.
-            hidden_size (:obj:`int`, optional, defaults to 4096):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_hidden_groups (:obj:`int`, optional, defaults to 1):
-                Number of groups for the hidden layers, parameters in the same group are shared.
-            num_attention_heads (:obj:`int`, optional, defaults to 64):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 16384):
-                The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            inner_group_num (:obj:`int`, optional, defaults to 1):
-                The number of inner repetition of attention and ffn.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with. Typically set this to something
-                large (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for attached classifiers.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30000):
+            Vocabulary size of the ALBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            Dimensionality of vocabulary embeddings.
+        hidden_size (:obj:`int`, optional, defaults to 4096):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_hidden_groups (:obj:`int`, optional, defaults to 1):
+            Number of groups for the hidden layers, parameters in the same group are shared.
+        num_attention_heads (:obj:`int`, optional, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 16384):
+            The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        inner_group_num (:obj:`int`, optional, defaults to 1):
+            The number of inner repetition of attention and ffn.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something
+            large (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for attached classifiers.

-        Example::
+    Example::

-            >>> from transformers import AlbertConfig, AlbertModel
-            >>> # Initializing an ALBERT-xxlarge style configuration
-            >>> albert_xxlarge_configuration = AlbertConfig()
+        >>> from transformers import AlbertConfig, AlbertModel
+        >>> # Initializing an ALBERT-xxlarge style configuration
+        >>> albert_xxlarge_configuration = AlbertConfig()

-            >>> # Initializing an ALBERT-base style configuration
-            >>> albert_base_configuration = AlbertConfig(
-            ...      hidden_size=768,
-            ...      num_attention_heads=12,
-            ...      intermediate_size=3072,
-            ...  )
+        >>> # Initializing an ALBERT-base style configuration
+        >>> albert_base_configuration = AlbertConfig(
+        ...      hidden_size=768,
+        ...      num_attention_heads=12,
+        ...      intermediate_size=3072,
+        ...  )

-            >>> # Initializing a model from the ALBERT-base style configuration
-            >>> model = AlbertModel(albert_xxlarge_configuration)
+        >>> # Initializing a model from the ALBERT-base style configuration
+        >>> model = AlbertModel(albert_xxlarge_configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "albert"
--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@ -73,43 +73,112 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(

 CONFIG_MAPPING = OrderedDict(
    [
-        ("retribert", RetriBertConfig,),
-        ("t5", T5Config,),
-        ("mobilebert", MobileBertConfig,),
-        ("distilbert", DistilBertConfig,),
-        ("albert", AlbertConfig,),
-        ("camembert", CamembertConfig,),
-        ("xlm-roberta", XLMRobertaConfig,),
+        (
+            "retribert",
+            RetriBertConfig,
+        ),
+        (
+            "t5",
+            T5Config,
+        ),
+        (
+            "mobilebert",
+            MobileBertConfig,
+        ),
+        (
+            "distilbert",
+            DistilBertConfig,
+        ),
+        (
+            "albert",
+            AlbertConfig,
+        ),
+        (
+            "camembert",
+            CamembertConfig,
+        ),
+        (
+            "xlm-roberta",
+            XLMRobertaConfig,
+        ),
        ("pegasus", PegasusConfig),
-        ("marian", MarianConfig,),
-        ("mbart", MBartConfig,),
-        ("bart", BartConfig,),
-        ("reformer", ReformerConfig,),
-        ("longformer", LongformerConfig,),
-        ("roberta", RobertaConfig,),
-        ("flaubert", FlaubertConfig,),
-        ("bert", BertConfig,),
-        ("openai-gpt", OpenAIGPTConfig,),
-        ("gpt2", GPT2Config,),
-        ("transfo-xl", TransfoXLConfig,),
-        ("xlnet", XLNetConfig,),
-        ("xlm", XLMConfig,),
-        ("ctrl", CTRLConfig,),
-        ("electra", ElectraConfig,),
-        ("encoder-decoder", EncoderDecoderConfig,),
+        (
+            "marian",
+            MarianConfig,
+        ),
+        (
+            "mbart",
+            MBartConfig,
+        ),
+        (
+            "bart",
+            BartConfig,
+        ),
+        (
+            "reformer",
+            ReformerConfig,
+        ),
+        (
+            "longformer",
+            LongformerConfig,
+        ),
+        (
+            "roberta",
+            RobertaConfig,
+        ),
+        (
+            "flaubert",
+            FlaubertConfig,
+        ),
+        (
+            "bert",
+            BertConfig,
+        ),
+        (
+            "openai-gpt",
+            OpenAIGPTConfig,
+        ),
+        (
+            "gpt2",
+            GPT2Config,
+        ),
+        (
+            "transfo-xl",
+            TransfoXLConfig,
+        ),
+        (
+            "xlnet",
+            XLNetConfig,
+        ),
+        (
+            "xlm",
+            XLMConfig,
+        ),
+        (
+            "ctrl",
+            CTRLConfig,
+        ),
+        (
+            "electra",
+            ElectraConfig,
+        ),
+        (
+            "encoder-decoder",
+            EncoderDecoderConfig,
+        ),
    ]
 )


 class AutoConfig:
    r"""
-        :class:`~transformers.AutoConfig` is a generic configuration class
-        that will be instantiated as one of the configuration classes of the library
-        when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
+    :class:`~transformers.AutoConfig` is a generic configuration class
+    that will be instantiated as one of the configuration classes of the library
+    when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.

-        The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
-        based on the `model_type` property of the config object, or when it's missing,
-        falling back to using pattern matching on the `pretrained_model_name_or_path` string.
+    The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
+    based on the `model_type` property of the config object, or when it's missing,
+    falling back to using pattern matching on the `pretrained_model_name_or_path` string.
    """

    def __init__(self):
--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@ -102,7 +102,7 @@ BART_CONFIG_ARGS_DOC = r"""
@add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
 class BartConfig(PretrainedConfig):
    r"""
-        Configuration class for Bart. Parameters are renamed from the fairseq implementation
+    Configuration class for Bart. Parameters are renamed from the fairseq implementation
    """
    model_type = "bart"

@ -141,14 +141,14 @@ class BartConfig(PretrainedConfig):
        **common_kwargs
    ):
        r"""
-            :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
+        :class:`~transformers.BartConfig` is the configuration class for `BartModel`.

-            Examples::
+        Examples::

-                >>> from transformers import BartConfig, BartModel
+            >>> from transformers import BartConfig, BartModel

-                >>> config = BartConfig.from_pretrained('facebook/bart-large')
-                >>> model = BartModel(config)
+            >>> config = BartConfig.from_pretrained('facebook/bart-large')
+            >>> model = BartModel(config)

        """
        if "hidden_size" in common_kwargs:
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@ -50,59 +50,59 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class BertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
-        It is used to instantiate an BERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
+    It is used to instantiate an BERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.


-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            gradient_checkpointing (:obj:`bool`, optional, defaults to False):
-                If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, optional, defaults to False):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.

-        Example::
+    Example::

-            >>> from transformers import BertModel, BertConfig
+        >>> from transformers import BertModel, BertConfig

-            >>> # Initializing a BERT bert-base-uncased style configuration
-            >>> configuration = BertConfig()
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> configuration = BertConfig()

-            >>> # Initializing a model from the bert-base-uncased style configuration
-            >>> model = BertModel(configuration)
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = BertModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "bert"

--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@ -25,55 +25,55 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.h

 class CTRLConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
-        It is used to instantiate an CTRL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
+    This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel`.
+    It is used to instantiate an CTRL model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 246534):
-                Vocabulary size of the CTRL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 256):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 1280):
-                Dimensionality of the embeddings and hidden states.
-            dff (:obj:`int`, optional, defaults to 8192):
-                Dimensionality of the inner dimension of the FFN.
-            n_layer (:obj:`int`, optional, defaults to 48):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 246534):
+            Vocabulary size of the CTRL model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+        n_positions (:obj:`int`, optional, defaults to 256):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 1280):
+            Dimensionality of the embeddings and hidden states.
+        dff (:obj:`int`, optional, defaults to 8192):
+            Dimensionality of the inner dimension of the FFN.
+        n_layer (:obj:`int`, optional, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

-        Example::
+    Example::

-            >>> from transformers import CTRLModel, CTRLConfig
+        >>> from transformers import CTRLModel, CTRLConfig

-            >>> # Initializing a CTRL configuration
-            >>> configuration = CTRLConfig()
+        >>> # Initializing a CTRL configuration
+        >>> configuration = CTRLConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = CTRLModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = CTRLModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "ctrl"
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@ -33,61 +33,61 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DistilBertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
-        It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
+    It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.


-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the DistilBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings.
-            n_layers (:obj:`int`, optional, defaults to 6):
-                Number of hidden layers in the Transformer encoder.
-            n_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dim (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            hidden_dim (:obj:`int`, optional, defaults to 3072):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            qa_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilities used in the question answering model
-                :class:`~transformers.DistilBertForQuestionAnswering`.
-            seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
-                The dropout probabilities used in the sequence classification and the multiple choice model
-                :class:`~transformers.DistilBertForSequenceClassification`.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the DistilBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings.
+        n_layers (:obj:`int`, optional, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        n_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dim (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_dim (:obj:`int`, optional, defaults to 3072):
+            The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qa_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilities used in the question answering model
+            :class:`~transformers.DistilBertForQuestionAnswering`.
+        seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
+            The dropout probabilities used in the sequence classification and the multiple choice model
+            :class:`~transformers.DistilBertForSequenceClassification`.

-        Example::
+    Example::

-            >>> from transformers import DistilBertModel, DistilBertConfig
+        >>> from transformers import DistilBertModel, DistilBertConfig

-            >>> # Initializing a DistilBERT configuration
-            >>> configuration = DistilBertConfig()
+        >>> # Initializing a DistilBERT configuration
+        >>> configuration = DistilBertConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = DistilBertModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = DistilBertModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "distilbert"

--- a/src/transformers/configuration_dpr.py
+++ b/src/transformers/configuration_dpr.py
@ -29,16 +29,16 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class DPRConfig(BertConfig):
    r"""
-        :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
-        `DPRModel`.
+    :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
+    `DPRModel`.

-        This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
-        It is used to instantiate the components of the DPR model.
+    This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
+    It is used to instantiate the components of the DPR model.

-        Args:
-            projection_dim (:obj:`int`, optional, defaults to 0):
-                Dimension of the projection for the context and question encoders.
-                If it is set to zero (default), then no projection is done.
+    Args:
+        projection_dim (:obj:`int`, optional, defaults to 0):
+            Dimension of the projection for the context and question encoders.
+            If it is set to zero (default), then no projection is done.
    """
    model_type = "dpr"

--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@ -33,82 +33,82 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ElectraConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
-        It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
+    It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
+    architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.


-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the ELECTRA model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of the encoder layers and the pooler layer.
-            hidden_size (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 4):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the ELECTRA model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_size (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Is one of the following options:

-                    - 'last' => take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                'gelu' => add a gelu activation to the output, Other => no activation.
-            summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.ElectraForMultipleChoice`.
-                Add a dropout after the projection and activation
+                - 'last' => take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            'gelu' => add a gelu activation to the output, Other => no activation.
+        summary_last_dropout (:obj:`float`, optional, defaults to 0.0):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.ElectraForMultipleChoice`.
+            Add a dropout after the projection and activation

-        Example::
+    Example::

-            >>> from transformers import ElectraModel, ElectraConfig
+        >>> from transformers import ElectraModel, ElectraConfig

-            >>> # Initializing a ELECTRA electra-base-uncased style configuration
-            >>> configuration = ElectraConfig()
+        >>> # Initializing a ELECTRA electra-base-uncased style configuration
+        >>> configuration = ElectraConfig()

-            >>> # Initializing a model from the electra-base-uncased style configuration
-            >>> model = ElectraModel(configuration)
+        >>> # Initializing a model from the electra-base-uncased style configuration
+        >>> model = ElectraModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "electra"

--- a/src/transformers/configuration_encoder_decoder.py
+++ b/src/transformers/configuration_encoder_decoder.py
@ -25,47 +25,47 @@ logger = logging.get_logger(__name__)

 class EncoderDecoderConfig(PretrainedConfig):
    r"""
-        :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.
+    :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a `EncoderDecoderModel`.

-        It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig`
-        and can be used to control the model outputs.
-        See the documentation for :class:`~transformers.PretrainedConfig` for more information.
+    It is used to instantiate an Encoder Decoder model according to the specified arguments, defining the encoder and decoder configs.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig`
+    and can be used to control the model outputs.
+    See the documentation for :class:`~transformers.PretrainedConfig` for more information.

-        Args:
-            kwargs (`optional`):
-                Remaining dictionary of keyword arguments. Notably:
-                    encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
-                        An instance of a configuration object that defines the encoder config.
-                    decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
-                        An instance of a configuration object that defines the decoder config.
+    Args:
+        kwargs (`optional`):
+            Remaining dictionary of keyword arguments. Notably:
+                encoder (:class:`PretrainedConfig`, optional, defaults to `None`):
+                    An instance of a configuration object that defines the encoder config.
+                decoder (:class:`PretrainedConfig`, optional, defaults to `None`):
+                    An instance of a configuration object that defines the decoder config.

-        Example::
+    Example::

-            >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+        >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

-            >>> # Initializing a BERT bert-base-uncased style configuration
-            >>> config_encoder = BertConfig()
-            >>> config_decoder = BertConfig()
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> config_encoder = BertConfig()
+        >>> config_decoder = BertConfig()

-            >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+        >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

-            >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
-            >>> model = EncoderDecoderModel(config=config)
+        >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
+        >>> model = EncoderDecoderModel(config=config)

-            >>> # Accessing the model configuration
-            >>> config_encoder = model.config.encoder
-            >>> config_decoder  = model.config.decoder
-            >>> # set decoder config to causal lm
-            >>> config_decoder.is_decoder = True
-            >>> config_decoder.add_cross_attention = True
+        >>> # Accessing the model configuration
+        >>> config_encoder = model.config.encoder
+        >>> config_decoder  = model.config.decoder
+        >>> # set decoder config to causal lm
+        >>> config_decoder.is_decoder = True
+        >>> config_decoder.add_cross_attention = True

-            >>> # Saving the model, including its configuration
-            >>> model.save_pretrained('my-model')
+        >>> # Saving the model, including its configuration
+        >>> model.save_pretrained('my-model')

-            >>> # loading model and config from pretrained folder
-            >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
-            >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+        >>> # loading model and config from pretrained folder
+        >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
+        >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
    """
    model_type = "encoder_decoder"

--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@ -30,121 +30,120 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class FlaubertConfig(XLMConfig):
    """
-        Configuration class to store the configuration of a `FlaubertModel`.
-        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+    Configuration class to store the configuration of a `FlaubertModel`.
+    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
+    It is used to instantiate an XLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether to apply the layer normalization before or after the feed forward layer following the
-                attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
-            layerdrop (:obj:`float`, `optional`, defaults to 0.0):
-                Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
-                with Structured Dropout. ICLR 2020)
-            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the Flaubert model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
-            emb_dim (:obj:`int`, optional, defaults to 2048):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for the attention mechanism
-            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Set this to `True` for the model to behave in a causal manner.
-                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-                if a bidirectional context.
-            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-                layer.
-            n_langs (:obj:`int`, optional, defaults to 1):
-                The number of languages the model handles. Set to 1 for monolingual models.
-            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
-                Whether to use language embeddings. Some models use additional language embeddings, see
-                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
-                for information on how to use them.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
-                The standard deviation of the truncated_normal_initializer for
-                initializing the embedding matrices.
-            init_std (:obj:`int`, optional, defaults to 50257):
-                The standard deviation of the truncated_normal_initializer for
-                initializing all weight matrices except the embedding matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            bos_index (:obj:`int`, optional, defaults to 0):
-                The index of the beginning of sentence token in the vocabulary.
-            eos_index (:obj:`int`, optional, defaults to 1):
-                The index of the end of sentence token in the vocabulary.
-            pad_index (:obj:`int`, optional, defaults to 2):
-                The index of the padding token in the vocabulary.
-            unk_index (:obj:`int`, optional, defaults to 3):
-                The index of the unknown token in the vocabulary.
-            mask_index (:obj:`int`, optional, defaults to 5):
-                The index of the masking token in the vocabulary.
-            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Is one of the following options:
+    Args:
+        pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to apply the layer normalization before or after the feed forward layer following the
+            attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+        layerdrop (:obj:`float`, `optional`, defaults to 0.0):
+            Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
+            with Structured Dropout. ICLR 2020)
+        vocab_size (:obj:`int`, optional, defaults to 30145):
+            Vocabulary size of the Flaubert model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FlaubertModel`.
+        emb_dim (:obj:`int`, optional, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
+        sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Set this to `True` for the model to behave in a causal manner.
+            Causal models use a triangular attention mask in order to only attend to the left-side context instead
+            if a bidirectional context.
+        asm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (:obj:`int`, optional, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see
+            `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+            for information on how to use them.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for
+            initializing the embedding matrices.
+        init_std (:obj:`int`, optional, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices except the embedding matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (:obj:`int`, optional, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (:obj:`int`, optional, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (:obj:`int`, optional, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (:obj:`int`, optional, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (:obj:`int`, optional, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Is one of the following options:

-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a dropout before the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            mask_token_id (:obj:`int`, optional, defaults to 0):
-                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-            lang_id (:obj:`int`, optional, defaults to 1):
-                The ID of the language used by the model. This parameter is used when generating
-                text in a given language.
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a dropout before the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        mask_token_id (:obj:`int`, optional, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (:obj:`int`, optional, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating
+            text in a given language.
    """

    model_type = "flaubert"

    def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
-        """Constructs FlaubertConfig.
-        """
+        """Constructs FlaubertConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
        self.layerdrop = layerdrop
        self.pre_norm = pre_norm
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@ -32,84 +32,84 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class GPT2Config(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
-        It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
+    It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.


-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 50257):
-                Vocabulary size of the GPT-2 model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
-            n_positions (:obj:`int`, optional, defaults to 1024):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            n_inner (:obj:`int`, optional, defaults to None):
-                Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
-            activation_function (:obj:`str`, optional, defaults to 'gelu'):
-                Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
+        n_positions (:obj:`int`, optional, defaults to 1024):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (:obj:`int`, optional, defaults to None):
+            Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
+        activation_function (:obj:`str`, optional, defaults to 'gelu'):
+            Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (:obj:`string`, optional, defaults to "cls_index"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Is one of the following options:

-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.GPT2DoubleHeadsModel`.
-                Add a dropout before the projection and activation
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+            Add a dropout before the projection and activation

-        Example::
+    Example::

-            >>> from transformers import GPT2Model, GPT2Config
+        >>> from transformers import GPT2Model, GPT2Config

-            >>> # Initializing a GPT2 configuration
-            >>> configuration = GPT2Config()
+        >>> # Initializing a GPT2 configuration
+        >>> configuration = GPT2Config()

-            >>> # Initializing a model from the configuration
-            >>> model = GPT2Model(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = GPT2Model(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "gpt2"
--- a/src/transformers/configuration_longformer.py
+++ b/src/transformers/configuration_longformer.py
@ -33,32 +33,32 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class LongformerConfig(RobertaConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
-        It is used to instantiate an Longformer model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
+    This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`.
+    It is used to instantiate an Longformer model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the RoBERTa `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.

-        The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
-        It reuses the same defaults. Please check the parent class for more information.
+    The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`.
+    It reuses the same defaults. Please check the parent class for more information.

-        Args:
-            attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
-                Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
-                To specify a different window size for each layer, use a :obj:`List[int]` where
-                ``len(attention_window) == num_hidden_layers``.
+    Args:
+        attention_window (:obj:`int` or :obj:`List[int]`, optional, defaults to 512):
+            Size of an attention window around each token. If :obj:`int`, use the same size for all layers.
+            To specify a different window size for each layer, use a :obj:`List[int]` where
+            ``len(attention_window) == num_hidden_layers``.

-        Example::
+    Example::

-            >>> from transformers import LongformerConfig, LongformerModel
+        >>> from transformers import LongformerConfig, LongformerModel

-            >>> # Initializing a Longformer configuration
-            >>> configuration = LongformerConfig()
+        >>> # Initializing a Longformer configuration
+        >>> configuration = LongformerConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = LongformerModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = LongformerModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "longformer"

--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@ -25,79 +25,79 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class MobileBertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
-        It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel`.
+    It is used to instantiate a MobileBERT model according to the specified arguments, defining the model
+    architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.


-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the MobileBERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 24):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 4):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the MobileBERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.MobileBertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.MobileBertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.

-            pad_token_id (:obj:`int`, optional, defaults to 0):
-                The ID of the token in the word embedding to use as padding.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                The dimension of the word embedding vectors.
-            trigram_input (:obj:`bool`, optional, defaults to True):
-                Use a convolution of trigram as input.
-            use_bottleneck (:obj:`bool`, optional, defaults to True):
-                Whether to use bottleneck in BERT.
-            intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
-                Size of bottleneck layer output.
-            use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
-                Whether to use attention inputs from the bottleneck transformation.
-            key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
-                Whether to use the same linear transformation for query&key in the bottleneck.
-            num_feedforward_networks (:obj:`int`, optional, defaults to 4):
-                Number of FFNs in a block.
-            normalization_type (:obj:`str`, optional, defaults to "no_norm"):
-                The normalization type in BERT.
+        pad_token_id (:obj:`int`, optional, defaults to 0):
+            The ID of the token in the word embedding to use as padding.
+        embedding_size (:obj:`int`, optional, defaults to 128):
+            The dimension of the word embedding vectors.
+        trigram_input (:obj:`bool`, optional, defaults to True):
+            Use a convolution of trigram as input.
+        use_bottleneck (:obj:`bool`, optional, defaults to True):
+            Whether to use bottleneck in BERT.
+        intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
+            Size of bottleneck layer output.
+        use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
+            Whether to use attention inputs from the bottleneck transformation.
+        key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
+            Whether to use the same linear transformation for query&key in the bottleneck.
+        num_feedforward_networks (:obj:`int`, optional, defaults to 4):
+            Number of FFNs in a block.
+        normalization_type (:obj:`str`, optional, defaults to "no_norm"):
+            The normalization type in BERT.

-        Example:
+    Example:

-            >>> from transformers import MobileBertModel, MobileBertConfig
+        >>> from transformers import MobileBertModel, MobileBertConfig

-            >>> # Initializing a MobileBERT configuration
-            >>> configuration = MobileBertConfig()
+        >>> # Initializing a MobileBERT configuration
+        >>> configuration = MobileBertConfig()

-            >>> # Initializing a model from the configuration above
-            >>> model = MobileBertModel(configuration)
+        >>> # Initializing a model from the configuration above
+        >>> model = MobileBertModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config

-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
+    Attributes:
+        pretrained_config_archive_map (Dict[str, str]):
+            A dictionary containing all the available pre-trained checkpoints.
    """
    pretrained_config_archive_map = MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "mobilebert"
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@ -28,84 +28,84 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class OpenAIGPTConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
-        It is used to instantiate an GPT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
+    This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel`.
+    It is used to instantiate an GPT model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 40478):
-                Vocabulary size of the GPT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
-            n_positions (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            n_ctx (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the causal mask (usually same as n_positions).
-            n_embd (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the embeddings and hidden states.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
-                The dropout ratio for the embeddings.
-            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention.
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether special tokens should be predicted when the model is has a language modeling head.
-            summary_type (:obj:`string`, optional, defaults to "cls_index"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 40478):
+            Vocabulary size of the GPT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+        n_positions (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether special tokens should be predicted when the model is has a language modeling head.
+        summary_type (:obj:`string`, optional, defaults to "cls_index"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Is one of the following options:

-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
-                Add a dropout before the projection and activation
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            Add a dropout before the projection and activation

-        Example::
+    Example::

-            >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
+        >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel

-            >>> # Initializing a GPT configuration
-            >>> configuration = OpenAIGPTConfig()
+        >>> # Initializing a GPT configuration
+        >>> configuration = OpenAIGPTConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = OpenAIGPTModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = OpenAIGPTModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "openai-gpt"
--- a/src/transformers/configuration_pegasus.py
+++ b/src/transformers/configuration_pegasus.py
@ -92,8 +92,8 @@ expected_alpha = {
@add_start_docstrings_to_callable(BART_CONFIG_ARGS_DOC)
 class PegasusConfig(BartConfig):
    r"""
-        :class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
-        `PegasusModel`.
+    :class:`~transformers.PegasusConfig` is the configuration class to store the configuration of a
+    `PegasusModel`.
    """
    model_type = "pegasus"
    # The implementation of the config object is in BartConfig
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@ -29,105 +29,105 @@ REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class ReformerConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
-        It is used to instantiate an Reformer model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
+    It is used to instantiate an Reformer model according to the specified arguments, defining the model
+    architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            attention_head_size (:obj:`int`, optional, defaults to 64):
-                Dimensionality of the projected key, query and value vectors
-            attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
-                List of attention layer types in ascending order. It can be chosen between a
-                LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
-                For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
-                For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
-            axial_pos_embds (:obj:`bool`, optional, defaults to True):
-                If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
-            axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
-                The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
-            axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
-                The position dims of the axial position encodings.
-                During training the product of the position dims has to equal the sequence length.
-                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-            axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
-                The embedding dims of the axial position encodings.
-                The sum of the embedding dims has to equal the hidden size.
-                For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-            chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
-                The chunk size of the final language model feed forward head layer.
-                A chunk size of 0 means that the feed forward layer is not chunked.
-                A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
-                For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
-            eos_token_id (:obj:`int`, optional, defaults to 2):
-                The token id for the <EOS> token.
-            feed_forward_size (:obj:`int`, optional, defaults to 512):
-                Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
-            hash_seed (:obj:`int`, optional, defaults to `None`):
-                Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
-                The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
-                If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            hidden_size (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the output hidden states of the residual attention blocks.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            is_decoder (:obj:`bool`, optional, defaults to False):
-                If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
-                When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            local_chunk_length (:obj:`int`, optional, defaults to 64):
-                Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
-            local_num_chunks_before (:obj:`int`, optional, defaults to 1):
-                Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
-            local_num_chunks_after (:obj:`int`, optional, defaults to 0):
-                Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
-            local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities in LocalSelfAttention.
-            lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
-                Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
-            lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
-                Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
-            lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
-                Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
-            lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities in LSHSelfAttention.
-            max_position_embeddings (:obj:`int`, optional, defaults to 4096):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
-                Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
-                The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
-                The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
-            num_hashes (:obj:`int`, optional, defaults to 1):
-                Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
-                The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
-            pad_token_id (:obj:`int`, optional, defaults to 0):
-                The token id for the <PAD> token.
-            vocab_size (:obj:`int`, optional, defaults to 320):
-                Vocabulary size of the Reformer model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
+    Args:
+        attention_head_size (:obj:`int`, optional, defaults to 64):
+            Dimensionality of the projected key, query and value vectors
+        attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
+            List of attention layer types in ascending order. It can be chosen between a
+            LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
+            For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
+            For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
+        axial_pos_embds (:obj:`bool`, optional, defaults to True):
+            If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
+        axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
+            The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
+        axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
+            The position dims of the axial position encodings.
+            During training the product of the position dims has to equal the sequence length.
+            For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
+        axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
+            The embedding dims of the axial position encodings.
+            The sum of the embedding dims has to equal the hidden size.
+            For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
+        chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
+            The chunk size of the final language model feed forward head layer.
+            A chunk size of 0 means that the feed forward layer is not chunked.
+            A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
+            For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+        eos_token_id (:obj:`int`, optional, defaults to 2):
+            The token id for the <EOS> token.
+        feed_forward_size (:obj:`int`, optional, defaults to 512):
+            Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
+        hash_seed (:obj:`int`, optional, defaults to `None`):
+            Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
+            The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
+            If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        hidden_size (:obj:`int`, optional, defaults to 256):
+            Dimensionality of the output hidden states of the residual attention blocks.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        is_decoder (:obj:`bool`, optional, defaults to False):
+            If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
+            When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        local_chunk_length (:obj:`int`, optional, defaults to 64):
+            Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+        local_num_chunks_before (:obj:`int`, optional, defaults to 1):
+            Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
+        local_num_chunks_after (:obj:`int`, optional, defaults to 0):
+            Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
+        local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities in LocalSelfAttention.
+        lsh_attn_chunk_length (:obj:`int`, optional, defaults to 64):
+            Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
+        lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
+            Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+        lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
+            Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
+        lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities in LSHSelfAttention.
+        max_position_embeddings (:obj:`int`, optional, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `None`):
+            Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
+            The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
+            The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` is set to `None`, a good value for `num_buckets` is calculated on the fly.
+        num_hashes (:obj:`int`, optional, defaults to 1):
+            Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
+            The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
+        pad_token_id (:obj:`int`, optional, defaults to 0):
+            The token id for the <PAD> token.
+        vocab_size (:obj:`int`, optional, defaults to 320):
+            Vocabulary size of the Reformer model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.

-        Example::
+    Example::

-            >>> from transformers import ReformerModel, ReformerConfig
+        >>> from transformers import ReformerModel, ReformerConfig

-            >>> # Initializing a Reformer configuration
-            >>> configuration = ReformerConfig()
+        >>> # Initializing a Reformer configuration
+        >>> configuration = ReformerConfig()

-            >>> # Initializing a Reformer model
-            >>> model = ReformerModel(configuration)
+        >>> # Initializing a Reformer model
+        >>> model = ReformerModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "reformer"

--- a/src/transformers/configuration_retribert.py
+++ b/src/transformers/configuration_retribert.py
@ -28,47 +28,47 @@ RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RetriBertConfig(PretrainedConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
-        It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
-        architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`.
+    It is used to instantiate a RetriBertModel model according to the specified arguments, defining the model
+    architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.


-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            share_encoders (:obj:`bool`, optional, defaults to True):
-                Whether to use the same Bert-type encoder for the queries and document
-            projection_dim (:obj:`int`, optional, defaults to 128):
-                Final dimension of the query and document representation after projection
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+        hidden_size (:obj:`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, optional, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, optional, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        share_encoders (:obj:`bool`, optional, defaults to True):
+            Whether to use the same Bert-type encoder for the queries and document
+        projection_dim (:obj:`int`, optional, defaults to 128):
+            Final dimension of the query and document representation after projection

    """
    model_type = "retribert"
--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@ -33,34 +33,33 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class RobertaConfig(BertConfig):
    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
-        It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel`.
+    It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
-        It reuses the same defaults. Please check the parent class for more information.
+    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
+    It reuses the same defaults. Please check the parent class for more information.

-        Example::
+    Example::

-            >>> from transformers import RobertaConfig, RobertaModel
+        >>> from transformers import RobertaConfig, RobertaModel

-            >>> # Initializing a RoBERTa configuration
-            >>> configuration = RobertaConfig()
+        >>> # Initializing a RoBERTa configuration
+        >>> configuration = RobertaConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = RobertaModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = RobertaModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "roberta"

    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
-        """Constructs RobertaConfig.
-        """
+        """Constructs RobertaConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
--- a/src/transformers/configuration_t5.py
+++ b/src/transformers/configuration_t5.py
@ -31,33 +31,33 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class T5Config(PretrainedConfig):
    r"""
-        :class:`~transformers.T5Config` is the configuration class to store the configuration of a
-        `T5Model`.
+    :class:`~transformers.T5Config` is the configuration class to store the configuration of a
+    `T5Model`.


-        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
-            d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
-            num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
-            d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
-            d_ff: Size of the intermediate feed forward layer in each `T5Block`.
-            num_heads: Number of attention heads for each attention layer in
-                the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            n_positions: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `T5Model`.
-            initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
-            layer_norm_eps: The epsilon used by LayerNorm.
+    Arguments:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
+        d_model: Size of the encoder layers and the pooler layer. `d_model` can also accesed via the property `hidden_size`.
+        num_layers: Number of hidden layers in the Transformer encoder. `num_layers` can also be accessed via the property `num_hidden_layers`.
+        d_kv: Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
+        d_ff: Size of the intermediate feed forward layer in each `T5Block`.
+        num_heads: Number of attention heads for each attention layer in
+            the Transformer encoder. `num_heads` can also be accessed via the property `num_attention_heads`.
+        intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        hidden_act: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+        hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob: The dropout ratio for the attention
+            probabilities.
+        n_positions: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048). `n_positions` can also be accessed via the property `max_position_embeddings`.
+        type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+            `T5Model`.
+        initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
+        layer_norm_eps: The epsilon used by LayerNorm.
    """
    model_type = "t5"

@ -80,7 +80,10 @@ class T5Config(PretrainedConfig):
        **kwargs
    ):
        super().__init__(
-            pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder, **kwargs,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
        )
        self.vocab_size = vocab_size
        self.n_positions = n_positions
--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@ -31,84 +31,84 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class TransfoXLConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
-        It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel`.
+    It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 267735):
-                Vocabulary size of the Transformer XL model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
-            cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
-                Cutoffs for the adaptive softmax
-            d_model (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the model's hidden states.
-            d_embed (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the embeddings
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            d_head (:obj:`int`, optional, defaults to 64):
-                Dimensionality of the model's heads.
-            d_inner (:obj:`int`, optional, defaults to 4096):
-                Inner dimension in FF
-            div_val (:obj:`int`, optional, defaults to 4):
-                Divident value for adapative input and softmax
-            pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Apply LayerNorm to the input instead of the output
-            n_layer (:obj:`int`, optional, defaults to 18):
-                Number of hidden layers in the Transformer encoder.
-            tgt_len (:obj:`int`, optional, defaults to 128):
-                Number of tokens to predict
-            ext_len (:obj:`int`, optional, defaults to 0):
-                Length of the extended context
-            mem_len (:obj:`int`, optional, defaults to 1600):
-                Length of the retained previous heads
-            clamp_len (:obj:`int`, optional, defaults to 1000):
-                use the same pos embeddings after clamp_len
-            same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Use the same attn length for all tokens
-            proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
-                True to share all but first projs, False not to share.
-            attn_type (:obj:`int`, optional, defaults to 0):
-                Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-            sample_softmax (:obj:`int`, optional, defaults to -1):
-                number of samples in sampled softmax
-            adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
-                use adaptive softmax
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            dropatt (:obj:`float`, optional, defaults to 0):
-                The dropout ratio for the attention probabilities.
-            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Untie relative position biases
-            init (:obj:`string`, optional, defaults to `normal`):
-                Parameter initializer to use
-            init_range (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by U(-init_range, init_range).
-            proj_init_std (:obj:`float`, optional, defaults to 0.01):
-                Parameters initialized by N(0, init_std)
-            init_std (:obj:`float`, optional, defaults to 0.02):
-                Parameters initialized by N(0, init_std)
-            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
-                The epsilon to use in the layer normalization layers
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 267735):
+            Vocabulary size of the Transformer XL model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
+        cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
+            Cutoffs for the adaptive softmax
+        d_model (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the model's hidden states.
+        d_embed (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the embeddings
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_head (:obj:`int`, optional, defaults to 64):
+            Dimensionality of the model's heads.
+        d_inner (:obj:`int`, optional, defaults to 4096):
+            Inner dimension in FF
+        div_val (:obj:`int`, optional, defaults to 4):
+            Divident value for adapative input and softmax
+        pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Apply LayerNorm to the input instead of the output
+        n_layer (:obj:`int`, optional, defaults to 18):
+            Number of hidden layers in the Transformer encoder.
+        tgt_len (:obj:`int`, optional, defaults to 128):
+            Number of tokens to predict
+        ext_len (:obj:`int`, optional, defaults to 0):
+            Length of the extended context
+        mem_len (:obj:`int`, optional, defaults to 1600):
+            Length of the retained previous heads
+        clamp_len (:obj:`int`, optional, defaults to 1000):
+            use the same pos embeddings after clamp_len
+        same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Use the same attn length for all tokens
+        proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
+            True to share all but first projs, False not to share.
+        attn_type (:obj:`int`, optional, defaults to 0):
+            Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+        sample_softmax (:obj:`int`, optional, defaults to -1):
+            number of samples in sampled softmax
+        adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
+            use adaptive softmax
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        dropatt (:obj:`float`, optional, defaults to 0):
+            The dropout ratio for the attention probabilities.
+        untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Untie relative position biases
+        init (:obj:`string`, optional, defaults to `normal`):
+            Parameter initializer to use
+        init_range (:obj:`float`, optional, defaults to 0.01):
+            Parameters initialized by U(-init_range, init_range).
+        proj_init_std (:obj:`float`, optional, defaults to 0.01):
+            Parameters initialized by N(0, init_std)
+        init_std (:obj:`float`, optional, defaults to 0.02):
+            Parameters initialized by N(0, init_std)
+        layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers

-        Example::
+    Example::

-            >>> from transformers import TransfoXLConfig, TransfoXLModel
+        >>> from transformers import TransfoXLConfig, TransfoXLModel

-            >>> # Initializing a Transformer XL configuration
-            >>> configuration = TransfoXLConfig()
+        >>> # Initializing a Transformer XL configuration
+        >>> configuration = TransfoXLConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = TransfoXLModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = TransfoXLModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "transfo-xl"
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -29,116 +29,116 @@ logger = logging.get_logger(__name__)


 class PretrainedConfig(object):
-    r""" Base class for all configuration classes.
-        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving
-        configurations.
+    r"""Base class for all configuration classes.
+    Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving
+    configurations.

-        Note:
-            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
-            initialize a model does **not** load the model weights.
-            It only affects the model's configuration.
+    Note:
+        A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
+        initialize a model does **not** load the model weights.
+        It only affects the model's configuration.

-        Class attributes (overridden by derived classes)
-            - **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
-              recreate the correct object in :class:`~transformers.AutoConfig`.
+    Class attributes (overridden by derived classes)
+        - **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
+          recreate the correct object in :class:`~transformers.AutoConfig`.

-        Args:
-            output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should return all hidden-states.
-            output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should returns all attentions.
-            use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not the model should return the last key/values attentions (not used by all models).
-            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
-                plain tuple.
-            is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether the model is used as an encoder/decoder or not.
-            is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether the model is used as decoder or not (in which case it's used as an encoder).
-            add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether cross-attention layers should be added to the model. Note, this option is only relevant for models that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
-            tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
-                Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder and decoder model to have the exact same parameter names.
-            prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
-                Pruned heads of the model. The keys are the selected layer indices and the associated values, the list
-                of heads to prune in said layer.
+    Args:
+        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should return all hidden-states.
+        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should returns all attentions.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
+            plain tuple.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model is used as an encoder/decoder or not.
+        is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model is used as decoder or not (in which case it's used as an encoder).
+        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether cross-attention layers should be added to the model. Note, this option is only relevant for models that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
+        tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
+            Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder and decoder model to have the exact same parameter names.
+        prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
+            Pruned heads of the model. The keys are the selected layer indices and the associated values, the list
+            of heads to prune in said layer.

-                For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer
-                2.
-            xla_device (:obj:`bool`, `optional`):
-                A flag to indicate if TPU are available or not.
-            chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
-                The chunk size of all feed forward layers in the residual attention blocks.
-                A chunk size of :obj:`0` means that the feed forward layer is not chunked.
-                A chunk size of n means that the feed forward layer processes :obj:`n` < sequence_length embeddings at a time.
-                For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+            For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer
+            2.
+        xla_device (:obj:`bool`, `optional`):
+            A flag to indicate if TPU are available or not.
+        chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
+            The chunk size of all feed forward layers in the residual attention blocks.
+            A chunk size of :obj:`0` means that the feed forward layer is not chunked.
+            A chunk size of n means that the feed forward layer processes :obj:`n` < sequence_length embeddings at a time.
+            For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .

-        Parameters for sequence generation
-            - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by
-              default in the :obj:`generate` method of the model.
-            - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by
-              default in the :obj:`generate` method of the model.
-            - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in
-              the :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
-            - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by
-              default in the :obj:`generate` method of the model. Whether to stop the beam search when at least
-              ``num_beams`` sentences are finished per batch or not.
-            - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be
-              used by default in the :obj:`generate` method of the model. 1 means no beam search.
-            - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
-              probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
-              positive.
-            - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to
-              keep for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
-            - **top_p** (:obj:`float`, `optional`, defaults to 1) --  Value that will be used by default in the
-              :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens
-              with probabilities that add up to ``top_p`` or higher are kept for generation.
-            - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty
-              that will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
-            - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that
-              will be used by default in the :obj:`generate` method of the model.
-            - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default
-              in the :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of
-              that size can only occur once.
-            - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be
-              generated that will be used by default in the :obj:`generate` method of the model. In order to get the
-              tokens of the words that should not appear in the generated text, use
-              :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
-            - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed
-              returned sequences for each element in the batch that will be used by default in the :obj:`generate`
-              method of the model.
+    Parameters for sequence generation
+        - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by
+          default in the :obj:`generate` method of the model.
+        - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by
+          default in the :obj:`generate` method of the model.
+        - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in
+          the :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
+        - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by
+          default in the :obj:`generate` method of the model. Whether to stop the beam search when at least
+          ``num_beams`` sentences are finished per batch or not.
+        - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be
+          used by default in the :obj:`generate` method of the model. 1 means no beam search.
+        - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
+          probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
+          positive.
+        - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to
+          keep for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
+        - **top_p** (:obj:`float`, `optional`, defaults to 1) --  Value that will be used by default in the
+          :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens
+          with probabilities that add up to ``top_p`` or higher are kept for generation.
+        - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty
+          that will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
+        - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that
+          will be used by default in the :obj:`generate` method of the model.
+        - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default
+          in the :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of
+          that size can only occur once.
+        - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be
+          generated that will be used by default in the :obj:`generate` method of the model. In order to get the
+          tokens of the words that should not appear in the generated text, use
+          :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
+        - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed
+          returned sequences for each element in the batch that will be used by default in the :obj:`generate`
+          method of the model.

-        Parameters for fine-tuning tasks
-            - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the
-              model pretrained weights.
-            - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
-              used when converting from an original (TensorFlow or PyTorch) checkpoint.
-            - **id2label** (:obj:`List[str]`, `optional`) -- A map from index (for instance prediction index, or target
-              index) to label.
-            - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
-            - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
-              typically for a classification task.
-            - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for
-              the current task.
+    Parameters for fine-tuning tasks
+        - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the
+          model pretrained weights.
+        - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
+          used when converting from an original (TensorFlow or PyTorch) checkpoint.
+        - **id2label** (:obj:`List[str]`, `optional`) -- A map from index (for instance prediction index, or target
+          index) to label.
+        - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
+        - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
+          typically for a classification task.
+        - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for
+          the current task.

-        Parameters linked to the tokenizer
-            - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each
-              text before calling the model.
-            - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
-            - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
-            - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
-            - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with
-              a different token than `bos`, the id of that token.
+    Parameters linked to the tokenizer
+        - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each
+          text before calling the model.
+        - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
+        - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
+        - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
+        - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with
+          a different token than `bos`, the id of that token.

-        PyTorch specific parameters
-            - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
-              used with Torchscript.
-            - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer.
+    PyTorch specific parameters
+        - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
+          used with Torchscript.
+        - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the model has a output word embedding layer.

-        TensorFlow specific parameters
-            - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should
-              use BFloat16 scalars (only used by some TensorFlow models).
+    TensorFlow specific parameters
+        - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should
+          use BFloat16 scalars (only used by some TensorFlow models).
    """
    model_type: str = ""

--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@ -36,120 +36,120 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class XLMConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
-        It is used to instantiate an XLM model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
+    It is used to instantiate an XLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30145):
-                Vocabulary size of the XLM model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
-            emb_dim (:obj:`int`, optional, defaults to 2048):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for the attention mechanism
-            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
-            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
-            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Set this to `True` for the model to behave in a causal manner.
-                Causal models use a triangular attention mask in order to only attend to the left-side context instead
-                if a bidirectional context.
-            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
-                layer.
-            n_langs (:obj:`int`, optional, defaults to 1):
-                The number of languages the model handles. Set to 1 for monolingual models.
-            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
-                Whether to use language embeddings. Some models use additional language embeddings, see
-                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
-                for information on how to use them.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
-                The standard deviation of the truncated_normal_initializer for
-                initializing the embedding matrices.
-            init_std (:obj:`int`, optional, defaults to 50257):
-                The standard deviation of the truncated_normal_initializer for
-                initializing all weight matrices except the embedding matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            bos_index (:obj:`int`, optional, defaults to 0):
-                The index of the beginning of sentence token in the vocabulary.
-            eos_index (:obj:`int`, optional, defaults to 1):
-                The index of the end of sentence token in the vocabulary.
-            pad_index (:obj:`int`, optional, defaults to 2):
-                The index of the padding token in the vocabulary.
-            unk_index (:obj:`int`, optional, defaults to 3):
-                The index of the unknown token in the vocabulary.
-            mask_index (:obj:`int`, optional, defaults to 5):
-                The index of the masking token in the vocabulary.
-            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
-                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-            summary_type (:obj:`string`, optional, defaults to "first"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 30145):
+            Vocabulary size of the XLM model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
+        emb_dim (:obj:`int`, optional, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
+        sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Set this to `True` for the model to behave in a causal manner.
+            Causal models use a triangular attention mask in order to only attend to the left-side context instead
+            if a bidirectional context.
+        asm (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (:obj:`int`, optional, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see
+            `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+            for information on how to use them.
+        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+            The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for
+            initializing the embedding matrices.
+        init_std (:obj:`int`, optional, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices except the embedding matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (:obj:`int`, optional, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (:obj:`int`, optional, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (:obj:`int`, optional, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (:obj:`int`, optional, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (:obj:`int`, optional, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
+            Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (:obj:`string`, optional, defaults to "first"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Is one of the following options:

-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLMForSequenceClassification`.
-                Add a dropout before the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            mask_token_id (:obj:`int`, optional, defaults to 0):
-                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-            lang_id (:obj:`int`, optional, defaults to 1):
-                The ID of the language used by the model. This parameter is used when generating
-                text in a given language.
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLMForSequenceClassification`.
+            Add a dropout before the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        mask_token_id (:obj:`int`, optional, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (:obj:`int`, optional, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating
+            text in a given language.

-        Example::
+    Example::

-            >>> from transformers import XLMConfig, XLMModel
+        >>> from transformers import XLMConfig, XLMModel

-            >>> # Initializing a XLM configuration
-            >>> configuration = XLMConfig()
+        >>> # Initializing a XLM configuration
+        >>> configuration = XLMConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = XLMModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = XLMModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "xlm"
@ -191,8 +191,7 @@ class XLMConfig(PretrainedConfig):
        bos_token_id=0,
        **kwargs
    ):
-        """Constructs XLMConfig.
-        """
+        """Constructs XLMConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@ -31,104 +31,104 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {

 class XLNetConfig(PretrainedConfig):
    """
-        This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
-        It is used to instantiate an XLNet model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
+    It is used to instantiate an XLNet model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.

-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.

-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 32000):
-                Vocabulary size of the XLNet model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
-            d_model (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the encoder layers and the pooler layer.
-            n_layer (:obj:`int`, optional, defaults to 24):
-                Number of hidden layers in the Transformer encoder.
-            n_head (:obj:`int`, optional, defaults to 16):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            d_inner (:obj:`int`, optional, defaults to 4096):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            ff_activation (:obj:`string`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Untie relative position biases
-            attn_type (:obj:`string`, optional, defaults to "bi"):
-                The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-            dropout (:obj:`float`, optional, defaults to 0.1):
-                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-            mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
-                The number of tokens to cache. The key/value pairs that have already been pre-computed
-                in a previous forward pass won't be re-computed. See the
-                `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
-                for more information.
-            reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
-                The number of tokens in the current batch to be cached and reused in the future.
-            bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use bidirectional input pipeline. Usually set to `True` during
-                pretraining and `False` during finetuning.
-            clamp_len (:obj:`int`, optional, defaults to -1):
-                Clamp all relative distances larger than clamp_len.
-                Setting this attribute to -1 means no clamping.
-            same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
-                Whether to use the same attention length for each token.
-            summary_type (:obj:`string`, optional, defaults to "last"):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Is one of the following options:
+    Args:
+        vocab_size (:obj:`int`, optional, defaults to 32000):
+            Vocabulary size of the XLNet model. Defines the different tokens that
+            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
+        d_model (:obj:`int`, optional, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (:obj:`int`, optional, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, optional, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_inner (:obj:`int`, optional, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        ff_activation (:obj:`string`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Untie relative position biases
+        attn_type (:obj:`string`, optional, defaults to "bi"):
+            The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
+        initializer_range (:obj:`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        dropout (:obj:`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+            The number of tokens to cache. The key/value pairs that have already been pre-computed
+            in a previous forward pass won't be re-computed. See the
+            `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
+            for more information.
+        reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+            The number of tokens in the current batch to be cached and reused in the future.
+        bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use bidirectional input pipeline. Usually set to `True` during
+            pretraining and `False` during finetuning.
+        clamp_len (:obj:`int`, optional, defaults to -1):
+            Clamp all relative distances larger than clamp_len.
+            Setting this attribute to -1 means no clamping.
+        same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
+            Whether to use the same attention length for each token.
+        summary_type (:obj:`string`, optional, defaults to "last"):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            Is one of the following options:

-                - 'last' => take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Add a projection after the vector extraction
-            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                'tanh' => add a tanh activation to the output, Other => no activation.
-            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
-                Argument used when doing sequence summary. Used in for the multiple choice head in
-                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
-                Add a dropout after the projection and activation
-            start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            end_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNet.
-            use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not the model should return the last pre-computed hidden states.
+            - 'last' => take the last token hidden state (like XLNet)
+            - 'first' => take the first token hidden state (like Bert)
+            - 'mean' => take the mean of all tokens hidden states
+            - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+            - 'attn' => Not implemented now, use multi-head attention
+        summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            Add a projection after the vector extraction
+        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            'tanh' => add a tanh activation to the output, Other => no activation.
+        summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+        summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+            Add a dropout after the projection and activation
+        start_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        end_n_top (:obj:`int`, optional, defaults to 5):
+            Used in the SQuAD evaluation script for XLM and XLNet.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last pre-computed hidden states.

-                .. note::
-                    This flag behaves differently from with other models: it just controls the inference behavior, during
-                    training the model always uses ``use_cache=True``.
+            .. note::
+                This flag behaves differently from with other models: it just controls the inference behavior, during
+                training the model always uses ``use_cache=True``.

-        Example::
+    Example::

-            >>> from transformers import XLNetConfig, XLNetModel
+        >>> from transformers import XLNetConfig, XLNetModel

-            >>> # Initializing a XLNet configuration
-            >>> configuration = XLNetConfig()
+        >>> # Initializing a XLNet configuration
+        >>> configuration = XLNetConfig()

-            >>> # Initializing a model from the configuration
-            >>> model = XLNetModel(configuration)
+        >>> # Initializing a model from the configuration
+        >>> model = XLNetModel(configuration)

-            >>> # Accessing the model configuration
-            >>> configuration = model.config
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """

    model_type = "xlnet"
@ -162,8 +162,7 @@ class XLNetConfig(PretrainedConfig):
        eos_token_id=2,
        **kwargs
    ):
-        """Constructs XLNetConfig.
-        """
+        """Constructs XLNetConfig."""
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model
--- a/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
@ -27,5 +27,6 @@ if __name__ == "__main__":
        checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
        pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
        convert_dialogpt_checkpoint(
-            checkpoint_path, pytorch_dump_folder_path,
+            checkpoint_path,
+            pytorch_dump_folder_path,
        )
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@ -38,24 +38,39 @@ class OnnxConverterArgumentParser(ArgumentParser):
        super().__init__("ONNX Converter")

        self.add_argument(
-            "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction",
+            "--pipeline",
+            type=str,
+            choices=SUPPORTED_PIPELINES,
+            default="feature-extraction",
        )
        self.add_argument(
-            "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)",
+            "--model",
+            type=str,
+            required=True,
+            help="Model's id or path (ex: bert-base-cased)",
        )
        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
        self.add_argument(
-            "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model",
+            "--framework",
+            type=str,
+            choices=["pt", "tf"],
+            help="Framework for loading the model",
        )
        self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
        self.add_argument(
-            "--check-loading", action="store_true", help="Check ONNX is able to load the model",
+            "--check-loading",
+            action="store_true",
+            help="Check ONNX is able to load the model",
        )
        self.add_argument(
-            "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb",
+            "--use-external-format",
+            action="store_true",
+            help="Allow exporting model >= than 2Gb",
        )
        self.add_argument(
-            "--quantize", action="store_true", help="Quantize the neural network to be run with int8",
+            "--quantize",
+            action="store_true",
+            help="Quantize the neural network to be run with int8",
        )
        self.add_argument("output")

@ -376,7 +391,10 @@ def quantize(onnx_model_path: Path) -> Path:
    )

    quantized_model = quantize(
-        model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True,
+        model=onnx_model,
+        quantization_mode=QuantizationMode.IntegerOps,
+        force_fusions=True,
+        symmetric_weight=True,
    )

    # Append "-quantized" at the end of the model's name
--- a/src/transformers/convert_marian_to_pytorch.py
+++ b/src/transformers/convert_marian_to_pytorch.py
@ -255,7 +255,11 @@ license: apache-2.0


 def write_model_card(
-    hf_model_name: str, repo_root="OPUS-MT-train", save_dir=Path("marian_converted"), dry_run=False, extra_metadata={},
+    hf_model_name: str,
+    repo_root="OPUS-MT-train",
+    save_dir=Path("marian_converted"),
+    dry_run=False,
+    extra_metadata={},
 ) -> str:
    """Copy the most recent model's readme section from opus, and add metadata.
    upload command: aws s3 sync model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
@ -604,7 +608,9 @@ class OpusState:

        assert "hidden_size" not in cfg.to_dict()
        load_layers_(
-            model.model.encoder.layers, state_dict, BART_CONVERTER,
+            model.model.encoder.layers,
+            state_dict,
+            BART_CONVERTER,
        )
        load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)

--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@ -108,7 +108,12 @@ if is_torch_available():
 logging.set_verbosity_info()

 MODEL_CLASSES = {
-    "bert": (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "bert": (
+        BertConfig,
+        TFBertForPreTraining,
+        BertForPreTraining,
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
    "bert-large-uncased-whole-word-masking-finetuned-squad": (
        BertConfig,
        TFBertForQuestionAnswering,
@ -127,9 +132,24 @@ MODEL_CLASSES = {
        BertForSequenceClassification,
        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ),
-    "gpt2": (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "xlnet": (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "xlm": (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "gpt2": (
+        GPT2Config,
+        TFGPT2LMHeadModel,
+        GPT2LMHeadModel,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlnet": (
+        XLNetConfig,
+        TFXLNetLMHeadModel,
+        XLNetLMHeadModel,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "xlm": (
+        XLMConfig,
+        TFXLMWithLMHeadModel,
+        XLMWithLMHeadModel,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
    "xlm-roberta": (
        XLMRobertaConfig,
        TFXLMRobertaForMaskedLM,
@ -148,7 +168,12 @@ MODEL_CLASSES = {
        OpenAIGPTLMHeadModel,
        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ),
-    "roberta": (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "roberta": (
+        RobertaConfig,
+        TFRobertaForMaskedLM,
+        RobertaForMaskedLM,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
    "roberta-large-mnli": (
        RobertaConfig,
        TFRobertaForSequenceClassification,
@ -179,10 +204,30 @@ MODEL_CLASSES = {
        DistilBertForQuestionAnswering,
        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
    ),
-    "ctrl": (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "albert": (AlbertConfig, TFAlbertForPreTraining, AlbertForPreTraining, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "t5": (T5Config, TFT5ForConditionalGeneration, T5ForConditionalGeneration, T5_PRETRAINED_CONFIG_ARCHIVE_MAP,),
-    "electra": (ElectraConfig, TFElectraForPreTraining, ElectraForPreTraining, ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,),
+    "ctrl": (
+        CTRLConfig,
+        TFCTRLLMHeadModel,
+        CTRLLMHeadModel,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "albert": (
+        AlbertConfig,
+        TFAlbertForPreTraining,
+        AlbertForPreTraining,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "t5": (
+        T5Config,
+        TFT5ForConditionalGeneration,
+        T5ForConditionalGeneration,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
+    "electra": (
+        ElectraConfig,
+        TFElectraForPreTraining,
+        ElectraForPreTraining,
+        ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
 }


--- a/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
@ -49,10 +49,12 @@ def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
        torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.value,
+        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+        torch_layer.output.dense,
+        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
    )


@ -64,16 +66,20 @@ def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size):
    np_dense = np.asarray(weights[3])

    set_param(
-        torch_layer.self_attention.query, torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.query,
+        torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.self_attention.key, torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.key,
+        torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
+        torch_layer.self_attention.value,
+        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
    )
    set_param(
-        torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
+        torch_layer.output.dense,
+        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
    )


@ -83,7 +89,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
    layer_norm_1_weight = np.asarray(layer_norm_1[0])
    layer_norm_1_bias = np.asarray(layer_norm_1[1])
    set_param(
-        torch_block.attention.layer_norm, torch.tensor(layer_norm_1_weight), torch.tensor(layer_norm_1_bias),
+        torch_block.attention.layer_norm,
+        torch.tensor(layer_norm_1_weight),
+        torch.tensor(layer_norm_1_bias),
    )

    # lsh weights + output
@ -104,7 +112,9 @@ def set_block_weights_in_torch(weights, torch_block, hidden_size):
    layer_norm_2_weight = np.asarray(intermediate_weights[0][0])
    layer_norm_2_bias = np.asarray(intermediate_weights[0][1])
    set_param(
-        torch_block.feed_forward.layer_norm, torch.tensor(layer_norm_2_weight), torch.tensor(layer_norm_2_bias),
+        torch_block.feed_forward.layer_norm,
+        torch.tensor(layer_norm_2_weight),
+        torch.tensor(layer_norm_2_bias),
    )

    # intermediate dense
@ -133,7 +143,8 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
    # word embeds
    word_embeddings = np.asarray(weights[1])
    set_param(
-        torch_model_reformer.embeddings.word_embeddings, torch.tensor(word_embeddings),
+        torch_model_reformer.embeddings.word_embeddings,
+        torch.tensor(word_embeddings),
    )

    if isinstance(weights[3], tuple):
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@ -86,7 +86,10 @@ class GlueDataset(Dataset):
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_{}_{}_{}_{}".format(
-                mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
+                mode.value,
+                tokenizer.__class__.__name__,
+                str(args.max_seq_length),
+                args.task_name,
            ),
        )
        label_list = self.processor.get_labels()
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@ -21,7 +21,11 @@ class TextDataset(Dataset):
    """

    def __init__(
-        self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False,
+        self,
+        tokenizer: PreTrainedTokenizer,
+        file_path: str,
+        block_size: int,
+        overwrite_cache=False,
    ):
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

@ -29,7 +33,12 @@ class TextDataset(Dataset):

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
-            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
+            directory,
+            "cached_lm_{}_{}_{}".format(
+                tokenizer.__class__.__name__,
+                str(block_size),
+                filename,
+            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@ -119,7 +119,10 @@ class SquadDataset(Dataset):
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_{}_{}_{}_{}".format(
-                mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), version_tag,
+                mode.value,
+                tokenizer.__class__.__name__,
+                str(args.max_seq_length),
+                version_tag,
            ),
        )

--- a/src/transformers/data/metrics/squad_metrics.py
+++ b/src/transformers/data/metrics/squad_metrics.py
@ -589,10 +589,10 @@ def compute_predictions_log_probs(
    tokenizer,
    verbose_logging,
 ):
-    """ XLNet write prediction logic (more complex than Bert's).
-        Write final predictions to the json file and log-odds of null if needed.
+    """XLNet write prediction logic (more complex than Bert's).
+    Write final predictions to the json file and log-odds of null if needed.

-        Requires utils_squad_evaluate.py
+    Requires utils_squad_evaluate.py
    """
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@ -69,7 +69,10 @@ def glue_convert_examples_to_features(
 if is_tf_available():

    def _tf_glue_convert_examples_to_features(
-        examples: tf.data.Dataset, tokenizer: PreTrainedTokenizer, task=str, max_length: Optional[int] = None,
+        examples: tf.data.Dataset,
+        tokenizer: PreTrainedTokenizer,
+        task=str,
+        max_length: Optional[int] = None,
    ) -> tf.data.Dataset:
        """
        Returns:
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@ -269,7 +269,9 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                logger.info("Tokenizing example %d", ex_index)

            input_ids = tokenizer.encode(
-                example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len),
+                example.text_a,
+                add_special_tokens=True,
+                max_length=min(max_length, tokenizer.max_len),
            )
            all_input_ids.append(input_ids)

--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@ -329,7 +329,13 @@ class TFGenerationMixin:
        if self.config.is_encoder_decoder:

            # create empty decoder_input_ids
-            input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id
+            input_ids = (
+                tf.ones(
+                    (effective_batch_size * num_beams, 1),
+                    dtype=tf.int32,
+                )
+                * decoder_start_token_id
+            )
            cur_len = 1

            assert (
@ -422,8 +428,8 @@ class TFGenerationMixin:
        attention_mask,
        use_cache,
    ):
-        """ Generate sequences for each example without beam search (num_beams == 1).
-            All returned sequence are generated independantly.
+        """Generate sequences for each example without beam search (num_beams == 1).
+        All returned sequence are generated independantly.
        """

        # length of generated sentences / unfinished sentences
@ -587,8 +593,7 @@ class TFGenerationMixin:
        attention_mask,
        use_cache,
    ):
-        """ Generate sequences for each example with beam search.
-        """
+        """Generate sequences for each example with beam search."""

        # generated hypotheses
        generated_hyps = [
@ -960,14 +965,14 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):


 def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
-    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-        Args:
-            logits: logits distribution shape (batch size, vocabulary size)
-            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-            Make sure we keep at least min_tokens_to_keep per batch example in the output
-        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    logits_shape = shape_list(logits)

@ -1001,7 +1006,8 @@ def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("In
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
        sorted_indices_to_remove = tf.concat(
-            [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1,
+            [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]],
+            -1,
        )
        # scatter sorted tensors to original indexing
        indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices)
@ -1027,9 +1033,9 @@ def set_tensor_by_indices_to_value(tensor, indices, value):

 def sample_without_replacement(logits, num_samples):
    """
-        categorical sampling witouth replacement is currently not implemented
-        the gumbel-max trick will do for now
-        see https://github.com/tensorflow/tensorflow/issues/9260 for more info
+    categorical sampling witouth replacement is currently not implemented
+    the gumbel-max trick will do for now
+    see https://github.com/tensorflow/tensorflow/issues/9260 for more info
    """
    z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
    _, indices = tf.nn.top_k(logits + z, num_samples)
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@ -83,7 +83,11 @@ class GenerationMixin:
        # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
        if repetition_penalty != 1.0:
            self.enforce_repetition_penalty_(
-                scores, batch_size, num_beams, input_ids, repetition_penalty,
+                scores,
+                batch_size,
+                num_beams,
+                input_ids,
+                repetition_penalty,
            )

        # set eos token prob to zero if min_length is not reached
@ -324,7 +328,10 @@ class GenerationMixin:
                "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
            )
            input_ids = torch.full(
-                (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device,
+                (batch_size, 1),
+                bos_token_id,
+                dtype=torch.long,
+                device=next(self.parameters()).device,
            )
        else:
            assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
@ -514,8 +521,8 @@ class GenerationMixin:
        use_cache,
        model_specific_kwargs,
    ):
-        """ Generate sequences for each example without beam search (num_beams == 1).
-            All returned sequence are generated independantly.
+        """Generate sequences for each example without beam search (num_beams == 1).
+        All returned sequence are generated independantly.
        """
        # length of generated sentences / unfinished sentences
        unfinished_sents = input_ids.new(batch_size).fill_(1)
@ -619,8 +626,7 @@ class GenerationMixin:
        use_cache,
        model_specific_kwargs,
    ):
-        """ Generate sequences for each example with beam search.
-        """
+        """Generate sequences for each example with beam search."""

        # generated hypotheses
        generated_hyps = [
@ -749,7 +755,8 @@ class GenerationMixin:
                        if is_beam_token_worse_than_top_num_beams:
                            continue
                        generated_hyps[batch_idx].add(
-                            input_ids[effective_beam_id].clone(), beam_token_score.item(),
+                            input_ids[effective_beam_id].clone(),
+                            beam_token_score.item(),
                        )
                    else:
                        # add next predicted token since it is not eos_token
@ -806,7 +813,8 @@ class GenerationMixin:
                assert torch.all(
                    next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx]
                ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
-                    next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx],
+                    next_scores[:, :num_beams][batch_idx],
+                    beam_scores.view(batch_size, num_beams)[batch_idx],
                )

            # need to add best num_beams hypotheses to generated hyps
@ -916,7 +924,7 @@ def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iter


 def set_scores_to_inf_for_banned_tokens(scores: torch.Tensor, banned_tokens: List[List[int]]) -> None:
-    """ Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
+    """Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
    a list of list of banned tokens to ban in the format [[batch index, vocabulary position],...]
        Args:
            scores: logits distribution of shape (batch size, vocabulary size)
@ -946,14 +954,14 @@ def top_k_top_p_filtering(
    filter_value: float = -float("Inf"),
    min_tokens_to_keep: int = 1,
 ) -> Tensor:
-    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-        Args:
-            logits: logits distribution shape (batch size, vocabulary size)
-            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-            Make sure we keep at least min_tokens_to_keep per batch example in the output
-        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    if top_k > 0:
        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@ -36,20 +36,20 @@ logger = logging.get_logger(__name__)


 class ModelCard:
-    r""" Structured Model Card class.
-        Store model card as well as methods for loading/downloading/saving model cards.
+    r"""Structured Model Card class.
+    Store model card as well as methods for loading/downloading/saving model cards.

-        Please read the following paper for details and explanation on the sections:
-            "Model Cards for Model Reporting"
-                by Margaret Mitchell, Simone Wu,
-                Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
-                Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
-            Link: https://arxiv.org/abs/1810.03993
+    Please read the following paper for details and explanation on the sections:
+        "Model Cards for Model Reporting"
+            by Margaret Mitchell, Simone Wu,
+            Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
+            Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
+        Link: https://arxiv.org/abs/1810.03993

-        Note:
-            A model card can be loaded and saved to disk.
+    Note:
+        A model card can be loaded and saved to disk.

-        Parameters:
+    Parameters:
    """

    def __init__(self, **kwargs):
@ -73,8 +73,7 @@ class ModelCard:
                raise err

    def save_pretrained(self, save_directory_or_file):
-        """ Save a model card object to the directory or file `save_directory_or_file`.
-        """
+        """Save a model card object to the directory or file `save_directory_or_file`."""
        if os.path.isdir(save_directory_or_file):
            # If we save using the predefined names, we can load using `from_pretrained`
            output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
@ -86,7 +85,7 @@ class ModelCard:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+        r"""Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.

        Parameters:
            pretrained_model_name_or_path: either:
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@ -302,7 +302,10 @@ class AlbertLayer(nn.Module):
        attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)

        ffn_output = apply_chunking_to_forward(
-            self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output[0],
+            self.ff_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output[0],
        )
        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])

@ -397,8 +400,8 @@ class AlbertTransformer(nn.Module):


 class AlbertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = AlbertConfig
@ -406,8 +409,7 @@ class AlbertPreTrainedModel(PreTrainedModel):
    authorized_missing_keys = [r"position_ids"]

    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
@ -543,17 +545,17 @@ class AlbertModel(AlbertPreTrainedModel):
        return self.embeddings.word_embeddings

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
-            If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
-            is a total of 4 different layers.
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
+        If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
+        is a total of 4 different layers.

-            These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
-            while [2,3] correspond to the two inner groups of the second hidden layer.
+        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
+        while [2,3] correspond to the two inner groups of the second hidden layer.

-            Any layer with in index other than [0,1,2,3] will result in an error.
-            See base class PreTrainedModel for more information about head pruning
+        Any layer with in index other than [0,1,2,3] will result in an error.
+        See base class PreTrainedModel for more information about head pruning
        """
        for layer, heads in heads_to_prune.items():
            group_idx = int(layer / self.config.inner_group_num)
@ -672,34 +674,34 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
        **kwargs,
    ):
        r"""
-        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates original order (sequence A, then sequence B),
-            ``1`` indicates switched order (sequence B, then sequence A).
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
+            sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates original order (sequence A, then sequence B),
+                ``1`` indicates switched order (sequence B, then sequence A).
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.

-    Returns:
+        Returns:

-    Examples::
+        Examples::

-        >>> from transformers import AlbertTokenizer, AlbertForPreTraining
-        >>> import torch
+            >>> from transformers import AlbertTokenizer, AlbertForPreTraining
+            >>> import torch

-        >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True)
+            >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+            >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True)

-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> outputs = model(input_ids)

-        >>> prediction_logits = outputs.prediction_logits
-        >>> sop_logits = outputs.sop_logits
+            >>> prediction_logits = outputs.prediction_logits
+            >>> sop_logits = outputs.sop_logits

        """

@ -787,7 +789,8 @@ class AlbertSOPHead(nn.Module):


@add_start_docstrings(
-    "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
+    "Albert Model with a `language modeling` head on top.",
+    ALBERT_START_DOCSTRING,
 )
 class AlbertForMaskedLM(AlbertPreTrainedModel):
    def __init__(self, config):
@ -952,7 +955,10 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


@ -1033,7 +1039,10 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


@ -1215,5 +1224,8 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@ -374,12 +374,12 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(

 class AutoModel:
    r"""
-        :class:`~transformers.AutoModel` is a generic model class
-        that will be instantiated as one of the base model classes of the library
-        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
-        or the `AutoModel.from_config(config)` class methods.
+    :class:`~transformers.AutoModel` is a generic model class
+    that will be instantiated as one of the base model classes of the library
+    when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
+    or the `AutoModel.from_config(config)` class methods.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
@ -391,7 +391,7 @@ class AutoModel:

    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.

        Note:
@ -433,7 +433,7 @@ class AutoModel:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a pre-trained model configuration.

        The `from_pretrained()` method takes care of returning the correct model class instance
@ -531,11 +531,11 @@ class AutoModel:

 class AutoModelForPreTraining:
    r"""
-        :class:`~transformers.AutoModelForPreTraining` is a generic model class
-        that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForPreTraining` is a generic model class
+    that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
+    class method.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
@ -547,7 +547,7 @@ class AutoModelForPreTraining:

    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.

        Note:
@ -589,7 +589,7 @@ class AutoModelForPreTraining:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
+        r"""Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.

        The `from_pretrained()` method takes care of returning the correct model class instance
        based on the `model_type` property of the config object, or when it's missing,
@ -680,12 +680,12 @@ class AutoModelForPreTraining:

 class AutoModelWithLMHead:
    r"""
-        :class:`~transformers.AutoModelWithLMHead` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelWithLMHead` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+    class method.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
@ -697,7 +697,7 @@ class AutoModelWithLMHead:

    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.

        Note:
@ -743,7 +743,7 @@ class AutoModelWithLMHead:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.

        The `from_pretrained()` method takes care of returning the correct model class instance
@ -839,12 +839,12 @@ class AutoModelWithLMHead:

 class AutoModelForCausalLM:
    r"""
-        :class:`~transformers.AutoModelForCausalLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForCausalLM` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
+    class method.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
@ -856,7 +856,7 @@ class AutoModelForCausalLM:

    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.

        Note:
@ -893,7 +893,7 @@ class AutoModelForCausalLM:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.

        The `from_pretrained()` method takes care of returning the correct model class instance
@ -976,12 +976,12 @@ class AutoModelForCausalLM:

 class AutoModelForMaskedLM:
    r"""
-        :class:`~transformers.AutoModelForMaskedLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForMaskedLM` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
+    class method.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
@ -993,7 +993,7 @@ class AutoModelForMaskedLM:

    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.

        Note:
@ -1033,7 +1033,7 @@ class AutoModelForMaskedLM:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.

        The `from_pretrained()` method takes care of returning the correct model class instance
@ -1119,12 +1119,12 @@ class AutoModelForMaskedLM:

 class AutoModelForSeq2SeqLM:
    r"""
-        :class:`~transformers.AutoModelForSeq2SeqLM` is a generic model class
-        that will be instantiated as one of the language modeling model classes of the library
-        when created with the `AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForSeq2SeqLM` is a generic model class
+    that will be instantiated as one of the language modeling model classes of the library
+    when created with the `AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
+    class method.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
@ -1136,7 +1136,7 @@ class AutoModelForSeq2SeqLM:

    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.

        Note:
@ -1172,7 +1172,7 @@ class AutoModelForSeq2SeqLM:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the language modeling model classes of the library
+        r"""Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.

        The `from_pretrained()` method takes care of returning the correct model class instance
@ -1254,12 +1254,12 @@ class AutoModelForSeq2SeqLM:

 class AutoModelForSequenceClassification:
    r"""
-        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
-        that will be instantiated as one of the sequence classification model classes of the library
-        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
+    that will be instantiated as one of the sequence classification model classes of the library
+    when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+    class method.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
@ -1271,7 +1271,7 @@ class AutoModelForSequenceClassification:

    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.

        Note:
@ -1313,7 +1313,7 @@ class AutoModelForSequenceClassification:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the sequence classification model classes of the library
+        r"""Instantiates one of the sequence classification model classes of the library
        from a pre-trained model configuration.

        The `from_pretrained()` method takes care of returning the correct model class instance
@ -1406,12 +1406,12 @@ class AutoModelForSequenceClassification:

 class AutoModelForQuestionAnswering:
    r"""
-        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
-        that will be instantiated as one of the question answering model classes of the library
-        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
+    that will be instantiated as one of the question answering model classes of the library
+    when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+    class method.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
@ -1423,7 +1423,7 @@ class AutoModelForQuestionAnswering:

    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.

        Note:
@ -1462,7 +1462,7 @@ class AutoModelForQuestionAnswering:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
+        r"""Instantiates one of the question answering model classes of the library
        from a pre-trained model configuration.

        The `from_pretrained()` method takes care of returning the correct model class instance
@ -1551,12 +1551,12 @@ class AutoModelForQuestionAnswering:

 class AutoModelForTokenClassification:
    r"""
-        :class:`~transformers.AutoModelForTokenClassification` is a generic model class
-        that will be instantiated as one of the token classification model classes of the library
-        when created with the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForTokenClassification` is a generic model class
+    that will be instantiated as one of the token classification model classes of the library
+    when created with the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)`
+    class method.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
@ -1568,7 +1568,7 @@ class AutoModelForTokenClassification:

    @classmethod
    def from_config(cls, config):
-        r""" Instantiates one of the base model classes of the library
+        r"""Instantiates one of the base model classes of the library
        from a configuration.

        Note:
@ -1611,7 +1611,7 @@ class AutoModelForTokenClassification:

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the question answering model classes of the library
+        r"""Instantiates one of the question answering model classes of the library
        from a pre-trained model configuration.

        The `from_pretrained()` method takes care of returning the correct model class instance
@ -1703,12 +1703,12 @@ class AutoModelForTokenClassification:

 class AutoModelForMultipleChoice:
    r"""
-        :class:`~transformers.AutoModelForMultipleChoice` is a generic model class
-        that will be instantiated as one of the multiple choice model classes of the library
-        when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+    :class:`~transformers.AutoModelForMultipleChoice` is a generic model class
+    that will be instantiated as one of the multiple choice model classes of the library
+    when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
+    class method.

-        This class cannot be instantiated using `__init__()` (throws an error).
+    This class cannot be instantiated using `__init__()` (throws an error).
    """

    def __init__(self):
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@ -223,7 +223,9 @@ class EncoderLayer(nn.Module):
        super().__init__()
        self.embed_dim = config.d_model
        self.self_attn = SelfAttention(
-            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout,
+            self.embed_dim,
+            config.encoder_attention_heads,
+            dropout=config.attention_dropout,
        )
        self.normalize_before = config.normalize_before
        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
@ -297,7 +299,10 @@ class BartEncoder(nn.Module):
            )
        else:
            self.embed_positions = LearnedPositionalEmbedding(
-                config.max_position_embeddings, embed_dim, self.padding_idx, config.extra_pos_embeddings,
+                config.max_position_embeddings,
+                embed_dim,
+                self.padding_idx,
+                config.extra_pos_embeddings,
            )
        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])
        self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
@ -370,7 +375,9 @@ class DecoderLayer(nn.Module):
        super().__init__()
        self.embed_dim = config.d_model
        self.self_attn = SelfAttention(
-            embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout,
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
        )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
@ -477,7 +484,10 @@ class BartDecoder(nn.Module):
            )
        else:
            self.embed_positions = LearnedPositionalEmbedding(
-                config.max_position_embeddings, config.d_model, self.padding_idx, config.extra_pos_embeddings,
+                config.max_position_embeddings,
+                config.d_model,
+                self.padding_idx,
+                config.extra_pos_embeddings,
            )
        self.layers = nn.ModuleList(
            [DecoderLayer(config) for _ in range(config.decoder_layers)]
@ -695,7 +705,10 @@ class SelfAttention(nn.Module):
        # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
        if key_padding_mask is not None and key_padding_mask.dim() == 0:
            key_padding_mask = None
-        assert key_padding_mask is None or key_padding_mask.size()[:2] == (bsz, src_len,)
+        assert key_padding_mask is None or key_padding_mask.size()[:2] == (
+            bsz,
+            src_len,
+        )

        if key_padding_mask is not None:  # don't attend to padding symbols
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
@ -703,7 +716,11 @@ class SelfAttention(nn.Module):
            attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
        attn_weights = F.softmax(attn_weights, dim=-1)
-        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,)
+        attn_probs = F.dropout(
+            attn_weights,
+            p=self.dropout,
+            training=self.training,
+        )

        assert v is not None
        attn_output = torch.bmm(attn_probs, v)
@ -754,7 +771,11 @@ class BartClassificationHead(nn.Module):
    # This can trivially be shared with RobertaClassificationHead

    def __init__(
-        self, input_dim, inner_dim, num_classes, pooler_dropout,
+        self,
+        input_dim,
+        inner_dim,
+        num_classes,
+        pooler_dropout,
    ):
        super().__init__()
        self.dense = nn.Linear(input_dim, inner_dim)
@ -819,7 +840,8 @@ def _get_shape(t):


@add_start_docstrings(
-    "The bare BART Model outputting raw hidden-states without any specific head on top.", BART_START_DOCSTRING,
+    "The bare BART Model outputting raw hidden-states without any specific head on top.",
+    BART_START_DOCSTRING,
 )
 class BartModel(PretrainedBartModel):
    def __init__(self, config: BartConfig):
@ -981,31 +1003,31 @@ class BartForConditionalGeneration(PretrainedBartModel):
        **unused,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
-            with labels in ``[0, ..., config.vocab_size]``.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
+                with labels in ``[0, ..., config.vocab_size]``.

-    Returns:
+        Returns:

-    Conditional generation example::
+        Conditional generation example::

-            # Mask filling only works for bart-large
-            from transformers import BartTokenizer, BartForConditionalGeneration
-            tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-            TXT = "My friends are <mask> but they eat too many carbs."
+                # Mask filling only works for bart-large
+                from transformers import BartTokenizer, BartForConditionalGeneration
+                tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+                TXT = "My friends are <mask> but they eat too many carbs."

-            model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
-            input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            logits = model(input_ids).logits
+                model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+                input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+                logits = model(input_ids).logits

-            masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            probs = logits[0, masked_index].softmax(dim=0)
-            values, predictions = probs.topk(5)
+                masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+                probs = logits[0, masked_index].softmax(dim=0)
+                values, predictions = probs.topk(5)

-            tokenizer.decode(predictions).split()
-            # ['good', 'great', 'all', 'really', 'very']
+                tokenizer.decode(predictions).split()
+                # ['good', 'great', 'all', 'really', 'very']
        """
        if "lm_labels" in unused:
            warnings.warn(
@ -1116,7 +1138,10 @@ class BartForSequenceClassification(PretrainedBartModel):
        super().__init__(config, **kwargs)
        self.model = BartModel(config)
        self.classification_head = BartClassificationHead(
-            config.d_model, config.d_model, config.num_labels, config.classif_dropout,
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classif_dropout,
        )
        self.model._init_weights(self.classification_head.dense)
        self.model._init_weights(self.classification_head.out_proj)
@ -1279,7 +1304,10 @@ class BartForQuestionAnswering(PretrainedBartModel):
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
-            output = (start_logits, end_logits,) + outputs[1:]
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        return Seq2SeqQuestionAnsweringModelOutput(
@ -1307,7 +1335,7 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
    @staticmethod
    def _init_weight(out: nn.Parameter):
        """Identical to the XLM create_sinusoidal_embeddings except features are not interleaved.
-            The cos features are in the 2nd half of the vector. [dim // 2:]
+        The cos features are in the 2nd half of the vector. [dim // 2:]
        """
        n_pos, dim = out.shape
        position_enc = np.array(
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@ -89,8 +89,7 @@ BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [


 def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
+    """Load tf checkpoints in a pytorch model."""
    try:
        import re

@ -174,8 +173,7 @@ BertLayerNorm = torch.nn.LayerNorm


 class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
@ -343,7 +341,12 @@ class BertAttention(nn.Module):
        output_attentions=False,
    ):
        self_outputs = self.self(
-            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            output_attentions,
        )
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
@ -403,7 +406,10 @@ class BertLayer(nn.Module):
        output_attentions=False,
    ):
        self_attention_outputs = self.attention(
-            hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
        )
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
@ -582,8 +588,8 @@ class BertPreTrainingHeads(nn.Module):


 class BertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = BertConfig
@ -733,9 +739,9 @@ class BertModel(BertPreTrainedModel):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@ -877,34 +883,34 @@ class BertForPreTraining(BertPreTrainedModel):
        **kwargs
    ):
        r"""
-        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
+            next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates sequence B is a continuation of sequence A,
+                ``1`` indicates sequence B is a random sequence.
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.

-    Returns:
+        Returns:

-    Examples::
+        Examples::

-        >>> from transformers import BertTokenizer, BertForPreTraining
-        >>> import torch
+            >>> from transformers import BertTokenizer, BertForPreTraining
+            >>> import torch

-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)

-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)

-        >>> prediction_logits = outptus.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
+            >>> prediction_logits = outptus.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@ -986,36 +992,36 @@ class BertLMHeadModel(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the left-to-right language modeling loss (next word prediction).
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
+            encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+                is used in the cross-attention if the model is configured as a decoder.
+                Mask values selected in ``[0, 1]``:
+                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the left-to-right language modeling loss (next word prediction).
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``

-    Returns:
+        Returns:

-    Example::
+        Example::

-        >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
-        >>> import torch
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch

-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-        >>> config = BertConfig.from_pretrained("bert-base-cased")
-        >>> config.is_decoder = True
-        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True)
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> config.is_decoder = True
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True)

-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)

-        >>> prediction_logits = outputs.logits
+            >>> prediction_logits = outputs.logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

@ -1049,7 +1055,10 @@ class BertLMHeadModel(BertPreTrainedModel):
            return ((lm_loss,) + output) if lm_loss is not None else output

        return CausalLMOutput(
-            loss=lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
@ -1173,7 +1182,8 @@ class BertForMaskedLM(BertPreTrainedModel):


@add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
 )
 class BertForNextSentencePrediction(BertPreTrainedModel):
    def __init__(self, config):
@ -1200,29 +1210,29 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
+            next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates sequence B is a continuation of sequence A,
+                ``1`` indicates sequence B is a random sequence.

-    Returns:
+        Returns:

-    Example::
+        Example::

-        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
-        >>> import torch
+            >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+            >>> import torch

-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)

-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

-        >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+            >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+            >>> logits = outputs.logits
+            >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

@ -1336,7 +1346,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


@ -1422,7 +1435,10 @@ class BertForMultipleChoice(BertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


@ -1505,7 +1521,10 @@ class BertForTokenClassification(BertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


--- a/src/transformers/modeling_camembert.py
+++ b/src/transformers/modeling_camembert.py
@ -68,7 +68,8 @@ class CamembertModel(RobertaModel):


@add_start_docstrings(
-    """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
+    """CamemBERT Model with a `language modeling` head on top. """,
+    CAMEMBERT_START_DOCSTRING,
 )
 class CamembertForMaskedLM(RobertaForMaskedLM):
    """
--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@ -212,16 +212,15 @@ class EncoderLayer(torch.nn.Module):


 class CTRLPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = CTRLConfig
    base_model_prefix = "transformer"

    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
@ -331,8 +330,8 @@ class CTRLModel(CTRLPreTrainedModel):
        self.w = new_embeddings

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-                heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        for layer, heads in heads_to_prune.items():
            self.h[layer].multi_head_attention.prune_heads(heads)
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@ -261,7 +261,12 @@ class TransformerBlock(nn.Module):
        """
        # Self-Attention
        sa_output = self.attention(
-            query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask, output_attentions=output_attentions,
+            query=x,
+            key=x,
+            value=x,
+            mask=attn_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
        )
        if output_attentions:
            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
@ -343,8 +348,8 @@ class Transformer(nn.Module):

 # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
 class DistilBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = DistilBertConfig
@ -352,8 +357,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
    base_model_prefix = "distilbert"

    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
        if isinstance(module, nn.Embedding):
            if module.weight.requires_grad:
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
@ -432,9 +436,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
        self.embeddings.word_embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.transformer.layer[layer].attention.prune_heads(heads)
@ -493,7 +497,8 @@ class DistilBertModel(DistilBertPreTrainedModel):


@add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
+    """DistilBert Model with a `masked language modeling` head on top. """,
+    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    def __init__(self, config):
@ -829,7 +834,10 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


@ -863,32 +871,32 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the multiple choice classification loss.
+                Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
+                of the input tensors. (see `input_ids` above)

-    Returns:
+        Returns:

-    Examples::
+        Examples::

-        >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
-        >>> import torch
+            >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
+            >>> import torch

-        >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True)
+            >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+            >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True)

-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> choice0 = "It is eaten with a fork and a knife."
-        >>> choice1 = "It is eaten while held in the hand."
-        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> choice0 = "It is eaten with a fork and a knife."
+            >>> choice1 = "It is eaten while held in the hand."
+            >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

-        >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
-        >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
+            >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
+            >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1

-        >>> # the linear classifier still needs to be trained
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
+            >>> # the linear classifier still needs to be trained
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@ -930,5 +938,8 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
--- a/src/transformers/modeling_dpr.py
+++ b/src/transformers/modeling_dpr.py
@ -265,8 +265,8 @@ class DPRSpanPredictor(PreTrainedModel):


 class DPRPretrainedContextEncoder(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = DPRConfig
@ -278,8 +278,8 @@ class DPRPretrainedContextEncoder(PreTrainedModel):


 class DPRPretrainedQuestionEncoder(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = DPRConfig
@ -291,8 +291,8 @@ class DPRPretrainedQuestionEncoder(PreTrainedModel):


 class DPRPretrainedReader(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = DPRConfig
@ -421,15 +421,15 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
        return_dict=None,
    ) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]:
        r"""
-    Return:
+        Return:

-    Examples::
+        Examples::

-        from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
-        tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
-        model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
-        input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
-        embeddings = model(input_ids).pooler_output
+            from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
+            tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
+            model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
+            input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+            embeddings = model(input_ids).pooler_output
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@ -499,15 +499,15 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
        return_dict=None,
    ) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]:
        r"""
-    Return:
+        Return:

-    Examples::
+        Examples::

-        from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
-        tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
-        model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True)
-        input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
-        embeddings = model(input_ids).pooler_output
+            from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
+            tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
+            model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True)
+            input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+            embeddings = model(input_ids).pooler_output
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@ -553,7 +553,8 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):


@add_start_docstrings(
-    "The bare DPRReader transformer outputting span predictions.", DPR_START_DOCSTRING,
+    "The bare DPRReader transformer outputting span predictions.",
+    DPR_START_DOCSTRING,
 )
 class DPRReader(DPRPretrainedReader):
    def __init__(self, config: DPRConfig):
@ -574,23 +575,23 @@ class DPRReader(DPRPretrainedReader):
        return_dict=None,
    ) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
        r"""
-    Return:
+        Return:

-    Examples::
+        Examples::

-        from transformers import DPRReader, DPRReaderTokenizer
-        tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-        model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True)
-        encoded_inputs = tokenizer(
-                questions=["What is love ?"],
-                titles=["Haddaway"],
-                texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-                return_tensors='pt'
-            )
-        outputs = model(**encoded_inputs)
-        start_logits = outputs.stat_logits
-        end_logits = outputs.end_logits
-        relevance_logits = outputs.relevance_logits
+            from transformers import DPRReader, DPRReaderTokenizer
+            tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+            model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True)
+            encoded_inputs = tokenizer(
+                    questions=["What is love ?"],
+                    titles=["Haddaway"],
+                    texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+                    return_tensors='pt'
+                )
+            outputs = model(**encoded_inputs)
+            start_logits = outputs.stat_logits
+            end_logits = outputs.end_logits
+            relevance_logits = outputs.relevance_logits

        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@ -46,8 +46,7 @@ ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [


 def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"):
-    """ Load tf checkpoints in a pytorch model.
-    """
+    """Load tf checkpoints in a pytorch model."""
    try:
        import re

@ -179,8 +178,8 @@ class ElectraGeneratorPredictions(nn.Module):


 class ElectraPreTrainedModel(BertPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = ElectraConfig
@ -311,9 +310,9 @@ class ElectraModel(ElectraPreTrainedModel):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@ -512,24 +511,24 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates the token is an original token,
-            ``1`` indicates the token was replaced.
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates the token is an original token,
+                ``1`` indicates the token was replaced.

-    Returns:
+        Returns:

-    Examples::
+        Examples::

-        >>> from transformers import ElectraTokenizer, ElectraForPreTraining
-        >>> import torch
+            >>> from transformers import ElectraTokenizer, ElectraForPreTraining
+            >>> import torch

-        >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+            >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+            >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')

-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> logits = model(input_ids).logits
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> logits = model(input_ids).logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

@ -836,7 +835,10 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
-            output = (start_logits, end_logits,) + discriminator_hidden_states[1:]
+            output = (
+                start_logits,
+                end_logits,
+            ) + discriminator_hidden_states[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@ -28,11 +28,11 @@ logger = logging.get_logger(__name__)

 class EncoderDecoderModel(PreTrainedModel):
    r"""
-        :class:`~transformers.EncoderDecoder` is a generic model class that will be
-        instantiated as a transformer architecture with one of the base model
-        classes of the library as encoder and another one as
-        decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
-        class method for the encoder and `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` class method for the decoder.
+    :class:`~transformers.EncoderDecoder` is a generic model class that will be
+    instantiated as a transformer architecture with one of the base model
+    classes of the library as encoder and another one as
+    decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
+    class method for the encoder and `AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` class method for the decoder.
    """
    config_class = EncoderDecoderConfig
    base_model_prefix = "encoder_decoder"
@ -103,7 +103,7 @@ class EncoderDecoderModel(PreTrainedModel):
        *model_args,
        **kwargs
    ) -> PreTrainedModel:
-        r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.
+        r"""Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.


        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@ -240,7 +240,11 @@ class FlaubertModel(XLMModel):
            # self attention
            if not self.pre_norm:
                attn_outputs = self.attentions[i](
-                    tensor, attn_mask, cache=cache, head_mask=head_mask[i], output_attentions=output_attentions,
+                    tensor,
+                    attn_mask,
+                    cache=cache,
+                    head_mask=head_mask[i],
+                    output_attentions=output_attentions,
                )
                attn = attn_outputs[0]
                if output_attentions:
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@ -61,8 +61,7 @@ GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [


 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
-    """
+    """Load tf checkpoints in a pytorch model"""
    try:
        import re

@ -324,8 +323,8 @@ class Block(nn.Module):


 class GPT2PreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = GPT2Config
@ -336,8 +335,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
        super().__init__(*inputs, **kwargs)

    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
@ -483,8 +481,8 @@ class GPT2Model(GPT2PreTrainedModel):
        self.wte = new_embeddings

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)
@ -800,47 +798,47 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
        **kwargs,
    ):
        r"""
-        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1[``.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
+                Index of the classification token in each input sequence.
+                Selected in the range ``[0, input_ids.size(-1) - 1[``.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
+                Labels for language modeling.
+                Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
+                Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+                All labels set to ``-100`` are ignored (masked), the loss is only
+                computed for labels in ``[0, ..., config.vocab_size]``
+            mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
+                Labels for computing the multiple choice classification loss.
+                Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+                of the input tensors. (see `input_ids` above)
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.

-    Return:
+        Return:

-    Examples::
+        Examples::

-        >>> import torch
-        >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+            >>> import torch
+            >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel

-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True)
+            >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True)

-        >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+            >>> # Add a [CLS] to the vocabulary (we should train it also!)
+            >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})

-        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+            >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size

-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
-        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+            >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+            >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+            >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

-        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
-        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+            >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+            >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1

-        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        >>> lm_logits = outputs.lm_logits
-        >>> mc_logits = outputs.mc_logits
+            >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+            >>> lm_logits = outputs.lm_logits
+            >>> mc_logits = outputs.mc_logits

        """
        if "lm_labels" in kwargs:
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@ -66,7 +66,7 @@ LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [

 def _get_question_end_index(input_ids, sep_token_id):
    """
-        Computes the index of the first occurance of `sep_token_id`.
+    Computes the index of the first occurance of `sep_token_id`.
    """

    sep_token_indices = (input_ids == sep_token_id).nonzero()
@ -81,9 +81,9 @@ def _get_question_end_index(input_ids, sep_token_id):

 def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True):
    """
-        Computes global attention mask by putting attention on all tokens
-        before `sep_token_id` if `before_sep_token is True` else after
-        `sep_token_id`.
+    Computes global attention mask by putting attention on all tokens
+    before `sep_token_id` if `before_sep_token is True` else after
+    `sep_token_id`.
    """
    question_end_index = _get_question_end_index(input_ids, sep_token_id)
    question_end_index = question_end_index.unsqueeze(dim=1)  # size: batch_size x 1
@ -135,7 +135,10 @@ class LongformerSelfAttention(nn.Module):
        self.one_sided_attn_window_size = attention_window // 2

    def forward(
-        self, hidden_states, attention_mask=None, output_attentions=False,
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
    ):
        """
        LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`.
@ -314,17 +317,17 @@ class LongformerSelfAttention(nn.Module):
    @staticmethod
    def _pad_and_diagonalize(chunked_hidden_states):
        """shift every row 1 step right, converting columns into diagonals.
-           Example:
-                 chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                          -1.8348,  0.7672,  0.2986,  0.0285,
-                                          -0.7584,  0.4206, -0.0405,  0.1599,
-                                          2.0514, -1.1600,  0.5372,  0.2629 ]
-                 window_overlap = num_rows = 4
-                (pad & diagonilize) =>
-                [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
-                  0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
-                  0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
-                  0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
+        Example:
+              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
+                                       -1.8348,  0.7672,  0.2986,  0.0285,
+                                       -0.7584,  0.4206, -0.0405,  0.1599,
+                                       2.0514, -1.1600,  0.5372,  0.2629 ]
+              window_overlap = num_rows = 4
+             (pad & diagonilize) =>
+             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
+               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
+               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
+               0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
        """
        total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
        chunked_hidden_states = F.pad(
@ -442,7 +445,7 @@ class LongformerSelfAttention(nn.Module):
        self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int
    ):
        """Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors.
-           Returned tensor will be of the same shape as `attn_probs`"""
+        Returned tensor will be of the same shape as `attn_probs`"""
        batch_size, seq_len, num_heads, head_dim = value.size()

        assert seq_len % (window_overlap * 2) == 0
@ -622,7 +625,10 @@ class LongformerSelfAttention(nn.Module):
            is_local_index_no_global_attn_nonzero[0], :, is_local_index_no_global_attn_nonzero[1], :
        ] = -10000.0

-        global_attn_scores = global_attn_scores.masked_fill(is_index_masked[:, None, None, :], -10000.0,)
+        global_attn_scores = global_attn_scores.masked_fill(
+            is_index_masked[:, None, None, :],
+            -10000.0,
+        )

        global_attn_scores = global_attn_scores.view(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)

@ -676,9 +682,16 @@ class LongformerAttention(nn.Module):
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
-        self, hidden_states, attention_mask=None, output_attentions=False,
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
    ):
-        self_outputs = self.self(hidden_states, attention_mask, output_attentions,)
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+        )
        attn_output = self.output(self_outputs[0], hidden_states)
        outputs = (attn_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs
@ -694,9 +707,16 @@ class LongformerLayer(nn.Module):
        self.seq_len_dim = 1

    def forward(
-        self, hidden_states, attention_mask=None, output_attentions=False,
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
    ):
-        self_attn_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions,)
+        self_attn_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+        )
        attn_output = self_attn_outputs[0]
        outputs = self_attn_outputs[1:]  # add self attentions if we output attention weights

@ -741,10 +761,16 @@ class LongformerEncoder(nn.Module):
                    return custom_forward

                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module), hidden_states, attention_mask,
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
                )
            else:
-                layer_outputs = layer_module(hidden_states, attention_mask, output_attentions,)
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
            hidden_states = layer_outputs[0]

            if output_attentions:
@ -762,9 +788,9 @@ class LongformerEncoder(nn.Module):


 class LongformerPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained
-        models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained
+    models.
    """

    config_class = LongformerConfig
@ -896,9 +922,9 @@ class LongformerModel(LongformerPreTrainedModel):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@ -938,7 +964,9 @@ class LongformerModel(LongformerPreTrainedModel):
                position_ids = F.pad(position_ids, (0, padding_len), value=pad_token_id)
            if inputs_embeds is not None:
                input_ids_padding = inputs_embeds.new_full(
-                    (batch_size, padding_len), self.config.pad_token_id, dtype=torch.long,
+                    (batch_size, padding_len),
+                    self.config.pad_token_id,
+                    dtype=torch.long,
                )
                inputs_embeds_padding = self.embeddings(input_ids_padding)
                inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2)
@ -976,28 +1004,28 @@ class LongformerModel(LongformerPreTrainedModel):
    ):
        r"""

-    Returns:
+        Returns:

-    Examples::
+        Examples::

-        >>> import torch
-        >>> from transformers import LongformerModel, LongformerTokenizer
+            >>> import torch
+            >>> from transformers import LongformerModel, LongformerTokenizer

-        >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True)
-        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+            >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True)
+            >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

-        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+            >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+            >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

-        >>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
-        >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
-        >>> attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
-        ...                                     # classification: the <s> token
-        ...                                     # QA: question tokens
-        ...                                     # LM: potentially on the beginning of sentences and paragraphs
-        >>> outputs = model(input_ids, attention_mask=attention_mask)
-        >>> sequence_output = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output
+            >>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
+            >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
+            >>> attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
+            ...                                     # classification: the <s> token
+            ...                                     # QA: question tokens
+            ...                                     # LM: potentially on the beginning of sentences and paragraphs
+            >>> outputs = model(input_ids, attention_mask=attention_mask)
+            >>> sequence_output = outputs.last_hidden_state
+            >>> pooled_output = outputs.pooler_output
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@ -1102,32 +1130,32 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
        **kwargs
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.

-    Returns:
+        Returns:

-    Examples::
+        Examples::

-        >>> import torch
-        >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
+            >>> import torch
+            >>> from transformers import LongformerForMaskedLM, LongformerTokenizer

-        >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True)
-        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+            >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True)
+            >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

-        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+            >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+            >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

-        >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
-        ...                        # check ``LongformerModel.forward`` for more details how to set `attention_mask`
-        >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
-        >>> loss = outputs.loss
-        >>> prediction_logits = output.logits
+            >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
+            ...                        # check ``LongformerModel.forward`` for more details how to set `attention_mask`
+            >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
+            >>> loss = outputs.loss
+            >>> prediction_logits = output.logits
        """

        if "masked_lm_labels" in kwargs:
@ -1252,7 +1280,10 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


@ -1310,39 +1341,39 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-    Returns:
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`).
+                Position outside of the sequence are not taken into account for computing the loss.
+        Returns:

-    Examples::
+        Examples::

-        >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
-        >>> import torch
+            >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
+            >>> import torch

-        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
-        >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True)
+            >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+            >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True)

-        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        >>> encoding = tokenizer(question, text, return_tensors="pt")
-        >>> input_ids = encoding["input_ids"]
+            >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+            >>> encoding = tokenizer(question, text, return_tensors="pt")
+            >>> input_ids = encoding["input_ids"]

-        >>> # default is local attention everywhere
-        >>> # the forward method will automatically set global attention on question tokens
-        >>> attention_mask = encoding["attention_mask"]
+            >>> # default is local attention everywhere
+            >>> # the forward method will automatically set global attention on question tokens
+            >>> attention_mask = encoding["attention_mask"]

-        >>> outputs = model(input_ids, attention_mask=attention_mask)
-        >>> start_logits = outputs.start_logits
-        >>> end_logits = outputs.end_logits
-        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
+            >>> outputs = model(input_ids, attention_mask=attention_mask)
+            >>> start_logits = outputs.start_logits
+            >>> end_logits = outputs.end_logits
+            >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

-        >>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
-        >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
+            >>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
+            >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token

        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -1487,7 +1518,10 @@ class LongformerForTokenClassification(BertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


@ -1592,5 +1626,8 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
--- a/src/transformers/modeling_mmbt.py
+++ b/src/transformers/modeling_mmbt.py
@ -32,8 +32,7 @@ _CONFIG_FOR_DOC = "MMBTConfig"


 class ModalEmbeddings(nn.Module):
-    """Generic Modal Embeddings which takes in an encoder, and a transformer embedding.
-    """
+    """Generic Modal Embeddings which takes in an encoder, and a transformer embedding."""

    def __init__(self, config, encoder, embeddings):
        super().__init__()
@ -154,7 +153,8 @@ MMBT_INPUTS_DOCSTRING = r"""    Inputs:


@add_start_docstrings(
-    "The bare MMBT Model outputting raw hidden-states without any specific head on top.", MMBT_START_DOCSTRING,
+    "The bare MMBT Model outputting raw hidden-states without any specific head on top.",
+    MMBT_START_DOCSTRING,
 )
 class MMBTModel(nn.Module, ModuleUtilsMixin):
    def __init__(self, config, transformer, encoder):
@ -288,34 +288,34 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
 )
 class MMBTForClassification(nn.Module):
    r"""
-            **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in ``[0, ..., config.num_labels - 1]``.
-                If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-                If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

-        Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-            **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-                Classification (or regression if config.num_labels==1) loss.
-            **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-                Classification (or regression if config.num_labels==1) scores (before SoftMax).
-            **hidden_states**: (`optional`, returned when ``output_hidden_states=True``)
-                list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-                of shape ``(batch_size, sequence_length, hidden_size)``:
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            **attentions**: (`optional`, returned when ``output_attentions=True``)
-                list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

-        Examples::
+    Examples::

-            # For example purposes. Not runnable.
-            transformer = BertModel.from_pretrained('bert-base-uncased')
-            encoder = ImageEncoder(args)
-            model = MMBTForClassification(config, transformer, encoder)
-            outputs = model(input_modal, input_ids, labels=labels)
-            loss, logits = outputs[:2]
-        """
+        # For example purposes. Not runnable.
+        transformer = BertModel.from_pretrained('bert-base-uncased')
+        encoder = ImageEncoder(args)
+        model = MMBTForClassification(config, transformer, encoder)
+        outputs = model(input_modal, input_ids, labels=labels)
+        loss, logits = outputs[:2]
+    """

    def __init__(self, config, transformer, encoder):
        super().__init__()
@ -378,5 +378,8 @@ class MMBTForClassification(nn.Module):
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@ -64,8 +64,7 @@ MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]


 def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
+    """Load tf checkpoints in a pytorch model."""
    try:
        import re

@ -161,8 +160,7 @@ NORM2FN = {"layer_norm": torch.nn.LayerNorm, "no_norm": NoNorm}


 class MobileBertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
@ -663,8 +661,8 @@ class MobileBertPreTrainingHeads(nn.Module):


 class MobileBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = MobileBertConfig
@ -788,7 +786,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
 )
 class MobileBertModel(MobileBertPreTrainedModel):
    """
-        https://arxiv.org/pdf/2004.02984.pdf
+    https://arxiv.org/pdf/2004.02984.pdf
    """

    authorized_missing_keys = [r"position_ids"]
@ -809,9 +807,9 @@ class MobileBertModel(MobileBertPreTrainedModel):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@ -965,31 +963,31 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-    Returns:
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+                in ``[0, ..., config.vocab_size]``
+            next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates sequence B is a continuation of sequence A,
+                ``1`` indicates sequence B is a random sequence.
+        Returns:

-    Examples::
+        Examples::

-        >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
-        >>> import torch
+            >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
+            >>> import torch

-        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
-        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True)
+            >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+            >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True)

-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> outputs = model(input_ids)

-        >>> prediction_logits = outptus.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
+            >>> prediction_logits = outptus.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits

        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -1176,29 +1174,29 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
+            next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+                Indices should be in ``[0, 1]``.
+                ``0`` indicates sequence B is a continuation of sequence A,
+                ``1`` indicates sequence B is a random sequence.

-    Returns:
+        Returns:

-    Examples::
+        Examples::

-        >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
-        >>> import torch
+            >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
+            >>> import torch

-        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True)
+            >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+            >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True)

-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

-        >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
+            >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

@ -1308,7 +1306,10 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


@ -1491,7 +1492,10 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
-            loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


@ -1574,5 +1578,8 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )
--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@ -59,8 +59,7 @@ OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [


 def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
-    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
-    """
+    """Load tf pre-trained weights in a pytorch model (from NumPy arrays here)"""
    import re

    import numpy as np
@ -257,7 +256,10 @@ class Block(nn.Module):

    def forward(self, x, attention_mask=None, head_mask=None, output_attentions=False):
        attn_outputs = self.attn(
-            x, attention_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions,
+            x,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
        )
        a = attn_outputs[0]

@ -270,8 +272,8 @@ class Block(nn.Module):


 class OpenAIGPTPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = OpenAIGPTConfig
@ -280,8 +282,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
    authorized_missing_keys = [r"position_ids"]

    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
+        """Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
@ -408,8 +409,8 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        self.tokens_embed = new_embeddings

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)
@ -506,7 +507,9 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)

        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions,
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
        )


@ -632,41 +635,41 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        **kwargs
    ):
        r"""
-        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
-            Index of the classification token in each input sequence.
-            Selected in the range ``[0, input_ids.size(-1) - 1]``.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
+            mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
+                Index of the classification token in each input sequence.
+                Selected in the range ``[0, input_ids.size(-1) - 1]``.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
+                Labels for language modeling.
+                Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
+                Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+                All labels set to ``-100`` are ignored (masked), the loss is only
+                computed for labels in ``[0, ..., config.vocab_size]``
+            mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
+                Labels for computing the multiple choice classification loss.
+                Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+                of the input tensors. (see `input_ids` above)
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+                Used to hide legacy arguments that have been deprecated.

-    Return:
+        Return:

-    Examples::
+        Examples::

-        from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
-        import torch
+            from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
+            import torch

-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
-        model.resize_token_embeddings(len(tokenizer))
+            tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+            model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
+            tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
+            model.resize_token_embeddings(len(tokenizer))

-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0)  # Batch size 1
+            choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+            input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+            mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0)  # Batch size 1

-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_logits = outputs.lm_logits
-        mc_logits = outputs.mc_logits
+            outputs = model(input_ids, mc_token_ids=mc_token_ids)
+            lm_logits = outputs.lm_logits
+            mc_logits = outputs.mc_logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if "lm_labels" in kwargs:
--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
@ -78,7 +78,8 @@ ReformerBackwardOutput = namedtuple(
    "ReformerBackwardOutput", ["attn_output", "hidden_states", "grad_attn_output", "grad_hidden_states"]
 )
 ReformerEncoderOutput = namedtuple(
-    "ReformerEncoderOutput", ["hidden_states", "all_hidden_states", "all_attentions", "past_buckets_states"],
+    "ReformerEncoderOutput",
+    ["hidden_states", "all_hidden_states", "all_attentions", "past_buckets_states"],
 )


@ -192,7 +193,9 @@ class AxialPositionEmbeddings(nn.Module):
            assert (
                reduce(mul, self.axial_pos_shape) >= sequence_length
            ), "Make sure that config.axial_pos_shape factors: {} multiply at least to max(sequence_length, least_common_mult_chunk_length): max({}, {})".format(
-                self.axial_pos_shape, sequence_length, self.least_common_mult_chunk_length,
+                self.axial_pos_shape,
+                sequence_length,
+                self.least_common_mult_chunk_length,
            )

            # compute how many columns are needed
@ -218,8 +221,7 @@ class AxialPositionEmbeddings(nn.Module):


 class PositionEmbeddings(nn.Module):
-    """Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`.
-    """
+    """Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`."""

    def __init__(self, config):
        super().__init__()
@ -233,8 +235,7 @@ class PositionEmbeddings(nn.Module):


 class ReformerEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
+    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
@ -285,16 +286,16 @@ class EfficientAttentionMixin:
    """

    def _look_adjacent(self, vectors, num_chunks_before, num_chunks_after):
-        """ Used to implement attention between consecutive chunks.
+        """Used to implement attention between consecutive chunks.

-            Args:
-                vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...]
-                num_chunks_before: chunks before current chunk to include in attention
-                num_chunks_after: chunks after current chunk to include in attention
+        Args:
+            vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...]
+            num_chunks_before: chunks before current chunk to include in attention
+            num_chunks_after: chunks after current chunk to include in attention

-            Returns:
-                tensor of shape [num_chunks, N * chunk_length, ...], where
-                N = (1 + num_chunks_before + num_chunks_after).
+        Returns:
+            tensor of shape [num_chunks, N * chunk_length, ...], where
+            N = (1 + num_chunks_before + num_chunks_after).
        """
        if num_chunks_before == 0 and num_chunks_after == 0:
            return vectors
@ -309,7 +310,7 @@ class EfficientAttentionMixin:

    def _split_hidden_size_dim(self, x, num_attn_heads, attn_head_size):
        """
-            splits hidden_size dim into attn_head_size and num_attn_heads
+        splits hidden_size dim into attn_head_size and num_attn_heads
        """
        new_x_shape = x.size()[:-1] + (num_attn_heads, attn_head_size)
        x = x.view(*new_x_shape)
@ -317,14 +318,14 @@ class EfficientAttentionMixin:

    def _merge_hidden_size_dims(self, x, num_attn_heads, attn_head_size):
        """
-            merges attn_head_size dim and num_attn_heads dim into hidden_size
+        merges attn_head_size dim and num_attn_heads dim into hidden_size
        """
        x = x.permute(0, 2, 1, 3)
        return torch.reshape(x, (x.size()[0], -1, num_attn_heads * attn_head_size))

    def _split_seq_length_dim_to(self, vectors, dim_factor_1, dim_factor_2, num_attn_heads, attn_head_size=None):
        """
-            splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims
+        splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims
        """
        batch_size = vectors.shape[0]
        split_dim_shape = (batch_size, num_attn_heads, dim_factor_1, dim_factor_2)
@ -418,10 +419,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
                # split key & value vectors by num hashes to apply
                # self attention on each separately
                query_key_vectors = self._split_seq_length_dim_to(
-                    query_key_vectors, num_hashes, -1, self.num_attention_heads, self.attention_head_size,
+                    query_key_vectors,
+                    num_hashes,
+                    -1,
+                    self.num_attention_heads,
+                    self.attention_head_size,
                )
                value_vectors = self._split_seq_length_dim_to(
-                    value_vectors, num_hashes, -1, self.num_attention_heads, self.attention_head_size,
+                    value_vectors,
+                    num_hashes,
+                    -1,
+                    self.num_attention_heads,
+                    self.attention_head_size,
                )
                # repeat query vectors across hash dimension
                query_vectors = query_vectors.unsqueeze(2).repeat(1, 1, num_hashes, 1, 1)
@ -496,10 +505,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
            query_key_vectors = self._gather_by_expansion(query_key_vectors, sorted_bucket_idx_per_hash, num_hashes)
            value_vectors = self._gather_by_expansion(value_vectors, sorted_bucket_idx_per_hash, num_hashes)
            query_key_vectors = self._split_seq_length_dim_to(
-                query_key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                query_key_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
            )
            value_vectors = self._split_seq_length_dim_to(
-                value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                value_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
            )

            if self.chunk_length is None:
@ -548,10 +565,18 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
            # sum up all hash rounds
            if num_hashes > 1:
                out_vectors = self._split_seq_length_dim_to(
-                    out_vectors, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size,
+                    out_vectors,
+                    num_hashes,
+                    sequence_length,
+                    self.num_attention_heads,
+                    self.attention_head_size,
                )
                logits = self._split_seq_length_dim_to(
-                    logits, num_hashes, sequence_length, self.num_attention_heads, self.attention_head_size,
+                    logits,
+                    num_hashes,
+                    sequence_length,
+                    self.num_attention_heads,
+                    self.attention_head_size,
                ).unsqueeze(-1)

                probs_vectors = torch.exp(logits - torch.logsumexp(logits, dim=2, keepdim=True))
@ -697,7 +722,8 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):

        # factorize `num_buckets` if `num_buckets` becomes too large
        num_buckets_limit = 2 * max(
-            int((self.max_position_embeddings // self.chunk_length) ** (0.5)), self.chunk_length,
+            int((self.max_position_embeddings // self.chunk_length) ** (0.5)),
+            self.chunk_length,
        )
        if num_buckets > num_buckets_limit:
            num_buckets = [2 ** (num_buckets_pow_2 // 2), 2 ** (num_buckets_pow_2 - num_buckets_pow_2 // 2)]
@ -946,7 +972,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):

    def _len_and_dim_norm(self, vectors):
        """
-            length and attention head size dim normalization
+        length and attention head size dim normalization
        """
        vectors = self._len_norm(vectors)
        vectors = vectors * torch.rsqrt(
@ -956,7 +982,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):

    def _len_norm(self, x, epsilon=1e-6):
        """
-            length normalization
+        length normalization
        """
        variance = torch.mean(x ** 2, -1, keepdim=True)
        norm_x = x * torch.rsqrt(variance + epsilon)
@ -964,7 +990,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):

    def _gather_by_expansion(self, vectors, idxs, num_hashes):
        """
-            expand dims of idxs and vectors for all hashes and gather
+        expand dims of idxs and vectors for all hashes and gather
        """
        expanded_idxs = idxs.unsqueeze(-1).expand(-1, -1, -1, self.attention_head_size)
        vectors = vectors.repeat(1, 1, num_hashes, 1)
@ -973,11 +999,11 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):

 class ReverseSort(Function):
    """
-        After chunked attention is applied which sorted clusters,
-        original ordering has to be restored.
-        Since customized backward function is used for Reformer,
-        the gradients of the output vectors have to be explicitely
-        sorted here.
+    After chunked attention is applied which sorted clusters,
+    original ordering has to be restored.
+    Since customized backward function is used for Reformer,
+    the gradients of the output vectors have to be explicitely
+    sorted here.
    """

    @staticmethod
@ -1113,13 +1139,25 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
            # chunk vectors
            # B x Num_Attn_Head x Seq_Len // chunk_len x chunk_len  x  attn_head_size
            query_vectors = self._split_seq_length_dim_to(
-                query_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                query_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
            )
            key_vectors = self._split_seq_length_dim_to(
-                key_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                key_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
            )
            value_vectors = self._split_seq_length_dim_to(
-                value_vectors, -1, self.chunk_length, self.num_attention_heads, self.attention_head_size,
+                value_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
            )

            # chunk indices
@ -1179,7 +1217,12 @@ class LocalSelfAttention(nn.Module, EfficientAttentionMixin):
        if not do_standard_self_attention:
            out_vectors = out_vectors.flatten(start_dim=2, end_dim=3)

-        assert out_vectors.shape == (batch_size, self.num_attention_heads, sequence_length, self.attention_head_size,)
+        assert out_vectors.shape == (
+            batch_size,
+            self.num_attention_heads,
+            sequence_length,
+            self.attention_head_size,
+        )

        out_vectors = self._merge_hidden_size_dims(out_vectors, self.num_attention_heads, self.attention_head_size)

@ -1321,7 +1364,9 @@ class ReformerAttention(nn.Module):
        attention_output = self.output(self_attention_outputs.hidden_states)

        return AttentionOutput(
-            hidden_states=attention_output, attention_probs=self_attention_outputs.attention_probs, buckets=buckets,
+            hidden_states=attention_output,
+            attention_probs=self_attention_outputs.attention_probs,
+            buckets=buckets,
        )


@ -1369,7 +1414,10 @@ class ChunkReformerFeedForward(nn.Module):

    def forward(self, attention_output):
        return apply_chunking_to_forward(
-            self.forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output,
+            self.forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output,
        )

    def forward_chunk(self, hidden_states):
@ -1391,11 +1439,11 @@ class ReformerLayer(nn.Module):

    def _init_attention_seed(self):
        """
-            This function sets a new seed for the
-            attention layer to make dropout deterministic
-            for both forward calls: 1 normal forward
-            call and 1 forward call in backward
-            to recalculate activations.
+        This function sets a new seed for the
+        attention layer to make dropout deterministic
+        for both forward calls: 1 normal forward
+        call and 1 forward call in backward
+        to recalculate activations.
        """

        # randomize seeds
@ -1412,11 +1460,11 @@ class ReformerLayer(nn.Module):

    def _init_feed_forward_seed(self):
        """
-            This function sets a new seed for the
-            feed forward layer to make dropout deterministic
-            for both forward calls: 1 normal forward
-            call and 1 forward call in backward
-            to recalculate activations.
+        This function sets a new seed for the
+        feed forward layer to make dropout deterministic
+        for both forward calls: 1 normal forward
+        call and 1 forward call in backward
+        to recalculate activations.
        """
        # randomize seeds
        # use cuda generator if available
@ -1520,7 +1568,10 @@ class ReformerLayer(nn.Module):
            # f(X_2)
            # use cached buckets for backprob if buckets not None for LSHSelfAttention
            output = self.attention(
-                hidden_states=hidden_states, head_mask=head_mask, attention_mask=attention_mask, buckets=buckets,
+                hidden_states=hidden_states,
+                head_mask=head_mask,
+                attention_mask=attention_mask,
+                buckets=buckets,
            ).hidden_states
            output.backward(grad_attn_output, retain_graph=True)

@ -1738,8 +1789,8 @@ class ReformerOnlyLMHead(nn.Module):


 class ReformerPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
    """

    config_class = ReformerConfig
@ -1947,9 +1998,9 @@ class ReformerModel(ReformerPreTrainedModel):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@ -2099,7 +2150,10 @@ class ReformerModel(ReformerPreTrainedModel):
        )

        padded_input_ids = torch.full(
-            (input_shape[0], padding_length), self.config.pad_token_id, device=device, dtype=torch.long,
+            (input_shape[0], padding_length),
+            self.config.pad_token_id,
+            device=device,
+            dtype=torch.long,
        )

        # Extend `attention_mask`
@ -2369,11 +2423,11 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
        return_dict=None,
    ):
        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

@ -2407,7 +2461,10 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
-            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


--- a/Показать больше
+++ b/Показать больше