Replace strided slice with tf.expand_dims (#10078)

* Replace tf.newaxis -> tf.expand_dims * Fix tests * Fix tests * Use reshape when a tensors needs a double expand * Fix GPT2 * Fix GPT2
2021-02-09 17:48:28 +01:00 · 2021-02-09 17:48:28 +01:00 · b82fe7d258
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@ -1631,7 +1631,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
                )  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
            cls_shape = shape_list(cls_index)
            if len(cls_shape) <= len(hidden_shape) - 2:
-                cls_index = cls_index[..., tf.newaxis]
+                cls_index = tf.expand_dims(cls_index, axis=-1)
            # else:
            # cls_index = cls_index[..., tf.newaxis]
            # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@ -138,7 +138,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
            token_type_ids = tf.fill(dims=input_shape, value=0)

        if position_ids is None:
-            position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :]
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
@ -543,7 +543,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@ -192,7 +192,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
            token_type_ids = tf.fill(dims=input_shape, value=0)

        if position_ids is None:
-            position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :]
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
@ -655,7 +655,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@ -128,7 +128,7 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
            token_type_ids = tf.fill(dims=input_shape, value=0)

        if position_ids is None:
-            position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :]
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
@ -541,7 +541,7 @@ class TFConvBertMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@ -312,9 +312,9 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
        else:
            past_length = shape_list(inputs["past"][0][0])[-2]
        if inputs["position_ids"] is None:
-            inputs["position_ids"] = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[
-                tf.newaxis, :
-            ]
+            inputs["position_ids"] = tf.expand_dims(
+                tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32), axis=0
+            )
            inputs["position_ids"] = tf.tile(inputs["position_ids"], [input_shape[0], 1])

        # Attention mask.
@ -324,7 +324,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            inputs["attention_mask"] = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+            inputs["attention_mask"] = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@ -113,7 +113,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
        input_shape = shape_list(inputs_embeds)[:-1]

        if position_ids is None:
-            position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :]
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@ -415,7 +415,7 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
            token_type_ids = tf.fill(dims=input_shape, value=0)

        if position_ids is None:
-            position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :]
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
@ -510,7 +510,7 @@ class TFElectraMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@ -181,12 +181,12 @@ def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
    else:
        # assert lengths.max().item() <= slen
        alen = tf.range(slen)
-        mask = tf.math.less(alen, lengths[:, tf.newaxis])
+        mask = tf.math.less(alen, tf.expand_dims(lengths, axis=1))

    # attention mask is the same as mask, or triangular inferior attention (causal)
    if causal:
        attn_mask = tf.less_equal(
-            tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), alen[tf.newaxis, :, tf.newaxis]
+            tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
        )
    else:
        attn_mask = mask
@ -612,7 +612,7 @@ class TFFlaubertMainLayer(tf.keras.layers.Layer):

        tensor = self.layer_norm_emb(tensor)
        tensor = self.dropout(tensor, training=inputs["training"])
-        tensor = tensor * mask[..., tf.newaxis]
+        tensor = tensor * tf.expand_dims(mask, axis=-1)

        # hidden_states and attentions cannot be None in graph mode.
        hidden_states = () if inputs["output_hidden_states"] else None
@ -682,7 +682,7 @@ class TFFlaubertMainLayer(tf.keras.layers.Layer):
                tensor_normalized = self.layer_norm2[i](tensor)
                tensor = tensor + self.ffns[i](tensor_normalized)

-            tensor = tensor * mask[..., tf.newaxis]
+            tensor = tensor * tf.expand_dims(mask, axis=-1)

        # Add last hidden state
        if inputs["output_hidden_states"]:
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@ -302,9 +302,9 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
            past_length = shape_list(inputs["past"][0][0])[-2]

        if inputs["position_ids"] is None:
-            inputs["position_ids"] = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[
-                tf.newaxis, :
-            ]
+            inputs["position_ids"] = tf.expand_dims(
+                tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32), axis=0
+            )

        if inputs["attention_mask"] is not None:
            # We create a 3D attention mask from a 2D tensor mask.
@ -312,7 +312,10 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            inputs["attention_mask"] = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+            attention_mask_shape = shape_list(inputs["attention_mask"])
+            inputs["attention_mask"] = tf.reshape(
+                inputs["attention_mask"], (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
+            )

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@ -547,9 +547,9 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
                # Create the position ids from the input token ids. Any padded tokens remain padded.
                position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
            else:
-                position_ids = tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1)[
-                    tf.newaxis, :
-                ]
+                position_ids = tf.expand_dims(
+                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
+                )
                position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1))

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
@ -1661,7 +1661,10 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = inputs["attention_mask"][:, :, tf.newaxis, tf.newaxis]
+        attention_mask_shape = shape_list(inputs["attention_mask"])
+        extended_attention_mask = tf.reshape(
+            inputs["attention_mask"], (attention_mask_shape[0], attention_mask_shape[1], 1, 1)
+        )

        # Since attention_mask is 1.0 for positions we want to locall attend locally and 0.0 for
        # masked and global attn positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@ -233,7 +233,7 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer):
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

-        position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :]
+        position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
@ -726,7 +726,7 @@ class TFLxmertMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
@ -738,7 +738,12 @@ class TFLxmertMainLayer(tf.keras.layers.Layer):
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        if inputs["visual_attention_mask"] is not None:
-            extended_visual_attention_mask = inputs["visual_attention_mask"][:, tf.newaxis, tf.newaxis, :]
+            extended_visual_attention_mask = tf.reshape(
+                inputs["visual_attention_mask"], (input_shape[0], 1, 1, input_shape[1])
+            )
+            extended_visual_attention_mask = tf.expand_dims(
+                tf.expand_dims(inputs["visual_attention_mask"], axis=1), axis=1
+            )

            extended_visual_attention_mask = tf.cast(extended_visual_attention_mask, tf.float32)
            extended_visual_attention_mask = (1.0 - extended_visual_attention_mask) * -10000.0
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@ -192,7 +192,7 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
            inputs_embeds = self.embedding_transformation(inputs_embeds)

        if position_ids is None:
-            position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :]
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
@ -731,7 +731,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@ -151,9 +151,9 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
                # Create the position ids from the input token ids. Any padded tokens remain padded.
                position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
            else:
-                position_ids = tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1)[
-                    tf.newaxis, :
-                ]
+                position_ids = tf.expand_dims(
+                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
+                )
                position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1))

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
@ -533,7 +533,7 @@ class TFMPNetMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@ -268,7 +268,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs["position_ids"] is None:
-            inputs["position_ids"] = tf.range(input_shape[-1], dtype=tf.int32)[tf.newaxis, :]
+            inputs["position_ids"] = tf.expand_dims(tf.range(input_shape[-1], dtype=tf.int32), axis=0)

        if inputs["attention_mask"] is not None:
            # We create a 3D attention mask from a 2D tensor mask.
@ -276,7 +276,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            inputs["attention_mask"] = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+            inputs["attention_mask"] = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@ -147,9 +147,9 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
                # Create the position ids from the input token ids. Any padded tokens remain padded.
                position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
            else:
-                position_ids = tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1)[
-                    tf.newaxis, :
-                ]
+                position_ids = tf.expand_dims(
+                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
+                )
                position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1))

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
@ -533,7 +533,7 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@ -92,12 +92,12 @@ def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
    else:
        # assert lengths.max().item() <= slen
        alen = tf.range(slen)
-        mask = tf.math.less(alen, lengths[:, tf.newaxis])
+        mask = tf.math.less(alen, tf.expand_dims(lengths, axis=1))

    # attention mask is the same as mask, or triangular inferior attention (causal)
    if causal:
        attn_mask = tf.less_equal(
-            tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), alen[tf.newaxis, :, tf.newaxis]
+            tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
        )
    else:
        attn_mask = mask
@ -463,7 +463,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):

        tensor = self.layer_norm_emb(tensor)
        tensor = self.dropout(tensor, training=inputs["training"])
-        tensor = tensor * mask[..., tf.newaxis]
+        tensor = tensor * tf.expand_dims(mask, axis=-1)

        # transformer layers
        hidden_states = () if inputs["output_hidden_states"] else None
@ -502,7 +502,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
            # FFN
            tensor = tensor + self.ffns[i](tensor)
            tensor = self.layer_norm2[i](tensor)
-            tensor = tensor * mask[..., tf.newaxis]
+            tensor = tensor * tf.expand_dims(mask, axis=-1)

        # Add last hidden state
        if inputs["output_hidden_states"]:
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@ -134,7 +134,7 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer):
            token_type_ids = tf.fill(dims=input_shape, value=0)

        if position_ids is None:
-            position_ids = tf.range(start=0, limit=input_shape[-1])[tf.newaxis, :]
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
@ -570,7 +570,7 @@ class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for