From 39f8eafc1b6f00769240f714e2df5b2c5f111c32 Mon Sep 17 00:00:00 2001 From: Pavel Belevich Date: Tue, 3 May 2022 11:06:11 -0400 Subject: [PATCH] Remove device parameter from create_extended_attention_mask_for_decoder (#16894) --- .../longform-qa/eli5_utils.py | 2 +- src/transformers/modeling_utils.py | 20 +++++++++++++++---- src/transformers/models/bert/modeling_bert.py | 2 +- .../modeling_bert_generation.py | 4 +--- .../models/big_bird/modeling_big_bird.py | 4 +--- .../models/canine/modeling_canine.py | 4 ++-- .../models/convbert/modeling_convbert.py | 2 +- .../models/data2vec/modeling_data2vec_text.py | 2 +- .../models/electra/modeling_electra.py | 2 +- .../models/ibert/modeling_ibert.py | 2 +- .../models/longformer/modeling_longformer.py | 2 +- .../megatron_bert/modeling_megatron_bert.py | 2 +- src/transformers/models/mmbt/modeling_mmbt.py | 2 +- .../models/mobilebert/modeling_mobilebert.py | 4 +--- .../models/mpnet/modeling_mpnet.py | 2 +- .../nystromformer/modeling_nystromformer.py | 2 +- .../models/qdqbert/modeling_qdqbert.py | 2 +- .../models/realm/modeling_realm.py | 2 +- .../models/rembert/modeling_rembert.py | 2 +- .../models/retribert/modeling_retribert.py | 2 +- .../models/roberta/modeling_roberta.py | 2 +- .../models/roformer/modeling_roformer.py | 2 +- .../models/splinter/modeling_splinter.py | 2 +- .../squeezebert/modeling_squeezebert.py | 2 +- src/transformers/models/t5/modeling_t5.py | 2 +- .../models/tapas/modeling_tapas.py | 2 +- src/transformers/models/vilt/modeling_vilt.py | 2 +- .../visual_bert/modeling_visual_bert.py | 4 ++-- .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 2 +- src/transformers/models/yoso/modeling_yoso.py | 2 +- ...ng_{{cookiecutter.lowercase_modelname}}.py | 2 +- 31 files changed, 48 insertions(+), 42 deletions(-) diff --git a/examples/research_projects/longform-qa/eli5_utils.py b/examples/research_projects/longform-qa/eli5_utils.py index ff72a16bfd235b..c14210bd5e584f 100644 --- a/examples/research_projects/longform-qa/eli5_utils.py +++ b/examples/research_projects/longform-qa/eli5_utils.py @@ -137,7 +137,7 @@ def embed_sentences_checkpointed(self, input_ids, attention_mask, checkpoint_bat token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) head_mask = [None] * self.sent_encoder.config.num_hidden_layers extended_attention_mask: torch.Tensor = self.sent_encoder.get_extended_attention_mask( - attention_mask, input_shape, device + attention_mask, input_shape ) # define function for checkpointing diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 05777d7111de0c..81db2ff4a2e7b9 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -651,7 +651,13 @@ def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor: return encoder_extended_attention_mask @staticmethod - def create_extended_attention_mask_for_decoder(input_shape, attention_mask, device): + def create_extended_attention_mask_for_decoder(input_shape, attention_mask, device=None): + if device is not None: + warnings.warn( + "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) + else: + device = attention_mask.device batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] @@ -672,7 +678,9 @@ def create_extended_attention_mask_for_decoder(input_shape, attention_mask, devi extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] return extended_attention_mask - def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device) -> Tensor: + def get_extended_attention_mask( + self, attention_mask: Tensor, input_shape: Tuple[int], device: device = None + ) -> Tensor: """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. @@ -681,12 +689,16 @@ def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (`Tuple[int]`): The shape of the input to the model. - device: (`torch.device`): - The device of the input to the model. Returns: `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. """ + if not (attention_mask.dim() == 2 and self.config.is_decoder): + # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` + if device is not None: + warnings.warn( + "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index eb9c35e9788719..91cb92b02ee009 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -982,7 +982,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 49a9e96d7c2b1c..33dfb530b9941e 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -364,9 +364,7 @@ def forward( # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask = None if not use_cache: - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( - attention_mask, input_shape, device - ) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index 563e8631902a50..a2ea03c17aeb37 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -2112,9 +2112,7 @@ def forward( to_mask = None # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( - attention_mask, input_shape, device - ) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) else: raise ValueError( f"attention_type can either be original_full or block_sparse, but is {self.attention_type}" diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index 2d903109ac0386..e9cad1d15b8624 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -1130,12 +1130,12 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) molecule_attention_mask = self._downsample_attention_mask( attention_mask, downsampling_rate=self.config.downsampling_rate ) extended_molecule_attention_mask: torch.Tensor = self.get_extended_attention_mask( - molecule_attention_mask, (batch_size, molecule_attention_mask.shape[-1]), device + molecule_attention_mask, (batch_size, molecule_attention_mask.shape[-1]) ) # Prepare head mask if needed diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index cf2240b79c01e0..d7216b879eed44 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -833,7 +833,7 @@ def forward( else: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) hidden_states = self.embeddings( diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 9168281eb8447a..5962a4447a236b 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -820,7 +820,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index c79ec763ac4801..a310ec0965523c 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -882,7 +882,7 @@ def forward( else: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index 420e8b27404704..421dbcae0b16d2 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -814,7 +814,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 6d18e11a2b4025..20a2e9d239e24a 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -1692,7 +1692,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)[ + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)[ :, 0, 0, : ] diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index b64a0d41b939ec..882798f68f074f 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -940,7 +940,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/mmbt/modeling_mmbt.py index 5e284c1b699657..8819dc4d5178c0 100644 --- a/src/transformers/models/mmbt/modeling_mmbt.py +++ b/src/transformers/models/mmbt/modeling_mmbt.py @@ -268,7 +268,7 @@ def forward( [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1 ) - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, self.device) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index 1a2156ed31d034..87db30757db964 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -875,9 +875,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( - attention_mask, input_shape, self.device - ) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index 89b68544a1efdb..e7977561fe2b1a 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -547,7 +547,7 @@ def forward( if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds) diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py index 636e5df108ce18..b5813af781b72f 100755 --- a/src/transformers/models/nystromformer/modeling_nystromformer.py +++ b/src/transformers/models/nystromformer/modeling_nystromformer.py @@ -624,7 +624,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py index ecd9fe73a95d49..5e28f4314fca23 100755 --- a/src/transformers/models/qdqbert/modeling_qdqbert.py +++ b/src/transformers/models/qdqbert/modeling_qdqbert.py @@ -952,7 +952,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index f8849a7878dbe2..b2cf47e9faa370 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -1078,7 +1078,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index dc6f88f886ad69..c7b8da35a27209 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -857,7 +857,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py index 8470aea7ae59d2..7d0a42b5fb132a 100644 --- a/src/transformers/models/retribert/modeling_retribert.py +++ b/src/transformers/models/retribert/modeling_retribert.py @@ -117,7 +117,7 @@ def embed_sentences_checkpointed( token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) head_mask = [None] * sent_encoder.config.num_hidden_layers extended_attention_mask: torch.Tensor = sent_encoder.get_extended_attention_mask( - attention_mask, input_shape, device + attention_mask, input_shape ) # define function for checkpointing diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index e7aac773fae768..5c226615d2bbc4 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -817,7 +817,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index fe746971504697..738df511192245 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -900,7 +900,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 4d695b3137e1f0..96b9998bd08b1e 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -710,7 +710,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index b8cdfe16a9f4bd..210531772984a2 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -612,7 +612,7 @@ def forward( if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 68b40c5fd45e74..630e9dd17aa549 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -957,7 +957,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index ceb250f5161fff..b0c3786ca05a45 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -954,7 +954,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index f29057addecc03..74759be4047742 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -843,7 +843,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) encoder_outputs = self.encoder( embedding_output, diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index 506b0c749aee09..643411ee7f32c9 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -794,12 +794,12 @@ def forward( if visual_embeds is not None: combined_attention_mask = torch.cat((attention_mask, visual_attention_mask), dim=-1) extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( - combined_attention_mask, [batch_size, input_shape + visual_input_shape], device + combined_attention_mask, (batch_size, input_shape + visual_input_shape) ) else: extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( - attention_mask, [batch_size, input_shape], device + attention_mask, (batch_size, input_shape) ) # Prepare head mask if needed diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index e6c3ac3ec8c79a..974a3e7b3e5322 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -788,7 +788,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index bcd9c516cc8baa..50013ca03209e9 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -816,7 +816,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index bde5eaa2e3b95f..938cbea65c63a3 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -876,7 +876,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]