diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py index a0cf8be82f8c30..5af80fc4503d32 100644 --- a/src/transformers/models/dinov2/modeling_dinov2.py +++ b/src/transformers/models/dinov2/modeling_dinov2.py @@ -583,7 +583,7 @@ def _set_gradient_checkpointing(self, module: Dinov2Encoder, value: bool = False DINOV2_START_DOCSTRING, ) class Dinov2Model(Dinov2PreTrainedModel): - def __init__(self, config: Dinov2Config, add_pooling_layer: bool = True): + def __init__(self, config: Dinov2Config): super().__init__(config) self.config = config @@ -591,7 +591,6 @@ def __init__(self, config: Dinov2Config, add_pooling_layer: bool = True): self.encoder = Dinov2Encoder(config) self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.pooler = Dinov2Pooler(config) if add_pooling_layer else None # Initialize weights and apply final processing self.post_init() @@ -651,10 +650,10 @@ def forward( ) sequence_output = encoder_outputs[0] sequence_output = self.layernorm(sequence_output) - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + pooled_output = sequence_output[:, 0, :] if not return_dict: - head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,) + head_outputs = (sequence_output, pooled_output) return head_outputs + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -665,22 +664,6 @@ def forward( ) -# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->Dinov2 -class Dinov2Pooler(nn.Module): - def __init__(self, config: Dinov2Config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - @add_start_docstrings( """ Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state @@ -693,7 +676,7 @@ def __init__(self, config: Dinov2Config) -> None: super().__init__(config) self.num_labels = config.num_labels - self.dinov2 = Dinov2Model(config, add_pooling_layer=False) + self.dinov2 = Dinov2Model(config) # Classifier head self.classifier = ( @@ -770,7 +753,7 @@ def forward( loss = loss_fct(logits, labels) if not return_dict: - output = (logits,) + outputs[1:] + output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return ImageClassifierOutput(