huggingface
diff --git a/‎.github/ISSUE_TEMPLATE/bug-report.yml
Lines changed: 2 additions & 2 deletions b/‎.github/ISSUE_TEMPLATE/bug-report.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 2 additions & 2 deletions b/‎.github/PULL_REQUEST_TEMPLATE.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/scripts/codeowners_for_review_action
Lines changed: 3 additions & 3 deletions b/‎.github/scripts/codeowners_for_review_action
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmark/README.md
Lines changed: 1 addition & 1 deletion b/‎benchmark/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/benchmarks_entrypoint.py
Lines changed: 0 additions & 1 deletion b/‎benchmark/benchmarks_entrypoint.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmark/llama.py
Lines changed: 4 additions & 4 deletions b/‎benchmark/llama.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/en/_toctree.yml
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/attention_interface.md
Lines changed: 30 additions & 8 deletions b/‎docs/source/en/attention_interface.md
Lines changed: 30 additions & 8 deletions
diff --git a/‎docs/source/en/model_doc/deepseek_v3.md
Lines changed: 184 additions & 0 deletions b/‎docs/source/en/model_doc/deepseek_v3.md
Lines changed: 184 additions & 0 deletions
@@ -48,11 +48,11 @@ body:
           - pipelines: @Rocketknight1
           - tensorflow: @gante and @Rocketknight1
           - tokenizers: @ArthurZucker and @itazap
-          - trainer: @muellerzr @SunMarc
+          - trainer: @zach-huggingface @SunMarc
 
         Integrations:
 
-          - deepspeed: HF Trainer/Accelerate: @muellerzr
+          - deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
           - ray/raytune: @richardliaw, @amogkam
           - Big Model Inference: @SunMarc
           - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
 
@@ -51,12 +51,12 @@ Library:
 - pipelines: @Rocketknight1
 - tensorflow: @gante and @Rocketknight1
 - tokenizers: @ArthurZucker
-- trainer: @muellerzr and @SunMarc
+- trainer: @zach-huggingface and @SunMarc
 - chat templates: @Rocketknight1
 
 Integrations:
 
-- deepspeed: HF Trainer/Accelerate: @muellerzr
+- deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
 - ray/raytune: @richardliaw, @amogkam
 - Big Model Inference: @SunMarc
 - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
 
@@ -14,7 +14,7 @@ docs/ @stevhliu
 # Owners of subsections of the library
 /src/transformers/generation/ @gante
 /src/transformers/pipeline/ @Rocketknight1 @yonigozlan
-/src/transformers/integrations/ @SunMarc @MekkCyber @muellerzr
+/src/transformers/integrations/ @SunMarc @MekkCyber @zach-huggingface
 /src/transformers/quantizers/ @SunMarc @MekkCyber
 tests/ @ydshieh
 tests/generation/ @gante
@@ -27,8 +27,8 @@ tests/generation/ @gante
 # Specific files come after the sections/globs, so they take priority
 /.circleci/config.yml @ArthurZucker @ydshieh
 /utils/tests_fetcher.py @ydshieh
-trainer.py @muellerzr @SunMarc
-trainer_utils.py @muellerzr @SunMarc
+trainer.py @zach-huggingface @SunMarc
+trainer_utils.py @zach-huggingface @SunMarc
 /utils/modular_model_converter.py @Cyrilvallez @ArthurZucker
 
 # Owners of individual models are specific / high priority, and so they come last
 
@@ -12,7 +12,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
 
 ## Writing metrics to the database
 
-`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
+`MetricsRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
 
 cf [`llama.py`](./llama.py) to see an example of this in practice.
 
 
@@ -3,7 +3,6 @@
 import logging
 import os
 from typing import Dict
-import psycopg2
 import sys
 
 from psycopg2.extras import Json
 
@@ -215,7 +215,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             torch.cuda.synchronize()
             end = perf_counter()
             time_to_second_token = end - start
-            logger.info(f"completed second compile generation in: {time_to_first_token}s")
+            logger.info(f"completed second compile generation in: {time_to_second_token}s")
             cache_position += 1
             all_generated_tokens += next_token.clone().detach().cpu().tolist()
 
@@ -227,7 +227,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             torch.cuda.synchronize()
             end = perf_counter()
             time_to_third_token = end - start
-            logger.info(f"completed third compile forward in: {time_to_first_token}s")
+            logger.info(f"completed third compile forward in: {time_to_third_token}s")
             cache_position += 1
             all_generated_tokens += next_token.clone().detach().cpu().tolist()
 
@@ -298,7 +298,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             output = model.generate(**inputs, past_key_values=past_key_values)
             end = perf_counter()
             third_compile_generate_time = end - start
-            logger.info(f"completed second compile generation in: {third_compile_generate_time}s")
+            logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
             logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
 
             past_key_values = StaticCache(
@@ -313,7 +313,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             output = model.generate(**inputs, past_key_values=past_key_values)
             end = perf_counter()
             fourth_compile_generate_time = end - start
-            logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s")
+            logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
             logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
 
         metrics_recorder.collect_model_measurements(
 
@@ -415,6 +415,8 @@
         title: DeBERTa
       - local: model_doc/deberta-v2
         title: DeBERTa-v2
+      - local: model_doc/deepseek_v3
+        title: DeepSeek-V3
       - local: model_doc/dialogpt
         title: DialoGPT
       - local: model_doc/diffllama
 
@@ -23,13 +23,13 @@ supported models.
 Most recent models can now switch from one attention function used in the Attention layer to the other, thanks to a simple mapping.
 By default, we provide the implementation for [`sdpa`](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html),
 [`flash_attention_2`](https://github.com/Dao-AILab/flash-attention) and [`flex_attention`](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention)
-as well as `eager`, which is simple matrix multiplication without any optimization on top.  
+as well as `eager`, which is a simple matrix multiplication without any optimization on top.  
 This is the setting you can usually choose when instantiating a model:
 
 ```python
 from transformers import AutoModelForCausalLM
 
-model_id = "meta-llama/Llama-3.2-1B
+model_id = "meta-llama/Llama-3.2-1B"
 
 # Here, using flash attention as an example
 model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2")
@@ -43,7 +43,7 @@ from transformers import AutoModelForCausalLM, AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
 import torch
 
-model_id = "meta-llama/Llama-3.2-1B
+model_id = "meta-llama/Llama-3.2-1B"
 
 def my_new_sdpa(*args, **kwargs):
     print("I just entered the attention computation")
@@ -56,7 +56,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="my_n
 model(torch.ones(1, 5, dtype=int))
 ```
 
-You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times.
+You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times).
 
 ## Dynamically switching attention function
 
@@ -70,12 +70,12 @@ model(torch.ones(1, 5, dtype=int))
 ```
 
 and it will stop printing the statements, as it now uses the `sdpa` attention.  
-This allows to quickly change attention function, without needing to reload the model!
+This allows to quickly change an attention function, without needing to reload the model!
 
-## What about new args needed in my custom function?
+## What about new args needed in my custom attention function?
 
 But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
-`AttentionInterface` propagates kwargs all the way to the Attention layers, and to the attention function used. That way,
+`AttentionInterface` propagate kwargs all the way to the Attention layers, and to the used attention function. That way,
 you can simply pass the arg (as a kwargs, i.e. you need to qualify the name of the arg) in the model's forward, and it will be correctly used in the attention. However, custom attention functions have some limitations. In particular, it must follow the signature and return format of other attention functions, i.e.
 
 ```python
@@ -103,4 +103,26 @@ model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="cust
 model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...)
 ```
 
-If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
+If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
+
+## Accessing current available implementations
+
+Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
+and/or perform a few checks, the prefered way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
+would expect from a usual Python dictionary:
+
+```python
+>>> from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+>>> list(ALL_ATTENTION_FUNCTIONS.keys())
+>>> ['flash_attention_2', 'flex_attention', 'sdpa']
+
+>>> ALL_ATTENTION_FUNCTIONS["sdpa"]
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+>>> ALL_ATTENTION_FUNCTIONS.get("sdpa", None)
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+# You can also globally `register` a new function directly on it
+>>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
+```
@@ -0,0 +1,184 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeepSeek-V3
+
+## Overview
+
+The DeepSeek-V3 model was proposed in [DeepSeek-V3 Technical Report](https://arxiv.org/abs/2412.19437) by DeepSeek-AI Team.
+
+The abstract from the paper is the following:
+We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition, its training process is remarkably stable. Throughout the entire training process, we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3.
+
+## Limitations and call for contribution!
+
+We are super happy to make this code community-powered, and would love to see how you can best optimize the following: 
+
+- current implementation uses the "naive" attention compution (so not really MLA)
+- current implementation loops through the experts. This should be replaced. Pointers to use `get_packed_weights` from `intetrations/tensor_parallel`. 
+- current implementation uses the eleuther formula for ROPE, using the orginal one would be more efficient! (should still follow our API)
+- static cache is not supported (this should be just a generation config issue / config shape issues)
+
+### Usage tips
+The model uses Multi-head Latent Attention (MLA) and DeepSeekMoE architectures for efficient inference and cost-effective training. It employs an auxiliary-loss-free strategy for load balancing and multi-token prediction training objective. The model can be used for various language tasks after being pre-trained on 14.8 trillion tokens and going through Supervised Fine-Tuning and Reinforcement Learning stages.
+
+You can run the model in `FP8` automatically, using 2 nodes of 8 H100 should be more than enough! 
+
+```python
+# `run_deepseek_v1.py`
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+torch.manual_seed(30)
+
+tokenizer = AutoTokenizer.from_pretrained("deepseek-r1")
+
+chat = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+
+
+model = AutoModelForCausalLM.from_pretrained("deepseek-r1", device_map="auto", torch_dtype=torch.bfloat16)
+inputs = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
+import time
+start = time.time()
+outputs = model.generate(inputs, max_new_tokens=50)
+print(tokenizer.batch_decode(outputs))
+print(time.time()-start)
+```
+This generated: 
+
+``````
+<｜Assistant｜><think>
+Okay, the user wants to demonstrate how chat templating works. Let me break down what that means. Chat templating is about structuring the conversation data, especially for models that need specific input formats. Maybe they're referring to something like how messages are formatted with roles (user, assistant, system) in APIs like OpenAI.
+
+First, I should explain what chat templating is. It's the process of formatting conversation data into a structured format that the model can understand. This usually includes roles and content. For example, user messages, assistant responses, and system messages each have their own role tags.
+
+They might want an example. Let me think of a simple conversation. The user says "Hello, how are you?" and the assistant responds "I'm doing great. How can I help you today?" Then the user follows up with wanting to show off chat templating. So the example should include the history and the new message.
+
+In some frameworks, like Hugging Face's Transformers, chat templates are applied using Jinja2 templates. The template might look something like combining system messages, then looping through user and assistant messages with appropriate tags. For instance, using {% for message in messages %} and assigning roles like <|user|>, <|assistant|>, etc.
+
+I should structure the example with the messages array, showing each role and content. Then apply a hypothetical template to convert that into a formatted string the model uses. Also, mention that different models have different templating requirements, like using special tokens or varying role labels.
+
+Wait, the user mentioned "chat templating" in the context of showing off. Maybe they want a practical example they can present. So providing a code snippet or a structured data example would be helpful. Let me outline a typical messages array and then the templated output.
+
+Also, it's important to note that proper templating ensures the model knows the conversation flow, which is crucial for generating coherent responses. Maybe include a note about why it's important, like maintaining context and role-specific processing.
+
+Let me check if there are any common mistakes or things to avoid. For example, not closing tags properly, or mismatching roles. But maybe that's too detailed unless the user asks. Focus on the positive example first.
+
+Putting it all together, the response should have an example messages array, the applied template, and the final formatted string. Maybe use angle brackets or special tokens as placeholders. Also, mention that this helps in training or fine-tuning models with structured data.
+
+I think that's a solid approach. Let me structure it step by step to make it clear.
+</think>
+
+Chat templating is a way to structure conversation data (e.g., user/assistant interactions) into a format that language models understand. This is especially important for models trained to handle multi-turn dialogues, where the input must explicitly separate roles (user, assistant, system, etc.) and messages. Let’s break this down with an example!
+
+---
+
+### **Step 1: Raw Conversation History**
+Suppose we have this conversation:
+- **User**: "Hello, how are you?"
+- **Assistant**: "I'm doing great. How can I help you today?"
+- **User**: "I'd like to show off how chat templating works!"
+
+---
+
+### **Step 2: Structured Messages**
+In frameworks like Hugging Face Transformers or OpenAI, conversations are often formatted as a list of dictionaries with `role` and `content`:
+```python
+messages = [
+    {"role": "user", "content": "Hello, how are you?"},
+    {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+    {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+```
+
+---
+
+### **Step 3: Apply a Chat Template**
+A **chat template** converts this structured data into a single string formatted for the model. For example, using a Jinja-style template (common in Hugging Face):
+
+```jinja
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        <|user|>{{ message['content'] }}<|end|>
+    {% elif message['role'] == 'assistant' %}
+        <|assistant|>{{ message['content'] }}<|end|>
+    {% endif %}
+{% endfor %}
+<|assistant|>
+```
+
+---
+
+### **Step 4: Final Templated Output**
+Applying the template to our `messages` list would produce:
+```text
+<|user|>Hello, how are you?<|end|>
+<|assistant|>I'm doing great. How can I help you today?<|end|>
+<|user|>I'd like to show off how chat templating works!<|end|>
+<|assistant|>
+```
+
+This tells the model:  
+1. The conversation history (user/assistant turns).  
+2. The model’s turn to generate a response (`<|assistant|>` at the end).  
+
+---
+
+### **Key Notes**:
+- **Role Separation**: Tags like `<|user|>` and `<|assistant|>` help the model distinguish speakers.
+- **Special Tokens**: Models often use unique tokens (e.g., `<|end|>`) to mark message boundaries.
+- **Flexibility**: Templates vary by model (e.g., OpenAI uses `{"role": "user", "content": "..."}` instead of tags).
+
+---
+
+### **Why This Matters**:
+- **Consistency**: Ensures the model understands dialogue structure.
+- **Context Preservation**: Maintains the flow of multi-turn conversations.
+- **Alignment**: Matches the format the model was trained on for better performance.
+
+Want to dive deeper or see a specific framework’s implementation (e.g., OpenAI, Llama, Mistral)? Let me know! 😊<｜end▁of▁sentence｜>
+``````
+
+Use the following to run it
+```bash
+torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0|1 --rdzv-id an_id --rdzv-backend c10d --rdzv-endpoint master_addr:master_port run_deepseek_r1.py
+```
+
+If you have: 
+```bash
+[rank0]: ncclInternalError: Internal check failed.
+[rank0]: Last error:
+[rank0]: Bootstrap : no socket interface found
+```
+error, it means NCCL was probably not loaded. 
+
+
+## DeepseekV3Config
+
+[[autodoc]] DeepseekV3Config
+
+## DeepseekV3Model
+
+[[autodoc]] DeepseekV3Model
+    - forward
+
+## DeepseekV3ForCausalLM
+
+[[autodoc]] DeepseekV3ForCausalLM
+    - forward