manueldeprada
diff --git a/‎.github/workflows/collated-reports.yml‎
Lines changed: 0 additions & 6 deletions b/‎.github/workflows/collated-reports.yml‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎.github/workflows/push-important-models.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/push-important-models.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/self-scheduled.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/self-scheduled.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/slack-report.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/slack-report.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/cache_explanation.md‎
Lines changed: 3 additions & 2 deletions b/‎docs/source/en/cache_explanation.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/source/en/gguf.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/en/gguf.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/en/kv_cache.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/kv_cache.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/model_doc/apertus.md‎
Lines changed: 100 additions & 0 deletions b/‎docs/source/en/model_doc/apertus.md‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/efficientloftr.md‎
Lines changed: 4 additions & 3 deletions b/‎docs/source/en/model_doc/efficientloftr.md‎
Lines changed: 4 additions & 3 deletions
@@ -41,9 +41,3 @@ jobs:
             --job ${{ inputs.job }}                          \
             --report-repo-id ${{ inputs.report_repo_id }}    \
             --gpu-name ${{ inputs.gpu_name }}
-
-      - name: Upload collated reports
-        uses: actions/upload-artifact@v4
-        with:
-          name: collated_reports_${{ env.CI_SHA }}.json
-          path: collated_reports_${{ env.CI_SHA }}.json
@@ -145,6 +145,7 @@ jobs:
     name: Model CI
     uses: ./.github/workflows/self-scheduled.yml
     needs: get_modified_models
+    if: needs.get_modified_models.outputs.matrix != '' && needs.get_modified_models.outputs.matrix != '[]'
     with:
       job: run_models_gpu
       slack_report_channel: "#transformers-ci-push"
 
@@ -515,7 +515,7 @@ jobs:
       run_quantization_torch_gpu,
       run_extract_warnings
     ]
-    if: ${{ always() }}
+    if: always() && !cancelled()
     uses: ./.github/workflows/slack-report.yml
     with:
       job: ${{ inputs.job }}
 
@@ -93,7 +93,7 @@ jobs:
             python utils/notification_service.py "${{ inputs.quantization_matrix }}"
           else
             python utils/notification_service.py "${{ inputs.folder_slices }}"
-          fi          
+          fi
 
       # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
       - name: Failure table artifacts
 
@@ -373,6 +373,8 @@
     - sections:
       - local: model_doc/albert
         title: ALBERT
+      - local: model_doc/apertus
+        title: Apertus
       - local: model_doc/arcee
         title: Arcee
       - local: model_doc/bamba
 
@@ -15,6 +15,7 @@ rendered properly in your Markdown viewer.
 -->
 
 # Caching
+
 Imagine you're having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?
 
 You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context.
@@ -107,7 +108,7 @@ model_id = "meta-llama/Llama-2-7b-chat-hf"
 model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map=device)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-past_key_values = DynamicCache()
+past_key_values = DynamicCache(config=model.config)
 messages = [{"role": "user", "content": "Hello, what's your name."}]
 inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
 
@@ -138,7 +139,7 @@ The cache position tracks where to insert new tokens in the attention cache. It
 Cache position is used internally for two purposes:
 
 1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
-2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, like [`StaticCache`], that pre-allocates a specific cache length.
+2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, that pre-allocates a specific cache length.
 
 The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.
 
 
@@ -33,6 +33,7 @@ Add the `gguf_file` parameter to [`~PreTrainedModel.from_pretrained`] to specify
 
 ```py
 # pip install gguf
+import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
 
@@ -227,7 +227,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."]
 
-past_key_values = DynamicCache()
+past_key_values = DynamicCache(config=model.config)
 
 messages = []
 for prompt in user_prompts:
 
@@ -0,0 +1,100 @@
+<!--Copyright 2025 The HuggingFace Team and the Swiss AI Initiative. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+    </div>
+</div>
+
+# Apertus
+
+[Apertus](https://www.swiss-ai.org) is a family of large language models from the Swiss AI Initiative.
+
+> [!TIP]
+> Coming soon
+
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`], and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation",
+    model="swiss-ai/Apertus-8B",
+    dtype=torch.bfloat16,
+    device=0
+)
+pipeline("Plants create energy through a process known as")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "swiss-ai/Apertus-8B",
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "swiss-ai/Apertus-8B",
+    dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model swiss-ai/Apertus-8B --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## ApertusConfig
+
+[[autodoc]] ApertusConfig
+
+## ApertusModel
+
+[[autodoc]] ApertusModel
+    - forward
+
+## ApertusForCausalLM
+
+[[autodoc]] ApertusForCausalLM
+    - forward
+
+## ApertusForTokenClassification
+
+[[autodoc]] ApertusForTokenClassification
+    - forward
@@ -45,7 +45,7 @@ results = keypoint_matcher([url_0, url_1], threshold=0.9)
 print(results[0])
 # {'keypoint_image_0': {'x': ..., 'y': ...}, 'keypoint_image_1': {'x': ..., 'y': ...}, 'score': ...}
 ```
-<hfoption id="AutoModel">
+</hfoption>
 <hfoption id="AutoModel">
 
 ```py
@@ -65,7 +65,7 @@ processor = AutoImageProcessor.from_pretrained("zju-community/efficientloftr")
 model = AutoModelForKeypointMatching.from_pretrained("zju-community/efficientloftr")
 
 inputs = processor(images, return_tensors="pt")
-with torch.no_grad():
+with torch.inference_mode():
     outputs = model(**inputs)
 
 # Post-process to get keypoints and matches
@@ -92,7 +92,8 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
     # EfficientLoFTR requires pairs of images
     images = [image1, image2]
     inputs = processor(images, return_tensors="pt")
-    outputs = model(**inputs)
+    with torch.inference_mode():
+        outputs = model(**inputs)
 
     # Extract matching information
     keypoints = outputs.keypoints        # Keypoints in both images
Original file line number	Diff line number	Diff line change
`@@ -515,7 +515,7 @@ jobs:`
`515`	`515`	`run_quantization_torch_gpu,`
`516`	`516`	`run_extract_warnings`
`517`	`517`	`]`
`518`		`- if: ${{ always() }}`
	`518`	`+ if: always() && !cancelled()`
`519`	`519`	`uses: ./.github/workflows/slack-report.yml`
`520`	`520`	`with:`
`521`	`521`	`job: ${{ inputs.job }}`