huggingface · NathanHB · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/README.md b/README.md
@@ -127,7 +127,7 @@ Did not find what you need ? You can always make your custom model API by follow
 Here's a **quick command** to evaluate using the *Accelerate backend*:
 
 ```shell
-lighteval eval "hf-inference-providers/openai/gpt-oss-20b" "lighteval|gpqa:diamond|0"
+lighteval eval "hf-inference-providers/openai/gpt-oss-20b" gpqa:diamond
 ```
 
 Or use the **Python API** to run a model *already loaded in memory*!
@@ -141,7 +141,7 @@ from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
 
 
 MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
-BENCHMARKS = "lighteval|gsm8k|0"
+BENCHMARKS = "gsm8k"
 
 evaluation_tracker = EvaluationTracker(output_dir="./results")
 pipeline_params = PipelineParameters(

diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
@@ -78,7 +78,6 @@ from lighteval.tasks.lighteval_task import LightevalTaskConfig
 task = LightevalTaskConfig(
     name="myothertask",
     prompt_function=prompt_fn,  # Must be defined in the file or imported
-    suite=["community"],
     hf_repo="your_dataset_repo_on_hf",
     hf_subset="default",
     hf_avail_splits=["train", "test"],
@@ -115,7 +114,6 @@ class CustomSubsetTask(LightevalTaskConfig):
             evaluation_splits=["test"],
             few_shots_split="train",
             few_shots_select="random_sampling_from_train",
-            suite=["lighteval"],
             generation_size=256,
             stop_sequence=["\n", "Question:"],
         )
@@ -149,22 +147,16 @@ Once your file is created, you can run the evaluation with the following command
 ```bash
 lighteval accelerate \
     "model_name=HuggingFaceH4/zephyr-7b-beta" \
-    "lighteval|{task}|{fewshots}" \
+    {task} \
     --custom-tasks {path_to_your_custom_task_file}
 ```
 
 ### Example Usage
 
 ```bash
-# Run a custom task with zero-shot evaluation
+# Run a custom task with 3 shot evaluation
 lighteval accelerate \
     "model_name=openai-community/gpt2" \
-    "lighteval|myothertask|0" \
-    --custom-tasks community_tasks/my_custom_task.py
-
-# Run a custom task with few-shot evaluation
-lighteval accelerate \
-    "model_name=openai-community/gpt2" \
-    "lighteval|myothertask|3" \
+    "myothertask|3" \
     --custom-tasks community_tasks/my_custom_task.py
 ```
diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx
@@ -146,7 +146,7 @@ path_to_your_file` when launching it after adding it to the task config.
 ```bash
 lighteval accelerate \
     "model_name=openai-community/gpt2" \
-    "leaderboard|truthfulqa:mc|0" \
+    "truthfulqa:mc" \
     --custom-tasks path_to_your_metric_file.py
 ```
 

diff --git a/docs/source/available-tasks.mdx b/docs/source/available-tasks.mdx
@@ -26,5 +26,5 @@ lighteval tasks inspect <task_name>
 
 Example:
 ```bash
-lighteval tasks inspect "lighteval|truthfulqa:mc|0"
+lighteval tasks inspect truthfulqa:mc
 ```
diff --git a/docs/source/evaluating-a-custom-model.mdx b/docs/source/evaluating-a-custom-model.mdx
@@ -59,7 +59,7 @@ You can evaluate your custom model using either the command-line interface or th
 lighteval custom \
     "google-translate" \
     "examples/custom_models/google_translate_model.py" \
-    "lighteval|wmt20:fr-de|0" \
+    "wmt20:fr-de \
     --max-samples 10
 ```
 
@@ -94,7 +94,7 @@ model_config = CustomModelConfig(
 
 # Create and run the pipeline
 pipeline = Pipeline(
-    tasks="leaderboard|truthfulqa:mc|0",
+    tasks=truthfulqa:mc,
     pipeline_parameters=pipeline_params,
     evaluation_tracker=evaluation_tracker,
     model_config=model_config

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -59,7 +59,7 @@ pip install lighteval
 
 ```bash
 lighteval eval "hf-inference-providers/openai/gpt-oss-20b" \
-  "lighteval|gpqa:diamond|0" \
+    gpqa:diamond \
     --bundle-dir gpt-oss-bundle \
     --repo-id OpenEvals/evals
 ```

diff --git a/docs/source/inspect-ai.mdx b/docs/source/inspect-ai.mdx
@@ -21,13 +21,13 @@ Once you've chosen a benchmark, run it with `lighteval eval`. Below are examples
 1. Evaluate a model via Hugging Face Inference Providers.
 
 ```bash
-lighteval eval "hf-inference-providers/openai/gpt-oss-20b" "lighteval|gpqa:diamond|0"
+lighteval eval "hf-inference-providers/openai/gpt-oss-20b" gpqa:diamond
 ```
 
 2. Run multiple evals at the same time.
 
 ```bash
-lighteval eval "hf-inference-providers/openai/gpt-oss-20b" "lighteval|gpqa:diamond|0,lighteval|aime25|0"
+lighteval eval "hf-inference-providers/openai/gpt-oss-20b" gpqa:diamond,aime25
 ```
 
 3. Compare providers for the same model.
@@ -37,25 +37,25 @@ lighteval eval \
     hf-inference-providers/openai/gpt-oss-20b:fireworks-ai \
     hf-inference-providers/openai/gpt-oss-20b:together \
     hf-inference-providers/openai/gpt-oss-20b:nebius \
-    "lighteval|gpqa:diamond|0"
+    gpqa:diamond
 ```
 
 4. Evaluate a vLLM or SGLang model.
 
 ```bash
-lighteval eval vllm/HuggingFaceTB/SmolLM-135M-Instruct "lighteval|gpqa:diamond|0"
+lighteval eval vllm/HuggingFaceTB/SmolLM-135M-Instruct gpqa:diamond
 ```
 
 5. See the impact of few-shot on your model.
 
 ```bash
-lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|gsm8k|0,lighteval|gsm8k|5"
+lighteval eval hf-inference-providers/openai/gpt-oss-20b "gsm8k|0,gsm8k|5"
 ```
 
 6. Optimize custom server connections.
 
 ```bash
-lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|gsm8k|0" \
+lighteval eval hf-inference-providers/openai/gpt-oss-20b gsm8k \
     --max-connections 50 \
     --timeout 30 \
     --retry-on-error 1 \
@@ -66,13 +66,13 @@ lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|gsm8k|0" \
 7. Use multiple epochs for more reliable results.
 
 ```bash
-lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|aime25|0" --epochs 16 --epochs-reducer "pass_at_4"
+lighteval eval hf-inference-providers/openai/gpt-oss-20b aime25 --epochs 16 --epochs-reducer "pass_at_4"
 ```
 
 8. Push to the Hub to share results.
 
 ```bash
-lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|hle|0" \
+lighteval eval hf-inference-providers/openai/gpt-oss-20b hle \
     --bundle-dir gpt-oss-bundle \
     --repo-id OpenEvals/evals \
     --max-samples 100
@@ -92,17 +92,17 @@ Resulting Space:
 You can use any argument defined in inspect-ai's API.
 
 ```bash
-lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|aime25|0" --temperature 0.1
+lighteval eval hf-inference-providers/openai/gpt-oss-20b aime25 --temperature 0.1
 ```
 
 10. Use model-args to use any inference provider specific argument.
 
 ```bash
-lighteval eval google/gemini-2.5-pro "lighteval|aime25|0" --model-args location=us-east5
+lighteval eval google/gemini-2.5-pro aime25 --model-args location=us-east5
 ```
 
 ```bash
-lighteval eval openai/gpt-4o "lighteval|gpqa:diamond|0" --model-args service_tier=flex,client_timeout=1200
+lighteval eval openai/gpt-4o gpqa:diamond --model-args service_tier=flex,client_timeout=1200
 ```
 
 

diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
@@ -21,6 +21,7 @@ Lighteval can be used with several different commands, each optimized for differ
 
 ### Evaluation Backends
 
+- `lighteval eval`: Use [inspect-ai](https://inspect.aisi.org.uk/) as backend to evaluate and inspect your models ! (prefered way)
 - `lighteval accelerate`: Evaluate models on CPU or one or more GPUs using [🤗
   Accelerate](https://github.com/huggingface/accelerate)
 - `lighteval nanotron`: Evaluate models in distributed settings using [⚡️
@@ -54,11 +55,9 @@ To evaluate `GPT-2` on the Truthful QA benchmark with [🤗
 ```bash
 lighteval accelerate \
      "model_name=openai-community/gpt2" \
-     "leaderboard|truthfulqa:mc|0"
+     truthfulqa:mc
 ```
 
-Here, we first choose a backend (either `accelerate`, `nanotron`, `endpoint`, or `vllm`), and then specify the model and task(s) to run.
-
 ### Task Specification
 
 The syntax for the task specification might be a bit hard to grasp at first. The format is as follows:
@@ -96,7 +95,7 @@ When specifying a path to a file, it should start with `./`.
 lighteval accelerate \
      "model_name=openai-community/gpt2" \
      ./path/to/lighteval/examples/tasks/recommended_set.txt
-# or, e.g., "leaderboard|truthfulqa:mc|0,leaderboard|gsm8k|3"
+# or, e.g., "truthfulqa:mc|0,gsm8k|3"
 ```
 
 ## Backend Configuration
@@ -120,7 +119,7 @@ thinking tokens:
 ```bash
 lighteval vllm \
     "model_name=mistralai/Magistral-Small-2507,dtype=float16,data_parallel_size=4" \
-    "lighteval|aime24|0" \
+    aime24 \
     --remove-reasoning-tags \
     --reasoning-tags="[('[THINK]','[/THINK]')]"
 ```

diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
@@ -69,7 +69,7 @@ import glob
 output_dir = "evals_doc"
 model_name = "HuggingFaceH4/zephyr-7b-beta"
 timestamp = "latest"
-task = "lighteval|gsm8k|0"
+task = "gsm8k"
 
 if timestamp == "latest":
     path = f"{output_dir}/details/{model_name}/*/"
@@ -94,7 +94,7 @@ from datasets import load_dataset
 results_org = "SaylorTwift"
 model_name = "HuggingFaceH4/zephyr-7b-beta"
 sanitized_model_name = model_name.replace("/", "__")
-task = "lighteval|gsm8k|0"
+task = "gsm8k"
 public_run = False
 
 dataset_path = f"{results_org}/details_{sanitized_model_name}{'_private' if not public_run else ''}"
@@ -192,7 +192,7 @@ The main results file contains several sections:
     "model_size": "476.2 MB"
   },
   "results": {
-    "lighteval|gsm8k|0": {
+    "gsm8k|0": {
       "em": 0.0,
       "em_stderr": 0.0,
       "maj@8": 0.0,
@@ -206,7 +206,7 @@ The main results file contains several sections:
     }
   },
   "versions": {
-    "lighteval|gsm8k|0": 0
+    "gsm8k|0": 0
   },
   "config_tasks": {
     "lighteval|gsm8k": {
@@ -257,7 +257,7 @@ The main results file contains several sections:
     }
   },
   "summary_tasks": {
-    "lighteval|gsm8k|0": {
+    "gsm8k|0": {
       "hashes": {
         "hash_examples": "8517d5bf7e880086",
         "hash_full_prompts": "8517d5bf7e880086",

diff --git a/docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx b/docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx
@@ -98,15 +98,15 @@ model_parameters:
 ```bash
 lighteval endpoint inference-endpoint \
     "configs/endpoint_model.yaml" \
-    "lighteval|gsm8k|0"
+    gsm8k
 ```
 
 ### Using an Existing TGI Server
 
 ```bash
 lighteval endpoint tgi \
     "configs/tgi_server.yaml" \
-    "lighteval|gsm8k|0"
+    gsm8k
 ```
 
 ### Reusing an Existing Endpoint

diff --git a/docs/source/use-inference-providers-as-backend.mdx b/docs/source/use-inference-providers-as-backend.mdx
@@ -11,7 +11,7 @@ Lighteval allows you to use Hugging Face's Inference Providers to evaluate LLMs
 ```bash
 lighteval endpoint inference-providers \
     "model_name=deepseek-ai/DeepSeek-R1,provider=hf-inference" \
-    "lighteval|gsm8k|0"
+    gsm8k
 ```
 
 ## Using a Configuration File
@@ -21,7 +21,7 @@ You can use configuration files to define the model and the provider to use.
 ```bash
 lighteval endpoint inference-providers \
     examples/model_configs/inference_providers.yaml \
-    "lighteval|gsm8k|0"
+    gsm8k
 ```
 
 With the following configuration file:

diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/use-litellm-as-backend.mdx
@@ -12,7 +12,7 @@ OpenAI, Groq, and many others.
 ```bash
 lighteval endpoint litellm \
     "provider=openai,model_name=gpt-3.5-turbo" \
-    "lighteval|gsm8k|0"
+    gsm8k
 ```
 
 ## Using a Configuration File

diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx
@@ -8,7 +8,7 @@ To use SGLang, simply change the `model_args` to reflect the arguments you want
 ```bash
 lighteval sglang \
     "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
-    "leaderboard|truthfulqa:mc|0"
+    truthfulqa:mc
 ```
 
 ## Parallelism Options
@@ -23,7 +23,7 @@ For example, if you have 4 GPUs, you can split the model across them using tenso
 ```bash
 lighteval sglang \
     "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tp_size=4" \
-    "leaderboard|truthfulqa:mc|0"
+    truthfulqa:mc
 ```
 
 ### Data Parallelism
@@ -33,7 +33,7 @@ If your model fits on a single GPU, you can use data parallelism with `dp_size`
 ```bash
 lighteval sglang \
     "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,dp_size=4" \
-    "leaderboard|truthfulqa:mc|0"
+    truthfulqa:mc
 ```
 
 ## Using a Configuration File
@@ -44,7 +44,7 @@ An example configuration file is shown below and can be found at `examples/model
 ```bash
 lighteval sglang \
     "examples/model_configs/sglang_model_config.yaml" \
-    "leaderboard|truthfulqa:mc|0"
+    truthfulqa:mc
 ```
 
 > [!TIP]

diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
@@ -11,7 +11,7 @@ To use VLLM, simply change the `model_args` to reflect the arguments you want to
 ```bash
 lighteval vllm \
     "model_name=HuggingFaceH4/zephyr-7b-beta" \
-    "extended|ifeval|0"
+    ifeval
 ```
 
 ## Parallelism Options
@@ -26,7 +26,7 @@ For example, if you have 4 GPUs, you can split the model across them using tenso
 ```bash
 export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \
     "model_name=HuggingFaceH4/zephyr-7b-beta,tensor_parallel_size=4" \
-    "extended|ifeval|0"
+    ifeval
 ```
 
 ### Data Parallelism
@@ -36,7 +36,7 @@ If your model fits on a single GPU, you can use data parallelism to speed up the
 ```bash
 export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \
     "model_name=HuggingFaceH4/zephyr-7b-beta,data_parallel_size=4" \
-    "extended|ifeval|0"
+    ifeval
 ```
 
 ## Using a Configuration File
@@ -47,7 +47,7 @@ An example configuration file is shown below and can be found at `examples/model
 ```bash
 lighteval vllm \
     "examples/model_configs/vllm_model_config.yaml" \
-    "extended|ifeval|0"
+    ifeval
 ```
 
 ```yaml