huggingface · ArthurZucker · May 9, 2025 · May 9, 2025 · May 15, 2025 · May 19, 2025
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
@@ -26,6 +26,7 @@
 
 This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
 """
+
 import random
 import re
 from typing import Any, Dict, List, Optional, Union

diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py
@@ -26,6 +26,7 @@
 
 This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
 """
+
 import re
 from dataclasses import asdict
 from typing import Dict, List, Tuple

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -591,7 +591,7 @@ def recreate_metadata_card(self, repo_id: str) -> None:  # noqa: C901
             f"To load the details from a run, you can for instance do the following:\n"
             f'```python\nfrom datasets import load_dataset\ndata = load_dataset("{repo_id}",\n\t"{sanitized_task}",\n\tsplit="train")\n```\n\n'
             f"## Latest results\n\n"
-            f'These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace("/resolve/", "/blob/")})'
+            f"These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace('/resolve/', '/blob/')})"
             f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
             f'You find each in the results and the "latest" split for each eval):\n\n'
             f"```python\n{results_string}\n```",

diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py
@@ -22,6 +22,7 @@
 # SOFTWARE.
 
 """Simplified version of the BertScorer lib - we only import what we need."""
+
 import logging
 import os
 import time

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -616,16 +616,16 @@ class Metrics(Enum):
         sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.REASONING,
-        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
-        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
+        higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
     )
     g_pass_at_8_16 = SampleLevelMetricGrouping(
         metric_name="G-Pass@8-16:48_samples",
         sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.REASONING,
-        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
-        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
+        higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
     )
     g_pass_at_16_expr_gold = SampleLevelMetricGrouping(
         metric_name="G-Pass@16:48_samples",
@@ -645,8 +645,8 @@ class Metrics(Enum):
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.REASONING,
-        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
-        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
+        higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
     )
     g_pass_at_16_latex_gold = SampleLevelMetricGrouping(
         metric_name="G-Pass@16:48_samples",
@@ -666,8 +666,8 @@ class Metrics(Enum):
         ).compute,
         category=MetricCategory.GENERATIVE_SAMPLING,
         use_case=MetricUseCase.REASONING,
-        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
-        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
+        higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
     )
     perfect_exact_match = SampleLevelMetric(
         metric_name="perfect_em",

diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -24,6 +24,7 @@
 Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
 A number of these aggregations come from the EleutherAIHarness
 """
+
 import logging
 import math
 from typing import Literal

diff --git a/src/lighteval/models/endpoints/openai_model.py b/src/lighteval/models/endpoints/openai_model.py
@@ -259,7 +259,7 @@ def _loglikelihood_tokens(
                 new_tokens == 1 for new_tokens in max_new_tokens
             ), "Only single token continuations are supported when using openai API."
 
-            logit_biases = [{tok: 100 for tok in sample.tokenized_continuation} for sample in split]
+            logit_biases = [dict.fromkeys(sample.tokenized_continuation, 100) for sample in split]
 
             outputs = self.__call_api_parallel(
                 inputs, return_logits=True, max_new_tokens=max_new_tokens, num_samples=1, logit_bias=logit_biases

diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py
@@ -22,7 +22,7 @@
 
 import logging
 import os
-from typing import Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -254,14 +254,15 @@ def from_model(
 
         # Instanciate the object without using __init__
         self = cls.__new__(cls)
-        self.config = config
         self.transformers_config = model.config
-        self.generation_config_dict = config.generation_parameters.to_transformers_dict()
+        self.config = config if config is not None else TransformersModelConfig(model_name=model.config.name_or_path)
+        if config is not None:
+            self.generation_config_dict = config.generation_parameters.to_transformers_dict()
         self._max_length = self._init_max_length()
         self._tokenizer = self._create_auto_tokenizer()
-        self.batch_size = config.batch_size
+        self.batch_size = getattr(config, "batch_size", None)
         self.model_name = _simplify_name(model.name_or_path)
-        self.model_sha = config.get_model_sha()
+        self.model_sha = self.config.get_model_sha()
 
         # If model_parallel is not set we compare the number of processes with the number of GPUs
         self.model = model
@@ -508,7 +509,114 @@ def greedy_until_multi_turn(  # noqa: C901
     ) -> GenerativeMultiturnResponse:
         raise NotImplementedError("This method is not implemented for this model")
 
-    def greedy_until(
+    def _continious_greedy_until(
+        self,
+        requests: list[GreedyUntilRequest],
+    ) -> list[GenerativeResponse]:
+        """
+        Generates responses using a greedy decoding strategy until certain ending conditions are met.
+
+        Args:
+            requests (list[Request]): list of requests containing the context and ending conditions.
+            override_bs (int, optional): Override the batch size for generation. Defaults to None.
+
+        Returns:
+            list[GenerateReturn]: list of generated responses.
+        """
+        for request in requests:
+            request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
+            request.tokenized_context = self.tok_encode(request.context)
+
+        dataset = GenerativeTaskDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS)
+        results = []
+
+        for split in tqdm(
+            dataset.splits_iterator(),
+            total=dataset.num_dataset_splits,
+            desc="Splits",
+            position=0,
+            disable=False,  # self.disable_tqdm,
+        ):
+            # For chat models, generation stops with EOS token, so we don't need to specify stop tokens
+            if self.use_chat_template:
+                stop_tokens = []
+            else:
+                # NOTE: we are assuming all items in a batch behave similarly (same
+                # stop_tokens and max_tokens genrated) which is not necessarily
+                # the case! Because of that we only use batch size of 1
+                stop_tokens = split[0].stop_sequence
+
+            max_new_tokens = self.config.generation_parameters.max_new_tokens or split[0].generation_size
+            returns_logits = split[0].use_logits
+            num_samples = split[0].num_samples
+
+            context = [sample.context for sample in split]
+            tokenized = self.tokenizer(context, add_special_tokens=self.add_special_tokens)
+
+            # The main question for this step is the following:
+            # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
+            # of losing some meaning, or have some generations that are exceedingly short?
+            # The choice we go for here is to avoid truncating the prompt if we can, since it
+            # should have been managed by the prompt creator/few shot manager if requested by the user.
+            inputs = tokenized["input_ids"]
+            context_size = len(inputs[0])
+
+            # left truncate the inputs to the maximum length
+            if max_new_tokens is not None:
+                if context_size + max_new_tokens > self.max_length:
+                    logger.warning(
+                        f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
+                    )
+                    context_size = self.max_length - max_new_tokens
+                    if context_size < 0:
+                        logger.critical(
+                            f"{context_size=} is less than 0, either reduce the max_new_tokens or increase model max length."
+                        )
+                        raise ValueError("Context size is less than 0.")
+                    inputs = [input[-context_size:] for input in inputs]
+            else:
+                if context_size > self.max_length:
+                    logger.warning(
+                        f"{context_size=} which is greater than {self.max_length=}. Truncating context to {self.max_length} tokens."
+                    )
+                    context_size = self.max_length
+                    inputs = [input[-context_size:] for input in inputs]
+
+            _outputs = self._generate(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                stop_tokens=stop_tokens,
+                returns_logits=returns_logits,
+                num_samples=num_samples,
+            )
+
+            for req_id, _output in _outputs.items():
+                output_token_ids = []
+                logprobs_raw = []
+                result = []
+
+                # for output in _output.outputs:
+                output_token_ids.append(_output.static_outputs)
+                # logprobs_raw.append(output.logprobs)
+                result.append(self.tokenizer.decode(_output.static_outputs))
+
+                if logprobs_raw and output_token_ids and False:
+                    logprobs = [logprobs_raw[0][token_id].logprob for token_id in output_token_ids[0]]
+                else:
+                    logprobs = []
+
+                input_token_ids = _output.full_prompt_ids
+                cur_response = GenerativeResponse(
+                    result=result,
+                    logits=logprobs,
+                    generated_tokens=output_token_ids,
+                    input_tokens=input_token_ids,
+                )
+                results.append(cur_response)
+
+        return dataset.get_original_order(results)
+
+    def _padded_greedy_until(
         self,
         requests: list[GreedyUntilRequest],
     ) -> list[GenerativeResponse]:
@@ -625,12 +733,41 @@ def greedy_until(
                     returns_logits=returns_logits,
                     num_samples=num_samples,
                     do_sample=do_sample,
+                    use_fast=False,
                 )
                 results.extend(cur_reponses)
 
         return dataset.get_original_order(results)
 
-    def _generate(
+    def greedy_until(
+        self,
+        requests: list[GreedyUntilRequest],
+        use_fast: bool = True,
+    ) -> list[GenerativeResponse]:
+        if use_fast:
+            return self._continious_greedy_until(requests)
+        else:
+            return self._padded_greedy_until(requests)
+
+    def _generate_fast(
+        self,
+        inputs: list[list[int]],
+        max_new_tokens: Optional[int] = None,
+        stop_tokens: Optional[list[str]] = None,
+        returns_logits: Optional[bool] = False,
+        num_samples: int = 1,
+        generate: bool = True,
+    ) -> Dict[str, GenerativeResponse]:
+        # Compute model generation
+        batch_outputs = self.model.generate_batch(
+            inputs=inputs,
+            generation_config=self.model.generation_config,
+            # You can pass request-specific overrides here, e.g., max_new_tokens=100
+        )
+
+        return batch_outputs
+
+    def _generate_padded(
         self,
         batch: Batch,
         max_new_tokens: int,
@@ -711,6 +848,16 @@ def _generate(
 
         return all_responses
 
+    def _generate(
+        self,
+        use_fast: bool = True,
+        **kwargs,
+    ) -> list[GenerativeResponse]:
+        if use_fast:
+            return self._generate_fast(**kwargs)
+        else:
+            return self._generate_padded(**kwargs)
+
     def loglikelihood(
         self,
         requests: list[LoglikelihoodRequest],

diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
@@ -383,7 +383,7 @@ def tensor_replacer(match):
         try:
             return ast.literal_eval(processed)
         except Exception as e:
-            raise ValueError(f"Failed to parse after preprocessing. " f"Processed string:\n{processed}\n\nError: {e}")
+            raise ValueError(f"Failed to parse after preprocessing. Processed string:\n{processed}\n\nError: {e}")
 
     def _load_responses_from_details(self):
         logger.info("--- LOADING RESPONSES FROM DETAILS ---")

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -749,7 +749,7 @@ def ethics_commonsense(line, task_name: str = None):
 def ethics_deontology(line, task_name: str = None):
     return Doc(
         task_name=task_name,
-        query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']} {line['excuse']}\"\nAnswer:",
+        query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]} {line["excuse"]}"\nAnswer:',
         choices=[" unreasonable", " reasonable"],
         gold_index=int(line["label"]),
         instruction="Question: Would most people believe this reasonable or unreasonable to say?",
@@ -759,7 +759,7 @@ def ethics_deontology(line, task_name: str = None):
 def ethics_justice(line, task_name: str = None):
     return Doc(
         task_name=task_name,
-        query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']}\"\nAnswer:",
+        query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]}"\nAnswer:',
         choices=[" unreasonable", " reasonable"],
         gold_index=int(line["label"]),
         instruction="Question: Would most people believe this reasonable or unreasonable to say?",
@@ -782,7 +782,7 @@ def ethics_utilitarianism(line, task_name: str = None):
 def ethics_virtue(line, task_name: str = None):
     return Doc(
         task_name=task_name,
-        query=f"Sentence: {line['scenario']}\nQuestion: Does the character in this sentence exhibit the trait \"{line['trait']}\"?\nAnswer:",
+        query=f'Sentence: {line["scenario"]}\nQuestion: Does the character in this sentence exhibit the trait "{line["trait"]}"?\nAnswer:',
         choices=[" no", " yes"],
         gold_index=int(line["label"]),
     )
@@ -1159,33 +1159,29 @@ def lextreme_covid19_emergency_event(line, task_name: str = None):
 
 def lextreme_multi_eurlex_level_1(line, task_name: str = None):
     instruction = (
-        "In this task, you are given a document from an EU law. "
-        "Predict the level 1 concept in the EUROVOC taxonomy."
+        "In this task, you are given a document from an EU law. Predict the level 1 concept in the EUROVOC taxonomy."
     )
     return lextreme(line, instruction, task_name)
 
 
 def lextreme_multi_eurlex_level_2(line, task_name: str = None):
     instruction = (
-        "In this task, you are given a document from an EU law. "
-        "Predict the level 2 concept in the EUROVOC taxonomy."
+        "In this task, you are given a document from an EU law. Predict the level 2 concept in the EUROVOC taxonomy."
     )
     return lextreme(line, instruction, task_name)
 
 
 def lextreme_multi_eurlex_level_3(line, task_name: str = None):
     instruction = (
-        "In this task, you are given a document from an EU law. "
-        "Predict the level 3 concept in the EUROVOC taxonomy."
+        "In this task, you are given a document from an EU law. Predict the level 3 concept in the EUROVOC taxonomy."
     )
 
     return lextreme(line, instruction, task_name)
 
 
 def lextreme_greek_legal_ner(line, task_name: str = None):
     instruction = (
-        "In this task, you are given a sentence from Greek legislation. "
-        "Predict the named entity type for each token."
+        "In this task, you are given a sentence from Greek legislation. Predict the named entity type for each token."
     )
     return lextreme(line, instruction, task_name)
 

diff --git a/src/lighteval/tasks/extended/hle/main.py b/src/lighteval/tasks/extended/hle/main.py
@@ -208,7 +208,7 @@ def hle_text_only(line, task_name: str = None):
 
 hle_metrics = CorpusLevelMetricGrouping(
     metric_name=["accuracy", "confidence_half_width", "calibration_error"],
-    higher_is_better={n: True for n in ["accuracy", "confidence_half_width", "calibration_error"]},
+    higher_is_better=dict.fromkeys(["accuracy", "confidence_half_width", "calibration_error"], True),
     category=MetricCategory.LLM_AS_JUDGE,
     use_case=MetricUseCase.ACCURACY,
     sample_level_fn=JudgeLLMHLE().compute,