Skip to content

Bump ruff version #774

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ repos:

- repo: https://github.com/charliermarsh/ruff-pre-commit
# Ruff version.
rev: 'v0.2.2'
rev: 'v0.11.10'
hooks:
- id: ruff
args: ['--fix']
Expand Down
1 change: 1 addition & 0 deletions community_tasks/arabic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
"""

import random
import re
from typing import Any, Dict, List, Optional, Union
Expand Down
1 change: 1 addition & 0 deletions examples/nanotron/custom_evaluation_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
"""

import re
from dataclasses import asdict
from typing import Dict, List, Tuple
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ nanotron = [
]
tensorboardX = ["tensorboardX"]
vllm = ["vllm>=0.7.0", "ray", "more_itertools"]
quality = ["ruff==v0.2.2","pre-commit"]
quality = ["ruff>=v0.11.0","pre-commit"]
tests = ["pytest==7.4.0","deepdiff"]
dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
docs = ["hf-doc-builder", "watchdog"]
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901
f"To load the details from a run, you can for instance do the following:\n"
f'```python\nfrom datasets import load_dataset\ndata = load_dataset("{repo_id}",\n\t"{sanitized_task}",\n\tsplit="train")\n```\n\n'
f"## Latest results\n\n"
f'These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace("/resolve/", "/blob/")})'
f"These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace('/resolve/', '/blob/')})"
f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
f'You find each in the results and the "latest" split for each eval):\n\n'
f"```python\n{results_string}\n```",
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
if len(list_of_subtasks) > 1:
metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys())
self.metric_aggregated[average_task] = {
metric: sum([self.metric_aggregated[k][metric] for k in list_of_subtasks]) / len(list_of_subtasks)
metric: sum(self.metric_aggregated[k][metric] for k in list_of_subtasks) / len(list_of_subtasks)
for metric in metrics
}

Expand Down
1 change: 1 addition & 0 deletions src/lighteval/metrics/imports/bert_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
# SOFTWARE.

"""Simplified version of the BertScorer lib - we only import what we need."""

import logging
import os
import time
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def __init__(
if self.backend == "inference-providers" and self.hf_provider is None:
raise ValueError("When using 'inference-providers' as backend, you must specify an 'hf_provider'")

def __lazy_load_client(self):
def __lazy_load_client(self): # noqa: C901
match self.backend:
# Both "openai" and "tgi" backends use the OpenAI-compatible API
# They are handled separately to allow for backend-specific validation and setup
Expand Down
16 changes: 8 additions & 8 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,16 +624,16 @@ class Metrics(Enum):
sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
)
g_pass_at_8_16 = SampleLevelMetricGrouping(
metric_name="G-Pass@8-16:48_samples",
sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
)
g_pass_at_16_expr_gold = SampleLevelMetricGrouping(
metric_name="G-Pass@16:48_samples",
Expand All @@ -653,8 +653,8 @@ class Metrics(Enum):
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
)
g_pass_at_16_latex_gold = SampleLevelMetricGrouping(
metric_name="G-Pass@16:48_samples",
Expand All @@ -674,8 +674,8 @@ class Metrics(Enum):
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
)
perfect_exact_match = SampleLevelMetric(
metric_name="perfect_em",
Expand Down
1 change: 1 addition & 0 deletions src/lighteval/metrics/metrics_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
A number of these aggregations come from the EleutherAIHarness
"""

import logging
import math
from typing import Literal
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/stderr.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@

def _stddev(arr):
mu = np.mean(arr)
return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
return math.sqrt(sum((x - mu) ** 2 for x in arr) / (len(arr) - 1))


def mean_stderr(arr):
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/utils/math_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ def are_flipped_inequalities_equal(a: Relational, b: Relational) -> bool:

# Same type of relation (e.g. both <= or both >=)
try:
if type(gold) == type(pred) and sympy_expr_eq(gold.lhs - gold.rhs, pred.lhs - pred.rhs, precision): # type: ignore
if type(gold) is type(pred) and sympy_expr_eq(gold.lhs - gold.rhs, pred.lhs - pred.rhs, precision): # type: ignore
return True
except TimeoutError:
raise
Expand Down
6 changes: 3 additions & 3 deletions src/lighteval/models/endpoints/inference_providers_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,9 @@ async def __call_api_parallel(
results = []

num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples
assert len(prompts) == len(
num_sampless
), f"Length of prompts and max_new_tokenss should be the same but are {len(prompts)}, {len(num_sampless)}"
assert len(prompts) == len(num_sampless), (
f"Length of prompts and max_new_tokenss should be the same but are {len(prompts)}, {len(num_sampless)}"
)

async def bounded_api_call(prompt, num_samples):
async with self.semaphore:
Expand Down
14 changes: 7 additions & 7 deletions src/lighteval/models/endpoints/openai_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,9 @@ def __call_api_parallel(
num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples
logit_biass = [logit_bias for _ in prompts] if logit_bias is None else logit_bias

assert (
len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(logit_biass)
), "Length of prompts, return_logitss, max_new_tokenss, num_sampless, logit_biass should be same"
assert len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(logit_biass), (
"Length of prompts, return_logitss, max_new_tokenss, num_sampless, logit_biass should be same"
)

with ThreadPoolExecutor(self.CONCURENT_CALLS) as executor:
for entry in tqdm(
Expand Down Expand Up @@ -255,11 +255,11 @@ def _loglikelihood_tokens(
inputs = [sample.context for sample in split]
max_new_tokens = [len(sample.tokenized_continuation) for sample in split]

assert all(
new_tokens == 1 for new_tokens in max_new_tokens
), "Only single token continuations are supported when using openai API."
assert all(new_tokens == 1 for new_tokens in max_new_tokens), (
"Only single token continuations are supported when using openai API."
)

logit_biases = [{tok: 100 for tok in sample.tokenized_continuation} for sample in split]
logit_biases = [dict.fromkeys(sample.tokenized_continuation, 100) for sample in split]

outputs = self.__call_api_parallel(
inputs, return_logits=True, max_new_tokens=max_new_tokens, num_samples=1, logit_bias=logit_biases
Expand Down
4 changes: 3 additions & 1 deletion src/lighteval/models/litellm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,9 @@ def __call_api_parallel(
stop_sequencess = [stop_sequence for _ in prompts]
assert (
len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(stop_sequencess)
), f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}"
), (
f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}"
)

with ThreadPoolExecutor(self.CONCURENT_CALLS) as executor:
for entry in tqdm(
Expand Down
24 changes: 12 additions & 12 deletions src/lighteval/models/nanotron/nanotron_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,9 +486,9 @@ def prepare_batch(
We truncate to keep only at most `max_context` tokens
We pad to `padding_length` tokens
"""
assert (
full_attention_masks is False
), "full_attention_masks=True means we would be doing attention of padding tokens, which would affect negatively the results."
assert full_attention_masks is False, (
"full_attention_masks=True means we would be doing attention of padding tokens, which would affect negatively the results."
)
assert pad_on_left is False, "pad_on_left=True not supported yet, see TODOs below"
current_pp_rank = dist.get_rank(self.parallel_context.pp_pg)

Expand All @@ -505,9 +505,9 @@ def prepare_batch(
if max_context is None:
max_context = self.max_length

assert (
self.parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
), "No reason to have tp_mode==REDUCE_SCATTER when doing inference"
assert self.parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE, (
"No reason to have tp_mode==REDUCE_SCATTER when doing inference"
)
# if max_context % self.parallel_config.tp != 0:
# # We need to round up to the next multiple of self.parallel_config.tp
# if (max_context + (self.parallel_config.tp - max_context % self.parallel_config.tp)) < self.max_length:
Expand Down Expand Up @@ -860,9 +860,9 @@ def _loglikelihood_single_token(
# print(f"i {i} padded: {r.padded}")

if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
assert (
len(res) == total_length
), f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
assert len(res) == total_length, (
f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
)

if len(res) == 0:
# We are in a process which return no output (beginning/middle of the PP group)
Expand Down Expand Up @@ -1338,9 +1338,9 @@ def greedy_until(
res = res[: len(res) - to_remove_at_the_end]

if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
assert (
len(res) == total_length
), f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
assert len(res) == total_length, (
f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
)

if len(res) == 0:
# We are in a process which return no output (beginning/middle of the PP group)
Expand Down
4 changes: 3 additions & 1 deletion src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ class VLLMModelConfig(ModelConfig):
data_parallel_size: PositiveInt = 1 # how many GPUs to use for data parallelism
pipeline_parallel_size: PositiveInt = 1 # how many GPUs to use for pipeline parallelism
gpu_memory_utilization: NonNegativeFloat = 0.9 # lower this if you are running out of memory
max_model_length: PositiveInt | None = None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
max_model_length: PositiveInt | None = (
None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
)
quantization: str | None = None
load_format: str | None = None
swap_space: PositiveInt = 4 # CPU swap space size (GiB) per GPU.
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def tensor_replacer(match):
try:
return ast.literal_eval(processed)
except Exception as e:
raise ValueError(f"Failed to parse after preprocessing. " f"Processed string:\n{processed}\n\nError: {e}")
raise ValueError(f"Failed to parse after preprocessing. Processed string:\n{processed}\n\nError: {e}")

def _load_responses_from_details(self):
logger.info("--- LOADING RESPONSES FROM DETAILS ---")
Expand Down
20 changes: 8 additions & 12 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,7 @@ def ethics_commonsense(line, task_name: str = None):
def ethics_deontology(line, task_name: str = None):
return Doc(
task_name=task_name,
query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']} {line['excuse']}\"\nAnswer:",
query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]} {line["excuse"]}"\nAnswer:',
choices=[" unreasonable", " reasonable"],
gold_index=int(line["label"]),
instruction="Question: Would most people believe this reasonable or unreasonable to say?",
Expand All @@ -836,7 +836,7 @@ def ethics_deontology(line, task_name: str = None):
def ethics_justice(line, task_name: str = None):
return Doc(
task_name=task_name,
query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']}\"\nAnswer:",
query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]}"\nAnswer:',
choices=[" unreasonable", " reasonable"],
gold_index=int(line["label"]),
instruction="Question: Would most people believe this reasonable or unreasonable to say?",
Expand All @@ -859,7 +859,7 @@ def ethics_utilitarianism(line, task_name: str = None):
def ethics_virtue(line, task_name: str = None):
return Doc(
task_name=task_name,
query=f"Sentence: {line['scenario']}\nQuestion: Does the character in this sentence exhibit the trait \"{line['trait']}\"?\nAnswer:",
query=f'Sentence: {line["scenario"]}\nQuestion: Does the character in this sentence exhibit the trait "{line["trait"]}"?\nAnswer:',
choices=[" no", " yes"],
gold_index=int(line["label"]),
)
Expand Down Expand Up @@ -1236,33 +1236,29 @@ def lextreme_covid19_emergency_event(line, task_name: str = None):

def lextreme_multi_eurlex_level_1(line, task_name: str = None):
instruction = (
"In this task, you are given a document from an EU law. "
"Predict the level 1 concept in the EUROVOC taxonomy."
"In this task, you are given a document from an EU law. Predict the level 1 concept in the EUROVOC taxonomy."
)
return lextreme(line, instruction, task_name)


def lextreme_multi_eurlex_level_2(line, task_name: str = None):
instruction = (
"In this task, you are given a document from an EU law. "
"Predict the level 2 concept in the EUROVOC taxonomy."
"In this task, you are given a document from an EU law. Predict the level 2 concept in the EUROVOC taxonomy."
)
return lextreme(line, instruction, task_name)


def lextreme_multi_eurlex_level_3(line, task_name: str = None):
instruction = (
"In this task, you are given a document from an EU law. "
"Predict the level 3 concept in the EUROVOC taxonomy."
"In this task, you are given a document from an EU law. Predict the level 3 concept in the EUROVOC taxonomy."
)

return lextreme(line, instruction, task_name)


def lextreme_greek_legal_ner(line, task_name: str = None):
instruction = (
"In this task, you are given a sentence from Greek legislation. "
"Predict the named entity type for each token."
"In this task, you are given a sentence from Greek legislation. Predict the named entity type for each token."
)
return lextreme(line, instruction, task_name)

Expand Down Expand Up @@ -1313,7 +1309,7 @@ def legal_summarization(line, task_name: str = None):
def mgsm(line, question_key, answer_key, task_name: str = None):
if line["answer"] is not None:
query = f"{line['question']}\n{answer_key}"
gold = f" {line['answer'][len(answer_key) + 1:]}"
gold = f" {line['answer'][len(answer_key) + 1 :]}"
else:
query = f"{question_key} {line['question']}\n{answer_key}"
gold = f" {str(line['answer_number'])}"
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/tasks/extended/hle/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def hle_text_only(line, task_name: str = None):

hle_metrics = CorpusLevelMetricGrouping(
metric_name=["accuracy", "confidence_half_width", "calibration_error"],
higher_is_better={n: True for n in ["accuracy", "confidence_half_width", "calibration_error"]},
higher_is_better=dict.fromkeys(["accuracy", "confidence_half_width", "calibration_error"], True),
category=MetricCategory.LLM_AS_JUDGE,
use_case=MetricUseCase.ACCURACY,
sample_level_fn=JudgeLLMHLE().compute,
Expand Down
Loading
Loading