Skip to content

Commit c9c19e1

Browse files
authored
Bump ruff version (#774)
* Bump ruff version * Bump ruff version
1 parent 034c23b commit c9c19e1

26 files changed

+84
-78
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ repos:
3434

3535
- repo: https://github.com/charliermarsh/ruff-pre-commit
3636
# Ruff version.
37-
rev: 'v0.2.2'
37+
rev: 'v0.11.10'
3838
hooks:
3939
- id: ruff
4040
args: ['--fix']

community_tasks/arabic_evals.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
2727
This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
2828
"""
29+
2930
import random
3031
import re
3132
from typing import Any, Dict, List, Optional, Union

examples/nanotron/custom_evaluation_tasks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
2727
This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
2828
"""
29+
2930
import re
3031
from dataclasses import asdict
3132
from typing import Dict, List, Tuple

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ nanotron = [
9595
]
9696
tensorboardX = ["tensorboardX"]
9797
vllm = ["vllm>=0.7.0", "ray", "more_itertools"]
98-
quality = ["ruff==v0.2.2","pre-commit"]
98+
quality = ["ruff>=v0.11.0","pre-commit"]
9999
tests = ["pytest==7.4.0","deepdiff"]
100100
dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
101101
docs = ["hf-doc-builder", "watchdog"]

src/lighteval/logging/evaluation_tracker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,7 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901
603603
f"To load the details from a run, you can for instance do the following:\n"
604604
f'```python\nfrom datasets import load_dataset\ndata = load_dataset("{repo_id}",\n\t"{sanitized_task}",\n\tsplit="train")\n```\n\n'
605605
f"## Latest results\n\n"
606-
f'These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace("/resolve/", "/blob/")})'
606+
f"These are the [latest results from run {max_last_eval_date_results}]({last_results_file_path.replace('/resolve/', '/blob/')})"
607607
f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
608608
f'You find each in the results and the "latest" split for each eval):\n\n'
609609
f"```python\n{results_string}\n```",

src/lighteval/logging/info_loggers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
556556
if len(list_of_subtasks) > 1:
557557
metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys())
558558
self.metric_aggregated[average_task] = {
559-
metric: sum([self.metric_aggregated[k][metric] for k in list_of_subtasks]) / len(list_of_subtasks)
559+
metric: sum(self.metric_aggregated[k][metric] for k in list_of_subtasks) / len(list_of_subtasks)
560560
for metric in metrics
561561
}
562562

src/lighteval/metrics/imports/bert_scorer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
# SOFTWARE.
2323

2424
"""Simplified version of the BertScorer lib - we only import what we need."""
25+
2526
import logging
2627
import os
2728
import time

src/lighteval/metrics/llm_as_judge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def __init__(
127127
if self.backend == "inference-providers" and self.hf_provider is None:
128128
raise ValueError("When using 'inference-providers' as backend, you must specify an 'hf_provider'")
129129

130-
def __lazy_load_client(self):
130+
def __lazy_load_client(self): # noqa: C901
131131
match self.backend:
132132
# Both "openai" and "tgi" backends use the OpenAI-compatible API
133133
# They are handled separately to allow for backend-specific validation and setup

src/lighteval/metrics/metrics.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -624,16 +624,16 @@ class Metrics(Enum):
624624
sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute,
625625
category=MetricCategory.GENERATIVE_SAMPLING,
626626
use_case=MetricUseCase.REASONING,
627-
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
628-
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
627+
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
628+
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
629629
)
630630
g_pass_at_8_16 = SampleLevelMetricGrouping(
631631
metric_name="G-Pass@8-16:48_samples",
632632
sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute,
633633
category=MetricCategory.GENERATIVE_SAMPLING,
634634
use_case=MetricUseCase.REASONING,
635-
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
636-
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
635+
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
636+
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
637637
)
638638
g_pass_at_16_expr_gold = SampleLevelMetricGrouping(
639639
metric_name="G-Pass@16:48_samples",
@@ -653,8 +653,8 @@ class Metrics(Enum):
653653
).compute,
654654
category=MetricCategory.GENERATIVE_SAMPLING,
655655
use_case=MetricUseCase.REASONING,
656-
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
657-
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
656+
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
657+
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
658658
)
659659
g_pass_at_16_latex_gold = SampleLevelMetricGrouping(
660660
metric_name="G-Pass@16:48_samples",
@@ -674,8 +674,8 @@ class Metrics(Enum):
674674
).compute,
675675
category=MetricCategory.GENERATIVE_SAMPLING,
676676
use_case=MetricUseCase.REASONING,
677-
corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
678-
higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
677+
corpus_level_fn=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, np.mean),
678+
higher_is_better=dict.fromkeys(GPassAtK(k=16, n=48, strip_strings=True).all_metrics, True),
679679
)
680680
perfect_exact_match = SampleLevelMetric(
681681
metric_name="perfect_em",

src/lighteval/metrics/metrics_corpus.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
2525
A number of these aggregations come from the EleutherAIHarness
2626
"""
27+
2728
import logging
2829
import math
2930
from typing import Literal

src/lighteval/metrics/stderr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242

4343
def _stddev(arr):
4444
mu = np.mean(arr)
45-
return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
45+
return math.sqrt(sum((x - mu) ** 2 for x in arr) / (len(arr) - 1))
4646

4747

4848
def mean_stderr(arr):

src/lighteval/metrics/utils/math_comparison.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ def are_flipped_inequalities_equal(a: Relational, b: Relational) -> bool:
374374

375375
# Same type of relation (e.g. both <= or both >=)
376376
try:
377-
if type(gold) == type(pred) and sympy_expr_eq(gold.lhs - gold.rhs, pred.lhs - pred.rhs, precision): # type: ignore
377+
if type(gold) is type(pred) and sympy_expr_eq(gold.lhs - gold.rhs, pred.lhs - pred.rhs, precision): # type: ignore
378378
return True
379379
except TimeoutError:
380380
raise

src/lighteval/models/endpoints/inference_providers_model.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,9 @@ async def __call_api_parallel(
157157
results = []
158158

159159
num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples
160-
assert len(prompts) == len(
161-
num_sampless
162-
), f"Length of prompts and max_new_tokenss should be the same but are {len(prompts)}, {len(num_sampless)}"
160+
assert len(prompts) == len(num_sampless), (
161+
f"Length of prompts and max_new_tokenss should be the same but are {len(prompts)}, {len(num_sampless)}"
162+
)
163163

164164
async def bounded_api_call(prompt, num_samples):
165165
async with self.semaphore:

src/lighteval/models/endpoints/openai_model.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,9 @@ def __call_api_parallel(
147147
num_sampless = [num_samples for _ in prompts] if not isinstance(num_samples, list) else num_samples
148148
logit_biass = [logit_bias for _ in prompts] if logit_bias is None else logit_bias
149149

150-
assert (
151-
len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(logit_biass)
152-
), "Length of prompts, return_logitss, max_new_tokenss, num_sampless, logit_biass should be same"
150+
assert len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(logit_biass), (
151+
"Length of prompts, return_logitss, max_new_tokenss, num_sampless, logit_biass should be same"
152+
)
153153

154154
with ThreadPoolExecutor(self.CONCURENT_CALLS) as executor:
155155
for entry in tqdm(
@@ -255,11 +255,11 @@ def _loglikelihood_tokens(
255255
inputs = [sample.context for sample in split]
256256
max_new_tokens = [len(sample.tokenized_continuation) for sample in split]
257257

258-
assert all(
259-
new_tokens == 1 for new_tokens in max_new_tokens
260-
), "Only single token continuations are supported when using openai API."
258+
assert all(new_tokens == 1 for new_tokens in max_new_tokens), (
259+
"Only single token continuations are supported when using openai API."
260+
)
261261

262-
logit_biases = [{tok: 100 for tok in sample.tokenized_continuation} for sample in split]
262+
logit_biases = [dict.fromkeys(sample.tokenized_continuation, 100) for sample in split]
263263

264264
outputs = self.__call_api_parallel(
265265
inputs, return_logits=True, max_new_tokens=max_new_tokens, num_samples=1, logit_bias=logit_biases

src/lighteval/models/litellm_model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,9 @@ def __call_api_parallel(
185185
stop_sequencess = [stop_sequence for _ in prompts]
186186
assert (
187187
len(prompts) == len(return_logitss) == len(max_new_tokenss) == len(num_sampless) == len(stop_sequencess)
188-
), f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}"
188+
), (
189+
f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}"
190+
)
189191

190192
with ThreadPoolExecutor(self.CONCURENT_CALLS) as executor:
191193
for entry in tqdm(

src/lighteval/models/nanotron/nanotron_model.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -486,9 +486,9 @@ def prepare_batch(
486486
We truncate to keep only at most `max_context` tokens
487487
We pad to `padding_length` tokens
488488
"""
489-
assert (
490-
full_attention_masks is False
491-
), "full_attention_masks=True means we would be doing attention of padding tokens, which would affect negatively the results."
489+
assert full_attention_masks is False, (
490+
"full_attention_masks=True means we would be doing attention of padding tokens, which would affect negatively the results."
491+
)
492492
assert pad_on_left is False, "pad_on_left=True not supported yet, see TODOs below"
493493
current_pp_rank = dist.get_rank(self.parallel_context.pp_pg)
494494

@@ -505,9 +505,9 @@ def prepare_batch(
505505
if max_context is None:
506506
max_context = self.max_length
507507

508-
assert (
509-
self.parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
510-
), "No reason to have tp_mode==REDUCE_SCATTER when doing inference"
508+
assert self.parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE, (
509+
"No reason to have tp_mode==REDUCE_SCATTER when doing inference"
510+
)
511511
# if max_context % self.parallel_config.tp != 0:
512512
# # We need to round up to the next multiple of self.parallel_config.tp
513513
# if (max_context + (self.parallel_config.tp - max_context % self.parallel_config.tp)) < self.max_length:
@@ -860,9 +860,9 @@ def _loglikelihood_single_token(
860860
# print(f"i {i} padded: {r.padded}")
861861

862862
if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
863-
assert (
864-
len(res) == total_length
865-
), f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
863+
assert len(res) == total_length, (
864+
f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
865+
)
866866

867867
if len(res) == 0:
868868
# We are in a process which return no output (beginning/middle of the PP group)
@@ -1338,9 +1338,9 @@ def greedy_until(
13381338
res = res[: len(res) - to_remove_at_the_end]
13391339

13401340
if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
1341-
assert (
1342-
len(res) == total_length
1343-
), f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
1341+
assert len(res) == total_length, (
1342+
f"we didn't cover all the data: len(res) == total_length ({len(res)} == {total_length})"
1343+
)
13441344

13451345
if len(res) == 0:
13461346
# We are in a process which return no output (beginning/middle of the PP group)

src/lighteval/models/vllm/vllm_model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,9 @@ class VLLMModelConfig(ModelConfig):
8080
data_parallel_size: PositiveInt = 1 # how many GPUs to use for data parallelism
8181
pipeline_parallel_size: PositiveInt = 1 # how many GPUs to use for pipeline parallelism
8282
gpu_memory_utilization: NonNegativeFloat = 0.9 # lower this if you are running out of memory
83-
max_model_length: PositiveInt | None = None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
83+
max_model_length: PositiveInt | None = (
84+
None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
85+
)
8486
quantization: str | None = None
8587
load_format: str | None = None
8688
swap_space: PositiveInt = 4 # CPU swap space size (GiB) per GPU.

src/lighteval/pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ def tensor_replacer(match):
383383
try:
384384
return ast.literal_eval(processed)
385385
except Exception as e:
386-
raise ValueError(f"Failed to parse after preprocessing. " f"Processed string:\n{processed}\n\nError: {e}")
386+
raise ValueError(f"Failed to parse after preprocessing. Processed string:\n{processed}\n\nError: {e}")
387387

388388
def _load_responses_from_details(self):
389389
logger.info("--- LOADING RESPONSES FROM DETAILS ---")

src/lighteval/tasks/default_prompts.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -826,7 +826,7 @@ def ethics_commonsense(line, task_name: str = None):
826826
def ethics_deontology(line, task_name: str = None):
827827
return Doc(
828828
task_name=task_name,
829-
query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']} {line['excuse']}\"\nAnswer:",
829+
query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]} {line["excuse"]}"\nAnswer:',
830830
choices=[" unreasonable", " reasonable"],
831831
gold_index=int(line["label"]),
832832
instruction="Question: Would most people believe this reasonable or unreasonable to say?",
@@ -836,7 +836,7 @@ def ethics_deontology(line, task_name: str = None):
836836
def ethics_justice(line, task_name: str = None):
837837
return Doc(
838838
task_name=task_name,
839-
query=f"Question: Would most people believe this reasonable or unreasonable to say? \"{line['scenario']}\"\nAnswer:",
839+
query=f'Question: Would most people believe this reasonable or unreasonable to say? "{line["scenario"]}"\nAnswer:',
840840
choices=[" unreasonable", " reasonable"],
841841
gold_index=int(line["label"]),
842842
instruction="Question: Would most people believe this reasonable or unreasonable to say?",
@@ -859,7 +859,7 @@ def ethics_utilitarianism(line, task_name: str = None):
859859
def ethics_virtue(line, task_name: str = None):
860860
return Doc(
861861
task_name=task_name,
862-
query=f"Sentence: {line['scenario']}\nQuestion: Does the character in this sentence exhibit the trait \"{line['trait']}\"?\nAnswer:",
862+
query=f'Sentence: {line["scenario"]}\nQuestion: Does the character in this sentence exhibit the trait "{line["trait"]}"?\nAnswer:',
863863
choices=[" no", " yes"],
864864
gold_index=int(line["label"]),
865865
)
@@ -1236,33 +1236,29 @@ def lextreme_covid19_emergency_event(line, task_name: str = None):
12361236

12371237
def lextreme_multi_eurlex_level_1(line, task_name: str = None):
12381238
instruction = (
1239-
"In this task, you are given a document from an EU law. "
1240-
"Predict the level 1 concept in the EUROVOC taxonomy."
1239+
"In this task, you are given a document from an EU law. Predict the level 1 concept in the EUROVOC taxonomy."
12411240
)
12421241
return lextreme(line, instruction, task_name)
12431242

12441243

12451244
def lextreme_multi_eurlex_level_2(line, task_name: str = None):
12461245
instruction = (
1247-
"In this task, you are given a document from an EU law. "
1248-
"Predict the level 2 concept in the EUROVOC taxonomy."
1246+
"In this task, you are given a document from an EU law. Predict the level 2 concept in the EUROVOC taxonomy."
12491247
)
12501248
return lextreme(line, instruction, task_name)
12511249

12521250

12531251
def lextreme_multi_eurlex_level_3(line, task_name: str = None):
12541252
instruction = (
1255-
"In this task, you are given a document from an EU law. "
1256-
"Predict the level 3 concept in the EUROVOC taxonomy."
1253+
"In this task, you are given a document from an EU law. Predict the level 3 concept in the EUROVOC taxonomy."
12571254
)
12581255

12591256
return lextreme(line, instruction, task_name)
12601257

12611258

12621259
def lextreme_greek_legal_ner(line, task_name: str = None):
12631260
instruction = (
1264-
"In this task, you are given a sentence from Greek legislation. "
1265-
"Predict the named entity type for each token."
1261+
"In this task, you are given a sentence from Greek legislation. Predict the named entity type for each token."
12661262
)
12671263
return lextreme(line, instruction, task_name)
12681264

@@ -1313,7 +1309,7 @@ def legal_summarization(line, task_name: str = None):
13131309
def mgsm(line, question_key, answer_key, task_name: str = None):
13141310
if line["answer"] is not None:
13151311
query = f"{line['question']}\n{answer_key}"
1316-
gold = f" {line['answer'][len(answer_key) + 1:]}"
1312+
gold = f" {line['answer'][len(answer_key) + 1 :]}"
13171313
else:
13181314
query = f"{question_key} {line['question']}\n{answer_key}"
13191315
gold = f" {str(line['answer_number'])}"

src/lighteval/tasks/extended/hle/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def hle_text_only(line, task_name: str = None):
208208

209209
hle_metrics = CorpusLevelMetricGrouping(
210210
metric_name=["accuracy", "confidence_half_width", "calibration_error"],
211-
higher_is_better={n: True for n in ["accuracy", "confidence_half_width", "calibration_error"]},
211+
higher_is_better=dict.fromkeys(["accuracy", "confidence_half_width", "calibration_error"], True),
212212
category=MetricCategory.LLM_AS_JUDGE,
213213
use_case=MetricUseCase.ACCURACY,
214214
sample_level_fn=JudgeLLMHLE().compute,

0 commit comments

Comments
 (0)