some further cleanup

gykovacs · gykovacs · commit 3db87e59bbaf · 2025-08-21T14:46:23.000+02:00
diff --git a/mlscorecheck/check/binary/_check_1_dataset_unknown_folds_mos.py b/mlscorecheck/check/binary/_check_1_dataset_unknown_folds_mos.py
@@ -45,8 +45,8 @@ def check_1_dataset_unknown_folds_mos(
     folding: dict,
     scores: dict,
     eps,
-    fold_score_bounds: dict | None = None,
     *,
+    score_bounds: dict | None = None,
     solver_name: str | None = None,
     timeout: int | None = None,
     verbosity: int = 1,
diff --git a/mlscorecheck/check/binary/_check_n_datasets_mos_unknown_folds_mos.py b/mlscorecheck/check/binary/_check_n_datasets_mos_unknown_folds_mos.py
@@ -48,8 +48,8 @@ def check_n_datasets_mos_unknown_folds_mos(
     evaluations: list,
     scores: dict,
     eps,
-    dataset_score_bounds: dict | None = None,
     *,
+    score_bounds: dict | None = None,
     solver_name: str | None = None,
     timeout: int | None = None,
     verbosity: int = 1,
diff --git a/mlscorecheck/check/bundles/retina/_chasedb1.py b/mlscorecheck/check/bundles/retina/_chasedb1.py
@@ -19,7 +19,7 @@
 ]
 
 
-def _filter_chasedb1(data, imageset, annotator):
+def _filter_chasedb1(data: dict, imageset, annotator: str) -> list:
     """
     Filters the CHASEDB1 dataset
 
@@ -107,8 +107,8 @@ def check_chasedb1_vessel_aggregated_mos(
 
 
 def check_chasedb1_vessel_aggregated_som(
-    imageset, annotator, scores, eps, numerical_tolerance=NUMERICAL_TOLERANCE
-):
+    imageset, annotator: str, scores: dict, eps, numerical_tolerance=NUMERICAL_TOLERANCE
+) -> dict:
     """
     Tests the consistency of scores calculated on the CHASEDB1 dataset using
     the score-of-means aggregation.
@@ -252,7 +252,7 @@ def check_chasedb1_vessel_image(
     eps,
     *,
     numerical_tolerance: float = NUMERICAL_TOLERANCE,
-):
+) -> dict:
     """
     Testing the scores calculated for one image of the CHASEDB1 dataset
 
diff --git a/mlscorecheck/check/bundles/retina/_hrf.py b/mlscorecheck/check/bundles/retina/_hrf.py
@@ -21,7 +21,7 @@
 ]
 
 
-def _filter_hrf(data, imageset, assumption):
+def _filter_hrf(data: dict, imageset, assumption: str) -> list:
     """
     Filters the HRF dataset
 
@@ -118,7 +118,7 @@ def check_hrf_vessel_aggregated_mos_assumption(
 
 def check_hrf_vessel_aggregated_som_assumption(
     imageset, assumption: str, scores: dict, eps, numerical_tolerance=NUMERICAL_TOLERANCE
-):
+) -> dict:
     """
     Tests the consistency of scores calculated on the HRF dataset using
     the score-of-means aggregation and an assumption on the region of evaluation.
@@ -183,7 +183,7 @@ def check_hrf_vessel_image_assumption(
     eps,
     *,
     numerical_tolerance: float = NUMERICAL_TOLERANCE,
-):
+) -> dict:
     """
     Testing the scores calculated for one image of the HRF dataset using an
     assumption on the region of evaluation.
@@ -242,6 +242,7 @@ def check_hrf_vessel_image_assumption(
 
 def check_hrf_vessel_aggregated(
     imageset,
+    assumption: str,
     scores: dict,
     eps,
     *,
@@ -325,7 +326,7 @@ def check_hrf_vessel_aggregated(
 
 def check_hrf_vessel_image(
     image_identifier: str, scores: dict, eps, *, numerical_tolerance: float = NUMERICAL_TOLERANCE
-):
+) -> dict:
     """
     Testing the scores calculated for one image of the HRF dataset with
     both assumptions on the region of evaluation ('fov'/'all')
diff --git a/mlscorecheck/check/bundles/retina/_stare.py b/mlscorecheck/check/bundles/retina/_stare.py
@@ -19,7 +19,7 @@
 ]
 
 
-def _filter_stare(data, imageset, annotator):
+def _filter_stare(data: dict, imageset, annotator: str) -> list:
     """
     Filters the STARE dataset
 
@@ -111,8 +111,8 @@ def check_stare_vessel_aggregated_mos(
 
 
 def check_stare_vessel_aggregated_som(
-    imageset, annotator, scores, eps, numerical_tolerance=NUMERICAL_TOLERANCE
-):
+    imageset, annotator: str, scores: dict, eps, numerical_tolerance=NUMERICAL_TOLERANCE
+) -> dict:
     """
     Tests the consistency of scores calculated on the STARE dataset using
     the score-of-means aggregation.
@@ -257,7 +257,7 @@ def check_stare_vessel_image(
     eps,
     *,
     numerical_tolerance: float = NUMERICAL_TOLERANCE,
-):
+) -> dict:
     """
     Testing the scores calculated for one image of the STARE dataset
 
diff --git a/mlscorecheck/check/bundles/skinlesion/_isic2016.py b/mlscorecheck/check/bundles/skinlesion/_isic2016.py
@@ -9,7 +9,7 @@
 __all__ = ["check_isic2016"]
 
 
-def check_isic2016(*, scores: dict, eps: float, numerical_tolerance: float = NUMERICAL_TOLERANCE):
+def check_isic2016(*, scores: dict, eps: float, numerical_tolerance: float = NUMERICAL_TOLERANCE) -> dict:
     """
     Tests if the scores are consistent with the test set of the ISIC2016
     melanoma classification dataset
diff --git a/mlscorecheck/check/bundles/skinlesion/_isic2017.py b/mlscorecheck/check/bundles/skinlesion/_isic2017.py
@@ -9,7 +9,7 @@
 __all__ = ["check_isic2017", "_prepare_testset_isic2017"]
 
 
-def _prepare_testset_isic2017(target, against):
+def _prepare_testset_isic2017(target: str | list, against: str | list | None) -> dict:
     """
     Preperation of the test set
 
@@ -25,7 +25,12 @@ def _prepare_testset_isic2017(target, against):
     data = get_experiment("skinlesion.isic2017")
 
     target = [target] if isinstance(target, str) else target
-    against = [against] if isinstance(against, str) else against
+    
+    if against is None:
+        all_classes = ['M', 'SK', 'N']
+        against = [cls for cls in all_classes if cls not in target]
+    else:
+        against = [against] if isinstance(against, str) else against
 
     mapping = {"M": "melanoma", "SK": "seborrheic keratosis", "N": "nevus"}
 
@@ -36,8 +41,13 @@ def _prepare_testset_isic2017(target, against):
 
 
 def check_isic2017(
-    *, target, against, scores: dict, eps: float, numerical_tolerance: float = NUMERICAL_TOLERANCE
-):
+    target: str,
+    scores: dict,
+    eps,
+    *,
+    against: str | None = None,
+    numerical_tolerance: float = NUMERICAL_TOLERANCE,
+) -> dict:
     """
     Tests if the scores are consistent with the test set of the ISIC2017
     skin lesion classification dataset. The dataset contains three classes,
diff --git a/tests/aggregated/_evaluate_lp.py b/tests/aggregated/_evaluate_lp.py
@@ -12,8 +12,8 @@
 
 
 def evaluate_timeout(
-    result: pl.LpProblem, problem: Experiment, scores: dict, eps, score_subset: list
-):
+    result: pl.LpProblem, problem: Experiment, scores: dict, eps, score_subset: list[str]
+) -> None:
     """
     Evaluate the stopped or succeeded tests
 
diff --git a/tests/aggregated/test_evaluation.py b/tests/aggregated/test_evaluation.py
@@ -9,6 +9,7 @@
 
 from mlscorecheck.aggregated import (
     Evaluation,
+    Experiment,
     compare_scores,
     generate_dataset,
     generate_evaluation,
@@ -44,7 +45,7 @@
 random_seeds = list(range(5))
 
 
-def test_evaluate_timeout():
+def test_evaluate_timeout() -> None:
     """
     Testing the evaluate_timeout function
     """
@@ -54,22 +55,28 @@ class Mock:  # pylint: disable=too-few-public-methods
         Mock lp_problem class
         """
 
-        def __init__(self):
+        def __init__(self) -> None:
             """
             Constructor of the mock class
             """
             self.status = 0
 
     mock = Mock()
 
+    # Create dummy objects for testing - need Experiment, not Evaluation
+    dummy_evaluation_dict = generate_evaluation(random_state=42)
+    dummy_experiment = Experiment(evaluations=[dummy_evaluation_dict], aggregation="som")
+    dummy_scores: dict = {"acc": 0.5}
+    dummy_subset: list[str] = ["acc"]
+
     with warnings.catch_warnings(record=True) as warn:
-        evaluate_timeout(mock, None, None, None, None)
+        evaluate_timeout(mock, dummy_experiment, dummy_scores, 0.1, dummy_subset)
         assert len(warn) == 1
 
 
 @pytest.mark.parametrize("random_seed", random_seeds)
 @pytest.mark.parametrize("aggregation", ["mos", "som"])
-def test_instantiation(random_seed: int, aggregation: str):
+def test_instantiation(random_seed: int, aggregation: str) -> None:
     """
     Testing the instantiation of evaluations
 
@@ -95,7 +102,7 @@ def test_instantiation(random_seed: int, aggregation: str):
 
 @pytest.mark.parametrize("random_seed", random_seeds)
 @pytest.mark.parametrize("aggregation", ["mos", "som"])
-def test_sample_figures(random_seed: int, aggregation: str):
+def test_sample_figures(random_seed: int, aggregation: str) -> None:
     """
     Testing the sampling of figures
 
@@ -119,8 +126,8 @@ def test_sample_figures(random_seed: int, aggregation: str):
 @pytest.mark.parametrize("aggregation", ["mos", "som"])
 @pytest.mark.parametrize("rounding_decimals", [2, 3, 4])
 def test_linear_programming_success(
-    subset: list, random_seed: int, aggregation: str, rounding_decimals: int
-):
+    subset: list[str], random_seed: int, aggregation: str, rounding_decimals: int
+) -> None:
     """
     Testing the linear programming functionalities
 
@@ -163,8 +170,8 @@ def test_linear_programming_success(
 @pytest.mark.parametrize("aggregation", ["mos", "som"])
 @pytest.mark.parametrize("rounding_decimals", [2, 3, 4])
 def test_linear_programming_evaluation_generation_success(
-    subset: list, random_seed: int, aggregation: str, rounding_decimals: int
-):
+    subset: list[str], random_seed: int, aggregation: str, rounding_decimals: int
+) -> None:
     """
     Testing the linear programming functionalities by generating the evaluation
 
@@ -175,9 +182,15 @@ def test_linear_programming_evaluation_generation_success(
         rounding_decimals (int): the number of decimals to round to
     """
 
-    evaluation = generate_evaluation(random_state=random_seed, aggregation=aggregation)
+    evaluation_dict = generate_evaluation(random_state=random_seed, aggregation=aggregation)
+    assert isinstance(evaluation_dict, dict), "generate_evaluation should return dict when return_scores=False"
 
-    evaluation = Evaluation(**evaluation)
+    evaluation = Evaluation(
+        dataset=evaluation_dict["dataset"],
+        folding=evaluation_dict["folding"],
+        aggregation=evaluation_dict["aggregation"],
+        fold_score_bounds=evaluation_dict.get("fold_score_bounds"),
+    )
 
     evaluation.sample_figures(random_state=random_seed)
 
@@ -203,7 +216,7 @@ def test_linear_programming_evaluation_generation_success(
 @pytest.mark.parametrize("aggregation", ["mos", "som"])
 def test_linear_programming_evaluation_generation_failure(
     random_seed: int, aggregation: str
-):
+) -> None:
     """
     Testing the linear programming functionalities by generating the evaluation
 
@@ -212,9 +225,15 @@ def test_linear_programming_evaluation_generation_failure(
         aggregation (str): the aggregation to use ('mos'/'som')
     """
 
-    evaluation = generate_evaluation(random_state=random_seed, aggregation=aggregation)
+    evaluation_dict = generate_evaluation(random_state=random_seed, aggregation=aggregation)
+    assert isinstance(evaluation_dict, dict), "generate_evaluation should return dict when return_scores=False"
 
-    evaluation = Evaluation(**evaluation)
+    evaluation = Evaluation(
+        dataset=evaluation_dict["dataset"],
+        folding=evaluation_dict["folding"],
+        aggregation=evaluation_dict["aggregation"],
+        fold_score_bounds=evaluation_dict.get("fold_score_bounds"),
+    )
 
     evaluation.sample_figures(random_state=random_seed)
 
@@ -229,7 +248,7 @@ def test_linear_programming_evaluation_generation_failure(
 
 @pytest.mark.parametrize("random_seed", random_seeds)
 @pytest.mark.parametrize("aggregation", ["mos", "som"])
-def test_get_fold_score_bounds(random_seed: int, aggregation: str):
+def test_get_fold_score_bounds(random_seed: int, aggregation: str) -> None:
     """
     Testing the extraction of fold score bounds
 
@@ -238,9 +257,15 @@ def test_get_fold_score_bounds(random_seed: int, aggregation: str):
         aggregation (str): the aggregation to use ('mos'/'som')
     """
 
-    evaluation = generate_evaluation(random_state=random_seed, aggregation=aggregation)
+    evaluation_dict = generate_evaluation(random_state=random_seed, aggregation=aggregation)
+    assert isinstance(evaluation_dict, dict), "generate_evaluation should return dict when return_scores=False"
 
-    evaluation = Evaluation(**evaluation)
+    evaluation = Evaluation(
+        dataset=evaluation_dict["dataset"],
+        folding=evaluation_dict["folding"],
+        aggregation=evaluation_dict["aggregation"],
+        fold_score_bounds=evaluation_dict.get("fold_score_bounds"),
+    )
     evaluation.sample_figures().calculate_scores()
 
     score_bounds = get_fold_score_bounds(evaluation, feasible=True)
@@ -255,8 +280,8 @@ def test_get_fold_score_bounds(random_seed: int, aggregation: str):
 @pytest.mark.parametrize("aggregation", ["mos"])
 @pytest.mark.parametrize("rounding_decimals", [3, 4])
 def test_linear_programming_success_bounds(
-    subset: list, random_seed: int, aggregation: str, rounding_decimals: int
-):
+    subset: list[str], random_seed: int, aggregation: str, rounding_decimals: int
+) -> None:
     """
     Testing the linear programming functionalities by generating the evaluation
     with bounds
@@ -287,16 +312,22 @@ def test_linear_programming_success_bounds(
 
     assert lp_program.status in (0, 1)
 
-    evaluate_timeout(lp_program, skeleton, scores, 10 ** (-rounding_decimals), subset)
+    # Direct evaluation instead of evaluate_timeout since we have an Evaluation, not Experiment
+    if lp_program.status == 1:
+        populated = skeleton.populate(lp_program)
+        assert compare_scores(
+            scores, populated.calculate_scores(), 10 ** (-rounding_decimals), subset
+        )
+        assert populated.check_bounds()["bounds_flag"] is True
 
 
 @pytest.mark.parametrize("subset", two_combs + three_combs + four_combs)
 @pytest.mark.parametrize("random_seed", random_seeds)
 @pytest.mark.parametrize("aggregation", ["mos"])
 @pytest.mark.parametrize("rounding_decimals", [3, 4])
 def test_linear_programming_failure_bounds(
-    subset: list, random_seed: int, aggregation: str, rounding_decimals: int
-):
+    subset: list[str], random_seed: int, aggregation: str, rounding_decimals: int
+) -> None:
     """
     Testing the linear programming functionalities by generating the evaluation
     with bounds
@@ -327,16 +358,23 @@ def test_linear_programming_failure_bounds(
 
     assert lp_program.status in (-1, 0)
 
-    evaluate_timeout(lp_program, skeleton, scores, 10 ** (-rounding_decimals), subset)
+    # Direct evaluation instead of evaluate_timeout since we have an Evaluation, not Experiment  
+    # For infeasible problems, just check the status
 
 
-def test_others():
+def test_others() -> None:
     """
     Testing other functionalities
     """
 
-    evaluation = generate_evaluation(aggregation="som",
+    evaluation_dict = generate_evaluation(aggregation="som",
                                         feasible_fold_score_bounds=True,
                                         random_state=5)
+    assert isinstance(evaluation_dict, dict), "generate_evaluation should return dict when return_scores=False"
     with pytest.raises(ValueError):
-        Evaluation(**evaluation)
+        Evaluation(
+            dataset=evaluation_dict["dataset"],
+            folding=evaluation_dict["folding"],
+            aggregation=evaluation_dict["aggregation"],
+            fold_score_bounds=evaluation_dict.get("fold_score_bounds"),
+        )
diff --git a/tests/aggregated/test_experiment.py b/tests/aggregated/test_experiment.py