From 4d785444f6da924217fc6d3380fb880b99d79f13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?=
 <2116466+MaxiBoether@users.noreply.github.com>
Date: Sat, 22 Jun 2024 15:26:02 +0200
Subject: [PATCH 1/4] Implement `ratio_max` scaling for downsamplers (#541)

Before, we only were able to enter ratios from 0-100 as percentages for
downsamplers. With this PR, we allow scaling by a ratio_max factor.
---
 modyn/config/schema/pipeline/config.py        | 29 +++++++++++++++++--
 .../pipeline/sampling/downsampling_config.py  | 21 ++++++++++++--
 .../abstract_downsampling_strategy.py         |  2 ++
 .../test_rho_loss_downsampling_strategy.py    |  1 +
 .../downsampling_strategies/test_scheduler.py |  6 ++++
 ...t_abstract_matrix_downsampling_strategy.py |  1 +
 ...t_abstract_remote_downsampling_strategy.py |  2 +-
 .../test_craig_remote_downsampling.py         |  7 +++--
 ..._remote_gradmatch_downsampling_strategy.py |  5 ++--
 .../test_remote_gradnorm_downsample.py        |  7 +++--
 ...st_remote_kcenter_downsampling_strategy.py |  5 ++--
 .../test_remote_loss_downsample.py            |  8 ++---
 .../test_remote_rho_loss_downsampling.py      |  1 +
 .../test_remote_rs2_downsampling.py           | 12 ++++----
 ...remote_submodular_downsampling_strategy.py |  2 ++
 ...emote_uncertainty_downsampling_strategy.py |  1 +
 .../internal/trainer/test_pytorch_trainer.py  | 13 ++++++---
 modyn/tests/utils/test_utils.py               |  2 +-
 .../internal/trainer/pytorch_trainer.py       |  9 ++++--
 .../abstract_matrix_downsampling_strategy.py  |  2 +-
 .../abstract_remote_downsampling_strategy.py  |  1 +
 .../remote_craig_downsampling.py              |  2 +-
 .../remote_gradnorm_downsampling.py           |  2 +-
 .../remote_loss_downsampling.py               |  2 +-
 .../remote_rho_loss_downsampling.py           |  2 +-
 .../remote_rs2_downsampling.py                |  2 +-
 ...emote_uncertainty_downsampling_strategy.py |  2 +-
 27 files changed, 109 insertions(+), 40 deletions(-)

diff --git a/modyn/config/schema/pipeline/config.py b/modyn/config/schema/pipeline/config.py
index 4a1a95851..f7031224f 100644
--- a/modyn/config/schema/pipeline/config.py
+++ b/modyn/config/schema/pipeline/config.py
@@ -1,15 +1,16 @@
 from __future__ import annotations
 
-from typing import Optional
+from typing import Optional, Self
 
 from modyn.config.schema.base_model import ModynBaseModel
-from pydantic import Field
+from pydantic import Field, model_validator
 
 from .data import DataConfig
 from .evaluation.config import EvaluationConfig
 from .model import ModelConfig
 from .model_storage import PipelineModelStorageConfig
-from .sampling.config import SelectionStrategy
+from .sampling.config import CoresetStrategyConfig, SelectionStrategy
+from .sampling.downsampling_config import MultiDownsamplingConfig
 from .training import TrainingConfig
 from .trigger import TriggerConfig
 
@@ -32,3 +33,25 @@ class ModynPipelineConfig(ModynBaseModel):
     trigger: TriggerConfig
     selection_strategy: SelectionStrategy
     evaluation: EvaluationConfig | None = Field(None)
+
+    @model_validator(mode="after")
+    def validate_bts_training_selection_works(self) -> Self:
+        # Validates that when using Downsampling with BtS, we choose a functional ratio
+        if isinstance(self.selection_strategy, CoresetStrategyConfig) and not isinstance(
+            self.selection_strategy.downsampling_config, MultiDownsamplingConfig
+        ):
+            if not self.selection_strategy.downsampling_config.sample_then_batch:  # bts
+                ratio = self.selection_strategy.downsampling_config.ratio
+                ratio_max = self.selection_strategy.downsampling_config.ratio_max
+                batch_size = self.training.batch_size
+
+                post_downsampling_size = max((ratio * batch_size) // ratio_max, 1)
+                if batch_size % post_downsampling_size != 0:
+                    raise ValueError(
+                        f"The target batch size of {batch_size} is not a multiple of the batch size "
+                        + f"after downsampling with ratio {ratio} a batch in BtS mode ({post_downsampling_size}). "
+                        + "We cannot accumulate batches. "
+                        + "Please choose the downsampling ratio and batch size such that this is possible."
+                    )
+
+        return self
diff --git a/modyn/config/schema/pipeline/sampling/downsampling_config.py b/modyn/config/schema/pipeline/sampling/downsampling_config.py
index 9db2004f3..8c54e58a4 100644
--- a/modyn/config/schema/pipeline/sampling/downsampling_config.py
+++ b/modyn/config/schema/pipeline/sampling/downsampling_config.py
@@ -20,9 +20,20 @@ class BaseDownsamplingConfig(ModynBaseModel):
         ),
     )
     ratio: int = Field(
-        description="Ratio post_sampling_size/pre_sampling_size. E.g. with 160 records and a ratio of 50 we keep 80.",
+        description=(
+            "Ratio post_sampling_size/pre_sampling_size * ratio_max. "
+            "For the default of ratio_max of 100, this implies percent, "
+            "e.g., with 160 records and a ratio of 50 we keep 80."
+        ),
         min=0,
-        max=100,
+    )
+    ratio_max: int = Field(
+        description=(
+            "Reference maximum ratio value. Defaults to 100, which implies percent."
+            " If you set this to 1000, ratio describes promille instead."
+        ),
+        default=100,
+        min=1,
     )
     period: int = Field(
         1,
@@ -34,6 +45,12 @@ class BaseDownsamplingConfig(ModynBaseModel):
         min=0,
     )
 
+    @model_validator(mode="after")
+    def validate_ratio(self) -> Self:
+        if self.ratio > self.ratio_max:
+            raise ValueError("ratio cannot be greater than ratio_max.")
+        return self
+
 
 class UncertaintyDownsamplingConfig(BaseDownsamplingConfig):
     """Config for the Craig downsampling strategy."""
diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py
index 584423a31..a8dae1333 100644
--- a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py
+++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py
@@ -38,6 +38,7 @@ def __init__(
 
         self.downsampling_period = downsampling_config.period
         self.downsampling_ratio = downsampling_config.ratio
+        self.ratio_max = downsampling_config.ratio_max
 
         self.requires_remote_computation = True
         self.maximum_keys_in_memory = maximum_keys_in_memory
@@ -60,6 +61,7 @@ def _compute_status_bar_scale(self) -> int:
     def downsampling_params(self) -> dict:
         config = {
             "downsampling_ratio": self.downsampling_ratio,
+            "ratio_max": self.ratio_max,
             "maximum_keys_in_memory": self.maximum_keys_in_memory,
             "sample_then_batch": self.downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH,
         }
diff --git a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_rho_loss_downsampling_strategy.py b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_rho_loss_downsampling_strategy.py
index 4249db700..b8664688e 100644
--- a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_rho_loss_downsampling_strategy.py
+++ b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_rho_loss_downsampling_strategy.py
@@ -301,6 +301,7 @@ def test_downsampling_params(il_training_config: ILTrainingConfig, data_config:
 
     expected = {
         "downsampling_ratio": 60,
+        "ratio_max": 100,
         "maximum_keys_in_memory": maximum_keys_in_memory,
         "sample_then_batch": False,
         "il_model_id": 3,
diff --git a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py
index e07cfba02..e3783c66b 100644
--- a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py
+++ b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py
@@ -86,6 +86,7 @@ def test_switch_functions():
             "downsampling_ratio": 50,
             "maximum_keys_in_memory": 1000,
             "sample_then_batch": True,
+            "ratio_max": 100,
         }
         assert downs.downsampling_strategy == "RemoteLossDownsampling"
         assert downs.training_status_bar_scale == 50
@@ -98,6 +99,7 @@ def test_switch_functions():
             "downsampling_ratio": 25,
             "maximum_keys_in_memory": 1000,
             "sample_then_batch": False,
+            "ratio_max": 100,
         }
         assert downs.downsampling_strategy == "RemoteGradNormDownsampling"
         assert downs.training_status_bar_scale == 100
@@ -140,6 +142,7 @@ def test_double_threshold():
             "downsampling_ratio": 50,
             "maximum_keys_in_memory": 1000,
             "sample_then_batch": True,
+            "ratio_max": 100,
         }
         assert downs.downsampling_strategy == "RemoteLossDownsampling"
         assert downs.training_status_bar_scale == 50
@@ -152,6 +155,7 @@ def test_double_threshold():
             "downsampling_ratio": 25,
             "maximum_keys_in_memory": 1000,
             "sample_then_batch": False,
+            "ratio_max": 100,
         }
         assert downs.downsampling_strategy == "RemoteGradNormDownsampling"
         assert downs.training_status_bar_scale == 100
@@ -179,6 +183,7 @@ def test_wrong_trigger():
             "downsampling_ratio": 50,
             "maximum_keys_in_memory": 1000,
             "sample_then_batch": True,
+            "ratio_max": 100,
         }
         assert downs.downsampling_strategy == "RemoteLossDownsampling"
         assert downs.training_status_bar_scale == 50
@@ -195,6 +200,7 @@ def test_wrong_trigger():
             "downsampling_ratio": 25,
             "maximum_keys_in_memory": 1000,
             "sample_then_batch": False,
+            "ratio_max": 100,
         }
         assert downs.downsampling_strategy == "RemoteGradNormDownsampling"
         assert downs.training_status_bar_scale == 100
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py
index 7ce654f55..4745b7f09 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py
@@ -19,6 +19,7 @@ def get_sampler_config(dummy_system_config: ModynConfig, balance=False):
         "sample_then_batch": False,
         "args": {},
         "balance": balance,
+        "ratio_max": 100,
     }
     return (
         0,
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py
index 899cd507b..8a428e8cc 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py
@@ -11,7 +11,7 @@
 def test_batch_then_sample_general(dummy_system_config: ModynConfig):
     downsampling_ratio = 50
 
-    params_from_selector = {"downsampling_ratio": downsampling_ratio}
+    params_from_selector = {"downsampling_ratio": downsampling_ratio, "ratio_max": 100}
     sampler = AbstractRemoteDownsamplingStrategy(
         154, 128, 64, params_from_selector, dummy_system_config.model_dump(by_alias=True), "cpu"
     )
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py
index 16891d54c..87719cfef 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py
@@ -20,6 +20,7 @@ def get_sampler_config(modyn_config, balance=False):
         "balance": balance,
         "selection_batch": 64,
         "greedy": "NaiveGreedy",
+        "ratio_max": 100,
     }
     return 0, 0, 0, params_from_selector, modyn_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
 
@@ -347,7 +348,7 @@ def test_matching_results_with_deepcore(dummy_system_config: ModynConfig):
         0,
         0,
         5,
-        {"downsampling_ratio": 20, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy"},
+        {"downsampling_ratio": 20, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy", "ratio_max": 100},
         dummy_system_config.model_dump(by_alias=True),
         BCEWithLogitsLoss(reduction="none"),
         "cpu",
@@ -402,7 +403,7 @@ def test_matching_results_with_deepcore_permutation(dummy_system_config: ModynCo
         0,
         0,
         5,
-        {"downsampling_ratio": 30, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy"},
+        {"downsampling_ratio": 30, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy", "ratio_max": 100},
         dummy_system_config.model_dump(by_alias=True),
         BCEWithLogitsLoss(reduction="none"),
         "cpu",
@@ -461,7 +462,7 @@ def test_matching_results_with_deepcore_permutation_fancy_ids(dummy_system_confi
         0,
         0,
         5,
-        {"downsampling_ratio": 50, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy"},
+        {"downsampling_ratio": 50, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy", "ratio_max": 100},
         dummy_system_config.model_dump(by_alias=True),
         BCEWithLogitsLoss(reduction="none"),
         "cpu",
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py
index 70eabe39d..d6355eee7 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py
@@ -19,6 +19,7 @@ def get_sampler_config(modyn_config: ModynConfig, balance=False):
         "sample_then_batch": False,
         "args": {},
         "balance": balance,
+        "ratio_max": 100,
     }
     return 0, 0, 0, params_from_selector, modyn_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
 
@@ -185,7 +186,7 @@ def test_matching_results_with_deepcore(dummy_system_config: ModynConfig):
             0,
             0,
             5,
-            {"downsampling_ratio": 10 * num_of_target_samples, "balance": False},
+            {"downsampling_ratio": 10 * num_of_target_samples, "balance": False, "ratio_max": 100},
             dummy_system_config.model_dump(by_alias=True),
             BCEWithLogitsLoss(reduction="none"),
             "cpu",
@@ -237,7 +238,7 @@ def test_matching_results_with_deepcore_permutation_fancy_ids(dummy_system_confi
         0,
         0,
         5,
-        {"downsampling_ratio": 50, "balance": False},
+        {"downsampling_ratio": 50, "balance": False, "ratio_max": 100},
         dummy_system_config.model_dump(by_alias=True),
         BCEWithLogitsLoss(reduction="none"),
         "cpu",
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py
index c6fe03829..a3211b7af 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py
@@ -14,7 +14,7 @@ def test_sample_shape_ce(dummy_system_config: ModynConfig):
     downsampling_ratio = 50
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
-    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
+    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100}
     sampler = RemoteGradNormDownsampling(
         0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
     )
@@ -45,7 +45,7 @@ def test_sample_shape_other_losses(dummy_system_config: ModynConfig):
     downsampling_ratio = 50
     per_sample_loss_fct = torch.nn.BCEWithLogitsLoss(reduction="none")
 
-    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
+    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100}
     sampler = RemoteGradNormDownsampling(
         0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
     )
@@ -84,6 +84,7 @@ def test_sampling_crossentropy(dummy_system_config: ModynConfig):
         "downsampling_ratio": downsampling_ratio,
         "replacement": False,
         "sample_then_batch": False,
+        "ratio_max": 100,
     }
 
     # Here we use autograd since the number of classes is not provided
@@ -135,7 +136,7 @@ def test_sample_dict_input(dummy_system_config: ModynConfig):
     model = DictLikeModel()
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
-    params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False}
+    params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False, "ratio_max": 100}
     sampler = RemoteGradNormDownsampling(
         0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
     )
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py
index 9b39754bd..1779a12e1 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py
@@ -17,6 +17,7 @@ def get_sampler_config(modyn_config: ModynConfig, balance=False):
         "sample_then_batch": False,
         "args": {},
         "balance": balance,
+        "ratio_max": 100,
     }
     return 0, 0, 0, params_from_selector, modyn_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
 
@@ -137,7 +138,7 @@ def test_matching_results_with_deepcore(dummy_system_config: ModynConfig):
             0,
             0,
             5,
-            {"downsampling_ratio": 10 * num_of_target_samples, "balance": False},
+            {"downsampling_ratio": 10 * num_of_target_samples, "balance": False, "ratio_max": 100},
             dummy_system_config.model_dump(by_alias=True),
             BCEWithLogitsLoss(reduction="none"),
             "cpu",
@@ -166,7 +167,7 @@ def test_matching_results_with_deepcore_permutation_fancy_ids(dummy_system_confi
         0,
         0,
         5,
-        {"downsampling_ratio": 50, "balance": False},
+        {"downsampling_ratio": 50, "balance": False, "ratio_max": 100},
         dummy_system_config.model_dump(by_alias=True),
         BCEWithLogitsLoss(reduction="none"),
         "cpu",
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py
index d875fb930..3a2cd27bc 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py
@@ -12,7 +12,7 @@ def test_sample_shape(dummy_system_config: ModynConfig):
     downsampling_ratio = 50
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
-    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
+    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100}
     sampler = RemoteLossDownsampling(
         0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
     )
@@ -39,7 +39,7 @@ def test_sample_weights(dummy_system_config: ModynConfig):
     downsampling_ratio = 50
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
-    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
+    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100}
     sampler = RemoteLossDownsampling(
         0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
     )
@@ -67,7 +67,7 @@ def test_sample_loss_dependent_sampling(dummy_system_config: ModynConfig):
     downsampling_ratio = 50
     per_sample_loss_fct = torch.nn.MSELoss(reduction="none")
 
-    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
+    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100}
     sampler = RemoteLossDownsampling(
         0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
     )
@@ -116,7 +116,7 @@ def test_sample_dict_input(dummy_system_config: ModynConfig):
     mymodel = DictLikeModel()
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
-    params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False}
+    params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False, "ratio_max": 100}
     sampler = RemoteLossDownsampling(
         0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
     )
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rho_loss_downsampling.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rho_loss_downsampling.py
index 7091c330e..a159884ce 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rho_loss_downsampling.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rho_loss_downsampling.py
@@ -29,6 +29,7 @@ def dummy_init_params(dummy_system_config: ModynConfig):
         "il_model_id": 2,
         "downsampling_ratio": 50,
         "sample_then_batch": False,
+        "ratio_max": 100,
     }
     modyn_config = dummy_system_config.model_dump(by_alias=True)
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rs2_downsampling.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rs2_downsampling.py
index a1dd4d836..c5e764d3b 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rs2_downsampling.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rs2_downsampling.py
@@ -10,7 +10,7 @@ def test_init(dummy_system_config: ModynConfig):
     pipeline_id = 0
     trigger_id = 0
     batch_size = 32
-    params_from_selector = {"replacement": True, "downsampling_ratio": 50}
+    params_from_selector = {"replacement": True, "downsampling_ratio": 50, "ratio_max": 100}
     per_sample_loss = None
     device = "cpu"
 
@@ -41,7 +41,7 @@ def test_inform_samples(dummy_system_config: ModynConfig):
     pipeline_id = 0
     trigger_id = 0
     batch_size = 32
-    params_from_selector = {"replacement": True, "downsampling_ratio": 50}
+    params_from_selector = {"replacement": True, "downsampling_ratio": 50, "ratio_max": 100}
     per_sample_loss = None
     device = "cpu"
 
@@ -76,7 +76,7 @@ def test_multiple_epochs_with_replacement(dummy_system_config: ModynConfig):
     pipeline_id = 0
     trigger_id = 0
     batch_size = 32
-    params_from_selector = {"replacement": True, "downsampling_ratio": 50}
+    params_from_selector = {"replacement": True, "downsampling_ratio": 50, "ratio_max": 100}
     per_sample_loss = None
     device = "cpu"
 
@@ -110,7 +110,7 @@ def test_multiple_epochs_without_replacement(dummy_system_config: ModynConfig):
     pipeline_id = 0
     trigger_id = 0
     batch_size = 32
-    params_from_selector = {"replacement": False, "downsampling_ratio": 50}
+    params_from_selector = {"replacement": False, "downsampling_ratio": 50, "ratio_max": 100}
     per_sample_loss = None
     device = "cpu"
 
@@ -171,7 +171,7 @@ def test_multiple_epochs_without_replacement_leftover_data(dummy_system_config:
     pipeline_id = 0
     trigger_id = 0
     batch_size = 32
-    params_from_selector = {"replacement": False, "downsampling_ratio": 40}
+    params_from_selector = {"replacement": False, "downsampling_ratio": 40, "ratio_max": 100}
     per_sample_loss = None
     device = "cpu"
 
@@ -207,7 +207,7 @@ def test_multiple_epochs_empty_without_replacement_leftover_data(dummy_system_co
     pipeline_id = 0
     trigger_id = 0
     batch_size = 32
-    params_from_selector = {"replacement": False, "downsampling_ratio": 40}
+    params_from_selector = {"replacement": False, "downsampling_ratio": 40, "ratio_max": 100}
     per_sample_loss = None
     device = "cpu"
 
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py
index 9adca1d03..6cc0366cc 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py
@@ -19,6 +19,7 @@ def get_sampler_config(modyn_config: ModynConfig, submodular: str = "GraphCut",
         "submodular_function": submodular,
         "balance": balance,
         "selection_batch": 64,
+        "ratio_max": 100,
     }
     return 0, 0, 0, params_from_selector, modyn_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
 
@@ -124,6 +125,7 @@ def _get_selected_samples(
             "submodular_function": submodular,
             "balance": False,
             "selection_batch": 64,
+            "ratio_max": 100,
         },
         modyn_config.model_dump(by_alias=True),
         BCEWithLogitsLoss(reduction="none"),
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py
index fab0b6d17..92013d235 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py
@@ -18,6 +18,7 @@ def sampler_config(dummy_system_config: ModynConfig, request):
         "args": {},
         "balance": False,
         "score_metric": request.param,
+        "ratio_max": 100,
     }
     return 0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
 
diff --git a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py
index 455e557c0..52527184d 100644
--- a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py
+++ b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py
@@ -291,6 +291,7 @@ def get_mock_trainer(
     batch_size: int = 32,
     downsampling_mode: DownsamplingMode = DownsamplingMode.DISABLED,
     downsampling_ratio: int = 25,
+    ratio_max: int = 100,
 ):
     model_dynamic_module_patch.return_value = MockModule(num_optimizers)
     lr_scheduler_dynamic_module_patch.return_value = MockLRSchedulerModule()
@@ -300,7 +301,7 @@ def get_mock_trainer(
         mock_selection_strategy.return_value = (
             True,
             "RemoteGradNormDownsampling",
-            {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False},
+            {"downsampling_ratio": downsampling_ratio, "ratio_max": ratio_max, "sample_then_batch": False},
         )
     elif downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH:
         raise NotImplementedError()
@@ -868,6 +869,7 @@ def test_create_trainer_with_exception(
         assert pathlib.Path(temp.name).exists()
 
 
+@pytest.mark.parametrize("downsampling_ratio, ratio_max", [(25, 100), (50, 100), (250, 1000), (125, 1000)])
 @patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders)
 @patch.object(BaseCallback, "on_train_begin", return_value=None)
 @patch.object(BaseCallback, "on_train_end", return_value=None)
@@ -893,10 +895,11 @@ def test_train_batch_then_sample_accumulation(
     test_on_train_end,
     test_on_train_begin,
     dummy_system_config: ModynConfig,
+    downsampling_ratio,
+    ratio_max,
 ):
     num_batches = 100  # hardcoded into mock dataloader
     batch_size = 32
-    downsampling_ratio = 25
 
     query_status_queue = mp.Queue()
     status_queue = mp.Queue()
@@ -913,11 +916,12 @@ def test_train_batch_then_sample_accumulation(
         batch_size=batch_size,
         downsampling_mode=DownsamplingMode.BATCH_THEN_SAMPLE,
         downsampling_ratio=downsampling_ratio,
+        ratio_max=ratio_max,
     )
     assert trainer._downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE
 
     # Mock the downsample_batch method to return batches of the expected size
-    expected_bts_size = int(batch_size * (downsampling_ratio / 100.0))
+    expected_bts_size = int(batch_size * (downsampling_ratio / ratio_max))
     bts_accumulate_period = batch_size // expected_bts_size
 
     def mock_downsample_batch(data, sample_ids, target):
@@ -945,7 +949,8 @@ def mock_forward(data):
 
     assert trainer._num_samples == batch_size * num_batches
     assert trainer._log["num_samples"] == batch_size * num_batches
-    assert trainer._log["num_samples_trained"] == expected_bts_size * num_batches
+    # We only train on whole batches, hence we have to scale by batch size
+    assert trainer._log["num_samples_trained"] == ((expected_bts_size * num_batches) // batch_size) * batch_size
     assert test_on_batch_begin.call_count == len(trainer._callbacks) * num_batches
     assert test_on_batch_end.call_count == len(trainer._callbacks) * num_batches
     assert test_downsample_batch.call_count == num_batches
diff --git a/modyn/tests/utils/test_utils.py b/modyn/tests/utils/test_utils.py
index 8ce245719..9b22dd841 100644
--- a/modyn/tests/utils/test_utils.py
+++ b/modyn/tests/utils/test_utils.py
@@ -189,7 +189,7 @@ def test_instantiate_class_existing(dummy_system_config: ModynConfig):
         10,
         11,
         64,
-        {"downsampling_ratio": 67},
+        {"downsampling_ratio": 67, "ratio_max": 100},
         dummy_system_config.model_dump(by_alias=True),
         {},
         "cpu",
diff --git a/modyn/trainer_server/internal/trainer/pytorch_trainer.py b/modyn/trainer_server/internal/trainer/pytorch_trainer.py
index dc6f73df4..b4a755765 100644
--- a/modyn/trainer_server/internal/trainer/pytorch_trainer.py
+++ b/modyn/trainer_server/internal/trainer/pytorch_trainer.py
@@ -223,7 +223,9 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
             # assertion since model validation by pydantic should catch this.
             assert self._downsampler.supports_bts, "The downsampler does not support batch then sample"
             # We cannot pass the target size from the trainer server since that depends on StB vs BtS.
-            post_downsampling_size = max((self._downsampler.downsampling_ratio * self._batch_size) // 100, 1)
+            post_downsampling_size = max(
+                (self._downsampler.downsampling_ratio * self._batch_size) // self._downsampling_ratio_max, 1
+            )
             assert post_downsampling_size < self._batch_size
             if self._batch_size % post_downsampling_size != 0:
                 raise ValueError(
@@ -727,6 +729,7 @@ def _setup_downsampling(
         self._downsampler = self._instantiate_downsampler(
             strategy_name, downsampler_config, modyn_config, self._criterion_nored
         )
+        self._downsampling_ratio_max = downsampler_config["ratio_max"]
         assert "sample_then_batch" in downsampler_config
         self._log["received_downsampler_config"] = downsampler_config
         if downsampler_config["sample_then_batch"]:
@@ -833,7 +836,9 @@ def _calc_expected_sizes(self, downsampling_enabled: bool) -> None:
         )  # scale up again to multiples of batch size
 
         if downsampling_enabled:
-            num_samples_per_epoch = max((self._downsampler.downsampling_ratio * num_samples_per_epoch) // 100, 1)
+            num_samples_per_epoch = max(
+                (self._downsampler.downsampling_ratio * num_samples_per_epoch) // self._downsampling_ratio_max, 1
+            )
 
         self._expected_num_batches = (num_samples_per_epoch // self._batch_size) * self.epochs_per_trigger
         self._expected_num_epochs = self.epochs_per_trigger
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py
index 440c715d2..51e7a3794 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py
@@ -110,7 +110,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]:
     def _select_from_matrix(self) -> tuple[list[int], torch.Tensor]:
         matrix = np.concatenate(self.matrix_elements)
         number_of_samples = len(matrix)
-        target_size = max(int(self.downsampling_ratio * number_of_samples / 100), 1)
+        target_size = max(int(self.downsampling_ratio * number_of_samples / self.ratio_max), 1)
         selected_indices, weights = self._select_indexes_from_matrix(matrix, target_size)
         selected_ids = [self.index_sampleid_map[index] for index in selected_indices]
         return selected_ids, weights
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py
index 600119ec7..e1dc49d28 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py
@@ -49,6 +49,7 @@ def __init__(
 
         assert "downsampling_ratio" in params_from_selector
         self.downsampling_ratio = params_from_selector["downsampling_ratio"]
+        self.ratio_max = params_from_selector["ratio_max"]
 
         # The next variable is used to keep a mapping index <-> sample_id
         # This is needed since the data selection policy works on indexes (the policy does not care what the sample_id
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py
index 0ea0102bd..fee662dad 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py
@@ -178,7 +178,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]:
 
     def _select_points_from_distance_matrix(self) -> tuple[list[int], torch.Tensor]:
         number_of_samples = self.distance_matrix.shape[0]
-        target_size = max(int(self.downsampling_ratio * number_of_samples / 100), 1)
+        target_size = max(int(self.downsampling_ratio * number_of_samples / self.ratio_max), 1)
 
         all_index = np.arange(number_of_samples)
         submod_function = FacilityLocation(index=all_index, similarity_matrix=self.distance_matrix)
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py
index fd3e737a5..ac456f6a6 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py
@@ -79,7 +79,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]:
             return [], torch.Tensor([])
 
         # select always at least 1 point
-        target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / 100), 1)
+        target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / self.ratio_max), 1)
 
         probabilities = torch.cat(self.probabilities, dim=0)
         probabilities = probabilities / probabilities.sum()
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py
index e7c4fa866..fd0906e08 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py
@@ -61,7 +61,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]:
             return [], torch.Tensor([])
 
         # select always at least 1 point
-        target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / 100), 1)
+        target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / self.ratio_max), 1)
 
         probabilities = torch.cat(self.probabilities, dim=0)
         probabilities = probabilities / probabilities.sum()
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rho_loss_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rho_loss_downsampling.py
index c1ccb84f2..4cdb2d482 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rho_loss_downsampling.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rho_loss_downsampling.py
@@ -58,7 +58,7 @@ def inform_samples(
         self.number_of_points_seen += forward_output.shape[0]
 
     def select_points(self) -> tuple[list[int], torch.Tensor]:
-        target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / 100), 1)
+        target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / self.ratio_max), 1)
         # find the indices of maximal "target_size" elements in the list of rho_loss
         selected_indices = torch.topk(self.rho_loss, target_size).indices
         # use sorted() because we keep the relative order of the selected samples
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rs2_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rs2_downsampling.py
index 3332fce46..706544ca2 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rs2_downsampling.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rs2_downsampling.py
@@ -71,7 +71,7 @@ def _epoch_step_no_r(self, target_size: int) -> None:
             self._subsets = [self._all_sample_ids[i * target_size : (i + 1) * target_size] for i in range(max_subset)]
 
     def _epoch_step(self) -> None:
-        target_size = max(int(self.downsampling_ratio * len(self._all_sample_ids) / 100), 1)
+        target_size = max(int(self.downsampling_ratio * len(self._all_sample_ids) / self.ratio_max), 1)
 
         if self._with_replacement:
             self._epoch_step_wr(target_size)
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py
index 7040569cd..da0067858 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py
@@ -119,7 +119,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]:
 
     def _select_from_scores(self) -> tuple[list[int], torch.Tensor]:
         number_of_samples = len(self.scores)
-        target_size = max(int(self.downsampling_ratio * number_of_samples / 100), 1)
+        target_size = max(int(self.downsampling_ratio * number_of_samples / self.ratio_max), 1)
         selected_indices, weights = self._select_indexes_from_scores(target_size)
         selected_ids = [self.index_sampleid_map[index] for index in selected_indices]
         return selected_ids, weights

From bf96bfa546c829a4844f25e0143497deb9539e04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20B=C3=B6ther?=
 <2116466+MaxiBoether@users.noreply.github.com>
Date: Sat, 22 Jun 2024 22:12:10 +0200
Subject: [PATCH 2/4] Add `ratio_max` for presampling strategies (#544)

I realized we also need to set `ratio_max` for presampling strategies,
otherwise they cannot run 12.5% as well.
---
 .../config/schema/pipeline/sampling/config.py | 21 +++++++++++++++++--
 .../abstract_presampling_strategy.py          |  3 ++-
 .../test_random_presampling_strategy.py       | 19 +++++++++++++++++
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/modyn/config/schema/pipeline/sampling/config.py b/modyn/config/schema/pipeline/sampling/config.py
index 2a6c95851..28b0846cc 100644
--- a/modyn/config/schema/pipeline/sampling/config.py
+++ b/modyn/config/schema/pipeline/sampling/config.py
@@ -19,13 +19,30 @@ class PresamplingConfig(ModynBaseModel):
         "Only the prefix, i.e. without `PresamplingStrategy`, is needed."
     )
     ratio: int = Field(
-        description="Percentage of points on which the metric (loss, gradient norm,..) is computed.",
+        description=(
+            "Ratio of points on which the metric (loss, gradient norm,..) is computed."
+            "By default with ratio_max=100, this describes the selection ratio in percent."
+        ),
         min=0,
-        max=100,
+    )
+
+    ratio_max: int = Field(
+        description=(
+            "Reference maximum ratio value. Defaults to 100, which implies percent."
+            " If you set this to 1000, ratio describes promille instead."
+        ),
+        default=100,
+        min=1,
     )
     force_column_balancing: bool = Field(False)
     force_required_target_size: bool = Field(False)
 
+    @model_validator(mode="after")
+    def validate_ratio(self) -> Self:
+        if self.ratio > self.ratio_max:
+            raise ValueError("ratio cannot be greater than ratio_max.")
+        return self
+
 
 StorageBackend = Literal["database", "local"]
 
diff --git a/modyn/selector/internal/selector_strategies/presampling_strategies/abstract_presampling_strategy.py b/modyn/selector/internal/selector_strategies/presampling_strategies/abstract_presampling_strategy.py
index adbbc7bc3..65c536b55 100644
--- a/modyn/selector/internal/selector_strategies/presampling_strategies/abstract_presampling_strategy.py
+++ b/modyn/selector/internal/selector_strategies/presampling_strategies/abstract_presampling_strategy.py
@@ -18,6 +18,7 @@ def __init__(
         self.pipeline_id = pipeline_id
         self._storage_backend = storage_backend
         self.presampling_ratio = presampling_config.ratio
+        self.ratio_max = presampling_config.ratio_max
         self.requires_trigger_dataset_size = False
 
     @abstractmethod
@@ -36,7 +37,7 @@ def get_presampling_query(
 
     def get_target_size(self, trigger_dataset_size: int, limit: Optional[int]) -> int:
         assert trigger_dataset_size >= 0
-        target_presampling = int(trigger_dataset_size * self.presampling_ratio / 100)
+        target_presampling = (trigger_dataset_size * self.presampling_ratio) // self.ratio_max
 
         if limit is not None:
             assert limit >= 0
diff --git a/modyn/tests/selector/internal/selector_strategies/presampling_strategies/test_random_presampling_strategy.py b/modyn/tests/selector/internal/selector_strategies/presampling_strategies/test_random_presampling_strategy.py
index c69f499c3..429205035 100644
--- a/modyn/tests/selector/internal/selector_strategies/presampling_strategies/test_random_presampling_strategy.py
+++ b/modyn/tests/selector/internal/selector_strategies/presampling_strategies/test_random_presampling_strategy.py
@@ -164,3 +164,22 @@ def test_dataset_size_various_scenarios():
     strat.tail_triggers = 1
     trigger_size = strat._get_trigger_dataset_size()
     assert presampling_strat.get_target_size(trigger_size, None) == 22  # 75% of presampling
+
+
+def test_target_size_ratio_max():
+    config = get_config()
+    config.ratio_max = 1000
+    config.ratio = 125
+    strat = RandomPresamplingStrategy(
+        config,
+        get_minimal_modyn_config(),
+        10,
+        DatabaseStorageBackend(0, get_minimal_modyn_config(), 123),
+    )
+    assert strat.get_target_size(128, None) == 16
+    assert strat.get_target_size(100, None) == 12
+    assert strat.get_target_size(12, None) == 1
+    assert strat.get_target_size(0, None) == 0
+
+    with pytest.raises(AssertionError):
+        strat.get_target_size(-1, None)

From 57803eaf25f1d0e687dd39c5335fa7981062bbdc Mon Sep 17 00:00:00 2001
From: Robin Holzinger <robin.holzinger@tum.de>
Date: Sun, 23 Jun 2024 14:09:27 +0200
Subject: [PATCH 3/4] feat: More sophisticated evaluation logic (#534)

---
 analytics/app/data/const.py                   |   3 +
 analytics/app/data/load.py                    |   2 +-
 analytics/app/data/transform.py               | 183 +++++++------
 analytics/app/pages/compare.py                | 109 ++++----
 analytics/app/pages/const/__init__.py         |   0
 analytics/app/pages/const/text.py             |  12 +
 analytics/app/pages/pipeline.py               |  87 +++----
 analytics/app/pages/plots/cost_over_time.py   |  27 +-
 .../pages/plots/cost_vs_eval_metric_agg.py    |  78 +++---
 analytics/app/pages/plots/eval_heatmap.py     | 103 +++++---
 analytics/app/pages/plots/eval_over_time.py   |  44 ++--
 analytics/app/pages/plots/num_samples.py      |  56 ++--
 .../pages/plots/num_triggers_eval_metric.py   |  78 +++---
 .../pages/plots/one_dimensional_comparison.py |  71 +++--
 analytics/app/pages/plots/pipeline_info.py    |  27 +-
 analytics/app/pages/state.py                  |  75 ++++++
 analytics/tools/__init__.py                   |   0
 analytics/tools/aggregate_runs/__init__.py    |   0
 .../tools/aggregate_runs/core_aggregation.py  | 119 +++++++++
 analytics/tools/aggregate_runs/dir_utils.py   |  31 +++
 analytics/tools/aggregate_runs/main.py        |  43 ++++
 .../aggregate_runs/pipeline_equivalence.py    |  32 +++
 analytics/tools/patch_logfile.ipynb           | 174 +++++++++++--
 dev-requirements.txt                          |   2 +-
 environment.yml                               |   2 +-
 .../internal/pipeline_executor/models.py      | 242 +++++++++++-------
 .../pipeline_executor/pipeline_executor.py    |   6 +-
 27 files changed, 1127 insertions(+), 479 deletions(-)
 create mode 100644 analytics/app/data/const.py
 create mode 100644 analytics/app/pages/const/__init__.py
 create mode 100644 analytics/app/pages/const/text.py
 create mode 100644 analytics/app/pages/state.py
 create mode 100644 analytics/tools/__init__.py
 create mode 100644 analytics/tools/aggregate_runs/__init__.py
 create mode 100644 analytics/tools/aggregate_runs/core_aggregation.py
 create mode 100644 analytics/tools/aggregate_runs/dir_utils.py
 create mode 100644 analytics/tools/aggregate_runs/main.py
 create mode 100644 analytics/tools/aggregate_runs/pipeline_equivalence.py

diff --git a/analytics/app/data/const.py b/analytics/app/data/const.py
new file mode 100644
index 000000000..3830018cc
--- /dev/null
+++ b/analytics/app/data/const.py
@@ -0,0 +1,3 @@
+from typing import Literal
+
+CompositeModelOptions = Literal["currently_active_model", "currently_trained_model"]
diff --git a/analytics/app/data/load.py b/analytics/app/data/load.py
index 8331119c4..49df34e87 100644
--- a/analytics/app/data/load.py
+++ b/analytics/app/data/load.py
@@ -27,7 +27,7 @@ def list_pipelines() -> dict[int, tuple[str, Path]]:
 
         pipelines[pipeline_id] = (pipeline_name, Path(pipeline))
 
-    return pipelines
+    return dict(sorted(pipelines.items()))
 
 
 def load_pipeline_logs(pipeline_id: int) -> PipelineLogs:
diff --git a/analytics/app/data/transform.py b/analytics/app/data/transform.py
index d3206f173..1e7cb7234 100644
--- a/analytics/app/data/transform.py
+++ b/analytics/app/data/transform.py
@@ -3,7 +3,9 @@
 
 import pandas as pd
 from modyn.supervisor.internal.grpc.enums import PipelineStage
-from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs, SingleEvaluationInfo
+from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs, SingleEvaluationInfo, StageLog
+from modyn.supervisor.internal.utils.time_tools import generate_real_training_end_timestamp
+from modyn.utils.utils import SECONDS_PER_UNIT
 
 AGGREGATION_FUNCTION = Literal["mean", "median", "max", "min", "sum", "std"]
 EVAL_AGGREGATION_FUNCTION = Literal["time_weighted_avg", "mean", "median", "max", "min", "sum", "std"]
@@ -14,8 +16,9 @@
 # -------------------------------------------------------------------------------------------------------------------- #
 
 
-def logs_dataframe(logs: PipelineLogs) -> pd.DataFrame:
+def logs_dataframe(logs: PipelineLogs, pipeline_ref: str = "pipeline") -> pd.DataFrame:
     df = logs.supervisor_logs.df
+    df["pipeline_ref"] = pipeline_ref
     df["duration"] = df["duration"].apply(lambda x: x.total_seconds())
     convert_epoch_to_datetime(df, "sample_time")
     return df
@@ -23,7 +26,7 @@ def logs_dataframe(logs: PipelineLogs) -> pd.DataFrame:
 
 def logs_dataframe_agg_by_stage(stage_run_df: pd.DataFrame) -> pd.DataFrame:
     df_agg = (
-        stage_run_df.groupby(["id"] + [c for c in stage_run_df.columns if c == "pipeline_ref"])
+        stage_run_df.groupby((["pipeline_ref"] if "pipeline_ref" in stage_run_df.columns else []) + ["id"])
         .agg(
             max=("duration", "max"),
             min=("duration", "min"),
@@ -33,77 +36,126 @@ def logs_dataframe_agg_by_stage(stage_run_df: pd.DataFrame) -> pd.DataFrame:
             sum=("duration", "sum"),
             count=("duration", "count"),
         )
-        .reset_index()
         .fillna(-1)
     )
+    df_agg.reset_index(inplace=True)
     return df_agg
 
 
+def pipeline_stage_parents(logs: PipelineLogs) -> pd.DataFrame:
+    ids = []
+    parents = []
+    for i, (_, parent_list) in logs.pipeline_stages.items():
+        if len(parent_list) == 1:
+            ids.append(i)
+            parents.append(parent_list[0])
+        if len(parent_list) > 1:
+            if i == PipelineStage.PROCESS_NEW_DATA.name:
+                if logs.experiment:
+                    ids.append(i)
+                    parents.append(PipelineStage.REPLAY_DATA.name)
+                else:
+                    ids.append(i)
+                    parents.append(PipelineStage.FETCH_NEW_DATA.name)
+            else:
+                raise ValueError(f"Stage {i} has multiple parents: {parent_list}")
+
+    df = pd.DataFrame({"id": ids, "parent_id": parents})
+    return df
+
+
 def dfs_models_and_evals(
-    logs: PipelineLogs, max_sample_time: Any
+    logs: PipelineLogs, max_sample_time: Any, pipeline_ref: str = "pipeline"
 ) -> tuple[pd.DataFrame, pd.DataFrame | None, pd.DataFrame | None]:
     """Returns a dataframe with the stored models and the dataframe for evaluations"""
 
     # ---------------------------------------------------- MODELS ---------------------------------------------------- #
 
-    store_models = [x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.STORE_TRAINED_MODEL.name]
-    df_models = pd.concat([x.df(extended=True) for x in store_models])
-    # df_models.sort_values(by=["sample_time"])
+    # PipelineStage.STORE_TRAINED_MODEL
+    df_store_models = StageLog.df(
+        (x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.STORE_TRAINED_MODEL.name), extended=True
+    )
+    df_store_models.set_index(["trigger_idx"], inplace=True)
 
-    _list_single_triggers = [
-        x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.HANDLE_SINGLE_TRIGGER.name
-    ]
-    df_single_triggers = pd.concat([x.df(extended=True) for x in _list_single_triggers])
+    # PipelineStage.HANDLE_SINGLE_TRIGGER
+    df_single_triggers = StageLog.df(
+        (x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.HANDLE_SINGLE_TRIGGER.name), extended=True
+    )[["trigger_idx", "trigger_id", "first_timestamp", "last_timestamp"]]
+    df_single_triggers.set_index(["trigger_idx"], inplace=True)
 
-    _list_single_trainings = [x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.TRAIN.name]
-    df_single_trainings = pd.concat([x.df(extended=True) for x in _list_single_trainings])
+    # PipelineStage.TRAIN
+    df_single_trainings = StageLog.df(
+        (x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.TRAIN.name), extended=True
+    )[["trigger_idx", "num_batches", "num_samples"]]
+    df_single_trainings.set_index(["trigger_idx"], inplace=True)
+
+    # MERGE
+    joined_models = df_store_models.merge(
+        df_single_triggers, on="trigger_idx", how="left", suffixes=("", "_trigger")
+    ).merge(df_single_trainings, on="trigger_idx", how="left", suffixes=("", "_training"))
+
+    # sort models by trigger_id (we need that for the shift functions in generate_real_training_end_timestamp etc.)
+    joined_models.sort_index(level="trigger_idx", inplace=True)
 
-    joined_models = df_models.merge(df_single_triggers, on="trigger_idx", how="left", suffixes=("", "_trigger")).merge(
-        df_single_trainings, on="trigger_idx", how="left", suffixes=("", "_training")
-    )
     joined_models["train_start"] = joined_models["first_timestamp"]
     joined_models["train_end"] = joined_models["last_timestamp"]
+    joined_models["real_train_end"] = generate_real_training_end_timestamp(joined_models)
+
+    convert_epoch_to_datetime(joined_models, "sample_time")
+    convert_epoch_to_datetime(joined_models, "train_start")
+    convert_epoch_to_datetime(joined_models, "train_end")
+    convert_epoch_to_datetime(joined_models, "real_train_end")
 
     df_models = joined_models[
-        [col for col in df_models.columns] + ["train_start", "train_end", "num_batches", "num_samples"]
+        [col for col in df_store_models.columns if col not in joined_models.index.names]
+        + ["train_start", "train_end", "real_train_end", "num_batches", "num_samples"]
     ]
 
-    convert_epoch_to_datetime(df_models, "train_start")
-    convert_epoch_to_datetime(df_models, "train_end")
-
-    # sort models by trigger_id
-    df_models.sort_values(by=["trigger_id"], inplace=True)
+    df_models.reset_index(inplace=True)
 
     # model_usage period
-    df_models["usage_start"] = df_models["train_end"] + pd.DateOffset(seconds=1)
-    df_models["usage_end"] = df_models["train_end"].shift(-1)
+    df_models["usage_start"] = df_models["real_train_end"] + pd.DateOffset(seconds=1)
+    df_models["usage_end"] = df_models["real_train_end"].shift(-1)
     df_models["usage_end"] = df_models["usage_end"].fillna(max_sample_time)
 
     # linearize ids:
+    df_models["trigger_idx"] = df_models["trigger_id"]
     df_models["training_idx"] = df_models["training_id"]
     df_models["model_idx"] = df_models["id_model"]
     _, trigger_idx_mappings = linearize_ids(df_models, [], "training_idx")
     _, model_idx_mappings = linearize_ids(df_models, [], "model_idx")
 
+    df_models["pipeline_ref"] = pipeline_ref
+
     # -------------------------------------------------- EVALUATIONS ------------------------------------------------- #
 
-    dfs_requests = [
-        run.df(extended=True)
-        for run in logs.supervisor_logs.stage_runs
-        if run.id == PipelineStage.EVALUATE_SINGLE.name and run.info.failure_reason is None and run.info.eval_request
-    ]
-    dfs_metrics = [
-        cast(SingleEvaluationInfo, run.info).results_df()
-        for run in logs.supervisor_logs.stage_runs
-        if run.id == PipelineStage.EVALUATE_SINGLE.name and run.info.failure_reason is None and run.info.eval_request
-    ]
-    if not dfs_requests and not dfs_metrics:
-        return df_models, None, None
+    dfs_requests = StageLog.df(
+        (
+            run
+            for run in logs.supervisor_logs.stage_runs
+            if (
+                run.id == PipelineStage.EVALUATE_SINGLE.name
+                and run.info.failure_reason is None
+                and run.info.eval_request
+            )
+        ),
+        extended=True,
+    )
+
+    dfs_metrics = SingleEvaluationInfo.results_df(
+        (
+            cast(SingleEvaluationInfo, run.info)
+            for run in logs.supervisor_logs.stage_runs
+            if run.id == PipelineStage.EVALUATE_SINGLE.name
+            and run.info.failure_reason is None
+            and run.info.eval_request
+        )
+    )
 
-    eval_requests = pd.concat(dfs_requests)
-    evals_metrics = pd.concat(dfs_metrics)
+    if dfs_requests.shape[0] == 0 or dfs_metrics.shape[0] == 0:
+        return df_models, None, None
 
-    for evals_df in [eval_requests, evals_metrics]:
+    for evals_df in [dfs_requests, dfs_metrics]:
         evals_df["interval_center"] = (evals_df["interval_start"] + evals_df["interval_end"]) / 2
         convert_epoch_to_datetime(evals_df, "interval_start")
         convert_epoch_to_datetime(evals_df, "interval_end")
@@ -116,11 +168,10 @@ def dfs_models_and_evals(
         linearize_ids(evals_df, [], "training_idx", trigger_idx_mappings)
         linearize_ids(evals_df, [], "model_idx", model_idx_mappings)
 
-    return df_models, eval_requests, evals_metrics
-
+    dfs_requests["pipeline_ref"] = pipeline_ref
+    dfs_metrics["pipeline_ref"] = pipeline_ref
 
-def logs_dataframe_pipeline_stage_logs(logs: PipelineLogs, stage: PipelineStage) -> pd.DateOffset:
-    return pd.concat([x.df(extended=True) for x in logs.supervisor_logs.stage_runs if x.id == stage.name])
+    return df_models, dfs_requests, dfs_metrics
 
 
 # -------------------------------------------------------------------------------------------------------------------- #
@@ -137,32 +188,6 @@ def leaf_stages(logs: PipelineLogs) -> list[str]:
     return [stage for stage in logs.pipeline_stages if stage not in referenced_as_parent]
 
 
-def pipeline_stage_parents(logs: PipelineLogs) -> pd.DataFrame:
-    ids = []
-    parents = []
-    for i, (_, parent_list) in logs.pipeline_stages.items():
-        if len(parent_list) == 1:
-            ids.append(i)
-            parents.append(parent_list[0])
-        if len(parent_list) > 1:
-            if i == PipelineStage.PROCESS_NEW_DATA.name:
-                if logs.experiment:
-                    ids.append(i)
-                    parents.append(PipelineStage.REPLAY_DATA.name)
-                else:
-                    ids.append(i)
-                    parents.append(PipelineStage.FETCH_NEW_DATA.name)
-            else:
-                raise ValueError(f"Stage {i} has multiple parents: {parent_list}")
-
-    return pd.DataFrame(
-        {
-            "id": ids,
-            "parent_id": parents,
-        }
-    )
-
-
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                  TRANSFORM dataframe                                                 #
 # -------------------------------------------------------------------------------------------------------------------- #
@@ -232,7 +257,13 @@ def patch_yearbook_time(df: pd.DataFrame, column: str) -> pd.DataFrame:
     Returns:
         DataFrame with patched yearbook time.
     """
-    df[column] = pd.to_datetime(1930 + (df[column] - datetime.datetime(1970, 1, 1)).dt.days, format="%Y")
+    if df.shape[0] == 0:
+        df[column] = pd.to_datetime([])
+        return df
+    delta = df[column] - pd.to_datetime("1970-01-01")
+    partial_years = delta.dt.seconds / SECONDS_PER_UNIT["d"]
+    partial_years_delta = partial_years.apply(lambda x: datetime.timedelta(seconds=x * SECONDS_PER_UNIT["y"]))
+    df[column] = pd.to_datetime(delta.apply(lambda x: f"{1930 + x.days}-01-01")) + partial_years_delta
     return df
 
 
@@ -259,13 +290,15 @@ def df_aggregate_eval_metric(
     if aggregate_func == "time_weighted_avg":
         # Compute the duration (end - start) as the weight
         df["weight"] = df[interval_end] - df[interval_start]
-        group_total_weights = df.groupby(group_by)["weight"].agg(weight_sum="sum").reset_index()
+        group_total_weights = df.groupby(group_by)["weight"].agg(weight_sum="sum")
+        group_total_weights.reset_index(inplace=True)
 
         # Compute the weighted value
         df["weighted_value"] = df[in_col] * df["weight"]
 
         # Group by `group_by` and compute the weighted average
-        grouped = df.groupby(group_by)["weighted_value"].agg(sum_weighted_value="sum").reset_index()
+        grouped = df.groupby(group_by)["weighted_value"].agg(sum_weighted_value="sum")
+        grouped.reset_index(inplace=True)
 
         # add weightsum info
         grouped = grouped.merge(group_total_weights, on=group_by)
@@ -275,4 +308,6 @@ def df_aggregate_eval_metric(
 
     else:
         # normal average
-        return df.groupby(group_by).agg({in_col: aggregate_func}).reset_index().rename(columns={in_col: out_col})
+        df = df.groupby(group_by).agg({in_col: aggregate_func})
+        df.reset_index(inplace=True)
+        return df.rename(columns={in_col: out_col})
diff --git a/analytics/app/pages/compare.py b/analytics/app/pages/compare.py
index 7e7f59924..c70d54eae 100644
--- a/analytics/app/pages/compare.py
+++ b/analytics/app/pages/compare.py
@@ -1,38 +1,35 @@
 import dash
 import pandas as pd
-from analytics.app.data.load import list_pipelines, load_pipeline_logs
-from analytics.app.data.transform import (
-    add_pipeline_ref,
-    dfs_models_and_evals,
-    leaf_stages,
-    logs_dataframe,
-    logs_dataframe_agg_by_stage,
-)
-from analytics.app.pages.plots.cost_over_time import section1_stacked_bar
+from analytics.app.data.const import CompositeModelOptions
+from analytics.app.pages.const.text import COMPOSITE_MODEL_TEXT
+from analytics.app.pages.plots.cost_over_time import section_cost_over_time
 from analytics.app.pages.plots.eval_heatmap import section_evalheatmap
 from analytics.app.pages.plots.eval_over_time import section_metricovertime
 from analytics.app.pages.plots.num_samples import section_num_samples
 from dash import Input, Output, callback, dcc, html
+from typing_extensions import get_args
 
 from .plots.cost_vs_eval_metric_agg import section3_scatter_cost_eval_metric
 from .plots.num_triggers_eval_metric import section3_scatter_num_triggers
 from .plots.one_dimensional_comparison import section4_1d_boxplots
+from .state import pipeline_data, pipelines, process_pipeline_data
 
 dash.register_page(__name__, path="/compare", title="Pipeline Comparison")
 
-pipelines = list_pipelines()
+initial_pipeline_ids = list(sorted(pipelines.keys()))[:1]
 
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                         PAGE                                                         #
 # -------------------------------------------------------------------------------------------------------------------- #
 
-pipelines = list_pipelines()
-initial_pipeline_ids = list(sorted(pipelines.keys()))[:1]
-
 
-@callback(Output("pipelines-info", "children"), Input("pipelines-selector", "value"))
-def switch_pipelines(pipeline_ids: list[int]):
-    return render_pipeline_infos(pipeline_ids)
+@callback(
+    Output("pipelines-info", "children"),
+    Input("pipelines-selector", "value"),
+    Input("composite-model-variant", "value"),
+)
+def switch_pipelines(pipeline_ids: list[int], composite_model_variant: CompositeModelOptions) -> list[html.Div]:
+    return render_pipeline_infos(pipeline_ids, composite_model_variant)
 
 
 ui_pipelines_selection = html.Div(
@@ -50,56 +47,50 @@ def switch_pipelines(pipeline_ids: list[int]):
             persistence=True,
             style={"color": "black"},
         ),
+        html.Br(),
+        dcc.Markdown(COMPOSITE_MODEL_TEXT),
+        dcc.RadioItems(
+            id="composite-model-variant",
+            options=[{"label": variant, "value": variant} for variant in get_args(CompositeModelOptions)],
+            value="currently_active_model",
+            persistence=True,
+        ),
     ]
 )
 
 
-def render_pipeline_infos(pipeline_ids: list[int]) -> list[html.Div]:
-    # --------------------------------------------------- DATA --------------------------------------------------- #
-
-    pipeline_refs = {pipeline_id: f"{pipeline_id} - {pipelines[pipeline_id][0]}" for pipeline_id in pipeline_ids}
+def render_pipeline_infos(pipeline_ids: list[int], composite_model_variant: CompositeModelOptions) -> list[html.Div]:
+    # ----------------------------------------------------- DATA ----------------------------------------------------- #
 
-    log_list = {pipeline_id: load_pipeline_logs(pipeline_id) for pipeline_id in pipeline_ids}
-    df_logs_dict = {
-        pipeline_id: add_pipeline_ref(logs_dataframe(logs), pipeline_refs[pipeline_id])
-        for pipeline_id, logs in log_list.items()
-    }
+    for pipeline_id in pipeline_ids:
+        if pipeline_id not in pipeline_data:
+            pipeline_data[pipeline_id] = process_pipeline_data(pipeline_id)
 
-    pipeline_leaf_stages = {leaf for log in log_list.values() for leaf in leaf_stages(log)}
-    df_logs = pd.concat(df_logs_dict.values())
-    df_logs_leaf = df_logs[df_logs["id"].isin(pipeline_leaf_stages)]
-
-    df_logs_agg = pd.concat([logs_dataframe_agg_by_stage(df_log) for pipeline_id, df_log in df_logs_dict.items()])
-    df_logs_agg_leaf = df_logs_agg[df_logs_agg["id"].isin(pipeline_leaf_stages)]
-
-    _dfs_models_evals: list[str, tuple[str, pd.DataFrame, pd.DataFrame | None]] = [
-        (pipeline_refs[pipeline_id], *dfs_models_and_evals(logs, df_logs["sample_time"].max()))
-        for pipeline_id, logs in log_list.items()
-    ]
-
-    df_logs_models = pd.concat(
-        [add_pipeline_ref(single_df_models, pipeline_ref) for pipeline_ref, single_df_models, _, _ in _dfs_models_evals]
-    )
-
-    df_logs_eval_requests = pd.concat(
+    df_all = pd.concat([pipeline_data[pipeline_id].df_all for pipeline_id in pipeline_ids])
+    df_agg = pd.concat([pipeline_data[pipeline_id].df_agg for pipeline_id in pipeline_ids])
+    df_leaf = pd.concat([pipeline_data[pipeline_id].df_leaf for pipeline_id in pipeline_ids])
+    df_agg = pd.concat([pipeline_data[pipeline_id].df_agg for pipeline_id in pipeline_ids])
+    df_agg_leaf = pd.concat([pipeline_data[pipeline_id].df_agg_leaf for pipeline_id in pipeline_ids])
+    df_models = pd.concat([pipeline_data[pipeline_id].df_models for pipeline_id in pipeline_ids])
+    df_eval_requests = pd.concat(
         [
-            add_pipeline_ref(_single_eval_req_df, pipeline_ref)
-            for pipeline_ref, _, _single_eval_req_df, _ in _dfs_models_evals
-            if _single_eval_req_df is not None
+            pipeline_data[pipeline_id].df_eval_requests
+            for pipeline_id in pipeline_ids
+            if pipeline_data[pipeline_id].df_eval_requests is not None
         ]
     )
-    df_logs_eval_single = pd.concat(
+    df_eval_single = pd.concat(
         [
-            add_pipeline_ref(_single_eval_df, pipeline_ref)
-            for pipeline_ref, _, _, _single_eval_df in _dfs_models_evals
-            if _single_eval_df is not None
+            pipeline_data[pipeline_id].df_eval_single
+            for pipeline_id in pipeline_ids
+            if pipeline_data[pipeline_id].df_eval_single is not None
         ]
     )
 
     # -------------------------------------------------- LAYOUT -------------------------------------------------- #
 
     eval_items = []
-    if df_logs_eval_single is None or df_logs_agg is None:
+    if df_eval_single is None or df_agg is None:
         eval_items.append(
             dcc.Markdown(
                 """
@@ -110,22 +101,20 @@ def render_pipeline_infos(pipeline_ids: list[int]) -> list[html.Div]:
             )
         )
     else:
+        eval_items.append(section_metricovertime("compare", True, df_eval_single, composite_model_variant))
+        eval_items.append(section_evalheatmap("compare", True, df_models, df_eval_single, composite_model_variant))
+        eval_items.append(section_num_samples("compare", True, df_models, df_eval_requests, composite_model_variant))
         eval_items.append(
-            section_metricovertime("compare", True, df_logs_eval_single),
-        )
-        eval_items.append(section_evalheatmap("compare", True, df_logs_eval_single, df_logs_models))
-        eval_items.append(section_num_samples("compare", True, df_logs_models, df_logs_eval_requests))
-        eval_items.append(
-            section3_scatter_num_triggers("compare", True, df_logs_agg, df_logs_eval_single),
+            section3_scatter_num_triggers("compare", True, df_agg, df_eval_single, composite_model_variant)
         )
         eval_items.append(
-            section3_scatter_cost_eval_metric("compare", df_logs, df_logs_agg_leaf, df_logs_eval_single),
+            section3_scatter_cost_eval_metric("compare", df_all, df_agg_leaf, df_eval_single, composite_model_variant)
         )
-        eval_items.append(section4_1d_boxplots("compare", True, df_logs, df_logs_eval_single))
+        eval_items.append(section4_1d_boxplots("compare", True, df_all, df_eval_single, composite_model_variant))
 
     return [
         html.H1("Cost over time comparison"),
-        section1_stacked_bar("compare", df_logs_leaf),
+        section_cost_over_time("compare", df_leaf),
         html.Div(children=eval_items),
     ]
 
@@ -141,6 +130,6 @@ def render_pipeline_infos(pipeline_ids: list[int]) -> list[html.Div]:
     """
         ),
         ui_pipelines_selection,
-        html.Div(id="pipelines-info", children=render_pipeline_infos(initial_pipeline_ids)),
+        html.Div(id="pipelines-info", children=render_pipeline_infos(initial_pipeline_ids, "currently_active_model")),
     ]
 )
diff --git a/analytics/app/pages/const/__init__.py b/analytics/app/pages/const/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/analytics/app/pages/const/text.py b/analytics/app/pages/const/text.py
new file mode 100644
index 000000000..4a69c9196
--- /dev/null
+++ b/analytics/app/pages/const/text.py
@@ -0,0 +1,12 @@
+COMPOSITE_MODEL_TEXT = """
+    ## Composite model variant
+
+    The composite model is the pipeline model that is made up by patching together the individual models
+    of the pipeline. We support two variants of the composite model:
+    - `currently_active_model`: For a certain point in time we make a fixed model the `pipeline` pipeline model,
+        that shows up in the composite model, iff it is the most recent model which was trained on an interval
+        that is strictly before the point of evaluation.
+    - `currently_trained_model`: For a fixed point in time this is the model that was trained after the
+        `currently_active_model`. So it is the model which training / training sample collection is still
+        ongoing during the point of evaluation.
+"""
diff --git a/analytics/app/pages/pipeline.py b/analytics/app/pages/pipeline.py
index b28a877ae..a954fa197 100644
--- a/analytics/app/pages/pipeline.py
+++ b/analytics/app/pages/pipeline.py
@@ -1,39 +1,34 @@
 import dash
-from analytics.app.data.load import list_pipelines, load_pipeline_logs
-from analytics.app.data.transform import (
-    add_pipeline_ref,
-    dfs_models_and_evals,
-    leaf_stages,
-    logs_dataframe,
-    logs_dataframe_agg_by_stage,
-    pipeline_stage_parents,
-)
+from analytics.app.data.const import CompositeModelOptions
+from analytics.app.pages.const.text import COMPOSITE_MODEL_TEXT
 from analytics.app.pages.plots.eval_heatmap import section_evalheatmap
 from analytics.app.pages.plots.eval_over_time import section_metricovertime
 from analytics.app.pages.plots.num_samples import section_num_samples
 from analytics.app.pages.plots.one_dimensional_comparison import section4_1d_boxplots
 from dash import Input, Output, callback, dcc, html
+from typing_extensions import get_args
 
-from .plots.cost_over_time import section1_stacked_bar
+from .plots.cost_over_time import section_cost_over_time
 from .plots.num_triggers_eval_metric import section3_scatter_num_triggers
 from .plots.pipeline_info import section0_pipeline
+from .state import pipeline_data, pipelines, process_pipeline_data
 
 dash.register_page(__name__, path="/", title="Pipeline Evaluation")
 
+initial_pipeline_id = min(pipelines.keys())
 
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                         PAGE                                                         #
 # -------------------------------------------------------------------------------------------------------------------- #
 
-pipelines = list_pipelines()
-initial_pipeline_id = min(pipelines.keys())
-
 
 @callback(
-    Output("pipeline-info", "children"), Input("pipeline-selector", "value"), prevent_initial_call="initial_duplicate"
+    Output("pipeline-info", "children"),
+    Input("pipeline-selector", "value"),
+    Input("composite-model-variant", "value"),
 )
-def switch_pipeline(pipeline_id: int):
-    return render_pipeline_info(pipeline_id)
+def switch_pipeline(pipeline_id: int, composite_model_variant: CompositeModelOptions) -> list[html.Div]:
+    return render_pipeline_info(pipeline_id, composite_model_variant)
 
 
 ui_pipeline_selection = html.Div(
@@ -50,34 +45,30 @@ def switch_pipeline(pipeline_id: int):
             persistence=True,
             style={"color": "black", "width": "65%"},
         ),
+        html.Br(),
+        dcc.Markdown(COMPOSITE_MODEL_TEXT),
+        dcc.RadioItems(
+            id="composite-model-variant",
+            options=[{"label": variant, "value": variant} for variant in get_args(CompositeModelOptions)],
+            value="currently_active_model",
+            persistence=True,
+        ),
     ]
 )
 
 
-def render_pipeline_info(pipeline_id: int) -> list[html.Div]:
+def render_pipeline_info(pipeline_id: int, composite_model_variant: CompositeModelOptions) -> list[html.Div]:
     # ----------------------------------------------------- DATA ----------------------------------------------------- #
 
-    pipeline_ref = f"{pipeline_id} - {pipelines[pipeline_id][1]}"
+    if pipeline_id not in pipeline_data:
+        pipeline_data[pipeline_id] = process_pipeline_data(pipeline_id)
 
-    logs = load_pipeline_logs(pipeline_id)
-    pipeline_leaf_stages = leaf_stages(logs)
-    df_logs = logs_dataframe(logs)
-    df_logs_leaf = df_logs[df_logs["id"].isin(pipeline_leaf_stages)]
-
-    df_logs_agg = logs_dataframe_agg_by_stage(df_logs)
-    df_logs_agg_leaf = df_logs_agg[df_logs_agg["id"].isin(pipeline_leaf_stages)]
-
-    df_parents = pipeline_stage_parents(logs)
-    df_logs_add_parents = df_logs_agg.merge(df_parents, left_on="id", right_on="id", how="left")
-
-    df_logs_models, df_logs_eval_requests, df_logs_eval_single = dfs_models_and_evals(
-        logs, df_logs["sample_time"].max()
-    )
+    data = pipeline_data[pipeline_id]
 
     # ---------------------------------------------------- LAYOUT ---------------------------------------------------- #
 
     eval_items = []
-    if df_logs_eval_single is None or df_logs_agg is None:
+    if data.df_eval_single is None or data.df_agg is None:
         eval_items.append(
             dcc.Markdown(
                 """
@@ -88,46 +79,48 @@ def render_pipeline_info(pipeline_id: int) -> list[html.Div]:
             )
         )
     else:
-        eval_items.append(
-            section_metricovertime("pipeline", False, add_pipeline_ref(df_logs_eval_single, pipeline_ref))
-        )
+        eval_items.append(section_metricovertime("pipeline", False, data.df_eval_single, composite_model_variant))
         eval_items.append(
             section_evalheatmap(
                 "pipeline",
                 False,
-                add_pipeline_ref(df_logs_eval_single, pipeline_ref),
-                add_pipeline_ref(df_logs_models, pipeline_ref),
+                data.df_models,
+                data.df_eval_single,
+                composite_model_variant,
             )
         )
         eval_items.append(
             section_num_samples(
                 "pipeline",
                 False,
-                add_pipeline_ref(df_logs_models, pipeline_ref),
-                add_pipeline_ref(df_logs_eval_requests, pipeline_ref),
+                data.df_models,
+                data.df_eval_requests,
+                composite_model_variant,
             )
         )
         eval_items.append(
             section3_scatter_num_triggers(
                 "pipeline",
                 False,
-                add_pipeline_ref(df_logs_agg, pipeline_ref),
-                add_pipeline_ref(df_logs_eval_single, pipeline_ref),
+                data.df_agg,
+                data.df_eval_single,
+                composite_model_variant,
             )
         )
         eval_items.append(
             section4_1d_boxplots(
                 "pipeline",
                 False,
-                add_pipeline_ref(df_logs, pipeline_ref),
-                add_pipeline_ref(df_logs_eval_single, pipeline_ref),
+                data.df_all,
+                data.df_eval_single,
+                composite_model_variant,
             )
         )
 
     return [
         html.Div(
             [
-                section0_pipeline(logs, df_logs, df_logs_agg_leaf, df_logs_add_parents),
+                section0_pipeline(data.logs, data.df_all, data.df_agg_leaf, data.df_add_parents),
                 dcc.Markdown(
                     """
                         ## Cost-/Accuracy triggering tradeoff
@@ -139,7 +132,7 @@ def render_pipeline_info(pipeline_id: int) -> list[html.Div]:
                         executed (i.e. batches without triggers).
                     """
                 ),
-                section1_stacked_bar("pipeline", add_pipeline_ref(df_logs_leaf, pipeline_ref)),
+                section_cost_over_time("pipeline", data.df_leaf),
                 html.Div(eval_items),
             ]
         )
@@ -157,6 +150,6 @@ def render_pipeline_info(pipeline_id: int) -> list[html.Div]:
     """
         ),
         ui_pipeline_selection,
-        html.Div(id="pipeline-info", children=render_pipeline_info(initial_pipeline_id)),
+        html.Div(id="pipeline-info", children=render_pipeline_info(initial_pipeline_id, "currently_active_model")),
     ]
 )
diff --git a/analytics/app/pages/plots/cost_over_time.py b/analytics/app/pages/plots/cost_over_time.py
index c20ec9d60..ccc543669 100644
--- a/analytics/app/pages/plots/cost_over_time.py
+++ b/analytics/app/pages/plots/cost_over_time.py
@@ -1,4 +1,3 @@
-import dataclasses
 from dataclasses import dataclass
 
 import pandas as pd
@@ -9,16 +8,16 @@
 
 
 @dataclass
-class _SharedData:
-    """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app.
-    Therefore the need a reference to the data structure at startup time (even though data is not available yet).
+class _PageState:
+    """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within
+    global references.
     """
 
-    df_logs_leaf: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    """page, data"""
+    df_leaf: pd.DataFrame
 
 
-_shared_data = _SharedData()
+_shared_data: dict[str, _PageState] = {}  # page -> _PageState
+
 
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                        FIGURE                                                        #
@@ -43,7 +42,7 @@ def gen_figure(
         histogram: Whether to use histogram over barplot
         nbins: Number of bins; only used in the histogram=True case
     """
-    df_adjusted = _shared_data.df_logs_leaf[page].copy()
+    df_adjusted = _shared_data[page].df_leaf.copy()  # TODO: remove
     if cumulative and not histogram:
         # as bar plots don't support cumulation natively
 
@@ -113,8 +112,10 @@ def gen_figure(
 # -------------------------------------------------------------------------------------------------------------------- #
 
 
-def section1_stacked_bar(page: str, df_logs_leaf: pd.DataFrame) -> html.Div:
-    _shared_data.df_logs_leaf[page] = df_logs_leaf
+def section_cost_over_time(page: str, df_leaf: pd.DataFrame) -> html.Div:
+    if page not in _shared_data:
+        _shared_data[page] = _PageState(df_leaf=df_leaf)
+    _shared_data[page].df_leaf = df_leaf
 
     @callback(
         Output(f"{page}-costovertime-plot", "figure"),
@@ -124,14 +125,16 @@ def section1_stacked_bar(page: str, df_logs_leaf: pd.DataFrame) -> html.Div:
         Input(f"{page}-costovertime-nbins-slider", "value"),
         Input(f"{page}-costovertime-radio-time-patch-yearbook", "value"),
     )
-    def update_figure(time_metric: str, cumulative: bool, histogram: bool, nbins: int, patch_yearbook: bool):
+    def update_figure(
+        time_metric: str, cumulative: bool, histogram: bool, nbins: int, patch_yearbook: bool
+    ) -> go.Figure:
         return gen_figure(page, time_metric, cumulative, histogram, nbins, patch_yearbook)
 
     @callback(
         Output(f"{page}-costovertime-nbins-slider", "disabled"),
         Input(f"{page}-costovertime-checkbox-histogram", "value"),
     )
-    def hide_bin_slider(histogram: bool):
+    def hide_bin_slider(histogram: bool) -> bool:
         return not histogram
 
     time_metrics = {
diff --git a/analytics/app/pages/plots/cost_vs_eval_metric_agg.py b/analytics/app/pages/plots/cost_vs_eval_metric_agg.py
index 1895cdeaa..65e8ae769 100644
--- a/analytics/app/pages/plots/cost_vs_eval_metric_agg.py
+++ b/analytics/app/pages/plots/cost_vs_eval_metric_agg.py
@@ -1,26 +1,29 @@
-import dataclasses
+from dataclasses import dataclass
 from typing import get_args
 
 import pandas as pd
 import plotly.express as px
+from analytics.app.data.const import CompositeModelOptions
 from analytics.app.data.transform import AGGREGATION_FUNCTION, EVAL_AGGREGATION_FUNCTION, df_aggregate_eval_metric
 from dash import Input, Output, callback, dcc, html
 from plotly import graph_objects as go
 
 
-@dataclasses.dataclass
-class _SharedData:
-    """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app.
-    Therefore the need a reference to the data structure at startup time (even though data is not available yet).
+@dataclass
+class _PageState:
+    """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within
+    global references.
     """
 
-    df_logs: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    df_logs_agg_leaf: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    """page, data"""
+    df_all: pd.DataFrame
+    df_agg_leaf: pd.DataFrame
+    df_eval_single: pd.DataFrame
 
+    composite_model_variant: CompositeModelOptions = "currently_active_model"
+
+
+_shared_data: dict[str, _PageState] = {}  # page -> _PageState
 
-_shared_data = _SharedData()
 
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                        FIGURE                                                        #
@@ -37,17 +40,18 @@ def gen_fig_scatter_num_triggers(
     stages: list[str],
 ) -> go.Figure:
     # unpack data
-    df_logs = _shared_data.df_logs[page]
-    df_logs_eval_single = _shared_data.df_logs_eval_single[page].copy()
-    df_logs_eval_single = df_logs_eval_single[
-        (df_logs_eval_single["dataset_id"] == dataset_id)
-        & (df_logs_eval_single["eval_handler"] == eval_handler)
-        & (df_logs_eval_single["most_recent_model"])
+    composite_model_variant = _shared_data[page].composite_model_variant
+    df_all = _shared_data[page].df_all
+    df_eval_single = _shared_data[page].df_eval_single
+    df_eval_single = df_eval_single[
+        (df_eval_single["dataset_id"] == dataset_id)
+        & (df_eval_single["eval_handler"] == eval_handler)
+        & (df_eval_single[composite_model_variant])
         # & (df_adjusted["metric"] == metric)
     ]
 
     agg_eval_metric = df_aggregate_eval_metric(
-        df_logs_eval_single,
+        df_eval_single,
         group_by=["pipeline_ref", "metric"],
         in_col="value",
         out_col="metric_value",
@@ -55,10 +59,13 @@ def gen_fig_scatter_num_triggers(
     )
 
     agg_duration = (
-        df_logs[df_logs["id"].isin(stages)].groupby(["pipeline_ref"]).agg(cost=("duration", agg_func_x)).reset_index()
+        df_all[df_all["id"].isin(stages)].groupby(["pipeline_ref"]).agg(cost=("duration", agg_func_x)).reset_index()
     )
 
     merged = agg_eval_metric.merge(agg_duration, on="pipeline_ref")
+    assert (
+        agg_eval_metric.shape[0] == merged.shape[0] == agg_duration.shape[0] * len(agg_eval_metric["metric"].unique())
+    )
     fig = px.scatter(
         merged,
         x="cost",
@@ -67,7 +74,7 @@ def gen_fig_scatter_num_triggers(
         facet_col="metric",
         labels={
             "cost": f"{agg_func_x} duration in sec. (proxy for cost)",
-            "metric_value": f"{agg_func_y} {metric}",
+            "metric_value": f"{agg_func_y}",
             "pipeline_ref": "Pipeline",
         },
         category_orders={
@@ -85,13 +92,26 @@ def gen_fig_scatter_num_triggers(
 
 
 def section3_scatter_cost_eval_metric(
-    page: str, df_logs: pd.DataFrame, df_logs_agg_leaf: pd.DataFrame, df_logs_eval_single: pd.DataFrame
+    page: str,
+    df_all: pd.DataFrame,
+    df_agg_leaf: pd.DataFrame,
+    df_eval_single: pd.DataFrame,
+    composite_model_variant: CompositeModelOptions,
 ) -> html.Div:
-    assert "pipeline_ref" in df_logs.columns.tolist()
-    assert "pipeline_ref" in df_logs_eval_single.columns.tolist()
-    _shared_data.df_logs[page] = df_logs
-    _shared_data.df_logs_agg_leaf[page] = df_logs_agg_leaf
-    _shared_data.df_logs_eval_single[page] = df_logs_eval_single
+    assert "pipeline_ref" in list(df_all.columns)
+    assert "pipeline_ref" in list(df_eval_single.columns)
+
+    if page not in _shared_data:
+        _shared_data[page] = _PageState(
+            composite_model_variant=composite_model_variant,
+            df_all=df_all,
+            df_agg_leaf=df_agg_leaf,
+            df_eval_single=df_eval_single,
+        )
+    _shared_data[page].composite_model_variant = composite_model_variant
+    _shared_data[page].df_all = df_all
+    _shared_data[page].df_agg_leaf = df_agg_leaf
+    _shared_data[page].df_eval_single = df_eval_single
 
     @callback(
         Output(f"{page}-scatter-cost-eval", "figure"),
@@ -114,11 +134,11 @@ def update_scatter_num_triggers(
             page, eval_handler_ref, dataset_id, metric_y, agg_func_x, agg_func_y, stages
         )
 
-    eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique())
-    eval_datasets = list(df_logs_eval_single["dataset_id"].unique())
-    eval_metrics = list(df_logs_eval_single["metric"].unique())
+    eval_handler_refs = list(df_eval_single["eval_handler"].unique())
+    eval_datasets = list(df_eval_single["dataset_id"].unique())
+    eval_metrics = list(df_eval_single["metric"].unique())
 
-    stages = list(df_logs_agg_leaf["id"].unique())
+    stages = list(df_agg_leaf["id"].unique())
 
     return html.Div(
         [
diff --git a/analytics/app/pages/plots/eval_heatmap.py b/analytics/app/pages/plots/eval_heatmap.py
index be026225d..2f095ab19 100644
--- a/analytics/app/pages/plots/eval_heatmap.py
+++ b/analytics/app/pages/plots/eval_heatmap.py
@@ -1,26 +1,26 @@
-import dataclasses
 from dataclasses import dataclass
 
 import pandas as pd
-from analytics.app.data.transform import patch_yearbook_time
+from analytics.app.data.const import CompositeModelOptions
+from analytics.app.data.transform import linearize_ids, patch_yearbook_time
 from dash import Input, Output, callback, dcc, html
 from plotly import graph_objects as go
 
 
 @dataclass
-class _SharedData:
-    """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app.
-    Therefore the need a reference to the data structure at startup time (even though data is not available yet).
+class _PageState:
+    """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within
+    global references.
     """
 
-    df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    """page, data"""
+    df_models: pd.DataFrame
+    df_eval_single: pd.DataFrame
 
-    df_logs_models: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    """page, data"""
+    composite_model_variant: CompositeModelOptions = "currently_active_model"
 
 
-_shared_data = _SharedData()
+_shared_data: dict[str, _PageState] = {}  # page -> _PageState
+
 
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                        FIGURE                                                        #
@@ -28,7 +28,12 @@ class _SharedData:
 
 
 def gen_figure(
-    page: str, multi_pipeline_mode: bool, patch_yearbook: bool, eval_handler: str, dataset_id: str, metric: str
+    page: str,
+    multi_pipeline_mode: bool,
+    patch_yearbook: bool,
+    eval_handler: str,
+    dataset_id: str,
+    metric: str,
 ) -> go.Figure:
     """
     Create the cost over time figure with barplot or histogram. Histogram has nice binning while barplot is precise.
@@ -42,8 +47,10 @@ def gen_figure(
         dataset_id: Dataset id
         metric: Evaluation metric (replaced with facet)
     """
-    df_logs_models = _shared_data.df_logs_models[page].copy()
-    df_adjusted = _shared_data.df_logs_eval_single[page].copy()
+    composite_model_variant = _shared_data[page].composite_model_variant
+
+    df_logs_models = _shared_data[page].df_models.copy()  # TODO: remove copy
+    df_adjusted = _shared_data[page].df_eval_single.copy()  # TODO: remove copy
     df_adjusted = df_adjusted[
         (df_adjusted["dataset_id"] == dataset_id)
         & (df_adjusted["eval_handler"] == eval_handler)
@@ -52,25 +59,44 @@ def gen_figure(
 
     # Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)
     if patch_yearbook:
-        for column in ["interval_center", "interval_start", "interval_end", "sample_time", "sample_time_until"]:
+        for column in ["interval_start", "interval_center", "interval_end"]:
             patch_yearbook_time(df_adjusted, column)
+        for column in ["train_start", "train_end", "real_train_end", "usage_start", "usage_end"]:
+            patch_yearbook_time(df_logs_models, column)
 
     df_adjusted = df_adjusted.sort_values(by=["interval_center"])
 
     if multi_pipeline_mode:
         # we only want the pipeline performance (composed of the models active periods stitched together)
-        df_adjusted = df_adjusted[df_adjusted["most_recent_model"]]
+        df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]
 
         # in model dataframe convert pipeline_ref to pipeline_id as we need int for the heatmap
         df_adjusted["pipeline_id"] = df_adjusted["pipeline_ref"].str.split("-").str[0].astype(int)
         df_logs_models["pipeline_id"] = df_logs_models["pipeline_ref"].str.split("-").str[0].astype(int)
 
+        full_refs = {
+            pipeline_id: pipeline_name
+            for pipeline_id, pipeline_name in df_logs_models[["pipeline_id", "pipeline_ref"]].values
+        }
+
+        _, mapping = linearize_ids(df_adjusted, [], "pipeline_id")
+        linearize_ids(df_logs_models, [], "pipeline_id", mapping)
+
+        # invert the mapping
+        label_map = {v: full_refs[k] for k, v in mapping[()].items()}
+
     else:
         assert df_adjusted["pipeline_ref"].nunique() == 1
         # add the pipeline time series which is the performance of different models stitched together dep.
         # w.r.t which model was active
-        pipeline_composite_model = df_adjusted[df_adjusted["most_recent_model"]]
-        pipeline_composite_model["model_idx"] = "0-pipeline-composite-model"
+        pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]
+        pipeline_composite_model["model_idx"] = 0
+        pipeline_composite_model["id_model"] = 0
+
+        label_map = {k: f"model_idx={k}, id={v}" for k, v in df_adjusted[["model_idx", "id_model"]].values}
+        label_map[0] = "Pipeline composite model"
+
+        df_adjusted = pd.concat([pipeline_composite_model, df_adjusted])
 
     # build heatmap matrix dataframe:
     heatmap_data = df_adjusted.pivot(
@@ -83,6 +109,8 @@ def gen_figure(
             x=heatmap_data.columns,
             y=heatmap_data.index,
             colorscale="RdBu_r",
+            dx=0.5,
+            dy=1,
         )
     )
     fig.update_layout(
@@ -90,9 +118,9 @@ def gen_figure(
         yaxis_nticks=2 * min(20, len(heatmap_data.index)),
         width=2200,
         height=1100,
-        # "pipeline_id": "Pipeline",
-        # "metric": "Metric",
-        # "interval_center": "Evaluation time (interval center)",
+        showlegend=True,
+        yaxis=dict(tickmode="array", tickvals=heatmap_data.index, ticktext=[label_map[y] for y in heatmap_data.index]),
+        xaxis=dict(tickangle=45),
     )
     shapes = []
 
@@ -105,10 +133,10 @@ def gen_figure(
                 y0=active_[1]["model_idx"] - 0.5,
                 x1=active_[1]["interval_end"],
                 y1=active_[1]["model_idx"] + 0.5,
-                line=dict(color="Green", width=5),
+                line=dict(color="Green", width=2),
             )
             for active_ in df_adjusted[
-                df_adjusted["most_recent_model"]
+                df_adjusted[composite_model_variant]
             ].iterrows()  # if "pipeline-composite-model" not in active_[1]["id_model"]
         ]
         # diagonal 2
@@ -119,10 +147,10 @@ def gen_figure(
                 y0=active_[1]["model_idx"] + 0.5,
                 x1=active_[1]["interval_end"],
                 y1=active_[1]["model_idx"] - 0.5,
-                line=dict(color="Green", width=5),
+                line=dict(color="Green", width=2),
             )
             for active_ in df_adjusted[
-                df_adjusted["most_recent_model"]
+                df_adjusted[composite_model_variant]
             ].iterrows()  # if "pipeline-composite-model" not in active_[1]["id_model"]
         ]
 
@@ -133,10 +161,10 @@ def gen_figure(
             dict(
                 type="rect",
                 x0=active_[1][f"{type_}_start"],
-                x1=active_[1][f"{type_}_end"],
+                x1=active_[1][f"{'real_' if type_ == 'train' else ''}{type_}_end"],
                 y0=active_[1][y_column] - 0.5,
                 y1=active_[1][y_column] + 0.5,
-                line=dict(color="Orange" if type_ == "train" else "Black", width=4),
+                line=dict(color="Orange" if type_ == "train" else "Black", width=2),
             )
             for active_ in df_logs_models.iterrows()
         ]
@@ -150,10 +178,21 @@ def gen_figure(
 
 
 def section_evalheatmap(
-    page: str, multi_pipeline_mode: bool, df_logs_eval_single: pd.DataFrame, df_logs_models: pd.DataFrame
+    page: str,
+    multi_pipeline_mode: bool,
+    df_models: pd.DataFrame,
+    df_eval_single: pd.DataFrame,
+    composite_model_variant: CompositeModelOptions,
 ) -> html.Div:
-    _shared_data.df_logs_eval_single[page] = df_logs_eval_single
-    _shared_data.df_logs_models[page] = df_logs_models
+    if page not in _shared_data:
+        _shared_data[page] = _PageState(
+            composite_model_variant=composite_model_variant,
+            df_models=df_models,
+            df_eval_single=df_eval_single,
+        )
+    _shared_data[page].composite_model_variant = composite_model_variant
+    _shared_data[page].df_models = df_models
+    _shared_data[page].df_eval_single = df_eval_single
 
     @callback(
         Output(f"{page}-evalheatmap-plot", "figure"),
@@ -165,9 +204,9 @@ def section_evalheatmap(
     def update_figure(patch_yearbook: bool, eval_handler_ref: str, dataset_id: str, metric: str) -> go.Figure:
         return gen_figure(page, multi_pipeline_mode, patch_yearbook, eval_handler_ref, dataset_id, metric)
 
-    eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique())
-    eval_datasets = list(df_logs_eval_single["dataset_id"].unique())
-    eval_metrics = list(df_logs_eval_single["metric"].unique())
+    eval_handler_refs = list(df_eval_single["eval_handler"].unique())
+    eval_datasets = list(df_eval_single["dataset_id"].unique())
+    eval_metrics = list(df_eval_single["metric"].unique())
 
     return html.Div(
         [
diff --git a/analytics/app/pages/plots/eval_over_time.py b/analytics/app/pages/plots/eval_over_time.py
index 7e87c3f01..eb5d90927 100644
--- a/analytics/app/pages/plots/eval_over_time.py
+++ b/analytics/app/pages/plots/eval_over_time.py
@@ -1,24 +1,24 @@
-import dataclasses
 from dataclasses import dataclass
 
 import pandas as pd
 import plotly.express as px
+from analytics.app.data.const import CompositeModelOptions
 from analytics.app.data.transform import patch_yearbook_time
 from dash import Input, Output, callback, dcc, html
 from plotly import graph_objects as go
 
 
 @dataclass
-class _SharedData:
-    """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app.
-    Therefore the need a reference to the data structure at startup time (even though data is not available yet).
+class _PageState:
+    """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within
+    global references.
     """
 
-    df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    """page, data"""
+    df_eval_single: pd.DataFrame
+    composite_model_variant: CompositeModelOptions = "currently_active_model"
 
 
-_shared_data = _SharedData()
+_shared_data: dict[str, _PageState] = {}  # page -> _PageState
 
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                        FIGURE                                                        #
@@ -29,7 +29,7 @@ def gen_figure(
     page: str, multi_pipeline_mode: bool, patch_yearbook: bool, eval_handler: str, dataset_id: str, metric: str
 ) -> go.Figure:
     """
-    Create the cost over time figure with barplot or histogram. Histogram has nice binning while barplot is precise.
+    Create the evaluation over time figure with a line plot.
 
     Args:
         page: Page name where the plot is displayed
@@ -40,7 +40,9 @@ def gen_figure(
         dataset_id: Dataset id
         metric: Evaluation metric (replaced with facet)
     """
-    df_adjusted = _shared_data.df_logs_eval_single[page].copy()
+    composite_model_variant = _shared_data[page].composite_model_variant
+
+    df_adjusted = _shared_data[page].df_eval_single.copy()
     df_adjusted = df_adjusted[
         (df_adjusted["dataset_id"] == dataset_id)
         & (df_adjusted["eval_handler"] == eval_handler)
@@ -49,17 +51,17 @@ def gen_figure(
 
     # Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)
     if patch_yearbook:
-        for column in ["interval_center", "interval_start", "interval_end", "sample_time", "sample_time_until"]:
+        for column in ["interval_center", "interval_start", "interval_end"]:
             patch_yearbook_time(df_adjusted, column)
 
     if multi_pipeline_mode:
         # we only want the pipeline performance (composed of the models active periods stitched together)
-        df_adjusted = df_adjusted[df_adjusted["most_recent_model"]]
+        df_adjusted = df_adjusted[df_adjusted[composite_model_variant]]
     else:
         assert df_adjusted["pipeline_ref"].nunique() == 1
         # add the pipeline time series which is the performance of different models stitched together dep.
         # w.r.t which model was active
-        pipeline_composite_model = df_adjusted[df_adjusted["most_recent_model"]]
+        pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]]
         pipeline_composite_model["model_idx"] = "00-pipeline-composite-model"
         number_digits = len(str(df_adjusted["model_idx"].max()))
         df_adjusted["model_idx"] = df_adjusted["model_idx"].astype(str).str.zfill(number_digits)
@@ -97,8 +99,16 @@ def gen_figure(
 # -------------------------------------------------------------------------------------------------------------------- #
 
 
-def section_metricovertime(page: str, multi_pipeline_mode: bool, df_logs_eval_single: pd.DataFrame) -> html.Div:
-    _shared_data.df_logs_eval_single[page] = df_logs_eval_single
+def section_metricovertime(
+    page: str,
+    multi_pipeline_mode: bool,
+    df_eval_single: pd.DataFrame,
+    composite_model_variant: CompositeModelOptions,
+) -> html.Div:
+    if page not in _shared_data:
+        _shared_data[page] = _PageState(composite_model_variant=composite_model_variant, df_eval_single=df_eval_single)
+    _shared_data[page].composite_model_variant = composite_model_variant
+    _shared_data[page].df_eval_single = df_eval_single
 
     @callback(
         Output(f"{page}-evalovertime-plot", "figure"),
@@ -110,9 +120,9 @@ def section_metricovertime(page: str, multi_pipeline_mode: bool, df_logs_eval_si
     def update_figure(patch_yearbook: bool, eval_handler_ref: str, dataset_id: str, metric: str) -> go.Figure:
         return gen_figure(page, multi_pipeline_mode, patch_yearbook, eval_handler_ref, dataset_id, metric)
 
-    eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique())
-    eval_datasets = list(df_logs_eval_single["dataset_id"].unique())
-    eval_metrics = list(df_logs_eval_single["metric"].unique())
+    eval_handler_refs = list(df_eval_single["eval_handler"].unique())
+    eval_datasets = list(df_eval_single["dataset_id"].unique())
+    eval_metrics = list(df_eval_single["metric"].unique())
 
     return html.Div(
         [
diff --git a/analytics/app/pages/plots/num_samples.py b/analytics/app/pages/plots/num_samples.py
index c2356780c..2aa2d9a38 100644
--- a/analytics/app/pages/plots/num_samples.py
+++ b/analytics/app/pages/plots/num_samples.py
@@ -1,26 +1,28 @@
-import dataclasses
 from dataclasses import dataclass
 from typing import Literal
 
 import pandas as pd
 import plotly.express as px
+from analytics.app.data.const import CompositeModelOptions
 from analytics.app.data.transform import patch_yearbook_time
 from dash import Input, Output, callback, dcc, html
 from plotly import graph_objects as go
 
 
 @dataclass
-class _SharedData:
-    """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app.
-    Therefore the need a reference to the data structure at startup time (even though data is not available yet).
+class _PageState:
+    """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within
+    global references.
     """
 
-    df_logs_models: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    df_logs_eval_requests: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    """page, data"""
+    df_models: pd.DataFrame
+    df_eval_requests: pd.DataFrame
 
+    composite_model_variant: CompositeModelOptions = "currently_active_model"
+
+
+_shared_data: dict[str, _PageState] = {}  # page -> _PageState
 
-_shared_data = _SharedData()
 
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                        FIGURE                                                        #
@@ -50,12 +52,14 @@ def gen_figure(
         use_scatter_size: If True, the size of the scatter points is proportional to the number of samples
         patch_yearbook: If True, the time metric is patched to be a yearbook
     """
+    composite_model_variant = _shared_data[page].composite_model_variant
+
     if y_axis == "eval_samples":
-        df_evals = _shared_data.df_logs_eval_requests[page].copy()
+        df_evals = _shared_data[page].df_eval_requests
         df_evals = df_evals[(df_evals["dataset_id"] == dataset_id) & (df_evals["eval_handler"] == eval_handler)]
 
         if multi_pipeline_mode:
-            df_evals = df_evals[df_evals["most_recent_model"]]
+            df_evals = df_evals[df_evals[composite_model_variant]]
 
         # Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)
         if time_metric == "sample_time" and patch_yearbook:
@@ -75,7 +79,7 @@ def gen_figure(
         assert y_axis != "eval_center"
 
         # y_axis = "train_*""
-        df_trainings = _shared_data.df_logs_models[page].copy()
+        df_trainings = _shared_data[page].df_models.copy()  # TODO: remove copy
 
         # Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)
         if time_metric == "sample_time" and patch_yearbook:
@@ -103,10 +107,21 @@ def gen_figure(
 
 
 def section_num_samples(
-    page: str, multi_pipeline_mode: bool, df_logs_models: pd.DataFrame, df_logs_eval_requests: pd.DataFrame
+    page: str,
+    multi_pipeline_mode: bool,
+    df_models: pd.DataFrame,
+    df_eval_requests: pd.DataFrame,
+    composite_model_variant: CompositeModelOptions,
 ) -> html.Div:
-    _shared_data.df_logs_models[page] = df_logs_models
-    _shared_data.df_logs_eval_requests[page] = df_logs_eval_requests
+    if page not in _shared_data:
+        _shared_data[page] = _PageState(
+            composite_model_variant=composite_model_variant,
+            df_models=df_models,
+            df_eval_requests=df_eval_requests,
+        )
+    _shared_data[page].composite_model_variant = composite_model_variant
+    _shared_data[page].df_models = df_models
+    _shared_data[page].df_eval_requests = df_eval_requests
 
     @callback(
         Output(f"{page}-num-samples-plot", "figure"),
@@ -127,7 +142,14 @@ def update_figure(
         eval_handler: str,
     ) -> go.Figure:
         return gen_figure(
-            page, multi_pipeline_mode, time_metric, y_axis, use_scatter_size, patch_yearbook, dataset_id, eval_handler
+            page,
+            multi_pipeline_mode,
+            time_metric,
+            y_axis,
+            use_scatter_size,
+            patch_yearbook,
+            dataset_id,
+            eval_handler,
         )
 
     @callback(
@@ -145,8 +167,8 @@ def show_eval_config(y_axis: YAxis) -> bool:
         "interval_center": "Evaluation interval center (only for y=eval_samples)",
     }
 
-    eval_handler_refs = list(df_logs_eval_requests["eval_handler"].unique())
-    eval_datasets = list(df_logs_eval_requests["dataset_id"].unique())
+    eval_handler_refs = list(df_eval_requests["eval_handler"].unique())
+    eval_datasets = list(df_eval_requests["dataset_id"].unique())
 
     return html.Div(
         [
diff --git a/analytics/app/pages/plots/num_triggers_eval_metric.py b/analytics/app/pages/plots/num_triggers_eval_metric.py
index e492856bc..024459bd9 100644
--- a/analytics/app/pages/plots/num_triggers_eval_metric.py
+++ b/analytics/app/pages/plots/num_triggers_eval_metric.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 import plotly.express as px
+from analytics.app.data.const import CompositeModelOptions
 from analytics.app.data.transform import df_aggregate_eval_metric
 from dash import Input, Output, callback, dcc, html
 from modyn.supervisor.internal.grpc.enums import PipelineStage
@@ -9,17 +10,18 @@
 
 
 @dataclasses.dataclass
-class _SharedData:
-    """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app.
-    Therefore the need a reference to the data structure at startup time (even though data is not available yet).
+class _PageState:
+    """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within
+    global references.
     """
 
-    df_logs_agg: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    """page, data"""
+    df_agg: pd.DataFrame
+    df_eval_single: pd.DataFrame
 
+    composite_model_variant: CompositeModelOptions = "currently_active_model"
 
-_shared_data = _SharedData()
+
+_shared_data: dict[str, _PageState] = {}  # page -> _PageState
 
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                        FIGURE                                                        #
@@ -48,37 +50,35 @@ def gen_fig_scatter_num_triggers(
         time_weighted: Whether to weight the aggregation by the evaluation interval length
     """
     # unpack data
-    df_logs_agg = _shared_data.df_logs_agg[page]
-
-    df_logs_eval_single = _shared_data.df_logs_eval_single[page]
-    df_logs_eval_single = df_logs_eval_single[
-        (df_logs_eval_single["dataset_id"] == dataset_id)
-        & (df_logs_eval_single["eval_handler"] == eval_handler)
+    composite_model_variant = _shared_data[page].composite_model_variant
+    df_agg = _shared_data[page].df_agg
+    df_eval_single = _shared_data[page].df_eval_single
+    df_eval_single = df_eval_single[
+        (df_eval_single["dataset_id"] == dataset_id)
+        & (df_eval_single["eval_handler"] == eval_handler)
         # & (df_adjusted["metric"] == metric)
     ]
 
     if multi_pipeline_mode or only_active_periods:
         # we only want the pipeline performance (composed of the models active periods stitched together)
-        df_logs_eval_single = df_logs_eval_single[df_logs_eval_single["most_recent_model"]]
+        df_eval_single = df_eval_single[df_eval_single[composite_model_variant]]
 
     if not multi_pipeline_mode:
-        assert df_logs_eval_single["pipeline_ref"].nunique() == 1
-
         # add the pipeline time series which is the performance of different models stitched together dep.
         # w.r.t which model was active
-        pipeline_composite_model = df_logs_eval_single[df_logs_eval_single["most_recent_model"]]
+        pipeline_composite_model = df_eval_single[df_eval_single[composite_model_variant]]
         pipeline_composite_model["id_model"] = "0-pipeline-composite-model"
-        df_logs_eval_single["id_model"] = df_logs_eval_single["id_model"].astype(str)
-        df_logs_eval_single = pd.concat([df_logs_eval_single, pipeline_composite_model])
+        df_eval_single["id_model"] = df_eval_single["id_model"].astype(str)
+        df_eval_single = pd.concat([df_eval_single, pipeline_composite_model])
 
     col_map = {"value": "metric_value", "count": "num_triggers"}
-    num_triggers = df_logs_agg[df_logs_agg["id"] == PipelineStage.HANDLE_SINGLE_TRIGGER.name][["pipeline_ref", "count"]]
-    accuracies = df_logs_eval_single
+    num_triggers = df_agg[df_agg["id"] == PipelineStage.HANDLE_SINGLE_TRIGGER.name][["pipeline_ref", "count"]]
+    accuracies = df_eval_single
     labels = {
         "pipeline_ref": "Pipeline",
         "metric": "Metric",
         "num_triggers": "#triggers (proxy for cost)",
-        "metric_value": f"Metric value {'(mean)' if aggregate_metric else ''}",
+        "metric_value": f"Metric value {'(aggregated)' if aggregate_metric else ''}",
     }
     category_orders = {
         "pipeline_ref": list(sorted(accuracies["pipeline_ref"].unique())),
@@ -93,6 +93,11 @@ def gen_fig_scatter_num_triggers(
             aggregate_func="time_weighted_avg" if time_weighted else "mean",
         )
         merged = num_triggers.merge(mean_accuracies, on="pipeline_ref").rename(columns=col_map, inplace=False)
+        assert (
+            mean_accuracies.shape[0]
+            == merged.shape[0]
+            == num_triggers.shape[0] * len(mean_accuracies["metric"].unique())
+        )
         fig = px.scatter(
             merged,
             x="num_triggers",
@@ -124,12 +129,23 @@ def gen_fig_scatter_num_triggers(
 
 
 def section3_scatter_num_triggers(
-    page: str, multi_pipeline_mode: bool, df_logs_agg: pd.DataFrame, df_logs_eval_single: pd.DataFrame
+    page: str,
+    multi_pipeline_mode: bool,
+    df_agg: pd.DataFrame,
+    df_eval_single: pd.DataFrame,
+    composite_model_variant: CompositeModelOptions,
 ) -> html.Div:
-    assert "pipeline_ref" in df_logs_agg.columns.tolist()
-    assert "pipeline_ref" in df_logs_eval_single.columns.tolist()
-    _shared_data.df_logs_agg[page] = df_logs_agg
-    _shared_data.df_logs_eval_single[page] = df_logs_eval_single
+    assert "pipeline_ref" in list(df_agg.columns)
+    assert "pipeline_ref" in list(df_eval_single.columns)
+    if page not in _shared_data:
+        _shared_data[page] = _PageState(
+            composite_model_variant=composite_model_variant,
+            df_agg=df_agg,
+            df_eval_single=df_eval_single,
+        )
+    _shared_data[page].composite_model_variant = composite_model_variant
+    _shared_data[page].df_agg = df_agg
+    _shared_data[page].df_eval_single = df_eval_single
 
     @callback(
         Output(f"{page}-scatter-plot-num-triggers", "figure"),
@@ -137,7 +153,7 @@ def section3_scatter_num_triggers(
         Input(f"{page}-radio-scatter-number-triggers-dataset-id", "value"),
         Input(f"{page}-radio-scatter-number-triggers-metric", "value"),
         Input(f"{page}-radio-scatter-number-triggers-agg-y", "value"),
-        Input(f"{page}-radio-1d-eval-metric-only-active-model-periods", "value"),
+        Input(f"{page}-radio-scatter-number-triggers-agg-time-weighted", "value"),
         Input(f"{page}-radio-scatter-number-triggers-only-active-model-periods", "value"),
     )
     def update_scatter_num_triggers(
@@ -159,9 +175,9 @@ def update_scatter_num_triggers(
             only_active_periods,
         )
 
-    eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique())
-    eval_datasets = list(df_logs_eval_single["dataset_id"].unique())
-    eval_metrics = list(df_logs_eval_single["metric"].unique())
+    eval_handler_refs = list(df_eval_single["eval_handler"].unique())
+    eval_datasets = list(df_eval_single["dataset_id"].unique())
+    eval_metrics = list(df_eval_single["metric"].unique())
 
     return html.Div(
         [
diff --git a/analytics/app/pages/plots/one_dimensional_comparison.py b/analytics/app/pages/plots/one_dimensional_comparison.py
index bb852f05c..619aa703e 100644
--- a/analytics/app/pages/plots/one_dimensional_comparison.py
+++ b/analytics/app/pages/plots/one_dimensional_comparison.py
@@ -1,7 +1,8 @@
-import dataclasses
+from dataclasses import dataclass
 
 import pandas as pd
 import plotly.express as px
+from analytics.app.data.const import CompositeModelOptions
 from analytics.app.data.transform import OPTIONAL_EVAL_AGGREGATION_FUNCTION, df_aggregate_eval_metric
 from dash import Input, Output, callback, dcc, html
 from modyn.supervisor.internal.grpc.enums import PipelineStage
@@ -9,18 +10,20 @@
 from typing_extensions import get_args
 
 
-@dataclasses.dataclass
-class _SharedData:
-    """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app.
-    Therefore the need a reference to the data structure at startup time (even though data is not available yet).
+@dataclass
+class _PageState:
+    """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within
+    global references.
     """
 
-    df_logs: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    """page, data"""
+    df_all: pd.DataFrame
+    df_eval_single: pd.DataFrame
 
+    composite_model_variant: CompositeModelOptions = "currently_active_model"
+
+
+_shared_data: dict[str, _PageState] = {}  # page -> _PageState
 
-_shared_data = _SharedData()
 
 # -------------------------------------------------------------------------------------------------------------------- #
 #                                                        FIGURE                                                        #
@@ -28,14 +31,14 @@ class _SharedData:
 
 
 def gen_fig_1d_cost(page: str) -> go.Figure:
-    df_logs = _shared_data.df_logs[page]
     return px.box(
-        df_logs,
+        _shared_data[page].df_all,
         x="pipeline_ref",
         y="duration",
         color="id",
         labels={"pipeline_ref": "Pipeline", "duration": "duration in seconds", "id": "Pipeline Stage"},
         title="Stage costs",
+        height=900,
     )
 
 
@@ -47,22 +50,28 @@ def gen_figs_1d_eval(
     agg_func_eval_metric: OPTIONAL_EVAL_AGGREGATION_FUNCTION,
     only_active_periods: bool = True,
 ) -> go.Figure:
-    df_logs = _shared_data.df_logs[page]
-    df_logs_eval_single = _shared_data.df_logs_eval_single[page]
+    composite_model_variant = _shared_data[page].composite_model_variant
+
+    df_logs = _shared_data[page].df_all
+    df_logs_eval_single = _shared_data[page].df_eval_single
     df_logs_eval_single = df_logs_eval_single[
         (df_logs_eval_single["dataset_id"] == dataset_id) & (df_logs_eval_single["eval_handler"] == eval_handler)
     ]
 
     if multi_pipeline_mode or only_active_periods:
         # we only want the pipeline performance (composed of the models active periods stitched together)
-        df_logs_eval_single = df_logs_eval_single[df_logs_eval_single["most_recent_model"]]
+        df_logs_eval_single = df_logs_eval_single[df_logs_eval_single[composite_model_variant]]
 
     if not multi_pipeline_mode:
         assert df_logs_eval_single["pipeline_ref"].nunique() == 1
 
+        digits = len(str(df_logs_eval_single["id_model"].max()))
+        # fill with leading spaces to have a consistent sorting
+        df_logs_eval_single["id_model"] = df_logs_eval_single["id_model"].astype(str).str.zfill(digits)
+
         # add the pipeline time series which is the performance of different models stitched together dep.
         # w.r.t which model was active
-        pipeline_composite_model = df_logs_eval_single[df_logs_eval_single["most_recent_model"]]
+        pipeline_composite_model = df_logs_eval_single[df_logs_eval_single[composite_model_variant]]
         pipeline_composite_model["id_model"] = "0-pipeline-composite-model"
         df_logs_eval_single["id_model"] = df_logs_eval_single["id_model"].astype(str)
         df_logs_eval_single = pd.concat([df_logs_eval_single, pipeline_composite_model])
@@ -110,13 +119,24 @@ def gen_figs_1d_eval(
 
 
 def section4_1d_boxplots(
-    page: str, multi_pipeline_mode: bool, df_logs: pd.DataFrame, df_logs_eval_single: pd.DataFrame
+    page: str,
+    multi_pipeline_mode: bool,
+    df_all: pd.DataFrame,
+    df_eval_single: pd.DataFrame,
+    composite_model_variant: CompositeModelOptions,
 ) -> html.Div:
-    assert "pipeline_ref" in df_logs.columns.tolist()
-    assert "pipeline_ref" in df_logs_eval_single.columns.tolist()
+    assert "pipeline_ref" in list(df_all.columns)
+    assert "pipeline_ref" in list(df_eval_single.columns)
 
-    _shared_data.df_logs[page] = df_logs
-    _shared_data.df_logs_eval_single[page] = df_logs_eval_single
+    if page not in _shared_data:
+        _shared_data[page] = _PageState(
+            composite_model_variant=composite_model_variant,
+            df_all=df_all,
+            df_eval_single=df_eval_single,
+        )
+    _shared_data[page].composite_model_variant = composite_model_variant
+    _shared_data[page].df_all = df_all
+    _shared_data[page].df_eval_single = df_eval_single
 
     @callback(
         Output(f"{page}-1d-box-plot-metrics", "figure"),
@@ -132,13 +152,18 @@ def update_scatter_num_triggers(
         only_active_periods: bool = True,
     ) -> go.Figure:
         return gen_figs_1d_eval(
-            page, multi_pipeline_mode, eval_handler_ref, dataset_id, agg_func_eval_metric, only_active_periods
+            page,
+            multi_pipeline_mode,
+            eval_handler_ref,
+            dataset_id,
+            agg_func_eval_metric,
+            only_active_periods,
         )
 
     # DATA (bring all metrics into columns of one dataframe)
 
-    eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique())
-    eval_datasets = list(df_logs_eval_single["dataset_id"].unique())
+    eval_handler_refs = list(df_eval_single["eval_handler"].unique())
+    eval_datasets = list(df_eval_single["dataset_id"].unique())
 
     return html.Div(
         [
diff --git a/analytics/app/pages/plots/pipeline_info.py b/analytics/app/pages/plots/pipeline_info.py
index 4733a0cc5..bb8506437 100644
--- a/analytics/app/pages/plots/pipeline_info.py
+++ b/analytics/app/pages/plots/pipeline_info.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 import dash_cytoscape as cyto
 import pandas as pd
 import plotly.express as px
@@ -8,13 +10,13 @@
 
 
 def section0_pipeline(
-    logs: PipelineLogs, df_logs: pd.DataFrame, df_logs_agg_leaf: pd.DataFrame, df_logs_add_parents: pd.DataFrame
+    logs: PipelineLogs, df_all: pd.DataFrame, df_agg_leaf: pd.DataFrame, df_add_parents: pd.DataFrame
 ) -> html.Div:
     def gen_stage_duration_histogram(stage_id: str) -> go.Figure:
         return px.histogram(
-            df_logs[df_logs["id"] == stage_id],
+            df_all[df_all["id"] == stage_id],
             title="Stage Duration Histogram",
-            hover_data=df_logs.columns,
+            hover_data=df_all.columns,
             marginal="rug",  # rug, box, violin
             x="duration",
             labels={"duration": "duration in seconds", "id": "Pipeline Stage"},
@@ -24,13 +26,13 @@ def gen_stage_duration_histogram(stage_id: str) -> go.Figure:
         )
 
     @callback(Output("pipeline-graph-info", "children"), Input("pipeline-graph", "tapNodeData"))
-    def display_tap_node_info(data) -> str:
+    def display_tap_node_info(data: Any) -> str:
         if not data or "id" not in data:
             return "Click a node to get more information"
-        series_info = df_logs[df_logs["id"] == data["id"]]["duration"].describe().to_string()
+        series_info = df_all[df_all["id"] == data["id"]]["duration"].describe().to_string()
         return (
             f"Pipeline Stage: {data['id']}\n"
-            f"Number of Runs: {df_logs[df_logs['id'] == data['id']].shape[0]}\n"
+            f"Number of Runs: {df_all[df_all['id'] == data['id']].shape[0]}\n"
             f"Info about pipeline stage duration:\n"
             f"{series_info}"
         )
@@ -39,9 +41,8 @@ def display_tap_node_info(data) -> str:
     @callback(
         Output("hist-stage-duration", "figure"),
         Input("pipeline-graph", "tapNodeData"),
-        prevent_initial_call="initial_duplicate",
     )
-    def display_tap_node_duration(data) -> go.Figure:
+    def display_tap_node_duration(data: Any) -> go.Figure:
         if not data or "id" not in data:
             stage_id = PipelineStage.MAIN.name
         else:
@@ -50,11 +51,11 @@ def display_tap_node_duration(data) -> go.Figure:
         return fig_hist_stage_duration
 
     fig_pie_pipeline = px.pie(
-        df_logs_agg_leaf,
+        df_agg_leaf,
         values="sum",
         names="id",
         hole=0.4,
-        hover_data=df_logs_agg_leaf,
+        hover_data=df_agg_leaf,
         custom_data=["max", "min", "mean", "median", "std", "count"],
     )
     # fig_pie_pipeline.update_traces(textposition='inside', textinfo='percent+label')
@@ -76,9 +77,9 @@ def display_tap_node_duration(data) -> go.Figure:
 
     fig_sunburst = go.Figure(
         go.Sunburst(
-            labels=df_logs_add_parents["id"],
-            parents=df_logs_add_parents["parent_id"],
-            values=df_logs_add_parents["sum"],
+            labels=df_add_parents["id"],
+            parents=df_add_parents["parent_id"],
+            values=df_add_parents["sum"],
         )
     )
 
diff --git a/analytics/app/pages/state.py b/analytics/app/pages/state.py
new file mode 100644
index 000000000..6f64c4dfa
--- /dev/null
+++ b/analytics/app/pages/state.py
@@ -0,0 +1,75 @@
+from dataclasses import dataclass
+
+import pandas as pd
+from analytics.app.data.load import list_pipelines, load_pipeline_logs
+from analytics.app.data.transform import (
+    dfs_models_and_evals,
+    leaf_stages,
+    logs_dataframe,
+    logs_dataframe_agg_by_stage,
+    pipeline_stage_parents,
+)
+from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs
+
+
+@dataclass
+class ProcessedPipelineData:
+    pipeline_ref: str
+
+    logs: PipelineLogs
+    pipeline_leaf_stages: list[str]
+
+    df_all: pd.DataFrame
+    df_leaf: pd.DataFrame
+
+    df_agg: pd.DataFrame
+    df_agg_leaf: pd.DataFrame
+
+    df_parents: pd.DataFrame
+    df_add_parents: pd.DataFrame
+
+    df_models: pd.DataFrame
+    df_eval_requests: pd.DataFrame | None
+    df_eval_single: pd.DataFrame | None
+
+
+# ---------------------------------------- Global state (shared by all pages) ---------------------------------------- #
+
+pipelines = list_pipelines()
+max_pipeline_id = max(pipelines.keys())
+
+pipeline_data: dict[int, ProcessedPipelineData] = {}
+
+
+def process_pipeline_data(pipeline_id: int) -> ProcessedPipelineData:
+    pipeline_ref = f"{pipeline_id}".zfill(len(str(max_pipeline_id))) + f" - {pipelines[pipeline_id][0]}"
+
+    logs = load_pipeline_logs(pipeline_id)
+    pipeline_leaf_stages = leaf_stages(logs)
+    df_all = logs_dataframe(logs, pipeline_ref)
+    df_leaf = df_all[df_all["id"].isin(pipeline_leaf_stages)]
+
+    df_agg = logs_dataframe_agg_by_stage(df_all)
+    df_agg_leaf = df_agg[df_agg["id"].isin(pipeline_leaf_stages)]
+
+    df_parents = pipeline_stage_parents(logs)
+    df_add_parents = df_agg.merge(df_parents, left_on="id", right_on="id", how="left")
+
+    df_logs_models, df_eval_requests, df_eval_single = dfs_models_and_evals(
+        logs, df_all["sample_time"].max(), pipeline_ref
+    )
+
+    return ProcessedPipelineData(
+        pipeline_ref=pipeline_ref,
+        logs=logs,
+        pipeline_leaf_stages=pipeline_leaf_stages,
+        df_all=df_all,
+        df_leaf=df_leaf,
+        df_agg=df_agg,
+        df_agg_leaf=df_agg_leaf,
+        df_parents=df_parents,
+        df_add_parents=df_add_parents,
+        df_models=df_logs_models,
+        df_eval_requests=df_eval_requests,
+        df_eval_single=df_eval_single,
+    )
diff --git a/analytics/tools/__init__.py b/analytics/tools/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/analytics/tools/aggregate_runs/__init__.py b/analytics/tools/aggregate_runs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/analytics/tools/aggregate_runs/core_aggregation.py b/analytics/tools/aggregate_runs/core_aggregation.py
new file mode 100644
index 000000000..d26cc7074
--- /dev/null
+++ b/analytics/tools/aggregate_runs/core_aggregation.py
@@ -0,0 +1,119 @@
+from copy import deepcopy
+from pathlib import Path
+
+import pandas as pd
+from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe
+from analytics.tools.aggregate_runs.dir_utils import load_multiple_logfiles
+from analytics.tools.aggregate_runs.pipeline_equivalence import assert_pipeline_equivalence
+from modyn.supervisor.internal.grpc.enums import PipelineStage
+from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs, SingleEvaluationInfo
+
+DEBUGGING_MODE = True
+"""if True, the the process will halt on breakpoints to allow for manual verification"""
+
+
+def merge_files_for_equivalence_group(pipeline_files: list[Path], output_directory: Path) -> None:
+    """
+    Merges the logfiles of a group of equivalent pipelines into one file.
+    """
+    logs = load_multiple_logfiles(pipeline_files)
+    assert_pipeline_equivalence(logs)
+
+    dfs_logs = [logs_dataframe(log) for log in logs]
+
+    max_sample_time = max([df["sample_time"].max() for df in dfs_logs])
+
+    dfs_models_evals: list[tuple[pd.DataFrame, pd.DataFrame | None, pd.DataFrame | None]] = [
+        dfs_models_and_evals(log, max_sample_time) for log in logs
+    ]
+
+    df_models = pd.concat([_df_models for _df_models, _, _ in dfs_models_evals])
+    assert df_models.shape[0] > 0
+
+    df_eval_requests = pd.concat(
+        [
+            single_df_eval_requests
+            for _, single_df_eval_requests, _ in dfs_models_evals
+            if single_df_eval_requests is not None
+        ]
+    )
+    assert df_eval_requests.shape[0] > 0
+
+    df_eval_single = pd.concat(
+        [_single_eval_df for _, _, _single_eval_df in dfs_models_evals if _single_eval_df is not None]
+    )
+
+    if DEBUGGING_MODE:
+        # TEMPLATE
+        # df_eval_single[
+        #     (df_eval_single["model_idx"] == 1)
+        #     & (df_eval_single["eval_handler"] == "exactmatrix")  # ADJUST THIS
+        #     & (df_eval_single["dataset_id"] == "cglm_landmark_min25-test")  # ADJUST THIS
+        #     & (df_eval_single["interval_start"] == "2004-01-01")  # ADJUST THIS
+        #     & (df_eval_single["interval_end"] == "2004-12-31")  # ADJUST THIS
+        #     & (df_eval_single["metric"] == "Accuracy")
+        # ]
+        breakpoint()
+
+    aggregated_logs = aggregate_eval_metrics(df_eval_single, logs)
+    aggregated_logs.materialize(output_directory, mode="final")
+
+    if DEBUGGING_MODE:
+        breakpoint()
+
+
+def aggregate_eval_metrics(df_eval_single: pd.DataFrame, logs: list[PipelineLogs]) -> PipelineLogs:
+    """
+    Aggregates the evaluation metrics group-wise and updates the creates a new PipelineLogs object using
+    the first log in the list as a template.
+    """
+
+    # --------------------------------------- Aggregation within eval dataframe -------------------------------------- #
+    groups = df_eval_single.groupby(
+        ["model_idx", "eval_handler", "dataset_id", "interval_start", "interval_end", "metric"]
+    )
+
+    for size in groups.size():
+        assert size == len(logs), "Wrong primary key"
+
+    aggregated_metrics = groups.agg(
+        agg_value=("value", "mean"), id_model_list=("id_model", lambda x: list(x))
+    ).reset_index()
+
+    # sanity check: per aggregated row we find len(logs) unique id_model
+    assert all(
+        len(row[1]["id_model_list"]) == len(logs)
+        for row in aggregated_metrics[["model_idx", "id_model_list"]].iterrows()
+    )
+
+    if DEBUGGING_MODE:
+        # print(aggregated_metrics[["model_idx", "id_model_list"]])
+        breakpoint()
+
+    # ---------------------------------- Write back dataframe to PipelineLogs object --------------------------------- #
+
+    aggregated_logs = deepcopy(logs[0])
+    for log in aggregated_logs.supervisor_logs.stage_runs:
+        if log.id == PipelineStage.EVALUATE_SINGLE.name:
+            assert isinstance(log.info, SingleEvaluationInfo)
+            if not log.info.results:
+                continue
+
+            eval_req = log.info.eval_request
+
+            # will yield multiple rows (one per each metric)
+            request_lookup = aggregated_metrics[
+                (aggregated_metrics["id_model_list"].apply(lambda x: eval_req.id_model in x))
+                & (aggregated_metrics["eval_handler"] == eval_req.eval_handler)
+                & (aggregated_metrics["dataset_id"] == eval_req.dataset_id)
+                & (aggregated_metrics["interval_start"] == pd.to_datetime(eval_req.interval_start, unit="s"))
+                & (aggregated_metrics["interval_end"] == pd.to_datetime(eval_req.interval_end, unit="s"))
+            ]
+
+            # find aggregated value
+            for metric in log.info.results["metrics"]:
+                lookup = request_lookup[request_lookup["metric"] == metric["name"]]
+                assert len(lookup) == 1, f"Primary key not unique: {metric['name']}"
+                metric["result"] = float(lookup["agg_value"].iloc[0])
+
+    return aggregated_logs
diff --git a/analytics/tools/aggregate_runs/dir_utils.py b/analytics/tools/aggregate_runs/dir_utils.py
new file mode 100644
index 000000000..af1f431e6
--- /dev/null
+++ b/analytics/tools/aggregate_runs/dir_utils.py
@@ -0,0 +1,31 @@
+import os
+from pathlib import Path
+
+from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs
+
+
+def group_pipelines_by_name(pipeline_logs_directory: Path) -> dict[str, list[Path]]:
+    # find the groups of equivalent pipelines via the .name file
+
+    pipeline_directories = [
+        pipeline_logs_directory / d for d in os.listdir(pipeline_logs_directory) if str(d).startswith("pipeline_")
+    ]
+
+    pipeline_names: list[tuple[Path, str]] = [(d, (d / ".name").read_text()) for d in pipeline_directories if (d / "pipeline.log").exists()]
+
+    pipeline_groups = {name: [d for d, n in pipeline_names if n == name] for name in set(n for _, n in pipeline_names)}
+    return pipeline_groups
+
+
+def load_multiple_logfiles(pipeline_files: list[Path]) -> list[PipelineLogs]:
+    """
+    Args:
+        pipeline_files: list of paths to pipeline log directories (not files!)
+    Returns:
+        list of PipelineLogs
+    """
+    logs = [
+        PipelineLogs.model_validate_json((pipeline_logfile / "pipeline.log").read_text())
+        for pipeline_logfile in pipeline_files
+    ]
+    return logs
diff --git a/analytics/tools/aggregate_runs/main.py b/analytics/tools/aggregate_runs/main.py
new file mode 100644
index 000000000..dde60a19f
--- /dev/null
+++ b/analytics/tools/aggregate_runs/main.py
@@ -0,0 +1,43 @@
+"""
+# Motivation
+
+We want to increase the confidence in our pipeline run results by running the same experiment pipelines with different
+seeds.
+
+This yields different evaluation metrics. In consequence, we want to aggregate (e.g. mean, median) the evaluation
+metrics over runs.
+"""
+
+from pathlib import Path
+from typing import Annotated, Optional
+
+import typer
+from analytics.tools.aggregate_runs.core_aggregation import merge_files_for_equivalence_group
+from analytics.tools.aggregate_runs.dir_utils import group_pipelines_by_name
+
+
+def main(
+    logs_directory: Annotated[Path, typer.Argument(help="Path to read the pipelines in from")],
+    aggregated_log_dir: Annotated[Path, typer.Argument(help="Path to output the aggregated pipelines to")],
+    pipeline_name: Annotated[
+        Optional[str],
+        typer.Option(
+            help=(
+                "If not all pipelines should be aggregated, specify the name of the "
+                "pipeline to aggregate (as specified in the .name file)"
+            )
+        ),
+    ] = None,
+) -> None:
+    # find the groups of equivalent pipelines via the .name file
+
+    pipeline_groups = group_pipelines_by_name(logs_directory)
+
+    for group_name, group_pipelines in pipeline_groups.items():
+        if pipeline_name is not None and group_name != pipeline_name:
+            continue
+        merge_files_for_equivalence_group(group_pipelines, output_directory=aggregated_log_dir)
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/analytics/tools/aggregate_runs/pipeline_equivalence.py b/analytics/tools/aggregate_runs/pipeline_equivalence.py
new file mode 100644
index 000000000..ab64a08a8
--- /dev/null
+++ b/analytics/tools/aggregate_runs/pipeline_equivalence.py
@@ -0,0 +1,32 @@
+from copy import deepcopy
+
+from modyn.config.schema.pipeline.sampling.config import CoresetStrategyConfig
+from modyn.config.schema.pipeline.sampling.downsampling_config import RHOLossDownsamplingConfig
+from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs
+
+
+def assert_pipeline_equivalence(logs: list[PipelineLogs]) -> None:
+    # assert that all pipelines are the same except from the seed
+    assert len(logs) >= 1
+
+    candidates = [deepcopy(log) for log in logs]
+    # set seeds to seed of first pipeline
+    # set device to first pipeline since that does not matter
+    for i, candidate in enumerate(candidates):
+        candidate.config.pipeline.training.seed = candidates[0].config.pipeline.training.seed
+        candidate.config.pipeline.training.device = candidates[0].config.pipeline.training.device
+        candidate.config.pipeline.evaluation.device = candidates[0].config.pipeline.evaluation.device
+
+        if isinstance(candidate.config.pipeline.selection_strategy, CoresetStrategyConfig) and isinstance(
+            candidate.config.pipeline.selection_strategy.downsampling_config, RHOLossDownsamplingConfig
+        ):
+            candidate.config.pipeline.selection_strategy.downsampling_config.il_training_config.device = candidates[
+                0
+            ].config.pipeline.selection_strategy.downsampling_config.il_training_config.device
+            candidate.config.pipeline.selection_strategy.downsampling_config.il_training_config.seed = candidates[
+                0
+            ].config.pipeline.selection_strategy.downsampling_config.il_training_config.seed
+
+    assert all(
+        [candidate.config == candidates[0].config for candidate in candidates]
+    ), "Not all pipelines are the same (ignoring seed)"
diff --git a/analytics/tools/patch_logfile.ipynb b/analytics/tools/patch_logfile.ipynb
index cafc1ca4b..9acf9eef2 100644
--- a/analytics/tools/patch_logfile.ipynb
+++ b/analytics/tools/patch_logfile.ipynb
@@ -31,6 +31,7 @@
     "\n",
     "from analytics.app.data.transform import logs_dataframe\n",
     "from pathlib import Path\n",
+    "from analytics.app.data.transform import dfs_models_and_evals\n",
     "\n",
     "\n",
     "%load_ext autoreload\n",
@@ -52,7 +53,7 @@
    "source": [
     "# VARIABLES\n",
     "\n",
-    "pipeline_logfile = Path(\"/Users/robinholzinger/robin/dev/eth/modyn/.data/evaluation_results/pipeline_5/pipeline.log\")"
+    "pipeline_logfile = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/pipeline_11/pipeline.log\")"
    ]
   },
   {
@@ -70,22 +71,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "trains = [(l_ for l_ in logs.supervisor_logs.stage_runs if l_.id == PipelineStage.HANDLE_SINGLE_TRIGGER.name)]\n",
-    "evals = [(l_ for l_ in logs.supervisor_logs.stage_runs if l_.id == PipelineStage.EVALUATE_SINGLE.name and l_.info.eval_request.dataset_id == \"cglm_landmark_min25-test\")]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from analytics.app.data.transform import dfs_models_and_evals\n",
     "\n",
     "df_logs = logs_dataframe(logs)\n",
     "# max_timestamp = df_logs[\"sample_time\"].max()\n",
-    "max_timestamp = 1703682949\n",
-    "df_models, df_evals = dfs_models_and_evals(logs, max_timestamp)"
+    "max_timestamp = df_logs[\"sample_time\"].max()\n",
+    "df_models, eval_requests, evals_metrics = dfs_models_and_evals(logs, max_timestamp)"
    ]
   },
   {
@@ -110,7 +100,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_evals"
+    "eval_requests"
    ]
   },
   {
@@ -119,7 +109,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_evals[df_evals[\"most_recent_model\"]]"
+    "eval_requests[eval_requests[\"currently_active_model\"]]"
    ]
   },
   {
@@ -137,9 +127,6 @@
    "source": [
     "for eval_log in logs.supervisor_logs.stage_runs:\n",
     "    if eval_log.id == PipelineStage.EVALUATE_SINGLE.name:\n",
-    "        # Let's throw away all information about the most recent model, let's rebuild it\n",
-    "        eval_log.info.eval_request.most_recent_model = False\n",
-    "\n",
     "        # For a fixed interval the evaluation request of a certain model is the most recent, if the model training\n",
     "        # interval center lies within the evaluation interval.\n",
     "        # Note: this is not a generic solution, but works for the slicing case with fixed evaluation and trigger\n",
@@ -148,7 +135,7 @@
     "        assert len(model_row) == 1\n",
     "\n",
     "        training_center = (model_row.iloc[0][\"train_start\"].to_pydatetime().timestamp() + model_row.iloc[0][\"train_end\"].to_pydatetime().timestamp()) / 2\n",
-    "        eval_log.info.eval_request.most_recent_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end"
+    "        eval_log.info.eval_request.currently_trained_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end"
    ]
   },
   {
@@ -160,11 +147,152 @@
     "# Write results back\n",
     "pipeline_logfile.write_text(logs.model_dump_json(by_alias=True))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def patch_logfile(path):\n",
+    "    logs = PipelineLogs.model_validate_json(path.read_text())\n",
+    "    df_logs = logs_dataframe(logs)\n",
+    "    max_timestamp = df_logs[\"sample_time\"].max()\n",
+    "    df_models, eval_requests, evals_metrics = dfs_models_and_evals(logs, max_timestamp)\n",
+    "\n",
+    "    for eval_log in logs.supervisor_logs.stage_runs:\n",
+    "        if eval_log.id == PipelineStage.EVALUATE_SINGLE.name:\n",
+    "            # Let's throw away all information about the most recent model, let's rebuild it\n",
+    "            eval_log.info.eval_request.currently_active_model = False\n",
+    "\n",
+    "            # For a fixed interval the evaluation request of a certain model is the most recent, if the model training\n",
+    "            # interval center lies within the evaluation interval.\n",
+    "            # Note: this is not a generic solution, but works for the slicing case with fixed evaluation and trigger\n",
+    "            # intervals in the same order of magnitude.\n",
+    "            model_row = df_models[df_models[\"id_model\"] == eval_log.info.eval_request.id_model]\n",
+    "            assert len(model_row) == 1\n",
+    "\n",
+    "            training_center = (model_row.iloc[0][\"train_start\"].to_pydatetime().timestamp() + model_row.iloc[0][\"train_end\"].to_pydatetime().timestamp()) / 2\n",
+    "            eval_log.info.eval_request.currently_active_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end\n",
+    "            eval_log.info.eval_request.currently_trained_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end\n",
+    "\n",
+    "    patched_path = path.parent / \"pipeline.patched\"\n",
+    "    patched_path.write_text(logs.model_dump_json(by_alias=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "log_dir = Path(\"/Users/mboether/phd/dynamic-data/sigmod-data/cglm-landmark/data_selection_50%/logs\")\n",
+    "logfiles = [logfile for logfile in log_dir.glob(\"**/pipeline.log\") if (logfile.parent / \"snapshot\").exists()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "for logfile in tqdm(logfiles):\n",
+    "    patch_logfile(logfile)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "models_red = df_models[[\"trigger_id\", \"id_model\", \"train_start\", \"train_end\"]]\n",
+    "models_red"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_red = eval_requests[[\"trigger_id\", \"training_idx\", \"model_idx\", \"interval_start\", \"interval_end\", \"eval_handler\", \"dataset_id\"]]\n",
+    "eval_red"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_cross = models_red.merge(eval_red, on=\"trigger_id\").rename(columns={\"train_start\": \"first_timestamp\", \"train_end\": \"last_timestamp\"})\n",
+    "assert df_cross.shape[0] == eval_red.shape[0]\n",
+    "df_cross"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Adapted logic from handler.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_cross[\"active_candidate\"] = df_cross[\"last_timestamp\"] < df_cross[\"active_model_trained_before\"]\n",
+    "\n",
+    "# # find the maximum model for every EvalCandidate that doesn't violate that constraint\n",
+    "# max_model_id = (\n",
+    "#     df_cross[df_cross[\"active_candidate\"]]\n",
+    "#     .groupby(\"active_model_trained_before\")[\"id_model\"]\n",
+    "#     .aggregate(max_model_id=\"max\")\n",
+    "# )\n",
+    "\n",
+    "# # combine: a model in the cross product is most recent for a certain interval iff\n",
+    "# #  it has maximum model id for its active_model_trained_before\n",
+    "# df_active_models = df_cross.merge(max_model_id, on=\"active_model_trained_before\", how=\"left\")\n",
+    "# df_active_models[\"active_model\"] = df_active_models[\"id_model\"] == df_active_models[\"max_model_id\"]\n",
+    "\n",
+    "# # for a given interval, the currently trained model is the model with the smallest id\n",
+    "# # from all models that have a strictly bigger id than the most recent model. Hence it is the model after the\n",
+    "# # most recent model.\n",
+    "# # For that we first build a model -> successor model mapping:\n",
+    "# model_successor_relation = df_active_models[[\"id_model\"]].drop_duplicates().sort_values(by=\"id_model\")\n",
+    "# model_successor_relation[\"next_id_model\"] = model_successor_relation[\"id_model\"].shift(-1, fill_value=-1)\n",
+    "\n",
+    "# # if there's no active model for the first interval(s), we still need to define the next model as the\n",
+    "# # trained model\n",
+    "# model_successor_relation = pd.concat(\n",
+    "#     [\n",
+    "#         model_successor_relation,\n",
+    "#         pd.DataFrame([{\"id_model\": None, \"next_id_model\": df_active_models[\"id_model\"].min()}]),\n",
+    "#     ]\n",
+    "# )\n",
+    "\n",
+    "# df_trained_models = df_active_models.merge(\n",
+    "#     model_successor_relation, how=\"left\", left_on=\"max_model_id\", right_on=\"id_model\", suffixes=(\"\", \"__\")\n",
+    "# )\n",
+    "# df_trained_models[\"trained_model\"] = df_trained_models[\"id_model\"] == df_trained_models[\"next_id_model\"]\n"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -178,9 +306,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.1.-1"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/dev-requirements.txt b/dev-requirements.txt
index 34927185d..b8b416fee 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -22,4 +22,4 @@ seaborn
 dash
 dash-daq
 dash_cytoscape
-
+dash_bootstrap_components
diff --git a/environment.yml b/environment.yml
index 5de4819b7..7e494c791 100644
--- a/environment.yml
+++ b/environment.yml
@@ -14,7 +14,7 @@ channels:
   - huggingface
 
 dependencies:
-  - python>=3.11
+  - python=3.11
   - pip
   - tqdm
   - conda-forge::enlighten
diff --git a/modyn/supervisor/internal/pipeline_executor/models.py b/modyn/supervisor/internal/pipeline_executor/models.py
index 470c40c3d..380e34d6d 100644
--- a/modyn/supervisor/internal/pipeline_executor/models.py
+++ b/modyn/supervisor/internal/pipeline_executor/models.py
@@ -2,13 +2,14 @@
 
 import dataclasses
 import datetime
+import itertools
 import logging
 import multiprocessing as mp
 import os
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, Union, cast
+from typing import Any, Callable, Iterator, Literal, Optional, Union, cast
 
 import pandas as pd
 from modyn.config.schema.pipeline import ModynPipelineConfig
@@ -144,29 +145,34 @@ class StageInfo(BaseModel):
     `StageInfo` class is therefore intended to be subclassed for different pipeline stage information.
     """
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return []
+
     @property
-    def df(self) -> pd.DataFrame | None:
+    def df_row(self) -> tuple:
         """
         While appending StageLog subclasses to `StageLog.info` is sufficient to persist additional information in the
-        logs, this method is used to provide a DataFrame representation of the data for online analysis.
-
-        `Online` refers the to the ability to analyze the data while the pipeline is running as we do not only
-        want to analyze the data after the pipeline has finished (e.g. for triggering policies).
+        logs, this method is used to provide a DataFrame representation of the data for analytical purposes.
 
         Returns:
-            A DataFrame if the stage should collect data, else None.
+            The dataframe rows.
         """
-        return None
+        return ()
 
 
 class FetchDataInfo(StageInfo):
     num_samples: int = Field(..., description="Number of samples processed in the new data.")
     trigger_indexes: list[int] = Field(..., description="Indices of triggers in the new data.")
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return ["num_samples", "trigger_indexes"]
+
     @override
     @property
-    def df(self) -> pd.DataFrame:
-        return pd.DataFrame([(self.num_samples, str(self.trigger_indexes))], columns=["num_samples", "trigger_indexes"])
+    def df_row(self) -> tuple:
+        return (self.num_samples, str(self.trigger_indexes))
 
 
 class ProcessNewDataInfo(StageInfo):
@@ -174,10 +180,14 @@ class ProcessNewDataInfo(StageInfo):
     num_samples: int = Field(..., description="Number of samples processed")
     trigger_indexes: list[int] = Field(..., description="Indices of triggers in the new data.")
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return ["fetch_time", "num_samples"]
+
     @override
     @property
-    def df(self) -> pd.DataFrame:
-        return pd.DataFrame([(self.fetch_time, self.num_samples)], columns=["fetch_time", "num_samples"])
+    def df_row(self) -> tuple:
+        return (self.fetch_time, self.num_samples)
 
 
 class EvaluateTriggerInfo(StageInfo):
@@ -186,10 +196,14 @@ class EvaluateTriggerInfo(StageInfo):
     trigger_eval_times: list[int] = Field(default_factory=list)
     """Time in milliseconds that every next(...) call of the trigger.inform(...) generator took."""
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return ["batch_size", "trigger_indexes"]
+
     @override
     @property
-    def df(self) -> pd.DataFrame:
-        return pd.DataFrame([(self.batch_size, list(self.trigger_indexes))], columns=["batch_size", "trigger_indexes"])
+    def df_row(self) -> tuple:
+        return (self.batch_size, list(self.trigger_indexes))
 
 
 class _TriggerLogMixin(StageInfo):
@@ -207,26 +221,28 @@ class SelectorInformTriggerInfo(_TriggerLogMixin):
     selector_log: dict[str, Any]
     num_samples_in_trigger: int
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return ["trigger_i", "trigger_index", "trigger_id", "num_samples_in_trigger"]
+
     @override
     @property
-    def df(self) -> pd.DataFrame:
-        return pd.DataFrame(
-            [(self.trigger_i, self.trigger_index, self.trigger_i, self.num_samples_in_trigger)],
-            columns=["trigger_i", "trigger_index", "trigger_id", "num_samples_in_trigger"],
-        )
+    def df_row(self) -> tuple:
+        return (self.trigger_i, self.trigger_index, self.trigger_i, self.num_samples_in_trigger)
 
 
 class TriggerExecutionInfo(_TriggerLogMixin):
     first_timestamp: int | None
     last_timestamp: int | None
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return ["trigger_i", "trigger_index", "trigger_id", "first_timestamp", "last_timestamp"]
+
     @override
     @property
-    def df(self) -> pd.DataFrame:
-        return pd.DataFrame(
-            [(self.trigger_i, self.trigger_index, self.trigger_id, self.first_timestamp, self.last_timestamp)],
-            columns=["trigger_i", "trigger_index", "trigger_id", "first_timestamp", "last_timestamp"],
-        )
+    def df_row(self) -> tuple:
+        return (self.trigger_i, self.trigger_index, self.trigger_id, self.first_timestamp, self.last_timestamp)
 
 
 class _TrainInfoMixin(StageInfo):
@@ -237,25 +253,27 @@ class _TrainInfoMixin(StageInfo):
 class TrainingInfo(_TrainInfoMixin):
     trainer_log: dict[str, Any]
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return ["trigger_id", "training_id", "num_batches", "num_samples"]
+
     @override
     @property
-    def df(self) -> pd.DataFrame:
-        return pd.DataFrame(
-            [(self.trigger_id, self.training_id, self.trainer_log["num_batches"], self.trainer_log["num_samples"])],
-            columns=["trigger_id", "training_id", "num_batches", "num_samples"],
-        )
+    def df_row(self) -> tuple:
+        return (self.trigger_id, self.training_id, self.trainer_log["num_batches"], self.trainer_log["num_samples"])
 
 
 class StoreModelInfo(_TrainInfoMixin):
     id_model: int  # model_ prefix not allowed in pydantic
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return ["trigger_id", "training_id", "id_model"]
+
     @override
     @property
-    def df(self) -> pd.DataFrame:
-        return pd.DataFrame(
-            [(self.trigger_id, self.training_id, self.id_model)],
-            columns=["trigger_id", "training_id", "id_model"],
-        )
+    def df_row(self) -> tuple:
+        return (self.trigger_id, self.training_id, self.id_model)
 
 
 class SingleEvaluationInfo(StageInfo):
@@ -263,24 +281,60 @@ class SingleEvaluationInfo(StageInfo):
     results: dict[str, Any] = Field(default_factory=dict)
     failure_reason: str | None = None
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return [
+            "trigger_id",
+            "training_id",
+            "id_model",
+            "currently_active_model",
+            "currently_trained_model",
+            "eval_handler",
+            "dataset_id",
+            "interval_start",
+            "interval_end",
+            "num_samples",
+        ]
+
     @override
     @property
-    def df(self) -> pd.DataFrame:
-        """One dataframe per requests (does not contain metrics)"""
+    def df_row(self) -> tuple:
+        return (
+            self.eval_request.trigger_id,
+            self.eval_request.training_id,
+            self.eval_request.id_model,
+            self.eval_request.currently_active_model,
+            self.eval_request.currently_trained_model,
+            self.eval_request.eval_handler,
+            self.eval_request.dataset_id,
+            self.eval_request.interval_start,
+            self.eval_request.interval_end,
+            self.results.get("dataset_size", 0),
+        )
+
+    @classmethod
+    def results_df(cls, infos: list[SingleEvaluationInfo]) -> pd.DataFrame:
+        """As one evaluation can have multiple metrics, we return a DataFrame with one row per metric."""
         return pd.DataFrame(
             [
                 (
-                    self.eval_request.trigger_id,
-                    self.eval_request.training_id,
-                    self.eval_request.id_model,
-                    self.eval_request.currently_active_model,
-                    self.eval_request.currently_trained_model,
-                    self.eval_request.eval_handler,
-                    self.eval_request.dataset_id,
-                    self.eval_request.interval_start,
-                    self.eval_request.interval_end,
-                    self.results.get("dataset_size", 0),
+                    # per request
+                    info.eval_request.trigger_id,
+                    info.eval_request.training_id,
+                    info.eval_request.id_model,
+                    info.eval_request.currently_active_model,
+                    info.eval_request.currently_trained_model,
+                    info.eval_request.eval_handler,
+                    info.eval_request.dataset_id,
+                    info.eval_request.interval_start,
+                    info.eval_request.interval_end,
+                    info.results.get("dataset_size", 0),
+                    # per metric
+                    metric["name"],
+                    metric["result"],
                 )
+                for info in infos
+                for metric in info.results["metrics"]  # pylint: disable=unsubscriptable-object
             ],
             columns=[
                 "trigger_id",
@@ -292,35 +346,26 @@ def df(self) -> pd.DataFrame:
                 "dataset_id",
                 "interval_start",
                 "interval_end",
-                "num_samples",
+                "dataset_size",
+                "metric",
+                "value",
             ],
         )
 
-    def results_df(self) -> pd.DataFrame:
-        """As one evaluation can have multiple metrics, we return a DataFrame with one row per metric."""
-        return self.df.merge(
-            pd.DataFrame(
-                [
-                    (metric["name"], metric["result"])
-                    for metric in self.results["metrics"]  # pylint: disable=unsubscriptable-object
-                ],
-                columns=["metric", "value"],
-            ),
-            how="cross",
-        )
-
 
 class SelectorInformInfo(StageInfo):
     selector_log: dict[str, Any] | None
     remaining_data: bool
     trigger_indexes: list[int]
 
+    def df_columns(self) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return ["remaining_data", "trigger_indexes"]
+
     @override
     @property
-    def df(self) -> pd.DataFrame:
-        return pd.DataFrame(
-            [(self.remaining_data, self.trigger_indexes)], columns=["remaining_data", "trigger_indexes"]
-        )
+    def df_row(self) -> tuple:
+        return (self.remaining_data, self.trigger_indexes)
 
 
 StageInfoUnion = Union[
@@ -359,40 +404,23 @@ class StageLog(BaseModel):
     # stage specific log info
     info: StageInfo | None = Field(None)
 
-    def df(self, extended: bool = False) -> pd.DataFrame | None:
-        """
-        Provides a DataFrame with the log information of this stage.
-
-        To conveniently allow analysis of lists of log entries, this method provides a DataFrame representation of the
-        log entry.
-
-        Args:
-            extended: If True, include the columns of the info attribute. Requires all logs to have the same type.
-
-        Returns:
-            A DataFrame with the log information of this stage.
-        """
-        df = pd.DataFrame(
-            [
-                (
-                    self.id,
-                    self.start,
-                    self.end,
-                    self.duration,
-                    self.batch_idx,
-                    self.sample_idx,
-                    self.sample_time,
-                    self.trigger_idx,
-                )
-            ],
-            columns=["id", "start", "end", "duration", "batch_idx", "sample_idx", "sample_time", "trigger_idx"],
+    def df_columns(self, extended: bool = False) -> list[str]:
+        """Provide the column names of the DataFrame representation of the data."""
+        return ["id", "start", "end", "duration", "batch_idx", "sample_idx", "sample_time", "trigger_idx"] + (
+            self.info.df_columns() if extended and self.info else []
         )
-        info_df = self.info.df if self.info else None
-        if info_df is not None and extended:
-            # add additional columns
-            df = pd.concat([df, info_df], axis=1)
 
-        return df
+    def df_row(self, extended: bool = False) -> tuple:
+        return (
+            self.id,
+            self.start,
+            self.end,
+            self.duration,
+            self.batch_idx,
+            self.sample_idx,
+            self.sample_time,
+            self.trigger_idx,
+        ) + (self.info.df_row if extended and self.info else ())
 
     # (De)Serialization to enable parsing all classes in the StageInfoUnion;
     # with that logic we avoid having to add disciminator fields to every subclass of StageInfo
@@ -417,6 +445,28 @@ def deserializer(cls, data: Any) -> Any:
                 data["info"] = None
         return data
 
+    @classmethod
+    def df(cls, stage_logs: Iterator[StageLog], extended: bool = False) -> pd.DataFrame:
+        """
+        Provides a DataFrame with the log information of this stage.
+
+        To conveniently allow analysis of lists of log entries, this method provides a DataFrame representation of the
+        log entry.
+
+        Args:
+            extended: If True, include the columns of the info attribute. Requires all logs to have the same type.
+
+        Returns:
+            A DataFrame row with the log information of this stage.
+        """
+        if not stage_logs:
+            return pd.DataFrame()
+        stage_logs, iter_copy = itertools.tee(stage_logs)
+        return pd.DataFrame(
+            [stage.df_row(extended=extended) for stage in stage_logs],
+            columns=next(iter_copy).df_columns(extended=extended),
+        )
+
 
 class SupervisorLogs(BaseModel):
     stage_runs: list[StageLog] = Field(default_factory=list)
diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py
index 52412502b..3062243d8 100644
--- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py
+++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py
@@ -114,8 +114,10 @@ def report_results(stage_log: StageLog) -> None:
                 if track and stage_log.info:
                     # ensure df exists
                     old_df = state.tracking.get(stage_log.id, None)
-                    if (new_rows := stage_log.df(extended=True)) is not None:
-                        state.tracking[stage_log.id] = pd.concat([old_df, new_rows]) if old_df is not None else new_rows
+                    columns = old_df.columns if old_df is not None else stage_log.df_columns(extended=True)
+                    if (new_row := stage_log.df_row(extended=True)) is not None:
+                        new_df = pd.DataFrame([new_row], columns=columns)
+                        state.tracking[stage_log.id] = pd.concat([old_df, new_df]) if old_df is not None else new_df
 
                 # record logs
                 if log:

From 97a2b5f91f80156eeba122a4dbb3eb01aeaa97b7 Mon Sep 17 00:00:00 2001
From: Xianzhe Ma <xianzma@gmail.com>
Date: Sun, 23 Jun 2024 20:27:23 +0200
Subject: [PATCH 4/4] Fix batch number (#533)

Previously, we didn't record the number of passed batches correctly:
We use a `batch_number` which is generated purely from enumeration in
`dataloader`. Therefore this number is irrelevant to the number of
epochs (only shows how many batches there are in one epoch).
A similar issue exists on the iteration on `StB` when we calculate
scores class by class. The number of batches passed in the previous
class is not correctly accumulated on the current class.

This PR fixes it.
---
 .../abstract_downsampling_strategy.py         |  18 +-
 .../downsampling_strategies/test_scheduler.py |   6 +-
 .../internal/trainer/test_pytorch_trainer.py  | 196 +++++++++++++++---
 .../internal/trainer/pytorch_trainer.py       |  48 +++--
 4 files changed, 200 insertions(+), 68 deletions(-)

diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py
index a8dae1333..a71879fbb 100644
--- a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py
+++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py
@@ -43,19 +43,11 @@ def __init__(
         self.requires_remote_computation = True
         self.maximum_keys_in_memory = maximum_keys_in_memory
         self.downsampling_config = downsampling_config
-        self.status_bar_scale = self._compute_status_bar_scale()
-
-    def _compute_status_bar_scale(self) -> int:
-        """
-        This function is used to create the downsampling status bar and handle the training one accordingly.
-
-        For BTS, we return 100 since the training status bar sees all the samples
-        For STB, we return the downsampling_ratio since the training status bar sees only a fraction of points
-        (while the downsampling status bas sees all the points)
-        """
-        if self.downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE:
-            return 100
-        return self.downsampling_ratio
+        # the status bar scale is used in conjunction with the total number of samples (after presampling)
+        # and the number of already trained samples to show current training progress
+        # No matter it is BtS or StB, the number of trained samples should be compared to the total number of samples
+        # divided by the downsampling ratio. Therefore, the status bar scale should be the downsampling ratio.
+        self.status_bar_scale = self.downsampling_ratio
 
     @property
     def downsampling_params(self) -> dict:
diff --git a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py
index e3783c66b..7b993cb7d 100644
--- a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py
+++ b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py
@@ -102,7 +102,7 @@ def test_switch_functions():
             "ratio_max": 100,
         }
         assert downs.downsampling_strategy == "RemoteGradNormDownsampling"
-        assert downs.training_status_bar_scale == 100
+        assert downs.training_status_bar_scale == 25
 
 
 def test_wrong_number_threshold():
@@ -158,7 +158,7 @@ def test_double_threshold():
             "ratio_max": 100,
         }
         assert downs.downsampling_strategy == "RemoteGradNormDownsampling"
-        assert downs.training_status_bar_scale == 100
+        assert downs.training_status_bar_scale == 25
 
     # above the last threshold
     for i in range(15, 25):
@@ -203,7 +203,7 @@ def test_wrong_trigger():
             "ratio_max": 100,
         }
         assert downs.downsampling_strategy == "RemoteGradNormDownsampling"
-        assert downs.training_status_bar_scale == 100
+        assert downs.training_status_bar_scale == 25
 
 
 def test_instantiate_scheduler_just_one():
diff --git a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py
index 52527184d..aceb19b3b 100644
--- a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py
+++ b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py
@@ -10,7 +10,7 @@
 from collections import OrderedDict
 from io import BytesIO
 from time import sleep
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, call, patch
 
 import grpc
 import pytest
@@ -29,6 +29,7 @@
 from modyn.trainer_server.internal.metadata_collector.metadata_collector import MetadataCollector
 from modyn.trainer_server.internal.trainer.metadata_pytorch_callbacks.base_callback import BaseCallback
 from modyn.trainer_server.internal.trainer.pytorch_trainer import PytorchTrainer, train
+from modyn.trainer_server.internal.trainer.remote_downsamplers import RemoteGradMatchDownsamplingStrategy
 from modyn.trainer_server.internal.utils.trainer_messages import TrainerMessages
 from modyn.trainer_server.internal.utils.training_info import TrainingInfo
 from modyn.utils import DownsamplingMode
@@ -117,6 +118,28 @@ def get_mock_label_transformer():
     )
 
 
+class MockDataloader:
+    def __init__(self, batch_size, num_batches):
+        self.batch_size = batch_size
+        self.num_batches = num_batches
+        self.dataset = MagicMock()
+
+    def __iter__(self):
+        return iter(
+            [
+                (
+                    ("1",) * self.batch_size,
+                    torch.ones(self.batch_size, 10, requires_grad=True),
+                    torch.ones(self.batch_size, dtype=torch.uint8),
+                )
+                for _ in range(self.num_batches)
+            ]
+        )
+
+    def __len__(self):
+        return self.num_batches
+
+
 def mock_get_dataloaders(
     pipeline_id,
     trigger_id,
@@ -135,12 +158,7 @@ def mock_get_dataloaders(
     log_path,
     num_batches: int = 100,
 ):
-    mock_train_dataloader = iter(
-        [
-            (("1",) * batch_size, torch.ones(batch_size, 10, requires_grad=True), torch.ones(batch_size, dtype=int))
-            for _ in range(num_batches)
-        ]
-    )
+    mock_train_dataloader = MockDataloader(batch_size, num_batches)
     return mock_train_dataloader, None
 
 
@@ -257,6 +275,7 @@ def get_training_info(
 
 @patch.object(StorageStub, "__init__", noop_constructor_mock)
 @patch.object(SelectorStub, "__init__", noop_constructor_mock)
+@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch(
     "modyn.trainer_server.internal.dataset.key_sources.selector_key_source.grpc_connection_established",
@@ -266,13 +285,13 @@ def get_training_info(
 @patch("modyn.trainer_server.internal.utils.training_info.dynamic_module_import")
 @patch("modyn.trainer_server.internal.trainer.pytorch_trainer.dynamic_module_import")
 @patch.object(PytorchTrainer, "connect_to_selector", return_value=None)
-@patch.object(PytorchTrainer, "get_selection_strategy", return_value=(False, "", {}))
+@patch.object(PytorchTrainer, "get_selection_strategy")
 @patch.object(PytorchTrainer, "get_num_samples_in_trigger")
 @patch.object(SelectorKeySource, "uses_weights", return_value=False)
 def get_mock_trainer(
     modyn_config: ModynConfig,
-    query_queue: mp.Queue,
-    response_queue: mp.Queue,
+    query_queue_training: mp.Queue,
+    response_queue_training: mp.Queue,
     use_pretrained: bool,
     load_optimizer_state: bool,
     pretrained_model_path: pathlib.Path,
@@ -289,22 +308,13 @@ def get_mock_trainer(
     test_grpc_connection_established_selector: MagicMock,
     test_grpc_connection_established: MagicMock,
     batch_size: int = 32,
-    downsampling_mode: DownsamplingMode = DownsamplingMode.DISABLED,
-    downsampling_ratio: int = 25,
-    ratio_max: int = 100,
+    selection_strategy: tuple[bool, str, dict] = (False, "", {}),
 ):
     model_dynamic_module_patch.return_value = MockModule(num_optimizers)
     lr_scheduler_dynamic_module_patch.return_value = MockLRSchedulerModule()
     mock_get_num_samples.return_value = batch_size * 100
 
-    if downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE:
-        mock_selection_strategy.return_value = (
-            True,
-            "RemoteGradNormDownsampling",
-            {"downsampling_ratio": downsampling_ratio, "ratio_max": ratio_max, "sample_then_batch": False},
-        )
-    elif downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH:
-        raise NotImplementedError()
+    mock_selection_strategy.return_value = selection_strategy
 
     training_info = get_training_info(
         0,
@@ -323,8 +333,8 @@ def get_mock_trainer(
         modyn_config.model_dump(by_alias=True),
         training_info,
         "cpu",
-        query_queue,
-        response_queue,
+        query_queue_training,
+        response_queue_training,
         mp.Queue(),
         mp.Queue(),
         logging.getLogger(__name__),
@@ -621,7 +631,6 @@ def test_send_model_state_to_server(dummy_system_config: ModynConfig):
     }
 
 
-@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders)
 @patch.object(PytorchTrainer, "weights_handling", return_value=(False, False))
 def test_train_invalid_query_message(test_weight_handling, dummy_system_config: ModynConfig):
     query_status_queue = mp.Queue()
@@ -652,7 +661,6 @@ def test_train_invalid_query_message(test_weight_handling, dummy_system_config:
 # # pylint: disable=too-many-locals
 
 
-@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders)
 @patch.object(BaseCallback, "on_train_begin", return_value=None)
 @patch.object(BaseCallback, "on_train_end", return_value=None)
 @patch.object(BaseCallback, "on_batch_begin", return_value=None)
@@ -870,7 +878,6 @@ def test_create_trainer_with_exception(
 
 
 @pytest.mark.parametrize("downsampling_ratio, ratio_max", [(25, 100), (50, 100), (250, 1000), (125, 1000)])
-@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders)
 @patch.object(BaseCallback, "on_train_begin", return_value=None)
 @patch.object(BaseCallback, "on_train_end", return_value=None)
 @patch.object(BaseCallback, "on_batch_begin", return_value=None)
@@ -914,9 +921,11 @@ def test_train_batch_then_sample_accumulation(
         "custom",
         False,
         batch_size=batch_size,
-        downsampling_mode=DownsamplingMode.BATCH_THEN_SAMPLE,
-        downsampling_ratio=downsampling_ratio,
-        ratio_max=ratio_max,
+        selection_strategy=(
+            True,
+            "RemoteGradNormDownsampling",
+            {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": ratio_max},
+        ),
     )
     assert trainer._downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE
 
@@ -949,6 +958,7 @@ def mock_forward(data):
 
     assert trainer._num_samples == batch_size * num_batches
     assert trainer._log["num_samples"] == batch_size * num_batches
+    assert trainer._log["num_batches"] == num_batches
     # We only train on whole batches, hence we have to scale by batch size
     assert trainer._log["num_samples_trained"] == ((expected_bts_size * num_batches) // batch_size) * batch_size
     assert test_on_batch_begin.call_count == len(trainer._callbacks) * num_batches
@@ -970,7 +980,6 @@ def mock_forward(data):
         assert torch.allclose(data, expected_data)
 
 
-@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders)
 @patch.object(MetadataCollector, "send_metadata", return_value=None)
 @patch.object(MetadataCollector, "cleanup", return_value=None)
 @patch.object(CustomLRScheduler, "step", return_value=None)
@@ -1003,3 +1012,130 @@ def test_lr_scheduler_init(
     )
 
     assert trainer._lr_scheduler.T_max == 100
+
+
+@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.SelectorKeySource")
+@patch.object(PytorchTrainer, "get_available_labels_from_selector")
+@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_per_class_dataloader_from_online_dataset")
+@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.LocalDatasetWriter")
+@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.LocalKeySource")
+@patch.object(PytorchTrainer, "start_embedding_recording_if_needed")
+@patch.object(PytorchTrainer, "end_embedding_recorder_if_needed")
+@patch.object(PytorchTrainer, "get_embeddings_if_recorded")
+@patch.object(RemoteGradMatchDownsamplingStrategy, "inform_samples")
+@patch.object(RemoteGradMatchDownsamplingStrategy, "inform_end_of_current_label")
+@patch.object(PytorchTrainer, "update_queue")
+def test_downsample_trigger_training_set_label_by_label(
+    test_update_queue,
+    test_inform_end_of_current_label,
+    test_inform_samples,
+    test_get_embeddings,
+    test_end_embedding_recording,
+    test_start_embedding_recording,
+    test_local_key_source,
+    test_local_dataset_writer,
+    test_prepare_per_class_dataloader,
+    test_get_available_labels,
+    test_selector_key_source,
+    dummy_system_config: ModynConfig,
+):
+    batch_size = 4
+    available_labels = [0, 1, 2, 3, 4, 5]
+    test_prepare_per_class_dataloader.return_value = MockDataloader(batch_size, 100)
+    test_get_available_labels.return_value = available_labels
+    num_batches = 100  # hardcoded into mock dataloader
+    query_status_queue_training = mp.Queue()
+    status_queue_training = mp.Queue()
+    trainer = get_mock_trainer(
+        dummy_system_config,
+        query_status_queue_training,
+        status_queue_training,
+        False,
+        False,
+        None,
+        2,
+        "custom",
+        False,
+        batch_size=batch_size,
+        selection_strategy=(
+            True,
+            "RemoteGradMatchDownsamplingStrategy",
+            {
+                "downsampling_ratio": 25,
+                "downsampling_period": 1,
+                "sample_then_batch": True,
+                "balance": True,
+                "ratio_max": 100,
+            },
+        ),
+    )
+    assert trainer._downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH
+    assert trainer._downsampler.requires_data_label_by_label
+    trainer.downsample_trigger_training_set()
+    assert test_prepare_per_class_dataloader.call_count == 1
+    assert test_update_queue.call_count == len(available_labels) * num_batches + 1
+    # check the args of the last call
+    last_call_args = test_update_queue.call_args_list[-1]
+    expected_batch_number = len(available_labels) * num_batches
+    expected_num_samples = expected_batch_number * batch_size
+    assert last_call_args == call("DOWNSAMPLING", expected_batch_number, expected_num_samples, training_active=True)
+    assert test_inform_end_of_current_label.call_count == len(available_labels)
+
+
+@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.SelectorKeySource")
+@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.LocalDatasetWriter")
+@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.LocalKeySource")
+@patch.object(PytorchTrainer, "start_embedding_recording_if_needed")
+@patch.object(PytorchTrainer, "end_embedding_recorder_if_needed")
+@patch.object(PytorchTrainer, "get_embeddings_if_recorded")
+@patch.object(RemoteGradMatchDownsamplingStrategy, "inform_samples")
+@patch.object(RemoteGradMatchDownsamplingStrategy, "select_points", return_value=([1, 2], torch.ones(2)))
+@patch.object(PytorchTrainer, "update_queue")
+def test_downsample_trigger_training_set(
+    test_update_queue,
+    test_select_points,
+    test_inform_samples,
+    test_get_embeddings,
+    test_end_embedding_recording,
+    test_start_embedding_recording,
+    test_local_key_source,
+    test_local_dataset_writer,
+    test_selector_key_source,
+    dummy_system_config: ModynConfig,
+):
+    batch_size = 4
+    num_batches = 100  # hardcoded into mock dataloader
+    query_status_queue_training = mp.Queue()
+    status_queue_training = mp.Queue()
+    trainer = get_mock_trainer(
+        dummy_system_config,
+        query_status_queue_training,
+        status_queue_training,
+        False,
+        False,
+        None,
+        2,
+        "custom",
+        False,
+        batch_size=batch_size,
+        selection_strategy=(
+            True,
+            "RemoteGradMatchDownsamplingStrategy",
+            {
+                "downsampling_ratio": 25,
+                "downsampling_period": 1,
+                "sample_then_batch": True,
+                "balance": False,
+                "ratio_max": 100,
+            },
+        ),
+    )
+    assert trainer._downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH
+    assert not trainer._downsampler.requires_data_label_by_label
+    trainer.downsample_trigger_training_set()
+    assert test_update_queue.call_count == num_batches + 1
+    # check the args of the last call
+    last_call_args = test_update_queue.call_args_list[-1]
+    expected_batch_number = num_batches
+    expected_num_samples = expected_batch_number * batch_size
+    assert last_call_args == call("DOWNSAMPLING", expected_batch_number, expected_num_samples, training_active=True)
diff --git a/modyn/trainer_server/internal/trainer/pytorch_trainer.py b/modyn/trainer_server/internal/trainer/pytorch_trainer.py
index b4a755765..82d3da8d6 100644
--- a/modyn/trainer_server/internal/trainer/pytorch_trainer.py
+++ b/modyn/trainer_server/internal/trainer/pytorch_trainer.py
@@ -211,7 +211,6 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
         self._info("Handled OnBegin Callbacks.")
         self._log["epochs"] = []
 
-        batch_number = -1
         if self.num_samples_to_pass == 0:
             epoch_num_generator: Iterable[int] = range(self.epochs_per_trigger)
         else:
@@ -236,30 +235,33 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
             batch_accumulator = BatchAccumulator(self._batch_size // post_downsampling_size, self._device)
 
         trained_batches = 0
+        passed_batches = 0
         for epoch in epoch_num_generator:
             stopw = Stopwatch()  # Reset timings per epoch
             self._log["epochs"].append({})
             batch_timings = []
 
             if self._sample_then_batch_this_epoch(epoch):
-                self.update_queue("TRAINING", batch_number, self._num_samples, training_active=False)
+                self.update_queue(
+                    "TRAINING", trained_batches, trained_batches * self._batch_size, training_active=False
+                )
                 with GPUMeasurement(self._measure_gpu_ops, "DownsampleSTB", self._device, stopw):
                     self.downsample_trigger_training_set()
 
             stopw.start("IndivFetchBatch", overwrite=True)
             stopw.start("FetchBatch", resume=True)
-            for batch_number, batch in enumerate(self._train_dataloader):
+            for batch in self._train_dataloader:
                 stopw.stop("FetchBatch")
                 batch_timings.append(stopw.stop("IndivFetchBatch"))
                 retrieve_weights_from_dataloader, weighted_optimization = self.weights_handling(len(batch))
 
                 stopw.start("OnBatchBeginCallbacks", resume=True)
                 for _, callback in self._callbacks.items():
-                    callback.on_batch_begin(self._model.model, self._optimizers, batch, batch_number)
+                    callback.on_batch_begin(self._model.model, self._optimizers, batch, passed_batches)
                 stopw.stop()
 
-                self.update_queue("TRAINING", batch_number, self._num_samples, training_active=True)
-
+                self.update_queue("TRAINING", trained_batches, trained_batches * self._batch_size, training_active=True)
+                passed_batches += 1
                 with GPUMeasurement(self._measure_gpu_ops, "PreprocessBatch", self._device, stopw, resume=True):
                     sample_ids, target, data = self.preprocess_batch(batch, stopw)
 
@@ -285,6 +287,7 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
                         data, sample_ids, target, weights = batch_accumulator.get_accumulated_batch()
 
                     self._assert_data_size(self._batch_size, data, sample_ids, target)
+
                     with GPUMeasurement(self._measure_gpu_ops, "Forward", self._device, stopw, resume=True):
                         output = self._model.model(data)
 
@@ -299,7 +302,7 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
                 stopw.start("OnBatchBeforeUpdate", resume=True)
                 for _, callback in self._callbacks.items():
                     callback.on_batch_before_update(
-                        self._model.model, self._optimizers, batch_number, sample_ids, data, target, output, loss
+                        self._model.model, self._optimizers, trained_batches, sample_ids, data, target, output, loss
                     )
                 stopw.stop()
 
@@ -315,10 +318,10 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
 
                 self._step_lr_if_necessary(True)
 
-                if self._checkpoint_interval > 0 and batch_number % self._checkpoint_interval == 0:
+                if self._checkpoint_interval > 0 and trained_batches % self._checkpoint_interval == 0:
                     stopw.start("Checkpoint", resume=True)
-                    checkpoint_file_name = self._checkpoint_path / f"model_{batch_number}.modyn"
-                    self.save_state(checkpoint_file_name, batch_number)
+                    checkpoint_file_name = self._checkpoint_path / f"model_{trained_batches}.modyn"
+                    self.save_state(checkpoint_file_name, trained_batches)
                     stopw.stop("Checkpoint")
 
                 self._num_samples += self._batch_size
@@ -326,7 +329,7 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
                 stopw.start("OnBatchEnd", resume=True)
                 for _, callback in self._callbacks.items():
                     callback.on_batch_end(
-                        self._model.model, self._optimizers, batch_number, sample_ids, data, target, output, loss
+                        self._model.model, self._optimizers, trained_batches, sample_ids, data, target, output, loss
                     )
                 stopw.stop()
                 if 0 < self.num_samples_to_pass <= self._num_samples:
@@ -376,10 +379,11 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
 
         total_stopw.stop("TotalTrain")
 
-        self._info(f"Finished training: {self._num_samples} samples, {batch_number + 1} batches.")
+        self._info(f"Finished training: {self._num_samples} samples, {passed_batches} batches.")
         self._log["num_samples"] = self._num_samples
         self._log["num_samples_trained"] = trained_batches * self._batch_size
-        self._log["num_batches"] = batch_number + 1
+        self._log["num_batches"] = passed_batches
+        self._log["num_batches_trained"] = trained_batches
         self._log["total_train"] = total_stopw.measurements.get("TotalTrain", 0)
 
         self._assert_training_size(epoch, trained_batches)
@@ -387,7 +391,7 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
         self._persist_pipeline_log()
 
         for _, callback in self._callbacks.items():
-            callback.on_train_end(self._model.model, self._optimizers, self._num_samples, batch_number)
+            callback.on_train_end(self._model.model, self._optimizers, self._num_samples, passed_batches)
 
         for metric in self._callbacks:
             self._metadata_collector.send_metadata(metric)
@@ -435,7 +439,7 @@ def downsample_trigger_training_set(self) -> None:
             available_labels = self.get_available_labels_from_selector()
 
             number_of_samples = 0
-            batch_number = 0
+            batch_number = -1
             first_label = True
             for label in available_labels:
                 if first_label:
@@ -480,7 +484,7 @@ def downsample_trigger_training_set(self) -> None:
         )
         self._train_dataloader.dataset.change_key_source(new_key_source)
 
-        self.update_queue("DOWNSAMPLING", batch_number, number_of_samples, training_active=True)
+        self.update_queue("DOWNSAMPLING", batch_number + 1, number_of_samples, training_active=True)
         # set the model to train
         self._model.model.train()
 
@@ -863,16 +867,16 @@ def _sample_then_batch_this_epoch(self, epoch: int) -> bool:
     def _iterate_dataloader_and_compute_scores(
         self,
         dataloader: torch.utils.data.DataLoader,
-        previous_batch_number: int = 0,
+        previous_batch_number: int = -1,
         previous_number_of_samples: int = 0,
     ) -> Tuple[int, int]:
         """
         Function to iterate a dataloader, compute the forward pass and send the forward output to the downsampler.
         Args:
             dataloader: torch.dataloader to get the data
-            previous_batch_number: number of batches processed before calling this function. Useful when this function
-            is called several times to keep track of previous invocations (ex label by label dataloader). We need to
-            have a total to correctly update the queue and show the progress in the supervisor counter.
+            previous_batch_number: The batch number returned from the last call to this method. Useful when this
+            function is called several times to keep track of previous invocations (ex label by label dataloader). We
+            need to have a total to correctly update the queue and show the progress in the supervisor counter.
             previous_number_of_samples: number of samples processed before calling this function. See above for the use.
 
         Returns:
@@ -880,9 +884,9 @@ def _iterate_dataloader_and_compute_scores(
         """
         number_of_samples = previous_number_of_samples
         batch_number = previous_batch_number
-        for batch_number, batch in enumerate(dataloader):
+        for batch in dataloader:
             self.update_queue("DOWNSAMPLING", batch_number, number_of_samples, training_active=False)
-
+            batch_number += 1
             sample_ids, target, data = self.preprocess_batch(batch)
             number_of_samples += len(sample_ids)