From 4d785444f6da924217fc6d3380fb880b99d79f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <2116466+MaxiBoether@users.noreply.github.com> Date: Sat, 22 Jun 2024 15:26:02 +0200 Subject: [PATCH 1/4] Implement `ratio_max` scaling for downsamplers (#541) Before, we only were able to enter ratios from 0-100 as percentages for downsamplers. With this PR, we allow scaling by a ratio_max factor. --- modyn/config/schema/pipeline/config.py | 29 +++++++++++++++++-- .../pipeline/sampling/downsampling_config.py | 21 ++++++++++++-- .../abstract_downsampling_strategy.py | 2 ++ .../test_rho_loss_downsampling_strategy.py | 1 + .../downsampling_strategies/test_scheduler.py | 6 ++++ ...t_abstract_matrix_downsampling_strategy.py | 1 + ...t_abstract_remote_downsampling_strategy.py | 2 +- .../test_craig_remote_downsampling.py | 7 +++-- ..._remote_gradmatch_downsampling_strategy.py | 5 ++-- .../test_remote_gradnorm_downsample.py | 7 +++-- ...st_remote_kcenter_downsampling_strategy.py | 5 ++-- .../test_remote_loss_downsample.py | 8 ++--- .../test_remote_rho_loss_downsampling.py | 1 + .../test_remote_rs2_downsampling.py | 12 ++++---- ...remote_submodular_downsampling_strategy.py | 2 ++ ...emote_uncertainty_downsampling_strategy.py | 1 + .../internal/trainer/test_pytorch_trainer.py | 13 ++++++--- modyn/tests/utils/test_utils.py | 2 +- .../internal/trainer/pytorch_trainer.py | 9 ++++-- .../abstract_matrix_downsampling_strategy.py | 2 +- .../abstract_remote_downsampling_strategy.py | 1 + .../remote_craig_downsampling.py | 2 +- .../remote_gradnorm_downsampling.py | 2 +- .../remote_loss_downsampling.py | 2 +- .../remote_rho_loss_downsampling.py | 2 +- .../remote_rs2_downsampling.py | 2 +- ...emote_uncertainty_downsampling_strategy.py | 2 +- 27 files changed, 109 insertions(+), 40 deletions(-) diff --git a/modyn/config/schema/pipeline/config.py b/modyn/config/schema/pipeline/config.py index 4a1a95851..f7031224f 100644 --- a/modyn/config/schema/pipeline/config.py +++ b/modyn/config/schema/pipeline/config.py @@ -1,15 +1,16 @@ from __future__ import annotations -from typing import Optional +from typing import Optional, Self from modyn.config.schema.base_model import ModynBaseModel -from pydantic import Field +from pydantic import Field, model_validator from .data import DataConfig from .evaluation.config import EvaluationConfig from .model import ModelConfig from .model_storage import PipelineModelStorageConfig -from .sampling.config import SelectionStrategy +from .sampling.config import CoresetStrategyConfig, SelectionStrategy +from .sampling.downsampling_config import MultiDownsamplingConfig from .training import TrainingConfig from .trigger import TriggerConfig @@ -32,3 +33,25 @@ class ModynPipelineConfig(ModynBaseModel): trigger: TriggerConfig selection_strategy: SelectionStrategy evaluation: EvaluationConfig | None = Field(None) + + @model_validator(mode="after") + def validate_bts_training_selection_works(self) -> Self: + # Validates that when using Downsampling with BtS, we choose a functional ratio + if isinstance(self.selection_strategy, CoresetStrategyConfig) and not isinstance( + self.selection_strategy.downsampling_config, MultiDownsamplingConfig + ): + if not self.selection_strategy.downsampling_config.sample_then_batch: # bts + ratio = self.selection_strategy.downsampling_config.ratio + ratio_max = self.selection_strategy.downsampling_config.ratio_max + batch_size = self.training.batch_size + + post_downsampling_size = max((ratio * batch_size) // ratio_max, 1) + if batch_size % post_downsampling_size != 0: + raise ValueError( + f"The target batch size of {batch_size} is not a multiple of the batch size " + + f"after downsampling with ratio {ratio} a batch in BtS mode ({post_downsampling_size}). " + + "We cannot accumulate batches. " + + "Please choose the downsampling ratio and batch size such that this is possible." + ) + + return self diff --git a/modyn/config/schema/pipeline/sampling/downsampling_config.py b/modyn/config/schema/pipeline/sampling/downsampling_config.py index 9db2004f3..8c54e58a4 100644 --- a/modyn/config/schema/pipeline/sampling/downsampling_config.py +++ b/modyn/config/schema/pipeline/sampling/downsampling_config.py @@ -20,9 +20,20 @@ class BaseDownsamplingConfig(ModynBaseModel): ), ) ratio: int = Field( - description="Ratio post_sampling_size/pre_sampling_size. E.g. with 160 records and a ratio of 50 we keep 80.", + description=( + "Ratio post_sampling_size/pre_sampling_size * ratio_max. " + "For the default of ratio_max of 100, this implies percent, " + "e.g., with 160 records and a ratio of 50 we keep 80." + ), min=0, - max=100, + ) + ratio_max: int = Field( + description=( + "Reference maximum ratio value. Defaults to 100, which implies percent." + " If you set this to 1000, ratio describes promille instead." + ), + default=100, + min=1, ) period: int = Field( 1, @@ -34,6 +45,12 @@ class BaseDownsamplingConfig(ModynBaseModel): min=0, ) + @model_validator(mode="after") + def validate_ratio(self) -> Self: + if self.ratio > self.ratio_max: + raise ValueError("ratio cannot be greater than ratio_max.") + return self + class UncertaintyDownsamplingConfig(BaseDownsamplingConfig): """Config for the Craig downsampling strategy.""" diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py index 584423a31..a8dae1333 100644 --- a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py +++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py @@ -38,6 +38,7 @@ def __init__( self.downsampling_period = downsampling_config.period self.downsampling_ratio = downsampling_config.ratio + self.ratio_max = downsampling_config.ratio_max self.requires_remote_computation = True self.maximum_keys_in_memory = maximum_keys_in_memory @@ -60,6 +61,7 @@ def _compute_status_bar_scale(self) -> int: def downsampling_params(self) -> dict: config = { "downsampling_ratio": self.downsampling_ratio, + "ratio_max": self.ratio_max, "maximum_keys_in_memory": self.maximum_keys_in_memory, "sample_then_batch": self.downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH, } diff --git a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_rho_loss_downsampling_strategy.py b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_rho_loss_downsampling_strategy.py index 4249db700..b8664688e 100644 --- a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_rho_loss_downsampling_strategy.py +++ b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_rho_loss_downsampling_strategy.py @@ -301,6 +301,7 @@ def test_downsampling_params(il_training_config: ILTrainingConfig, data_config: expected = { "downsampling_ratio": 60, + "ratio_max": 100, "maximum_keys_in_memory": maximum_keys_in_memory, "sample_then_batch": False, "il_model_id": 3, diff --git a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py index e07cfba02..e3783c66b 100644 --- a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py +++ b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py @@ -86,6 +86,7 @@ def test_switch_functions(): "downsampling_ratio": 50, "maximum_keys_in_memory": 1000, "sample_then_batch": True, + "ratio_max": 100, } assert downs.downsampling_strategy == "RemoteLossDownsampling" assert downs.training_status_bar_scale == 50 @@ -98,6 +99,7 @@ def test_switch_functions(): "downsampling_ratio": 25, "maximum_keys_in_memory": 1000, "sample_then_batch": False, + "ratio_max": 100, } assert downs.downsampling_strategy == "RemoteGradNormDownsampling" assert downs.training_status_bar_scale == 100 @@ -140,6 +142,7 @@ def test_double_threshold(): "downsampling_ratio": 50, "maximum_keys_in_memory": 1000, "sample_then_batch": True, + "ratio_max": 100, } assert downs.downsampling_strategy == "RemoteLossDownsampling" assert downs.training_status_bar_scale == 50 @@ -152,6 +155,7 @@ def test_double_threshold(): "downsampling_ratio": 25, "maximum_keys_in_memory": 1000, "sample_then_batch": False, + "ratio_max": 100, } assert downs.downsampling_strategy == "RemoteGradNormDownsampling" assert downs.training_status_bar_scale == 100 @@ -179,6 +183,7 @@ def test_wrong_trigger(): "downsampling_ratio": 50, "maximum_keys_in_memory": 1000, "sample_then_batch": True, + "ratio_max": 100, } assert downs.downsampling_strategy == "RemoteLossDownsampling" assert downs.training_status_bar_scale == 50 @@ -195,6 +200,7 @@ def test_wrong_trigger(): "downsampling_ratio": 25, "maximum_keys_in_memory": 1000, "sample_then_batch": False, + "ratio_max": 100, } assert downs.downsampling_strategy == "RemoteGradNormDownsampling" assert downs.training_status_bar_scale == 100 diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py index 7ce654f55..4745b7f09 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py @@ -19,6 +19,7 @@ def get_sampler_config(dummy_system_config: ModynConfig, balance=False): "sample_then_batch": False, "args": {}, "balance": balance, + "ratio_max": 100, } return ( 0, diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py index 899cd507b..8a428e8cc 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py @@ -11,7 +11,7 @@ def test_batch_then_sample_general(dummy_system_config: ModynConfig): downsampling_ratio = 50 - params_from_selector = {"downsampling_ratio": downsampling_ratio} + params_from_selector = {"downsampling_ratio": downsampling_ratio, "ratio_max": 100} sampler = AbstractRemoteDownsamplingStrategy( 154, 128, 64, params_from_selector, dummy_system_config.model_dump(by_alias=True), "cpu" ) diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py index 16891d54c..87719cfef 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py @@ -20,6 +20,7 @@ def get_sampler_config(modyn_config, balance=False): "balance": balance, "selection_batch": 64, "greedy": "NaiveGreedy", + "ratio_max": 100, } return 0, 0, 0, params_from_selector, modyn_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" @@ -347,7 +348,7 @@ def test_matching_results_with_deepcore(dummy_system_config: ModynConfig): 0, 0, 5, - {"downsampling_ratio": 20, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy"}, + {"downsampling_ratio": 20, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy", "ratio_max": 100}, dummy_system_config.model_dump(by_alias=True), BCEWithLogitsLoss(reduction="none"), "cpu", @@ -402,7 +403,7 @@ def test_matching_results_with_deepcore_permutation(dummy_system_config: ModynCo 0, 0, 5, - {"downsampling_ratio": 30, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy"}, + {"downsampling_ratio": 30, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy", "ratio_max": 100}, dummy_system_config.model_dump(by_alias=True), BCEWithLogitsLoss(reduction="none"), "cpu", @@ -461,7 +462,7 @@ def test_matching_results_with_deepcore_permutation_fancy_ids(dummy_system_confi 0, 0, 5, - {"downsampling_ratio": 50, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy"}, + {"downsampling_ratio": 50, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy", "ratio_max": 100}, dummy_system_config.model_dump(by_alias=True), BCEWithLogitsLoss(reduction="none"), "cpu", diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py index 70eabe39d..d6355eee7 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py @@ -19,6 +19,7 @@ def get_sampler_config(modyn_config: ModynConfig, balance=False): "sample_then_batch": False, "args": {}, "balance": balance, + "ratio_max": 100, } return 0, 0, 0, params_from_selector, modyn_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" @@ -185,7 +186,7 @@ def test_matching_results_with_deepcore(dummy_system_config: ModynConfig): 0, 0, 5, - {"downsampling_ratio": 10 * num_of_target_samples, "balance": False}, + {"downsampling_ratio": 10 * num_of_target_samples, "balance": False, "ratio_max": 100}, dummy_system_config.model_dump(by_alias=True), BCEWithLogitsLoss(reduction="none"), "cpu", @@ -237,7 +238,7 @@ def test_matching_results_with_deepcore_permutation_fancy_ids(dummy_system_confi 0, 0, 5, - {"downsampling_ratio": 50, "balance": False}, + {"downsampling_ratio": 50, "balance": False, "ratio_max": 100}, dummy_system_config.model_dump(by_alias=True), BCEWithLogitsLoss(reduction="none"), "cpu", diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py index c6fe03829..a3211b7af 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py @@ -14,7 +14,7 @@ def test_sample_shape_ce(dummy_system_config: ModynConfig): downsampling_ratio = 50 per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none") - params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False} + params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100} sampler = RemoteGradNormDownsampling( 0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" ) @@ -45,7 +45,7 @@ def test_sample_shape_other_losses(dummy_system_config: ModynConfig): downsampling_ratio = 50 per_sample_loss_fct = torch.nn.BCEWithLogitsLoss(reduction="none") - params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False} + params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100} sampler = RemoteGradNormDownsampling( 0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" ) @@ -84,6 +84,7 @@ def test_sampling_crossentropy(dummy_system_config: ModynConfig): "downsampling_ratio": downsampling_ratio, "replacement": False, "sample_then_batch": False, + "ratio_max": 100, } # Here we use autograd since the number of classes is not provided @@ -135,7 +136,7 @@ def test_sample_dict_input(dummy_system_config: ModynConfig): model = DictLikeModel() per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none") - params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False} + params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False, "ratio_max": 100} sampler = RemoteGradNormDownsampling( 0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" ) diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py index 9b39754bd..1779a12e1 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py @@ -17,6 +17,7 @@ def get_sampler_config(modyn_config: ModynConfig, balance=False): "sample_then_batch": False, "args": {}, "balance": balance, + "ratio_max": 100, } return 0, 0, 0, params_from_selector, modyn_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" @@ -137,7 +138,7 @@ def test_matching_results_with_deepcore(dummy_system_config: ModynConfig): 0, 0, 5, - {"downsampling_ratio": 10 * num_of_target_samples, "balance": False}, + {"downsampling_ratio": 10 * num_of_target_samples, "balance": False, "ratio_max": 100}, dummy_system_config.model_dump(by_alias=True), BCEWithLogitsLoss(reduction="none"), "cpu", @@ -166,7 +167,7 @@ def test_matching_results_with_deepcore_permutation_fancy_ids(dummy_system_confi 0, 0, 5, - {"downsampling_ratio": 50, "balance": False}, + {"downsampling_ratio": 50, "balance": False, "ratio_max": 100}, dummy_system_config.model_dump(by_alias=True), BCEWithLogitsLoss(reduction="none"), "cpu", diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py index d875fb930..3a2cd27bc 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py @@ -12,7 +12,7 @@ def test_sample_shape(dummy_system_config: ModynConfig): downsampling_ratio = 50 per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none") - params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False} + params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100} sampler = RemoteLossDownsampling( 0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" ) @@ -39,7 +39,7 @@ def test_sample_weights(dummy_system_config: ModynConfig): downsampling_ratio = 50 per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none") - params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False} + params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100} sampler = RemoteLossDownsampling( 0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" ) @@ -67,7 +67,7 @@ def test_sample_loss_dependent_sampling(dummy_system_config: ModynConfig): downsampling_ratio = 50 per_sample_loss_fct = torch.nn.MSELoss(reduction="none") - params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False} + params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100} sampler = RemoteLossDownsampling( 0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" ) @@ -116,7 +116,7 @@ def test_sample_dict_input(dummy_system_config: ModynConfig): mymodel = DictLikeModel() per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none") - params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False} + params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False, "ratio_max": 100} sampler = RemoteLossDownsampling( 0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" ) diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rho_loss_downsampling.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rho_loss_downsampling.py index 7091c330e..a159884ce 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rho_loss_downsampling.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rho_loss_downsampling.py @@ -29,6 +29,7 @@ def dummy_init_params(dummy_system_config: ModynConfig): "il_model_id": 2, "downsampling_ratio": 50, "sample_then_batch": False, + "ratio_max": 100, } modyn_config = dummy_system_config.model_dump(by_alias=True) per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none") diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rs2_downsampling.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rs2_downsampling.py index a1dd4d836..c5e764d3b 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rs2_downsampling.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_rs2_downsampling.py @@ -10,7 +10,7 @@ def test_init(dummy_system_config: ModynConfig): pipeline_id = 0 trigger_id = 0 batch_size = 32 - params_from_selector = {"replacement": True, "downsampling_ratio": 50} + params_from_selector = {"replacement": True, "downsampling_ratio": 50, "ratio_max": 100} per_sample_loss = None device = "cpu" @@ -41,7 +41,7 @@ def test_inform_samples(dummy_system_config: ModynConfig): pipeline_id = 0 trigger_id = 0 batch_size = 32 - params_from_selector = {"replacement": True, "downsampling_ratio": 50} + params_from_selector = {"replacement": True, "downsampling_ratio": 50, "ratio_max": 100} per_sample_loss = None device = "cpu" @@ -76,7 +76,7 @@ def test_multiple_epochs_with_replacement(dummy_system_config: ModynConfig): pipeline_id = 0 trigger_id = 0 batch_size = 32 - params_from_selector = {"replacement": True, "downsampling_ratio": 50} + params_from_selector = {"replacement": True, "downsampling_ratio": 50, "ratio_max": 100} per_sample_loss = None device = "cpu" @@ -110,7 +110,7 @@ def test_multiple_epochs_without_replacement(dummy_system_config: ModynConfig): pipeline_id = 0 trigger_id = 0 batch_size = 32 - params_from_selector = {"replacement": False, "downsampling_ratio": 50} + params_from_selector = {"replacement": False, "downsampling_ratio": 50, "ratio_max": 100} per_sample_loss = None device = "cpu" @@ -171,7 +171,7 @@ def test_multiple_epochs_without_replacement_leftover_data(dummy_system_config: pipeline_id = 0 trigger_id = 0 batch_size = 32 - params_from_selector = {"replacement": False, "downsampling_ratio": 40} + params_from_selector = {"replacement": False, "downsampling_ratio": 40, "ratio_max": 100} per_sample_loss = None device = "cpu" @@ -207,7 +207,7 @@ def test_multiple_epochs_empty_without_replacement_leftover_data(dummy_system_co pipeline_id = 0 trigger_id = 0 batch_size = 32 - params_from_selector = {"replacement": False, "downsampling_ratio": 40} + params_from_selector = {"replacement": False, "downsampling_ratio": 40, "ratio_max": 100} per_sample_loss = None device = "cpu" diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py index 9adca1d03..6cc0366cc 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py @@ -19,6 +19,7 @@ def get_sampler_config(modyn_config: ModynConfig, submodular: str = "GraphCut", "submodular_function": submodular, "balance": balance, "selection_batch": 64, + "ratio_max": 100, } return 0, 0, 0, params_from_selector, modyn_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" @@ -124,6 +125,7 @@ def _get_selected_samples( "submodular_function": submodular, "balance": False, "selection_batch": 64, + "ratio_max": 100, }, modyn_config.model_dump(by_alias=True), BCEWithLogitsLoss(reduction="none"), diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py index fab0b6d17..92013d235 100644 --- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py +++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py @@ -18,6 +18,7 @@ def sampler_config(dummy_system_config: ModynConfig, request): "args": {}, "balance": False, "score_metric": request.param, + "ratio_max": 100, } return 0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu" diff --git a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py index 455e557c0..52527184d 100644 --- a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py +++ b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py @@ -291,6 +291,7 @@ def get_mock_trainer( batch_size: int = 32, downsampling_mode: DownsamplingMode = DownsamplingMode.DISABLED, downsampling_ratio: int = 25, + ratio_max: int = 100, ): model_dynamic_module_patch.return_value = MockModule(num_optimizers) lr_scheduler_dynamic_module_patch.return_value = MockLRSchedulerModule() @@ -300,7 +301,7 @@ def get_mock_trainer( mock_selection_strategy.return_value = ( True, "RemoteGradNormDownsampling", - {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}, + {"downsampling_ratio": downsampling_ratio, "ratio_max": ratio_max, "sample_then_batch": False}, ) elif downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH: raise NotImplementedError() @@ -868,6 +869,7 @@ def test_create_trainer_with_exception( assert pathlib.Path(temp.name).exists() +@pytest.mark.parametrize("downsampling_ratio, ratio_max", [(25, 100), (50, 100), (250, 1000), (125, 1000)]) @patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders) @patch.object(BaseCallback, "on_train_begin", return_value=None) @patch.object(BaseCallback, "on_train_end", return_value=None) @@ -893,10 +895,11 @@ def test_train_batch_then_sample_accumulation( test_on_train_end, test_on_train_begin, dummy_system_config: ModynConfig, + downsampling_ratio, + ratio_max, ): num_batches = 100 # hardcoded into mock dataloader batch_size = 32 - downsampling_ratio = 25 query_status_queue = mp.Queue() status_queue = mp.Queue() @@ -913,11 +916,12 @@ def test_train_batch_then_sample_accumulation( batch_size=batch_size, downsampling_mode=DownsamplingMode.BATCH_THEN_SAMPLE, downsampling_ratio=downsampling_ratio, + ratio_max=ratio_max, ) assert trainer._downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE # Mock the downsample_batch method to return batches of the expected size - expected_bts_size = int(batch_size * (downsampling_ratio / 100.0)) + expected_bts_size = int(batch_size * (downsampling_ratio / ratio_max)) bts_accumulate_period = batch_size // expected_bts_size def mock_downsample_batch(data, sample_ids, target): @@ -945,7 +949,8 @@ def mock_forward(data): assert trainer._num_samples == batch_size * num_batches assert trainer._log["num_samples"] == batch_size * num_batches - assert trainer._log["num_samples_trained"] == expected_bts_size * num_batches + # We only train on whole batches, hence we have to scale by batch size + assert trainer._log["num_samples_trained"] == ((expected_bts_size * num_batches) // batch_size) * batch_size assert test_on_batch_begin.call_count == len(trainer._callbacks) * num_batches assert test_on_batch_end.call_count == len(trainer._callbacks) * num_batches assert test_downsample_batch.call_count == num_batches diff --git a/modyn/tests/utils/test_utils.py b/modyn/tests/utils/test_utils.py index 8ce245719..9b22dd841 100644 --- a/modyn/tests/utils/test_utils.py +++ b/modyn/tests/utils/test_utils.py @@ -189,7 +189,7 @@ def test_instantiate_class_existing(dummy_system_config: ModynConfig): 10, 11, 64, - {"downsampling_ratio": 67}, + {"downsampling_ratio": 67, "ratio_max": 100}, dummy_system_config.model_dump(by_alias=True), {}, "cpu", diff --git a/modyn/trainer_server/internal/trainer/pytorch_trainer.py b/modyn/trainer_server/internal/trainer/pytorch_trainer.py index dc6f73df4..b4a755765 100644 --- a/modyn/trainer_server/internal/trainer/pytorch_trainer.py +++ b/modyn/trainer_server/internal/trainer/pytorch_trainer.py @@ -223,7 +223,9 @@ def train(self) -> None: # pylint: disable=too-many-locals, too-many-branches # assertion since model validation by pydantic should catch this. assert self._downsampler.supports_bts, "The downsampler does not support batch then sample" # We cannot pass the target size from the trainer server since that depends on StB vs BtS. - post_downsampling_size = max((self._downsampler.downsampling_ratio * self._batch_size) // 100, 1) + post_downsampling_size = max( + (self._downsampler.downsampling_ratio * self._batch_size) // self._downsampling_ratio_max, 1 + ) assert post_downsampling_size < self._batch_size if self._batch_size % post_downsampling_size != 0: raise ValueError( @@ -727,6 +729,7 @@ def _setup_downsampling( self._downsampler = self._instantiate_downsampler( strategy_name, downsampler_config, modyn_config, self._criterion_nored ) + self._downsampling_ratio_max = downsampler_config["ratio_max"] assert "sample_then_batch" in downsampler_config self._log["received_downsampler_config"] = downsampler_config if downsampler_config["sample_then_batch"]: @@ -833,7 +836,9 @@ def _calc_expected_sizes(self, downsampling_enabled: bool) -> None: ) # scale up again to multiples of batch size if downsampling_enabled: - num_samples_per_epoch = max((self._downsampler.downsampling_ratio * num_samples_per_epoch) // 100, 1) + num_samples_per_epoch = max( + (self._downsampler.downsampling_ratio * num_samples_per_epoch) // self._downsampling_ratio_max, 1 + ) self._expected_num_batches = (num_samples_per_epoch // self._batch_size) * self.epochs_per_trigger self._expected_num_epochs = self.epochs_per_trigger diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py index 440c715d2..51e7a3794 100644 --- a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py +++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py @@ -110,7 +110,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]: def _select_from_matrix(self) -> tuple[list[int], torch.Tensor]: matrix = np.concatenate(self.matrix_elements) number_of_samples = len(matrix) - target_size = max(int(self.downsampling_ratio * number_of_samples / 100), 1) + target_size = max(int(self.downsampling_ratio * number_of_samples / self.ratio_max), 1) selected_indices, weights = self._select_indexes_from_matrix(matrix, target_size) selected_ids = [self.index_sampleid_map[index] for index in selected_indices] return selected_ids, weights diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py index 600119ec7..e1dc49d28 100644 --- a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py +++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py @@ -49,6 +49,7 @@ def __init__( assert "downsampling_ratio" in params_from_selector self.downsampling_ratio = params_from_selector["downsampling_ratio"] + self.ratio_max = params_from_selector["ratio_max"] # The next variable is used to keep a mapping index <-> sample_id # This is needed since the data selection policy works on indexes (the policy does not care what the sample_id diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py index 0ea0102bd..fee662dad 100644 --- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py +++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py @@ -178,7 +178,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]: def _select_points_from_distance_matrix(self) -> tuple[list[int], torch.Tensor]: number_of_samples = self.distance_matrix.shape[0] - target_size = max(int(self.downsampling_ratio * number_of_samples / 100), 1) + target_size = max(int(self.downsampling_ratio * number_of_samples / self.ratio_max), 1) all_index = np.arange(number_of_samples) submod_function = FacilityLocation(index=all_index, similarity_matrix=self.distance_matrix) diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py index fd3e737a5..ac456f6a6 100644 --- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py +++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py @@ -79,7 +79,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]: return [], torch.Tensor([]) # select always at least 1 point - target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / 100), 1) + target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / self.ratio_max), 1) probabilities = torch.cat(self.probabilities, dim=0) probabilities = probabilities / probabilities.sum() diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py index e7c4fa866..fd0906e08 100644 --- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py +++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py @@ -61,7 +61,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]: return [], torch.Tensor([]) # select always at least 1 point - target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / 100), 1) + target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / self.ratio_max), 1) probabilities = torch.cat(self.probabilities, dim=0) probabilities = probabilities / probabilities.sum() diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rho_loss_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rho_loss_downsampling.py index c1ccb84f2..4cdb2d482 100644 --- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rho_loss_downsampling.py +++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rho_loss_downsampling.py @@ -58,7 +58,7 @@ def inform_samples( self.number_of_points_seen += forward_output.shape[0] def select_points(self) -> tuple[list[int], torch.Tensor]: - target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / 100), 1) + target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / self.ratio_max), 1) # find the indices of maximal "target_size" elements in the list of rho_loss selected_indices = torch.topk(self.rho_loss, target_size).indices # use sorted() because we keep the relative order of the selected samples diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rs2_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rs2_downsampling.py index 3332fce46..706544ca2 100644 --- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rs2_downsampling.py +++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_rs2_downsampling.py @@ -71,7 +71,7 @@ def _epoch_step_no_r(self, target_size: int) -> None: self._subsets = [self._all_sample_ids[i * target_size : (i + 1) * target_size] for i in range(max_subset)] def _epoch_step(self) -> None: - target_size = max(int(self.downsampling_ratio * len(self._all_sample_ids) / 100), 1) + target_size = max(int(self.downsampling_ratio * len(self._all_sample_ids) / self.ratio_max), 1) if self._with_replacement: self._epoch_step_wr(target_size) diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py index 7040569cd..da0067858 100644 --- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py +++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py @@ -119,7 +119,7 @@ def select_points(self) -> tuple[list[int], torch.Tensor]: def _select_from_scores(self) -> tuple[list[int], torch.Tensor]: number_of_samples = len(self.scores) - target_size = max(int(self.downsampling_ratio * number_of_samples / 100), 1) + target_size = max(int(self.downsampling_ratio * number_of_samples / self.ratio_max), 1) selected_indices, weights = self._select_indexes_from_scores(target_size) selected_ids = [self.index_sampleid_map[index] for index in selected_indices] return selected_ids, weights From bf96bfa546c829a4844f25e0143497deb9539e04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20B=C3=B6ther?= <2116466+MaxiBoether@users.noreply.github.com> Date: Sat, 22 Jun 2024 22:12:10 +0200 Subject: [PATCH 2/4] Add `ratio_max` for presampling strategies (#544) I realized we also need to set `ratio_max` for presampling strategies, otherwise they cannot run 12.5% as well. --- .../config/schema/pipeline/sampling/config.py | 21 +++++++++++++++++-- .../abstract_presampling_strategy.py | 3 ++- .../test_random_presampling_strategy.py | 19 +++++++++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/modyn/config/schema/pipeline/sampling/config.py b/modyn/config/schema/pipeline/sampling/config.py index 2a6c95851..28b0846cc 100644 --- a/modyn/config/schema/pipeline/sampling/config.py +++ b/modyn/config/schema/pipeline/sampling/config.py @@ -19,13 +19,30 @@ class PresamplingConfig(ModynBaseModel): "Only the prefix, i.e. without `PresamplingStrategy`, is needed." ) ratio: int = Field( - description="Percentage of points on which the metric (loss, gradient norm,..) is computed.", + description=( + "Ratio of points on which the metric (loss, gradient norm,..) is computed." + "By default with ratio_max=100, this describes the selection ratio in percent." + ), min=0, - max=100, + ) + + ratio_max: int = Field( + description=( + "Reference maximum ratio value. Defaults to 100, which implies percent." + " If you set this to 1000, ratio describes promille instead." + ), + default=100, + min=1, ) force_column_balancing: bool = Field(False) force_required_target_size: bool = Field(False) + @model_validator(mode="after") + def validate_ratio(self) -> Self: + if self.ratio > self.ratio_max: + raise ValueError("ratio cannot be greater than ratio_max.") + return self + StorageBackend = Literal["database", "local"] diff --git a/modyn/selector/internal/selector_strategies/presampling_strategies/abstract_presampling_strategy.py b/modyn/selector/internal/selector_strategies/presampling_strategies/abstract_presampling_strategy.py index adbbc7bc3..65c536b55 100644 --- a/modyn/selector/internal/selector_strategies/presampling_strategies/abstract_presampling_strategy.py +++ b/modyn/selector/internal/selector_strategies/presampling_strategies/abstract_presampling_strategy.py @@ -18,6 +18,7 @@ def __init__( self.pipeline_id = pipeline_id self._storage_backend = storage_backend self.presampling_ratio = presampling_config.ratio + self.ratio_max = presampling_config.ratio_max self.requires_trigger_dataset_size = False @abstractmethod @@ -36,7 +37,7 @@ def get_presampling_query( def get_target_size(self, trigger_dataset_size: int, limit: Optional[int]) -> int: assert trigger_dataset_size >= 0 - target_presampling = int(trigger_dataset_size * self.presampling_ratio / 100) + target_presampling = (trigger_dataset_size * self.presampling_ratio) // self.ratio_max if limit is not None: assert limit >= 0 diff --git a/modyn/tests/selector/internal/selector_strategies/presampling_strategies/test_random_presampling_strategy.py b/modyn/tests/selector/internal/selector_strategies/presampling_strategies/test_random_presampling_strategy.py index c69f499c3..429205035 100644 --- a/modyn/tests/selector/internal/selector_strategies/presampling_strategies/test_random_presampling_strategy.py +++ b/modyn/tests/selector/internal/selector_strategies/presampling_strategies/test_random_presampling_strategy.py @@ -164,3 +164,22 @@ def test_dataset_size_various_scenarios(): strat.tail_triggers = 1 trigger_size = strat._get_trigger_dataset_size() assert presampling_strat.get_target_size(trigger_size, None) == 22 # 75% of presampling + + +def test_target_size_ratio_max(): + config = get_config() + config.ratio_max = 1000 + config.ratio = 125 + strat = RandomPresamplingStrategy( + config, + get_minimal_modyn_config(), + 10, + DatabaseStorageBackend(0, get_minimal_modyn_config(), 123), + ) + assert strat.get_target_size(128, None) == 16 + assert strat.get_target_size(100, None) == 12 + assert strat.get_target_size(12, None) == 1 + assert strat.get_target_size(0, None) == 0 + + with pytest.raises(AssertionError): + strat.get_target_size(-1, None) From 57803eaf25f1d0e687dd39c5335fa7981062bbdc Mon Sep 17 00:00:00 2001 From: Robin Holzinger Date: Sun, 23 Jun 2024 14:09:27 +0200 Subject: [PATCH 3/4] feat: More sophisticated evaluation logic (#534) --- analytics/app/data/const.py | 3 + analytics/app/data/load.py | 2 +- analytics/app/data/transform.py | 183 +++++++------ analytics/app/pages/compare.py | 109 ++++---- analytics/app/pages/const/__init__.py | 0 analytics/app/pages/const/text.py | 12 + analytics/app/pages/pipeline.py | 87 +++---- analytics/app/pages/plots/cost_over_time.py | 27 +- .../pages/plots/cost_vs_eval_metric_agg.py | 78 +++--- analytics/app/pages/plots/eval_heatmap.py | 103 +++++--- analytics/app/pages/plots/eval_over_time.py | 44 ++-- analytics/app/pages/plots/num_samples.py | 56 ++-- .../pages/plots/num_triggers_eval_metric.py | 78 +++--- .../pages/plots/one_dimensional_comparison.py | 71 +++-- analytics/app/pages/plots/pipeline_info.py | 27 +- analytics/app/pages/state.py | 75 ++++++ analytics/tools/__init__.py | 0 analytics/tools/aggregate_runs/__init__.py | 0 .../tools/aggregate_runs/core_aggregation.py | 119 +++++++++ analytics/tools/aggregate_runs/dir_utils.py | 31 +++ analytics/tools/aggregate_runs/main.py | 43 ++++ .../aggregate_runs/pipeline_equivalence.py | 32 +++ analytics/tools/patch_logfile.ipynb | 174 +++++++++++-- dev-requirements.txt | 2 +- environment.yml | 2 +- .../internal/pipeline_executor/models.py | 242 +++++++++++------- .../pipeline_executor/pipeline_executor.py | 6 +- 27 files changed, 1127 insertions(+), 479 deletions(-) create mode 100644 analytics/app/data/const.py create mode 100644 analytics/app/pages/const/__init__.py create mode 100644 analytics/app/pages/const/text.py create mode 100644 analytics/app/pages/state.py create mode 100644 analytics/tools/__init__.py create mode 100644 analytics/tools/aggregate_runs/__init__.py create mode 100644 analytics/tools/aggregate_runs/core_aggregation.py create mode 100644 analytics/tools/aggregate_runs/dir_utils.py create mode 100644 analytics/tools/aggregate_runs/main.py create mode 100644 analytics/tools/aggregate_runs/pipeline_equivalence.py diff --git a/analytics/app/data/const.py b/analytics/app/data/const.py new file mode 100644 index 000000000..3830018cc --- /dev/null +++ b/analytics/app/data/const.py @@ -0,0 +1,3 @@ +from typing import Literal + +CompositeModelOptions = Literal["currently_active_model", "currently_trained_model"] diff --git a/analytics/app/data/load.py b/analytics/app/data/load.py index 8331119c4..49df34e87 100644 --- a/analytics/app/data/load.py +++ b/analytics/app/data/load.py @@ -27,7 +27,7 @@ def list_pipelines() -> dict[int, tuple[str, Path]]: pipelines[pipeline_id] = (pipeline_name, Path(pipeline)) - return pipelines + return dict(sorted(pipelines.items())) def load_pipeline_logs(pipeline_id: int) -> PipelineLogs: diff --git a/analytics/app/data/transform.py b/analytics/app/data/transform.py index d3206f173..1e7cb7234 100644 --- a/analytics/app/data/transform.py +++ b/analytics/app/data/transform.py @@ -3,7 +3,9 @@ import pandas as pd from modyn.supervisor.internal.grpc.enums import PipelineStage -from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs, SingleEvaluationInfo +from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs, SingleEvaluationInfo, StageLog +from modyn.supervisor.internal.utils.time_tools import generate_real_training_end_timestamp +from modyn.utils.utils import SECONDS_PER_UNIT AGGREGATION_FUNCTION = Literal["mean", "median", "max", "min", "sum", "std"] EVAL_AGGREGATION_FUNCTION = Literal["time_weighted_avg", "mean", "median", "max", "min", "sum", "std"] @@ -14,8 +16,9 @@ # -------------------------------------------------------------------------------------------------------------------- # -def logs_dataframe(logs: PipelineLogs) -> pd.DataFrame: +def logs_dataframe(logs: PipelineLogs, pipeline_ref: str = "pipeline") -> pd.DataFrame: df = logs.supervisor_logs.df + df["pipeline_ref"] = pipeline_ref df["duration"] = df["duration"].apply(lambda x: x.total_seconds()) convert_epoch_to_datetime(df, "sample_time") return df @@ -23,7 +26,7 @@ def logs_dataframe(logs: PipelineLogs) -> pd.DataFrame: def logs_dataframe_agg_by_stage(stage_run_df: pd.DataFrame) -> pd.DataFrame: df_agg = ( - stage_run_df.groupby(["id"] + [c for c in stage_run_df.columns if c == "pipeline_ref"]) + stage_run_df.groupby((["pipeline_ref"] if "pipeline_ref" in stage_run_df.columns else []) + ["id"]) .agg( max=("duration", "max"), min=("duration", "min"), @@ -33,77 +36,126 @@ def logs_dataframe_agg_by_stage(stage_run_df: pd.DataFrame) -> pd.DataFrame: sum=("duration", "sum"), count=("duration", "count"), ) - .reset_index() .fillna(-1) ) + df_agg.reset_index(inplace=True) return df_agg +def pipeline_stage_parents(logs: PipelineLogs) -> pd.DataFrame: + ids = [] + parents = [] + for i, (_, parent_list) in logs.pipeline_stages.items(): + if len(parent_list) == 1: + ids.append(i) + parents.append(parent_list[0]) + if len(parent_list) > 1: + if i == PipelineStage.PROCESS_NEW_DATA.name: + if logs.experiment: + ids.append(i) + parents.append(PipelineStage.REPLAY_DATA.name) + else: + ids.append(i) + parents.append(PipelineStage.FETCH_NEW_DATA.name) + else: + raise ValueError(f"Stage {i} has multiple parents: {parent_list}") + + df = pd.DataFrame({"id": ids, "parent_id": parents}) + return df + + def dfs_models_and_evals( - logs: PipelineLogs, max_sample_time: Any + logs: PipelineLogs, max_sample_time: Any, pipeline_ref: str = "pipeline" ) -> tuple[pd.DataFrame, pd.DataFrame | None, pd.DataFrame | None]: """Returns a dataframe with the stored models and the dataframe for evaluations""" # ---------------------------------------------------- MODELS ---------------------------------------------------- # - store_models = [x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.STORE_TRAINED_MODEL.name] - df_models = pd.concat([x.df(extended=True) for x in store_models]) - # df_models.sort_values(by=["sample_time"]) + # PipelineStage.STORE_TRAINED_MODEL + df_store_models = StageLog.df( + (x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.STORE_TRAINED_MODEL.name), extended=True + ) + df_store_models.set_index(["trigger_idx"], inplace=True) - _list_single_triggers = [ - x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.HANDLE_SINGLE_TRIGGER.name - ] - df_single_triggers = pd.concat([x.df(extended=True) for x in _list_single_triggers]) + # PipelineStage.HANDLE_SINGLE_TRIGGER + df_single_triggers = StageLog.df( + (x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.HANDLE_SINGLE_TRIGGER.name), extended=True + )[["trigger_idx", "trigger_id", "first_timestamp", "last_timestamp"]] + df_single_triggers.set_index(["trigger_idx"], inplace=True) - _list_single_trainings = [x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.TRAIN.name] - df_single_trainings = pd.concat([x.df(extended=True) for x in _list_single_trainings]) + # PipelineStage.TRAIN + df_single_trainings = StageLog.df( + (x for x in logs.supervisor_logs.stage_runs if x.id == PipelineStage.TRAIN.name), extended=True + )[["trigger_idx", "num_batches", "num_samples"]] + df_single_trainings.set_index(["trigger_idx"], inplace=True) + + # MERGE + joined_models = df_store_models.merge( + df_single_triggers, on="trigger_idx", how="left", suffixes=("", "_trigger") + ).merge(df_single_trainings, on="trigger_idx", how="left", suffixes=("", "_training")) + + # sort models by trigger_id (we need that for the shift functions in generate_real_training_end_timestamp etc.) + joined_models.sort_index(level="trigger_idx", inplace=True) - joined_models = df_models.merge(df_single_triggers, on="trigger_idx", how="left", suffixes=("", "_trigger")).merge( - df_single_trainings, on="trigger_idx", how="left", suffixes=("", "_training") - ) joined_models["train_start"] = joined_models["first_timestamp"] joined_models["train_end"] = joined_models["last_timestamp"] + joined_models["real_train_end"] = generate_real_training_end_timestamp(joined_models) + + convert_epoch_to_datetime(joined_models, "sample_time") + convert_epoch_to_datetime(joined_models, "train_start") + convert_epoch_to_datetime(joined_models, "train_end") + convert_epoch_to_datetime(joined_models, "real_train_end") df_models = joined_models[ - [col for col in df_models.columns] + ["train_start", "train_end", "num_batches", "num_samples"] + [col for col in df_store_models.columns if col not in joined_models.index.names] + + ["train_start", "train_end", "real_train_end", "num_batches", "num_samples"] ] - convert_epoch_to_datetime(df_models, "train_start") - convert_epoch_to_datetime(df_models, "train_end") - - # sort models by trigger_id - df_models.sort_values(by=["trigger_id"], inplace=True) + df_models.reset_index(inplace=True) # model_usage period - df_models["usage_start"] = df_models["train_end"] + pd.DateOffset(seconds=1) - df_models["usage_end"] = df_models["train_end"].shift(-1) + df_models["usage_start"] = df_models["real_train_end"] + pd.DateOffset(seconds=1) + df_models["usage_end"] = df_models["real_train_end"].shift(-1) df_models["usage_end"] = df_models["usage_end"].fillna(max_sample_time) # linearize ids: + df_models["trigger_idx"] = df_models["trigger_id"] df_models["training_idx"] = df_models["training_id"] df_models["model_idx"] = df_models["id_model"] _, trigger_idx_mappings = linearize_ids(df_models, [], "training_idx") _, model_idx_mappings = linearize_ids(df_models, [], "model_idx") + df_models["pipeline_ref"] = pipeline_ref + # -------------------------------------------------- EVALUATIONS ------------------------------------------------- # - dfs_requests = [ - run.df(extended=True) - for run in logs.supervisor_logs.stage_runs - if run.id == PipelineStage.EVALUATE_SINGLE.name and run.info.failure_reason is None and run.info.eval_request - ] - dfs_metrics = [ - cast(SingleEvaluationInfo, run.info).results_df() - for run in logs.supervisor_logs.stage_runs - if run.id == PipelineStage.EVALUATE_SINGLE.name and run.info.failure_reason is None and run.info.eval_request - ] - if not dfs_requests and not dfs_metrics: - return df_models, None, None + dfs_requests = StageLog.df( + ( + run + for run in logs.supervisor_logs.stage_runs + if ( + run.id == PipelineStage.EVALUATE_SINGLE.name + and run.info.failure_reason is None + and run.info.eval_request + ) + ), + extended=True, + ) + + dfs_metrics = SingleEvaluationInfo.results_df( + ( + cast(SingleEvaluationInfo, run.info) + for run in logs.supervisor_logs.stage_runs + if run.id == PipelineStage.EVALUATE_SINGLE.name + and run.info.failure_reason is None + and run.info.eval_request + ) + ) - eval_requests = pd.concat(dfs_requests) - evals_metrics = pd.concat(dfs_metrics) + if dfs_requests.shape[0] == 0 or dfs_metrics.shape[0] == 0: + return df_models, None, None - for evals_df in [eval_requests, evals_metrics]: + for evals_df in [dfs_requests, dfs_metrics]: evals_df["interval_center"] = (evals_df["interval_start"] + evals_df["interval_end"]) / 2 convert_epoch_to_datetime(evals_df, "interval_start") convert_epoch_to_datetime(evals_df, "interval_end") @@ -116,11 +168,10 @@ def dfs_models_and_evals( linearize_ids(evals_df, [], "training_idx", trigger_idx_mappings) linearize_ids(evals_df, [], "model_idx", model_idx_mappings) - return df_models, eval_requests, evals_metrics - + dfs_requests["pipeline_ref"] = pipeline_ref + dfs_metrics["pipeline_ref"] = pipeline_ref -def logs_dataframe_pipeline_stage_logs(logs: PipelineLogs, stage: PipelineStage) -> pd.DateOffset: - return pd.concat([x.df(extended=True) for x in logs.supervisor_logs.stage_runs if x.id == stage.name]) + return df_models, dfs_requests, dfs_metrics # -------------------------------------------------------------------------------------------------------------------- # @@ -137,32 +188,6 @@ def leaf_stages(logs: PipelineLogs) -> list[str]: return [stage for stage in logs.pipeline_stages if stage not in referenced_as_parent] -def pipeline_stage_parents(logs: PipelineLogs) -> pd.DataFrame: - ids = [] - parents = [] - for i, (_, parent_list) in logs.pipeline_stages.items(): - if len(parent_list) == 1: - ids.append(i) - parents.append(parent_list[0]) - if len(parent_list) > 1: - if i == PipelineStage.PROCESS_NEW_DATA.name: - if logs.experiment: - ids.append(i) - parents.append(PipelineStage.REPLAY_DATA.name) - else: - ids.append(i) - parents.append(PipelineStage.FETCH_NEW_DATA.name) - else: - raise ValueError(f"Stage {i} has multiple parents: {parent_list}") - - return pd.DataFrame( - { - "id": ids, - "parent_id": parents, - } - ) - - # -------------------------------------------------------------------------------------------------------------------- # # TRANSFORM dataframe # # -------------------------------------------------------------------------------------------------------------------- # @@ -232,7 +257,13 @@ def patch_yearbook_time(df: pd.DataFrame, column: str) -> pd.DataFrame: Returns: DataFrame with patched yearbook time. """ - df[column] = pd.to_datetime(1930 + (df[column] - datetime.datetime(1970, 1, 1)).dt.days, format="%Y") + if df.shape[0] == 0: + df[column] = pd.to_datetime([]) + return df + delta = df[column] - pd.to_datetime("1970-01-01") + partial_years = delta.dt.seconds / SECONDS_PER_UNIT["d"] + partial_years_delta = partial_years.apply(lambda x: datetime.timedelta(seconds=x * SECONDS_PER_UNIT["y"])) + df[column] = pd.to_datetime(delta.apply(lambda x: f"{1930 + x.days}-01-01")) + partial_years_delta return df @@ -259,13 +290,15 @@ def df_aggregate_eval_metric( if aggregate_func == "time_weighted_avg": # Compute the duration (end - start) as the weight df["weight"] = df[interval_end] - df[interval_start] - group_total_weights = df.groupby(group_by)["weight"].agg(weight_sum="sum").reset_index() + group_total_weights = df.groupby(group_by)["weight"].agg(weight_sum="sum") + group_total_weights.reset_index(inplace=True) # Compute the weighted value df["weighted_value"] = df[in_col] * df["weight"] # Group by `group_by` and compute the weighted average - grouped = df.groupby(group_by)["weighted_value"].agg(sum_weighted_value="sum").reset_index() + grouped = df.groupby(group_by)["weighted_value"].agg(sum_weighted_value="sum") + grouped.reset_index(inplace=True) # add weightsum info grouped = grouped.merge(group_total_weights, on=group_by) @@ -275,4 +308,6 @@ def df_aggregate_eval_metric( else: # normal average - return df.groupby(group_by).agg({in_col: aggregate_func}).reset_index().rename(columns={in_col: out_col}) + df = df.groupby(group_by).agg({in_col: aggregate_func}) + df.reset_index(inplace=True) + return df.rename(columns={in_col: out_col}) diff --git a/analytics/app/pages/compare.py b/analytics/app/pages/compare.py index 7e7f59924..c70d54eae 100644 --- a/analytics/app/pages/compare.py +++ b/analytics/app/pages/compare.py @@ -1,38 +1,35 @@ import dash import pandas as pd -from analytics.app.data.load import list_pipelines, load_pipeline_logs -from analytics.app.data.transform import ( - add_pipeline_ref, - dfs_models_and_evals, - leaf_stages, - logs_dataframe, - logs_dataframe_agg_by_stage, -) -from analytics.app.pages.plots.cost_over_time import section1_stacked_bar +from analytics.app.data.const import CompositeModelOptions +from analytics.app.pages.const.text import COMPOSITE_MODEL_TEXT +from analytics.app.pages.plots.cost_over_time import section_cost_over_time from analytics.app.pages.plots.eval_heatmap import section_evalheatmap from analytics.app.pages.plots.eval_over_time import section_metricovertime from analytics.app.pages.plots.num_samples import section_num_samples from dash import Input, Output, callback, dcc, html +from typing_extensions import get_args from .plots.cost_vs_eval_metric_agg import section3_scatter_cost_eval_metric from .plots.num_triggers_eval_metric import section3_scatter_num_triggers from .plots.one_dimensional_comparison import section4_1d_boxplots +from .state import pipeline_data, pipelines, process_pipeline_data dash.register_page(__name__, path="/compare", title="Pipeline Comparison") -pipelines = list_pipelines() +initial_pipeline_ids = list(sorted(pipelines.keys()))[:1] # -------------------------------------------------------------------------------------------------------------------- # # PAGE # # -------------------------------------------------------------------------------------------------------------------- # -pipelines = list_pipelines() -initial_pipeline_ids = list(sorted(pipelines.keys()))[:1] - -@callback(Output("pipelines-info", "children"), Input("pipelines-selector", "value")) -def switch_pipelines(pipeline_ids: list[int]): - return render_pipeline_infos(pipeline_ids) +@callback( + Output("pipelines-info", "children"), + Input("pipelines-selector", "value"), + Input("composite-model-variant", "value"), +) +def switch_pipelines(pipeline_ids: list[int], composite_model_variant: CompositeModelOptions) -> list[html.Div]: + return render_pipeline_infos(pipeline_ids, composite_model_variant) ui_pipelines_selection = html.Div( @@ -50,56 +47,50 @@ def switch_pipelines(pipeline_ids: list[int]): persistence=True, style={"color": "black"}, ), + html.Br(), + dcc.Markdown(COMPOSITE_MODEL_TEXT), + dcc.RadioItems( + id="composite-model-variant", + options=[{"label": variant, "value": variant} for variant in get_args(CompositeModelOptions)], + value="currently_active_model", + persistence=True, + ), ] ) -def render_pipeline_infos(pipeline_ids: list[int]) -> list[html.Div]: - # --------------------------------------------------- DATA --------------------------------------------------- # - - pipeline_refs = {pipeline_id: f"{pipeline_id} - {pipelines[pipeline_id][0]}" for pipeline_id in pipeline_ids} +def render_pipeline_infos(pipeline_ids: list[int], composite_model_variant: CompositeModelOptions) -> list[html.Div]: + # ----------------------------------------------------- DATA ----------------------------------------------------- # - log_list = {pipeline_id: load_pipeline_logs(pipeline_id) for pipeline_id in pipeline_ids} - df_logs_dict = { - pipeline_id: add_pipeline_ref(logs_dataframe(logs), pipeline_refs[pipeline_id]) - for pipeline_id, logs in log_list.items() - } + for pipeline_id in pipeline_ids: + if pipeline_id not in pipeline_data: + pipeline_data[pipeline_id] = process_pipeline_data(pipeline_id) - pipeline_leaf_stages = {leaf for log in log_list.values() for leaf in leaf_stages(log)} - df_logs = pd.concat(df_logs_dict.values()) - df_logs_leaf = df_logs[df_logs["id"].isin(pipeline_leaf_stages)] - - df_logs_agg = pd.concat([logs_dataframe_agg_by_stage(df_log) for pipeline_id, df_log in df_logs_dict.items()]) - df_logs_agg_leaf = df_logs_agg[df_logs_agg["id"].isin(pipeline_leaf_stages)] - - _dfs_models_evals: list[str, tuple[str, pd.DataFrame, pd.DataFrame | None]] = [ - (pipeline_refs[pipeline_id], *dfs_models_and_evals(logs, df_logs["sample_time"].max())) - for pipeline_id, logs in log_list.items() - ] - - df_logs_models = pd.concat( - [add_pipeline_ref(single_df_models, pipeline_ref) for pipeline_ref, single_df_models, _, _ in _dfs_models_evals] - ) - - df_logs_eval_requests = pd.concat( + df_all = pd.concat([pipeline_data[pipeline_id].df_all for pipeline_id in pipeline_ids]) + df_agg = pd.concat([pipeline_data[pipeline_id].df_agg for pipeline_id in pipeline_ids]) + df_leaf = pd.concat([pipeline_data[pipeline_id].df_leaf for pipeline_id in pipeline_ids]) + df_agg = pd.concat([pipeline_data[pipeline_id].df_agg for pipeline_id in pipeline_ids]) + df_agg_leaf = pd.concat([pipeline_data[pipeline_id].df_agg_leaf for pipeline_id in pipeline_ids]) + df_models = pd.concat([pipeline_data[pipeline_id].df_models for pipeline_id in pipeline_ids]) + df_eval_requests = pd.concat( [ - add_pipeline_ref(_single_eval_req_df, pipeline_ref) - for pipeline_ref, _, _single_eval_req_df, _ in _dfs_models_evals - if _single_eval_req_df is not None + pipeline_data[pipeline_id].df_eval_requests + for pipeline_id in pipeline_ids + if pipeline_data[pipeline_id].df_eval_requests is not None ] ) - df_logs_eval_single = pd.concat( + df_eval_single = pd.concat( [ - add_pipeline_ref(_single_eval_df, pipeline_ref) - for pipeline_ref, _, _, _single_eval_df in _dfs_models_evals - if _single_eval_df is not None + pipeline_data[pipeline_id].df_eval_single + for pipeline_id in pipeline_ids + if pipeline_data[pipeline_id].df_eval_single is not None ] ) # -------------------------------------------------- LAYOUT -------------------------------------------------- # eval_items = [] - if df_logs_eval_single is None or df_logs_agg is None: + if df_eval_single is None or df_agg is None: eval_items.append( dcc.Markdown( """ @@ -110,22 +101,20 @@ def render_pipeline_infos(pipeline_ids: list[int]) -> list[html.Div]: ) ) else: + eval_items.append(section_metricovertime("compare", True, df_eval_single, composite_model_variant)) + eval_items.append(section_evalheatmap("compare", True, df_models, df_eval_single, composite_model_variant)) + eval_items.append(section_num_samples("compare", True, df_models, df_eval_requests, composite_model_variant)) eval_items.append( - section_metricovertime("compare", True, df_logs_eval_single), - ) - eval_items.append(section_evalheatmap("compare", True, df_logs_eval_single, df_logs_models)) - eval_items.append(section_num_samples("compare", True, df_logs_models, df_logs_eval_requests)) - eval_items.append( - section3_scatter_num_triggers("compare", True, df_logs_agg, df_logs_eval_single), + section3_scatter_num_triggers("compare", True, df_agg, df_eval_single, composite_model_variant) ) eval_items.append( - section3_scatter_cost_eval_metric("compare", df_logs, df_logs_agg_leaf, df_logs_eval_single), + section3_scatter_cost_eval_metric("compare", df_all, df_agg_leaf, df_eval_single, composite_model_variant) ) - eval_items.append(section4_1d_boxplots("compare", True, df_logs, df_logs_eval_single)) + eval_items.append(section4_1d_boxplots("compare", True, df_all, df_eval_single, composite_model_variant)) return [ html.H1("Cost over time comparison"), - section1_stacked_bar("compare", df_logs_leaf), + section_cost_over_time("compare", df_leaf), html.Div(children=eval_items), ] @@ -141,6 +130,6 @@ def render_pipeline_infos(pipeline_ids: list[int]) -> list[html.Div]: """ ), ui_pipelines_selection, - html.Div(id="pipelines-info", children=render_pipeline_infos(initial_pipeline_ids)), + html.Div(id="pipelines-info", children=render_pipeline_infos(initial_pipeline_ids, "currently_active_model")), ] ) diff --git a/analytics/app/pages/const/__init__.py b/analytics/app/pages/const/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/analytics/app/pages/const/text.py b/analytics/app/pages/const/text.py new file mode 100644 index 000000000..4a69c9196 --- /dev/null +++ b/analytics/app/pages/const/text.py @@ -0,0 +1,12 @@ +COMPOSITE_MODEL_TEXT = """ + ## Composite model variant + + The composite model is the pipeline model that is made up by patching together the individual models + of the pipeline. We support two variants of the composite model: + - `currently_active_model`: For a certain point in time we make a fixed model the `pipeline` pipeline model, + that shows up in the composite model, iff it is the most recent model which was trained on an interval + that is strictly before the point of evaluation. + - `currently_trained_model`: For a fixed point in time this is the model that was trained after the + `currently_active_model`. So it is the model which training / training sample collection is still + ongoing during the point of evaluation. +""" diff --git a/analytics/app/pages/pipeline.py b/analytics/app/pages/pipeline.py index b28a877ae..a954fa197 100644 --- a/analytics/app/pages/pipeline.py +++ b/analytics/app/pages/pipeline.py @@ -1,39 +1,34 @@ import dash -from analytics.app.data.load import list_pipelines, load_pipeline_logs -from analytics.app.data.transform import ( - add_pipeline_ref, - dfs_models_and_evals, - leaf_stages, - logs_dataframe, - logs_dataframe_agg_by_stage, - pipeline_stage_parents, -) +from analytics.app.data.const import CompositeModelOptions +from analytics.app.pages.const.text import COMPOSITE_MODEL_TEXT from analytics.app.pages.plots.eval_heatmap import section_evalheatmap from analytics.app.pages.plots.eval_over_time import section_metricovertime from analytics.app.pages.plots.num_samples import section_num_samples from analytics.app.pages.plots.one_dimensional_comparison import section4_1d_boxplots from dash import Input, Output, callback, dcc, html +from typing_extensions import get_args -from .plots.cost_over_time import section1_stacked_bar +from .plots.cost_over_time import section_cost_over_time from .plots.num_triggers_eval_metric import section3_scatter_num_triggers from .plots.pipeline_info import section0_pipeline +from .state import pipeline_data, pipelines, process_pipeline_data dash.register_page(__name__, path="/", title="Pipeline Evaluation") +initial_pipeline_id = min(pipelines.keys()) # -------------------------------------------------------------------------------------------------------------------- # # PAGE # # -------------------------------------------------------------------------------------------------------------------- # -pipelines = list_pipelines() -initial_pipeline_id = min(pipelines.keys()) - @callback( - Output("pipeline-info", "children"), Input("pipeline-selector", "value"), prevent_initial_call="initial_duplicate" + Output("pipeline-info", "children"), + Input("pipeline-selector", "value"), + Input("composite-model-variant", "value"), ) -def switch_pipeline(pipeline_id: int): - return render_pipeline_info(pipeline_id) +def switch_pipeline(pipeline_id: int, composite_model_variant: CompositeModelOptions) -> list[html.Div]: + return render_pipeline_info(pipeline_id, composite_model_variant) ui_pipeline_selection = html.Div( @@ -50,34 +45,30 @@ def switch_pipeline(pipeline_id: int): persistence=True, style={"color": "black", "width": "65%"}, ), + html.Br(), + dcc.Markdown(COMPOSITE_MODEL_TEXT), + dcc.RadioItems( + id="composite-model-variant", + options=[{"label": variant, "value": variant} for variant in get_args(CompositeModelOptions)], + value="currently_active_model", + persistence=True, + ), ] ) -def render_pipeline_info(pipeline_id: int) -> list[html.Div]: +def render_pipeline_info(pipeline_id: int, composite_model_variant: CompositeModelOptions) -> list[html.Div]: # ----------------------------------------------------- DATA ----------------------------------------------------- # - pipeline_ref = f"{pipeline_id} - {pipelines[pipeline_id][1]}" + if pipeline_id not in pipeline_data: + pipeline_data[pipeline_id] = process_pipeline_data(pipeline_id) - logs = load_pipeline_logs(pipeline_id) - pipeline_leaf_stages = leaf_stages(logs) - df_logs = logs_dataframe(logs) - df_logs_leaf = df_logs[df_logs["id"].isin(pipeline_leaf_stages)] - - df_logs_agg = logs_dataframe_agg_by_stage(df_logs) - df_logs_agg_leaf = df_logs_agg[df_logs_agg["id"].isin(pipeline_leaf_stages)] - - df_parents = pipeline_stage_parents(logs) - df_logs_add_parents = df_logs_agg.merge(df_parents, left_on="id", right_on="id", how="left") - - df_logs_models, df_logs_eval_requests, df_logs_eval_single = dfs_models_and_evals( - logs, df_logs["sample_time"].max() - ) + data = pipeline_data[pipeline_id] # ---------------------------------------------------- LAYOUT ---------------------------------------------------- # eval_items = [] - if df_logs_eval_single is None or df_logs_agg is None: + if data.df_eval_single is None or data.df_agg is None: eval_items.append( dcc.Markdown( """ @@ -88,46 +79,48 @@ def render_pipeline_info(pipeline_id: int) -> list[html.Div]: ) ) else: - eval_items.append( - section_metricovertime("pipeline", False, add_pipeline_ref(df_logs_eval_single, pipeline_ref)) - ) + eval_items.append(section_metricovertime("pipeline", False, data.df_eval_single, composite_model_variant)) eval_items.append( section_evalheatmap( "pipeline", False, - add_pipeline_ref(df_logs_eval_single, pipeline_ref), - add_pipeline_ref(df_logs_models, pipeline_ref), + data.df_models, + data.df_eval_single, + composite_model_variant, ) ) eval_items.append( section_num_samples( "pipeline", False, - add_pipeline_ref(df_logs_models, pipeline_ref), - add_pipeline_ref(df_logs_eval_requests, pipeline_ref), + data.df_models, + data.df_eval_requests, + composite_model_variant, ) ) eval_items.append( section3_scatter_num_triggers( "pipeline", False, - add_pipeline_ref(df_logs_agg, pipeline_ref), - add_pipeline_ref(df_logs_eval_single, pipeline_ref), + data.df_agg, + data.df_eval_single, + composite_model_variant, ) ) eval_items.append( section4_1d_boxplots( "pipeline", False, - add_pipeline_ref(df_logs, pipeline_ref), - add_pipeline_ref(df_logs_eval_single, pipeline_ref), + data.df_all, + data.df_eval_single, + composite_model_variant, ) ) return [ html.Div( [ - section0_pipeline(logs, df_logs, df_logs_agg_leaf, df_logs_add_parents), + section0_pipeline(data.logs, data.df_all, data.df_agg_leaf, data.df_add_parents), dcc.Markdown( """ ## Cost-/Accuracy triggering tradeoff @@ -139,7 +132,7 @@ def render_pipeline_info(pipeline_id: int) -> list[html.Div]: executed (i.e. batches without triggers). """ ), - section1_stacked_bar("pipeline", add_pipeline_ref(df_logs_leaf, pipeline_ref)), + section_cost_over_time("pipeline", data.df_leaf), html.Div(eval_items), ] ) @@ -157,6 +150,6 @@ def render_pipeline_info(pipeline_id: int) -> list[html.Div]: """ ), ui_pipeline_selection, - html.Div(id="pipeline-info", children=render_pipeline_info(initial_pipeline_id)), + html.Div(id="pipeline-info", children=render_pipeline_info(initial_pipeline_id, "currently_active_model")), ] ) diff --git a/analytics/app/pages/plots/cost_over_time.py b/analytics/app/pages/plots/cost_over_time.py index c20ec9d60..ccc543669 100644 --- a/analytics/app/pages/plots/cost_over_time.py +++ b/analytics/app/pages/plots/cost_over_time.py @@ -1,4 +1,3 @@ -import dataclasses from dataclasses import dataclass import pandas as pd @@ -9,16 +8,16 @@ @dataclass -class _SharedData: - """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app. - Therefore the need a reference to the data structure at startup time (even though data is not available yet). +class _PageState: + """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within + global references. """ - df_logs_leaf: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - """page, data""" + df_leaf: pd.DataFrame -_shared_data = _SharedData() +_shared_data: dict[str, _PageState] = {} # page -> _PageState + # -------------------------------------------------------------------------------------------------------------------- # # FIGURE # @@ -43,7 +42,7 @@ def gen_figure( histogram: Whether to use histogram over barplot nbins: Number of bins; only used in the histogram=True case """ - df_adjusted = _shared_data.df_logs_leaf[page].copy() + df_adjusted = _shared_data[page].df_leaf.copy() # TODO: remove if cumulative and not histogram: # as bar plots don't support cumulation natively @@ -113,8 +112,10 @@ def gen_figure( # -------------------------------------------------------------------------------------------------------------------- # -def section1_stacked_bar(page: str, df_logs_leaf: pd.DataFrame) -> html.Div: - _shared_data.df_logs_leaf[page] = df_logs_leaf +def section_cost_over_time(page: str, df_leaf: pd.DataFrame) -> html.Div: + if page not in _shared_data: + _shared_data[page] = _PageState(df_leaf=df_leaf) + _shared_data[page].df_leaf = df_leaf @callback( Output(f"{page}-costovertime-plot", "figure"), @@ -124,14 +125,16 @@ def section1_stacked_bar(page: str, df_logs_leaf: pd.DataFrame) -> html.Div: Input(f"{page}-costovertime-nbins-slider", "value"), Input(f"{page}-costovertime-radio-time-patch-yearbook", "value"), ) - def update_figure(time_metric: str, cumulative: bool, histogram: bool, nbins: int, patch_yearbook: bool): + def update_figure( + time_metric: str, cumulative: bool, histogram: bool, nbins: int, patch_yearbook: bool + ) -> go.Figure: return gen_figure(page, time_metric, cumulative, histogram, nbins, patch_yearbook) @callback( Output(f"{page}-costovertime-nbins-slider", "disabled"), Input(f"{page}-costovertime-checkbox-histogram", "value"), ) - def hide_bin_slider(histogram: bool): + def hide_bin_slider(histogram: bool) -> bool: return not histogram time_metrics = { diff --git a/analytics/app/pages/plots/cost_vs_eval_metric_agg.py b/analytics/app/pages/plots/cost_vs_eval_metric_agg.py index 1895cdeaa..65e8ae769 100644 --- a/analytics/app/pages/plots/cost_vs_eval_metric_agg.py +++ b/analytics/app/pages/plots/cost_vs_eval_metric_agg.py @@ -1,26 +1,29 @@ -import dataclasses +from dataclasses import dataclass from typing import get_args import pandas as pd import plotly.express as px +from analytics.app.data.const import CompositeModelOptions from analytics.app.data.transform import AGGREGATION_FUNCTION, EVAL_AGGREGATION_FUNCTION, df_aggregate_eval_metric from dash import Input, Output, callback, dcc, html from plotly import graph_objects as go -@dataclasses.dataclass -class _SharedData: - """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app. - Therefore the need a reference to the data structure at startup time (even though data is not available yet). +@dataclass +class _PageState: + """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within + global references. """ - df_logs: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - df_logs_agg_leaf: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - """page, data""" + df_all: pd.DataFrame + df_agg_leaf: pd.DataFrame + df_eval_single: pd.DataFrame + composite_model_variant: CompositeModelOptions = "currently_active_model" + + +_shared_data: dict[str, _PageState] = {} # page -> _PageState -_shared_data = _SharedData() # -------------------------------------------------------------------------------------------------------------------- # # FIGURE # @@ -37,17 +40,18 @@ def gen_fig_scatter_num_triggers( stages: list[str], ) -> go.Figure: # unpack data - df_logs = _shared_data.df_logs[page] - df_logs_eval_single = _shared_data.df_logs_eval_single[page].copy() - df_logs_eval_single = df_logs_eval_single[ - (df_logs_eval_single["dataset_id"] == dataset_id) - & (df_logs_eval_single["eval_handler"] == eval_handler) - & (df_logs_eval_single["most_recent_model"]) + composite_model_variant = _shared_data[page].composite_model_variant + df_all = _shared_data[page].df_all + df_eval_single = _shared_data[page].df_eval_single + df_eval_single = df_eval_single[ + (df_eval_single["dataset_id"] == dataset_id) + & (df_eval_single["eval_handler"] == eval_handler) + & (df_eval_single[composite_model_variant]) # & (df_adjusted["metric"] == metric) ] agg_eval_metric = df_aggregate_eval_metric( - df_logs_eval_single, + df_eval_single, group_by=["pipeline_ref", "metric"], in_col="value", out_col="metric_value", @@ -55,10 +59,13 @@ def gen_fig_scatter_num_triggers( ) agg_duration = ( - df_logs[df_logs["id"].isin(stages)].groupby(["pipeline_ref"]).agg(cost=("duration", agg_func_x)).reset_index() + df_all[df_all["id"].isin(stages)].groupby(["pipeline_ref"]).agg(cost=("duration", agg_func_x)).reset_index() ) merged = agg_eval_metric.merge(agg_duration, on="pipeline_ref") + assert ( + agg_eval_metric.shape[0] == merged.shape[0] == agg_duration.shape[0] * len(agg_eval_metric["metric"].unique()) + ) fig = px.scatter( merged, x="cost", @@ -67,7 +74,7 @@ def gen_fig_scatter_num_triggers( facet_col="metric", labels={ "cost": f"{agg_func_x} duration in sec. (proxy for cost)", - "metric_value": f"{agg_func_y} {metric}", + "metric_value": f"{agg_func_y}", "pipeline_ref": "Pipeline", }, category_orders={ @@ -85,13 +92,26 @@ def gen_fig_scatter_num_triggers( def section3_scatter_cost_eval_metric( - page: str, df_logs: pd.DataFrame, df_logs_agg_leaf: pd.DataFrame, df_logs_eval_single: pd.DataFrame + page: str, + df_all: pd.DataFrame, + df_agg_leaf: pd.DataFrame, + df_eval_single: pd.DataFrame, + composite_model_variant: CompositeModelOptions, ) -> html.Div: - assert "pipeline_ref" in df_logs.columns.tolist() - assert "pipeline_ref" in df_logs_eval_single.columns.tolist() - _shared_data.df_logs[page] = df_logs - _shared_data.df_logs_agg_leaf[page] = df_logs_agg_leaf - _shared_data.df_logs_eval_single[page] = df_logs_eval_single + assert "pipeline_ref" in list(df_all.columns) + assert "pipeline_ref" in list(df_eval_single.columns) + + if page not in _shared_data: + _shared_data[page] = _PageState( + composite_model_variant=composite_model_variant, + df_all=df_all, + df_agg_leaf=df_agg_leaf, + df_eval_single=df_eval_single, + ) + _shared_data[page].composite_model_variant = composite_model_variant + _shared_data[page].df_all = df_all + _shared_data[page].df_agg_leaf = df_agg_leaf + _shared_data[page].df_eval_single = df_eval_single @callback( Output(f"{page}-scatter-cost-eval", "figure"), @@ -114,11 +134,11 @@ def update_scatter_num_triggers( page, eval_handler_ref, dataset_id, metric_y, agg_func_x, agg_func_y, stages ) - eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique()) - eval_datasets = list(df_logs_eval_single["dataset_id"].unique()) - eval_metrics = list(df_logs_eval_single["metric"].unique()) + eval_handler_refs = list(df_eval_single["eval_handler"].unique()) + eval_datasets = list(df_eval_single["dataset_id"].unique()) + eval_metrics = list(df_eval_single["metric"].unique()) - stages = list(df_logs_agg_leaf["id"].unique()) + stages = list(df_agg_leaf["id"].unique()) return html.Div( [ diff --git a/analytics/app/pages/plots/eval_heatmap.py b/analytics/app/pages/plots/eval_heatmap.py index be026225d..2f095ab19 100644 --- a/analytics/app/pages/plots/eval_heatmap.py +++ b/analytics/app/pages/plots/eval_heatmap.py @@ -1,26 +1,26 @@ -import dataclasses from dataclasses import dataclass import pandas as pd -from analytics.app.data.transform import patch_yearbook_time +from analytics.app.data.const import CompositeModelOptions +from analytics.app.data.transform import linearize_ids, patch_yearbook_time from dash import Input, Output, callback, dcc, html from plotly import graph_objects as go @dataclass -class _SharedData: - """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app. - Therefore the need a reference to the data structure at startup time (even though data is not available yet). +class _PageState: + """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within + global references. """ - df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - """page, data""" + df_models: pd.DataFrame + df_eval_single: pd.DataFrame - df_logs_models: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - """page, data""" + composite_model_variant: CompositeModelOptions = "currently_active_model" -_shared_data = _SharedData() +_shared_data: dict[str, _PageState] = {} # page -> _PageState + # -------------------------------------------------------------------------------------------------------------------- # # FIGURE # @@ -28,7 +28,12 @@ class _SharedData: def gen_figure( - page: str, multi_pipeline_mode: bool, patch_yearbook: bool, eval_handler: str, dataset_id: str, metric: str + page: str, + multi_pipeline_mode: bool, + patch_yearbook: bool, + eval_handler: str, + dataset_id: str, + metric: str, ) -> go.Figure: """ Create the cost over time figure with barplot or histogram. Histogram has nice binning while barplot is precise. @@ -42,8 +47,10 @@ def gen_figure( dataset_id: Dataset id metric: Evaluation metric (replaced with facet) """ - df_logs_models = _shared_data.df_logs_models[page].copy() - df_adjusted = _shared_data.df_logs_eval_single[page].copy() + composite_model_variant = _shared_data[page].composite_model_variant + + df_logs_models = _shared_data[page].df_models.copy() # TODO: remove copy + df_adjusted = _shared_data[page].df_eval_single.copy() # TODO: remove copy df_adjusted = df_adjusted[ (df_adjusted["dataset_id"] == dataset_id) & (df_adjusted["eval_handler"] == eval_handler) @@ -52,25 +59,44 @@ def gen_figure( # Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years) if patch_yearbook: - for column in ["interval_center", "interval_start", "interval_end", "sample_time", "sample_time_until"]: + for column in ["interval_start", "interval_center", "interval_end"]: patch_yearbook_time(df_adjusted, column) + for column in ["train_start", "train_end", "real_train_end", "usage_start", "usage_end"]: + patch_yearbook_time(df_logs_models, column) df_adjusted = df_adjusted.sort_values(by=["interval_center"]) if multi_pipeline_mode: # we only want the pipeline performance (composed of the models active periods stitched together) - df_adjusted = df_adjusted[df_adjusted["most_recent_model"]] + df_adjusted = df_adjusted[df_adjusted[composite_model_variant]] # in model dataframe convert pipeline_ref to pipeline_id as we need int for the heatmap df_adjusted["pipeline_id"] = df_adjusted["pipeline_ref"].str.split("-").str[0].astype(int) df_logs_models["pipeline_id"] = df_logs_models["pipeline_ref"].str.split("-").str[0].astype(int) + full_refs = { + pipeline_id: pipeline_name + for pipeline_id, pipeline_name in df_logs_models[["pipeline_id", "pipeline_ref"]].values + } + + _, mapping = linearize_ids(df_adjusted, [], "pipeline_id") + linearize_ids(df_logs_models, [], "pipeline_id", mapping) + + # invert the mapping + label_map = {v: full_refs[k] for k, v in mapping[()].items()} + else: assert df_adjusted["pipeline_ref"].nunique() == 1 # add the pipeline time series which is the performance of different models stitched together dep. # w.r.t which model was active - pipeline_composite_model = df_adjusted[df_adjusted["most_recent_model"]] - pipeline_composite_model["model_idx"] = "0-pipeline-composite-model" + pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]] + pipeline_composite_model["model_idx"] = 0 + pipeline_composite_model["id_model"] = 0 + + label_map = {k: f"model_idx={k}, id={v}" for k, v in df_adjusted[["model_idx", "id_model"]].values} + label_map[0] = "Pipeline composite model" + + df_adjusted = pd.concat([pipeline_composite_model, df_adjusted]) # build heatmap matrix dataframe: heatmap_data = df_adjusted.pivot( @@ -83,6 +109,8 @@ def gen_figure( x=heatmap_data.columns, y=heatmap_data.index, colorscale="RdBu_r", + dx=0.5, + dy=1, ) ) fig.update_layout( @@ -90,9 +118,9 @@ def gen_figure( yaxis_nticks=2 * min(20, len(heatmap_data.index)), width=2200, height=1100, - # "pipeline_id": "Pipeline", - # "metric": "Metric", - # "interval_center": "Evaluation time (interval center)", + showlegend=True, + yaxis=dict(tickmode="array", tickvals=heatmap_data.index, ticktext=[label_map[y] for y in heatmap_data.index]), + xaxis=dict(tickangle=45), ) shapes = [] @@ -105,10 +133,10 @@ def gen_figure( y0=active_[1]["model_idx"] - 0.5, x1=active_[1]["interval_end"], y1=active_[1]["model_idx"] + 0.5, - line=dict(color="Green", width=5), + line=dict(color="Green", width=2), ) for active_ in df_adjusted[ - df_adjusted["most_recent_model"] + df_adjusted[composite_model_variant] ].iterrows() # if "pipeline-composite-model" not in active_[1]["id_model"] ] # diagonal 2 @@ -119,10 +147,10 @@ def gen_figure( y0=active_[1]["model_idx"] + 0.5, x1=active_[1]["interval_end"], y1=active_[1]["model_idx"] - 0.5, - line=dict(color="Green", width=5), + line=dict(color="Green", width=2), ) for active_ in df_adjusted[ - df_adjusted["most_recent_model"] + df_adjusted[composite_model_variant] ].iterrows() # if "pipeline-composite-model" not in active_[1]["id_model"] ] @@ -133,10 +161,10 @@ def gen_figure( dict( type="rect", x0=active_[1][f"{type_}_start"], - x1=active_[1][f"{type_}_end"], + x1=active_[1][f"{'real_' if type_ == 'train' else ''}{type_}_end"], y0=active_[1][y_column] - 0.5, y1=active_[1][y_column] + 0.5, - line=dict(color="Orange" if type_ == "train" else "Black", width=4), + line=dict(color="Orange" if type_ == "train" else "Black", width=2), ) for active_ in df_logs_models.iterrows() ] @@ -150,10 +178,21 @@ def gen_figure( def section_evalheatmap( - page: str, multi_pipeline_mode: bool, df_logs_eval_single: pd.DataFrame, df_logs_models: pd.DataFrame + page: str, + multi_pipeline_mode: bool, + df_models: pd.DataFrame, + df_eval_single: pd.DataFrame, + composite_model_variant: CompositeModelOptions, ) -> html.Div: - _shared_data.df_logs_eval_single[page] = df_logs_eval_single - _shared_data.df_logs_models[page] = df_logs_models + if page not in _shared_data: + _shared_data[page] = _PageState( + composite_model_variant=composite_model_variant, + df_models=df_models, + df_eval_single=df_eval_single, + ) + _shared_data[page].composite_model_variant = composite_model_variant + _shared_data[page].df_models = df_models + _shared_data[page].df_eval_single = df_eval_single @callback( Output(f"{page}-evalheatmap-plot", "figure"), @@ -165,9 +204,9 @@ def section_evalheatmap( def update_figure(patch_yearbook: bool, eval_handler_ref: str, dataset_id: str, metric: str) -> go.Figure: return gen_figure(page, multi_pipeline_mode, patch_yearbook, eval_handler_ref, dataset_id, metric) - eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique()) - eval_datasets = list(df_logs_eval_single["dataset_id"].unique()) - eval_metrics = list(df_logs_eval_single["metric"].unique()) + eval_handler_refs = list(df_eval_single["eval_handler"].unique()) + eval_datasets = list(df_eval_single["dataset_id"].unique()) + eval_metrics = list(df_eval_single["metric"].unique()) return html.Div( [ diff --git a/analytics/app/pages/plots/eval_over_time.py b/analytics/app/pages/plots/eval_over_time.py index 7e87c3f01..eb5d90927 100644 --- a/analytics/app/pages/plots/eval_over_time.py +++ b/analytics/app/pages/plots/eval_over_time.py @@ -1,24 +1,24 @@ -import dataclasses from dataclasses import dataclass import pandas as pd import plotly.express as px +from analytics.app.data.const import CompositeModelOptions from analytics.app.data.transform import patch_yearbook_time from dash import Input, Output, callback, dcc, html from plotly import graph_objects as go @dataclass -class _SharedData: - """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app. - Therefore the need a reference to the data structure at startup time (even though data is not available yet). +class _PageState: + """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within + global references. """ - df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - """page, data""" + df_eval_single: pd.DataFrame + composite_model_variant: CompositeModelOptions = "currently_active_model" -_shared_data = _SharedData() +_shared_data: dict[str, _PageState] = {} # page -> _PageState # -------------------------------------------------------------------------------------------------------------------- # # FIGURE # @@ -29,7 +29,7 @@ def gen_figure( page: str, multi_pipeline_mode: bool, patch_yearbook: bool, eval_handler: str, dataset_id: str, metric: str ) -> go.Figure: """ - Create the cost over time figure with barplot or histogram. Histogram has nice binning while barplot is precise. + Create the evaluation over time figure with a line plot. Args: page: Page name where the plot is displayed @@ -40,7 +40,9 @@ def gen_figure( dataset_id: Dataset id metric: Evaluation metric (replaced with facet) """ - df_adjusted = _shared_data.df_logs_eval_single[page].copy() + composite_model_variant = _shared_data[page].composite_model_variant + + df_adjusted = _shared_data[page].df_eval_single.copy() df_adjusted = df_adjusted[ (df_adjusted["dataset_id"] == dataset_id) & (df_adjusted["eval_handler"] == eval_handler) @@ -49,17 +51,17 @@ def gen_figure( # Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years) if patch_yearbook: - for column in ["interval_center", "interval_start", "interval_end", "sample_time", "sample_time_until"]: + for column in ["interval_center", "interval_start", "interval_end"]: patch_yearbook_time(df_adjusted, column) if multi_pipeline_mode: # we only want the pipeline performance (composed of the models active periods stitched together) - df_adjusted = df_adjusted[df_adjusted["most_recent_model"]] + df_adjusted = df_adjusted[df_adjusted[composite_model_variant]] else: assert df_adjusted["pipeline_ref"].nunique() == 1 # add the pipeline time series which is the performance of different models stitched together dep. # w.r.t which model was active - pipeline_composite_model = df_adjusted[df_adjusted["most_recent_model"]] + pipeline_composite_model = df_adjusted[df_adjusted[composite_model_variant]] pipeline_composite_model["model_idx"] = "00-pipeline-composite-model" number_digits = len(str(df_adjusted["model_idx"].max())) df_adjusted["model_idx"] = df_adjusted["model_idx"].astype(str).str.zfill(number_digits) @@ -97,8 +99,16 @@ def gen_figure( # -------------------------------------------------------------------------------------------------------------------- # -def section_metricovertime(page: str, multi_pipeline_mode: bool, df_logs_eval_single: pd.DataFrame) -> html.Div: - _shared_data.df_logs_eval_single[page] = df_logs_eval_single +def section_metricovertime( + page: str, + multi_pipeline_mode: bool, + df_eval_single: pd.DataFrame, + composite_model_variant: CompositeModelOptions, +) -> html.Div: + if page not in _shared_data: + _shared_data[page] = _PageState(composite_model_variant=composite_model_variant, df_eval_single=df_eval_single) + _shared_data[page].composite_model_variant = composite_model_variant + _shared_data[page].df_eval_single = df_eval_single @callback( Output(f"{page}-evalovertime-plot", "figure"), @@ -110,9 +120,9 @@ def section_metricovertime(page: str, multi_pipeline_mode: bool, df_logs_eval_si def update_figure(patch_yearbook: bool, eval_handler_ref: str, dataset_id: str, metric: str) -> go.Figure: return gen_figure(page, multi_pipeline_mode, patch_yearbook, eval_handler_ref, dataset_id, metric) - eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique()) - eval_datasets = list(df_logs_eval_single["dataset_id"].unique()) - eval_metrics = list(df_logs_eval_single["metric"].unique()) + eval_handler_refs = list(df_eval_single["eval_handler"].unique()) + eval_datasets = list(df_eval_single["dataset_id"].unique()) + eval_metrics = list(df_eval_single["metric"].unique()) return html.Div( [ diff --git a/analytics/app/pages/plots/num_samples.py b/analytics/app/pages/plots/num_samples.py index c2356780c..2aa2d9a38 100644 --- a/analytics/app/pages/plots/num_samples.py +++ b/analytics/app/pages/plots/num_samples.py @@ -1,26 +1,28 @@ -import dataclasses from dataclasses import dataclass from typing import Literal import pandas as pd import plotly.express as px +from analytics.app.data.const import CompositeModelOptions from analytics.app.data.transform import patch_yearbook_time from dash import Input, Output, callback, dcc, html from plotly import graph_objects as go @dataclass -class _SharedData: - """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app. - Therefore the need a reference to the data structure at startup time (even though data is not available yet). +class _PageState: + """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within + global references. """ - df_logs_models: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - df_logs_eval_requests: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - """page, data""" + df_models: pd.DataFrame + df_eval_requests: pd.DataFrame + composite_model_variant: CompositeModelOptions = "currently_active_model" + + +_shared_data: dict[str, _PageState] = {} # page -> _PageState -_shared_data = _SharedData() # -------------------------------------------------------------------------------------------------------------------- # # FIGURE # @@ -50,12 +52,14 @@ def gen_figure( use_scatter_size: If True, the size of the scatter points is proportional to the number of samples patch_yearbook: If True, the time metric is patched to be a yearbook """ + composite_model_variant = _shared_data[page].composite_model_variant + if y_axis == "eval_samples": - df_evals = _shared_data.df_logs_eval_requests[page].copy() + df_evals = _shared_data[page].df_eval_requests df_evals = df_evals[(df_evals["dataset_id"] == dataset_id) & (df_evals["eval_handler"] == eval_handler)] if multi_pipeline_mode: - df_evals = df_evals[df_evals["most_recent_model"]] + df_evals = df_evals[df_evals[composite_model_variant]] # Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years) if time_metric == "sample_time" and patch_yearbook: @@ -75,7 +79,7 @@ def gen_figure( assert y_axis != "eval_center" # y_axis = "train_*"" - df_trainings = _shared_data.df_logs_models[page].copy() + df_trainings = _shared_data[page].df_models.copy() # TODO: remove copy # Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years) if time_metric == "sample_time" and patch_yearbook: @@ -103,10 +107,21 @@ def gen_figure( def section_num_samples( - page: str, multi_pipeline_mode: bool, df_logs_models: pd.DataFrame, df_logs_eval_requests: pd.DataFrame + page: str, + multi_pipeline_mode: bool, + df_models: pd.DataFrame, + df_eval_requests: pd.DataFrame, + composite_model_variant: CompositeModelOptions, ) -> html.Div: - _shared_data.df_logs_models[page] = df_logs_models - _shared_data.df_logs_eval_requests[page] = df_logs_eval_requests + if page not in _shared_data: + _shared_data[page] = _PageState( + composite_model_variant=composite_model_variant, + df_models=df_models, + df_eval_requests=df_eval_requests, + ) + _shared_data[page].composite_model_variant = composite_model_variant + _shared_data[page].df_models = df_models + _shared_data[page].df_eval_requests = df_eval_requests @callback( Output(f"{page}-num-samples-plot", "figure"), @@ -127,7 +142,14 @@ def update_figure( eval_handler: str, ) -> go.Figure: return gen_figure( - page, multi_pipeline_mode, time_metric, y_axis, use_scatter_size, patch_yearbook, dataset_id, eval_handler + page, + multi_pipeline_mode, + time_metric, + y_axis, + use_scatter_size, + patch_yearbook, + dataset_id, + eval_handler, ) @callback( @@ -145,8 +167,8 @@ def show_eval_config(y_axis: YAxis) -> bool: "interval_center": "Evaluation interval center (only for y=eval_samples)", } - eval_handler_refs = list(df_logs_eval_requests["eval_handler"].unique()) - eval_datasets = list(df_logs_eval_requests["dataset_id"].unique()) + eval_handler_refs = list(df_eval_requests["eval_handler"].unique()) + eval_datasets = list(df_eval_requests["dataset_id"].unique()) return html.Div( [ diff --git a/analytics/app/pages/plots/num_triggers_eval_metric.py b/analytics/app/pages/plots/num_triggers_eval_metric.py index e492856bc..024459bd9 100644 --- a/analytics/app/pages/plots/num_triggers_eval_metric.py +++ b/analytics/app/pages/plots/num_triggers_eval_metric.py @@ -2,6 +2,7 @@ import pandas as pd import plotly.express as px +from analytics.app.data.const import CompositeModelOptions from analytics.app.data.transform import df_aggregate_eval_metric from dash import Input, Output, callback, dcc, html from modyn.supervisor.internal.grpc.enums import PipelineStage @@ -9,17 +10,18 @@ @dataclasses.dataclass -class _SharedData: - """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app. - Therefore the need a reference to the data structure at startup time (even though data is not available yet). +class _PageState: + """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within + global references. """ - df_logs_agg: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - """page, data""" + df_agg: pd.DataFrame + df_eval_single: pd.DataFrame + composite_model_variant: CompositeModelOptions = "currently_active_model" -_shared_data = _SharedData() + +_shared_data: dict[str, _PageState] = {} # page -> _PageState # -------------------------------------------------------------------------------------------------------------------- # # FIGURE # @@ -48,37 +50,35 @@ def gen_fig_scatter_num_triggers( time_weighted: Whether to weight the aggregation by the evaluation interval length """ # unpack data - df_logs_agg = _shared_data.df_logs_agg[page] - - df_logs_eval_single = _shared_data.df_logs_eval_single[page] - df_logs_eval_single = df_logs_eval_single[ - (df_logs_eval_single["dataset_id"] == dataset_id) - & (df_logs_eval_single["eval_handler"] == eval_handler) + composite_model_variant = _shared_data[page].composite_model_variant + df_agg = _shared_data[page].df_agg + df_eval_single = _shared_data[page].df_eval_single + df_eval_single = df_eval_single[ + (df_eval_single["dataset_id"] == dataset_id) + & (df_eval_single["eval_handler"] == eval_handler) # & (df_adjusted["metric"] == metric) ] if multi_pipeline_mode or only_active_periods: # we only want the pipeline performance (composed of the models active periods stitched together) - df_logs_eval_single = df_logs_eval_single[df_logs_eval_single["most_recent_model"]] + df_eval_single = df_eval_single[df_eval_single[composite_model_variant]] if not multi_pipeline_mode: - assert df_logs_eval_single["pipeline_ref"].nunique() == 1 - # add the pipeline time series which is the performance of different models stitched together dep. # w.r.t which model was active - pipeline_composite_model = df_logs_eval_single[df_logs_eval_single["most_recent_model"]] + pipeline_composite_model = df_eval_single[df_eval_single[composite_model_variant]] pipeline_composite_model["id_model"] = "0-pipeline-composite-model" - df_logs_eval_single["id_model"] = df_logs_eval_single["id_model"].astype(str) - df_logs_eval_single = pd.concat([df_logs_eval_single, pipeline_composite_model]) + df_eval_single["id_model"] = df_eval_single["id_model"].astype(str) + df_eval_single = pd.concat([df_eval_single, pipeline_composite_model]) col_map = {"value": "metric_value", "count": "num_triggers"} - num_triggers = df_logs_agg[df_logs_agg["id"] == PipelineStage.HANDLE_SINGLE_TRIGGER.name][["pipeline_ref", "count"]] - accuracies = df_logs_eval_single + num_triggers = df_agg[df_agg["id"] == PipelineStage.HANDLE_SINGLE_TRIGGER.name][["pipeline_ref", "count"]] + accuracies = df_eval_single labels = { "pipeline_ref": "Pipeline", "metric": "Metric", "num_triggers": "#triggers (proxy for cost)", - "metric_value": f"Metric value {'(mean)' if aggregate_metric else ''}", + "metric_value": f"Metric value {'(aggregated)' if aggregate_metric else ''}", } category_orders = { "pipeline_ref": list(sorted(accuracies["pipeline_ref"].unique())), @@ -93,6 +93,11 @@ def gen_fig_scatter_num_triggers( aggregate_func="time_weighted_avg" if time_weighted else "mean", ) merged = num_triggers.merge(mean_accuracies, on="pipeline_ref").rename(columns=col_map, inplace=False) + assert ( + mean_accuracies.shape[0] + == merged.shape[0] + == num_triggers.shape[0] * len(mean_accuracies["metric"].unique()) + ) fig = px.scatter( merged, x="num_triggers", @@ -124,12 +129,23 @@ def gen_fig_scatter_num_triggers( def section3_scatter_num_triggers( - page: str, multi_pipeline_mode: bool, df_logs_agg: pd.DataFrame, df_logs_eval_single: pd.DataFrame + page: str, + multi_pipeline_mode: bool, + df_agg: pd.DataFrame, + df_eval_single: pd.DataFrame, + composite_model_variant: CompositeModelOptions, ) -> html.Div: - assert "pipeline_ref" in df_logs_agg.columns.tolist() - assert "pipeline_ref" in df_logs_eval_single.columns.tolist() - _shared_data.df_logs_agg[page] = df_logs_agg - _shared_data.df_logs_eval_single[page] = df_logs_eval_single + assert "pipeline_ref" in list(df_agg.columns) + assert "pipeline_ref" in list(df_eval_single.columns) + if page not in _shared_data: + _shared_data[page] = _PageState( + composite_model_variant=composite_model_variant, + df_agg=df_agg, + df_eval_single=df_eval_single, + ) + _shared_data[page].composite_model_variant = composite_model_variant + _shared_data[page].df_agg = df_agg + _shared_data[page].df_eval_single = df_eval_single @callback( Output(f"{page}-scatter-plot-num-triggers", "figure"), @@ -137,7 +153,7 @@ def section3_scatter_num_triggers( Input(f"{page}-radio-scatter-number-triggers-dataset-id", "value"), Input(f"{page}-radio-scatter-number-triggers-metric", "value"), Input(f"{page}-radio-scatter-number-triggers-agg-y", "value"), - Input(f"{page}-radio-1d-eval-metric-only-active-model-periods", "value"), + Input(f"{page}-radio-scatter-number-triggers-agg-time-weighted", "value"), Input(f"{page}-radio-scatter-number-triggers-only-active-model-periods", "value"), ) def update_scatter_num_triggers( @@ -159,9 +175,9 @@ def update_scatter_num_triggers( only_active_periods, ) - eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique()) - eval_datasets = list(df_logs_eval_single["dataset_id"].unique()) - eval_metrics = list(df_logs_eval_single["metric"].unique()) + eval_handler_refs = list(df_eval_single["eval_handler"].unique()) + eval_datasets = list(df_eval_single["dataset_id"].unique()) + eval_metrics = list(df_eval_single["metric"].unique()) return html.Div( [ diff --git a/analytics/app/pages/plots/one_dimensional_comparison.py b/analytics/app/pages/plots/one_dimensional_comparison.py index bb852f05c..619aa703e 100644 --- a/analytics/app/pages/plots/one_dimensional_comparison.py +++ b/analytics/app/pages/plots/one_dimensional_comparison.py @@ -1,7 +1,8 @@ -import dataclasses +from dataclasses import dataclass import pandas as pd import plotly.express as px +from analytics.app.data.const import CompositeModelOptions from analytics.app.data.transform import OPTIONAL_EVAL_AGGREGATION_FUNCTION, df_aggregate_eval_metric from dash import Input, Output, callback, dcc, html from modyn.supervisor.internal.grpc.enums import PipelineStage @@ -9,18 +10,20 @@ from typing_extensions import get_args -@dataclasses.dataclass -class _SharedData: - """We use the call by reference features asa the callbacks in the UI are not updated over the lifetime of the app. - Therefore the need a reference to the data structure at startup time (even though data is not available yet). +@dataclass +class _PageState: + """Callbacks cannot be updated after the initial rendering therefore we need to define and update state within + global references. """ - df_logs: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - df_logs_eval_single: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict) - """page, data""" + df_all: pd.DataFrame + df_eval_single: pd.DataFrame + composite_model_variant: CompositeModelOptions = "currently_active_model" + + +_shared_data: dict[str, _PageState] = {} # page -> _PageState -_shared_data = _SharedData() # -------------------------------------------------------------------------------------------------------------------- # # FIGURE # @@ -28,14 +31,14 @@ class _SharedData: def gen_fig_1d_cost(page: str) -> go.Figure: - df_logs = _shared_data.df_logs[page] return px.box( - df_logs, + _shared_data[page].df_all, x="pipeline_ref", y="duration", color="id", labels={"pipeline_ref": "Pipeline", "duration": "duration in seconds", "id": "Pipeline Stage"}, title="Stage costs", + height=900, ) @@ -47,22 +50,28 @@ def gen_figs_1d_eval( agg_func_eval_metric: OPTIONAL_EVAL_AGGREGATION_FUNCTION, only_active_periods: bool = True, ) -> go.Figure: - df_logs = _shared_data.df_logs[page] - df_logs_eval_single = _shared_data.df_logs_eval_single[page] + composite_model_variant = _shared_data[page].composite_model_variant + + df_logs = _shared_data[page].df_all + df_logs_eval_single = _shared_data[page].df_eval_single df_logs_eval_single = df_logs_eval_single[ (df_logs_eval_single["dataset_id"] == dataset_id) & (df_logs_eval_single["eval_handler"] == eval_handler) ] if multi_pipeline_mode or only_active_periods: # we only want the pipeline performance (composed of the models active periods stitched together) - df_logs_eval_single = df_logs_eval_single[df_logs_eval_single["most_recent_model"]] + df_logs_eval_single = df_logs_eval_single[df_logs_eval_single[composite_model_variant]] if not multi_pipeline_mode: assert df_logs_eval_single["pipeline_ref"].nunique() == 1 + digits = len(str(df_logs_eval_single["id_model"].max())) + # fill with leading spaces to have a consistent sorting + df_logs_eval_single["id_model"] = df_logs_eval_single["id_model"].astype(str).str.zfill(digits) + # add the pipeline time series which is the performance of different models stitched together dep. # w.r.t which model was active - pipeline_composite_model = df_logs_eval_single[df_logs_eval_single["most_recent_model"]] + pipeline_composite_model = df_logs_eval_single[df_logs_eval_single[composite_model_variant]] pipeline_composite_model["id_model"] = "0-pipeline-composite-model" df_logs_eval_single["id_model"] = df_logs_eval_single["id_model"].astype(str) df_logs_eval_single = pd.concat([df_logs_eval_single, pipeline_composite_model]) @@ -110,13 +119,24 @@ def gen_figs_1d_eval( def section4_1d_boxplots( - page: str, multi_pipeline_mode: bool, df_logs: pd.DataFrame, df_logs_eval_single: pd.DataFrame + page: str, + multi_pipeline_mode: bool, + df_all: pd.DataFrame, + df_eval_single: pd.DataFrame, + composite_model_variant: CompositeModelOptions, ) -> html.Div: - assert "pipeline_ref" in df_logs.columns.tolist() - assert "pipeline_ref" in df_logs_eval_single.columns.tolist() + assert "pipeline_ref" in list(df_all.columns) + assert "pipeline_ref" in list(df_eval_single.columns) - _shared_data.df_logs[page] = df_logs - _shared_data.df_logs_eval_single[page] = df_logs_eval_single + if page not in _shared_data: + _shared_data[page] = _PageState( + composite_model_variant=composite_model_variant, + df_all=df_all, + df_eval_single=df_eval_single, + ) + _shared_data[page].composite_model_variant = composite_model_variant + _shared_data[page].df_all = df_all + _shared_data[page].df_eval_single = df_eval_single @callback( Output(f"{page}-1d-box-plot-metrics", "figure"), @@ -132,13 +152,18 @@ def update_scatter_num_triggers( only_active_periods: bool = True, ) -> go.Figure: return gen_figs_1d_eval( - page, multi_pipeline_mode, eval_handler_ref, dataset_id, agg_func_eval_metric, only_active_periods + page, + multi_pipeline_mode, + eval_handler_ref, + dataset_id, + agg_func_eval_metric, + only_active_periods, ) # DATA (bring all metrics into columns of one dataframe) - eval_handler_refs = list(df_logs_eval_single["eval_handler"].unique()) - eval_datasets = list(df_logs_eval_single["dataset_id"].unique()) + eval_handler_refs = list(df_eval_single["eval_handler"].unique()) + eval_datasets = list(df_eval_single["dataset_id"].unique()) return html.Div( [ diff --git a/analytics/app/pages/plots/pipeline_info.py b/analytics/app/pages/plots/pipeline_info.py index 4733a0cc5..bb8506437 100644 --- a/analytics/app/pages/plots/pipeline_info.py +++ b/analytics/app/pages/plots/pipeline_info.py @@ -1,3 +1,5 @@ +from typing import Any + import dash_cytoscape as cyto import pandas as pd import plotly.express as px @@ -8,13 +10,13 @@ def section0_pipeline( - logs: PipelineLogs, df_logs: pd.DataFrame, df_logs_agg_leaf: pd.DataFrame, df_logs_add_parents: pd.DataFrame + logs: PipelineLogs, df_all: pd.DataFrame, df_agg_leaf: pd.DataFrame, df_add_parents: pd.DataFrame ) -> html.Div: def gen_stage_duration_histogram(stage_id: str) -> go.Figure: return px.histogram( - df_logs[df_logs["id"] == stage_id], + df_all[df_all["id"] == stage_id], title="Stage Duration Histogram", - hover_data=df_logs.columns, + hover_data=df_all.columns, marginal="rug", # rug, box, violin x="duration", labels={"duration": "duration in seconds", "id": "Pipeline Stage"}, @@ -24,13 +26,13 @@ def gen_stage_duration_histogram(stage_id: str) -> go.Figure: ) @callback(Output("pipeline-graph-info", "children"), Input("pipeline-graph", "tapNodeData")) - def display_tap_node_info(data) -> str: + def display_tap_node_info(data: Any) -> str: if not data or "id" not in data: return "Click a node to get more information" - series_info = df_logs[df_logs["id"] == data["id"]]["duration"].describe().to_string() + series_info = df_all[df_all["id"] == data["id"]]["duration"].describe().to_string() return ( f"Pipeline Stage: {data['id']}\n" - f"Number of Runs: {df_logs[df_logs['id'] == data['id']].shape[0]}\n" + f"Number of Runs: {df_all[df_all['id'] == data['id']].shape[0]}\n" f"Info about pipeline stage duration:\n" f"{series_info}" ) @@ -39,9 +41,8 @@ def display_tap_node_info(data) -> str: @callback( Output("hist-stage-duration", "figure"), Input("pipeline-graph", "tapNodeData"), - prevent_initial_call="initial_duplicate", ) - def display_tap_node_duration(data) -> go.Figure: + def display_tap_node_duration(data: Any) -> go.Figure: if not data or "id" not in data: stage_id = PipelineStage.MAIN.name else: @@ -50,11 +51,11 @@ def display_tap_node_duration(data) -> go.Figure: return fig_hist_stage_duration fig_pie_pipeline = px.pie( - df_logs_agg_leaf, + df_agg_leaf, values="sum", names="id", hole=0.4, - hover_data=df_logs_agg_leaf, + hover_data=df_agg_leaf, custom_data=["max", "min", "mean", "median", "std", "count"], ) # fig_pie_pipeline.update_traces(textposition='inside', textinfo='percent+label') @@ -76,9 +77,9 @@ def display_tap_node_duration(data) -> go.Figure: fig_sunburst = go.Figure( go.Sunburst( - labels=df_logs_add_parents["id"], - parents=df_logs_add_parents["parent_id"], - values=df_logs_add_parents["sum"], + labels=df_add_parents["id"], + parents=df_add_parents["parent_id"], + values=df_add_parents["sum"], ) ) diff --git a/analytics/app/pages/state.py b/analytics/app/pages/state.py new file mode 100644 index 000000000..6f64c4dfa --- /dev/null +++ b/analytics/app/pages/state.py @@ -0,0 +1,75 @@ +from dataclasses import dataclass + +import pandas as pd +from analytics.app.data.load import list_pipelines, load_pipeline_logs +from analytics.app.data.transform import ( + dfs_models_and_evals, + leaf_stages, + logs_dataframe, + logs_dataframe_agg_by_stage, + pipeline_stage_parents, +) +from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs + + +@dataclass +class ProcessedPipelineData: + pipeline_ref: str + + logs: PipelineLogs + pipeline_leaf_stages: list[str] + + df_all: pd.DataFrame + df_leaf: pd.DataFrame + + df_agg: pd.DataFrame + df_agg_leaf: pd.DataFrame + + df_parents: pd.DataFrame + df_add_parents: pd.DataFrame + + df_models: pd.DataFrame + df_eval_requests: pd.DataFrame | None + df_eval_single: pd.DataFrame | None + + +# ---------------------------------------- Global state (shared by all pages) ---------------------------------------- # + +pipelines = list_pipelines() +max_pipeline_id = max(pipelines.keys()) + +pipeline_data: dict[int, ProcessedPipelineData] = {} + + +def process_pipeline_data(pipeline_id: int) -> ProcessedPipelineData: + pipeline_ref = f"{pipeline_id}".zfill(len(str(max_pipeline_id))) + f" - {pipelines[pipeline_id][0]}" + + logs = load_pipeline_logs(pipeline_id) + pipeline_leaf_stages = leaf_stages(logs) + df_all = logs_dataframe(logs, pipeline_ref) + df_leaf = df_all[df_all["id"].isin(pipeline_leaf_stages)] + + df_agg = logs_dataframe_agg_by_stage(df_all) + df_agg_leaf = df_agg[df_agg["id"].isin(pipeline_leaf_stages)] + + df_parents = pipeline_stage_parents(logs) + df_add_parents = df_agg.merge(df_parents, left_on="id", right_on="id", how="left") + + df_logs_models, df_eval_requests, df_eval_single = dfs_models_and_evals( + logs, df_all["sample_time"].max(), pipeline_ref + ) + + return ProcessedPipelineData( + pipeline_ref=pipeline_ref, + logs=logs, + pipeline_leaf_stages=pipeline_leaf_stages, + df_all=df_all, + df_leaf=df_leaf, + df_agg=df_agg, + df_agg_leaf=df_agg_leaf, + df_parents=df_parents, + df_add_parents=df_add_parents, + df_models=df_logs_models, + df_eval_requests=df_eval_requests, + df_eval_single=df_eval_single, + ) diff --git a/analytics/tools/__init__.py b/analytics/tools/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/analytics/tools/aggregate_runs/__init__.py b/analytics/tools/aggregate_runs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/analytics/tools/aggregate_runs/core_aggregation.py b/analytics/tools/aggregate_runs/core_aggregation.py new file mode 100644 index 000000000..d26cc7074 --- /dev/null +++ b/analytics/tools/aggregate_runs/core_aggregation.py @@ -0,0 +1,119 @@ +from copy import deepcopy +from pathlib import Path + +import pandas as pd +from analytics.app.data.transform import dfs_models_and_evals, logs_dataframe +from analytics.tools.aggregate_runs.dir_utils import load_multiple_logfiles +from analytics.tools.aggregate_runs.pipeline_equivalence import assert_pipeline_equivalence +from modyn.supervisor.internal.grpc.enums import PipelineStage +from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs, SingleEvaluationInfo + +DEBUGGING_MODE = True +"""if True, the the process will halt on breakpoints to allow for manual verification""" + + +def merge_files_for_equivalence_group(pipeline_files: list[Path], output_directory: Path) -> None: + """ + Merges the logfiles of a group of equivalent pipelines into one file. + """ + logs = load_multiple_logfiles(pipeline_files) + assert_pipeline_equivalence(logs) + + dfs_logs = [logs_dataframe(log) for log in logs] + + max_sample_time = max([df["sample_time"].max() for df in dfs_logs]) + + dfs_models_evals: list[tuple[pd.DataFrame, pd.DataFrame | None, pd.DataFrame | None]] = [ + dfs_models_and_evals(log, max_sample_time) for log in logs + ] + + df_models = pd.concat([_df_models for _df_models, _, _ in dfs_models_evals]) + assert df_models.shape[0] > 0 + + df_eval_requests = pd.concat( + [ + single_df_eval_requests + for _, single_df_eval_requests, _ in dfs_models_evals + if single_df_eval_requests is not None + ] + ) + assert df_eval_requests.shape[0] > 0 + + df_eval_single = pd.concat( + [_single_eval_df for _, _, _single_eval_df in dfs_models_evals if _single_eval_df is not None] + ) + + if DEBUGGING_MODE: + # TEMPLATE + # df_eval_single[ + # (df_eval_single["model_idx"] == 1) + # & (df_eval_single["eval_handler"] == "exactmatrix") # ADJUST THIS + # & (df_eval_single["dataset_id"] == "cglm_landmark_min25-test") # ADJUST THIS + # & (df_eval_single["interval_start"] == "2004-01-01") # ADJUST THIS + # & (df_eval_single["interval_end"] == "2004-12-31") # ADJUST THIS + # & (df_eval_single["metric"] == "Accuracy") + # ] + breakpoint() + + aggregated_logs = aggregate_eval_metrics(df_eval_single, logs) + aggregated_logs.materialize(output_directory, mode="final") + + if DEBUGGING_MODE: + breakpoint() + + +def aggregate_eval_metrics(df_eval_single: pd.DataFrame, logs: list[PipelineLogs]) -> PipelineLogs: + """ + Aggregates the evaluation metrics group-wise and updates the creates a new PipelineLogs object using + the first log in the list as a template. + """ + + # --------------------------------------- Aggregation within eval dataframe -------------------------------------- # + groups = df_eval_single.groupby( + ["model_idx", "eval_handler", "dataset_id", "interval_start", "interval_end", "metric"] + ) + + for size in groups.size(): + assert size == len(logs), "Wrong primary key" + + aggregated_metrics = groups.agg( + agg_value=("value", "mean"), id_model_list=("id_model", lambda x: list(x)) + ).reset_index() + + # sanity check: per aggregated row we find len(logs) unique id_model + assert all( + len(row[1]["id_model_list"]) == len(logs) + for row in aggregated_metrics[["model_idx", "id_model_list"]].iterrows() + ) + + if DEBUGGING_MODE: + # print(aggregated_metrics[["model_idx", "id_model_list"]]) + breakpoint() + + # ---------------------------------- Write back dataframe to PipelineLogs object --------------------------------- # + + aggregated_logs = deepcopy(logs[0]) + for log in aggregated_logs.supervisor_logs.stage_runs: + if log.id == PipelineStage.EVALUATE_SINGLE.name: + assert isinstance(log.info, SingleEvaluationInfo) + if not log.info.results: + continue + + eval_req = log.info.eval_request + + # will yield multiple rows (one per each metric) + request_lookup = aggregated_metrics[ + (aggregated_metrics["id_model_list"].apply(lambda x: eval_req.id_model in x)) + & (aggregated_metrics["eval_handler"] == eval_req.eval_handler) + & (aggregated_metrics["dataset_id"] == eval_req.dataset_id) + & (aggregated_metrics["interval_start"] == pd.to_datetime(eval_req.interval_start, unit="s")) + & (aggregated_metrics["interval_end"] == pd.to_datetime(eval_req.interval_end, unit="s")) + ] + + # find aggregated value + for metric in log.info.results["metrics"]: + lookup = request_lookup[request_lookup["metric"] == metric["name"]] + assert len(lookup) == 1, f"Primary key not unique: {metric['name']}" + metric["result"] = float(lookup["agg_value"].iloc[0]) + + return aggregated_logs diff --git a/analytics/tools/aggregate_runs/dir_utils.py b/analytics/tools/aggregate_runs/dir_utils.py new file mode 100644 index 000000000..af1f431e6 --- /dev/null +++ b/analytics/tools/aggregate_runs/dir_utils.py @@ -0,0 +1,31 @@ +import os +from pathlib import Path + +from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs + + +def group_pipelines_by_name(pipeline_logs_directory: Path) -> dict[str, list[Path]]: + # find the groups of equivalent pipelines via the .name file + + pipeline_directories = [ + pipeline_logs_directory / d for d in os.listdir(pipeline_logs_directory) if str(d).startswith("pipeline_") + ] + + pipeline_names: list[tuple[Path, str]] = [(d, (d / ".name").read_text()) for d in pipeline_directories if (d / "pipeline.log").exists()] + + pipeline_groups = {name: [d for d, n in pipeline_names if n == name] for name in set(n for _, n in pipeline_names)} + return pipeline_groups + + +def load_multiple_logfiles(pipeline_files: list[Path]) -> list[PipelineLogs]: + """ + Args: + pipeline_files: list of paths to pipeline log directories (not files!) + Returns: + list of PipelineLogs + """ + logs = [ + PipelineLogs.model_validate_json((pipeline_logfile / "pipeline.log").read_text()) + for pipeline_logfile in pipeline_files + ] + return logs diff --git a/analytics/tools/aggregate_runs/main.py b/analytics/tools/aggregate_runs/main.py new file mode 100644 index 000000000..dde60a19f --- /dev/null +++ b/analytics/tools/aggregate_runs/main.py @@ -0,0 +1,43 @@ +""" +# Motivation + +We want to increase the confidence in our pipeline run results by running the same experiment pipelines with different +seeds. + +This yields different evaluation metrics. In consequence, we want to aggregate (e.g. mean, median) the evaluation +metrics over runs. +""" + +from pathlib import Path +from typing import Annotated, Optional + +import typer +from analytics.tools.aggregate_runs.core_aggregation import merge_files_for_equivalence_group +from analytics.tools.aggregate_runs.dir_utils import group_pipelines_by_name + + +def main( + logs_directory: Annotated[Path, typer.Argument(help="Path to read the pipelines in from")], + aggregated_log_dir: Annotated[Path, typer.Argument(help="Path to output the aggregated pipelines to")], + pipeline_name: Annotated[ + Optional[str], + typer.Option( + help=( + "If not all pipelines should be aggregated, specify the name of the " + "pipeline to aggregate (as specified in the .name file)" + ) + ), + ] = None, +) -> None: + # find the groups of equivalent pipelines via the .name file + + pipeline_groups = group_pipelines_by_name(logs_directory) + + for group_name, group_pipelines in pipeline_groups.items(): + if pipeline_name is not None and group_name != pipeline_name: + continue + merge_files_for_equivalence_group(group_pipelines, output_directory=aggregated_log_dir) + + +if __name__ == "__main__": + typer.run(main) diff --git a/analytics/tools/aggregate_runs/pipeline_equivalence.py b/analytics/tools/aggregate_runs/pipeline_equivalence.py new file mode 100644 index 000000000..ab64a08a8 --- /dev/null +++ b/analytics/tools/aggregate_runs/pipeline_equivalence.py @@ -0,0 +1,32 @@ +from copy import deepcopy + +from modyn.config.schema.pipeline.sampling.config import CoresetStrategyConfig +from modyn.config.schema.pipeline.sampling.downsampling_config import RHOLossDownsamplingConfig +from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs + + +def assert_pipeline_equivalence(logs: list[PipelineLogs]) -> None: + # assert that all pipelines are the same except from the seed + assert len(logs) >= 1 + + candidates = [deepcopy(log) for log in logs] + # set seeds to seed of first pipeline + # set device to first pipeline since that does not matter + for i, candidate in enumerate(candidates): + candidate.config.pipeline.training.seed = candidates[0].config.pipeline.training.seed + candidate.config.pipeline.training.device = candidates[0].config.pipeline.training.device + candidate.config.pipeline.evaluation.device = candidates[0].config.pipeline.evaluation.device + + if isinstance(candidate.config.pipeline.selection_strategy, CoresetStrategyConfig) and isinstance( + candidate.config.pipeline.selection_strategy.downsampling_config, RHOLossDownsamplingConfig + ): + candidate.config.pipeline.selection_strategy.downsampling_config.il_training_config.device = candidates[ + 0 + ].config.pipeline.selection_strategy.downsampling_config.il_training_config.device + candidate.config.pipeline.selection_strategy.downsampling_config.il_training_config.seed = candidates[ + 0 + ].config.pipeline.selection_strategy.downsampling_config.il_training_config.seed + + assert all( + [candidate.config == candidates[0].config for candidate in candidates] + ), "Not all pipelines are the same (ignoring seed)" diff --git a/analytics/tools/patch_logfile.ipynb b/analytics/tools/patch_logfile.ipynb index cafc1ca4b..9acf9eef2 100644 --- a/analytics/tools/patch_logfile.ipynb +++ b/analytics/tools/patch_logfile.ipynb @@ -31,6 +31,7 @@ "\n", "from analytics.app.data.transform import logs_dataframe\n", "from pathlib import Path\n", + "from analytics.app.data.transform import dfs_models_and_evals\n", "\n", "\n", "%load_ext autoreload\n", @@ -52,7 +53,7 @@ "source": [ "# VARIABLES\n", "\n", - "pipeline_logfile = Path(\"/Users/robinholzinger/robin/dev/eth/modyn/.data/evaluation_results/pipeline_5/pipeline.log\")" + "pipeline_logfile = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/pipeline_11/pipeline.log\")" ] }, { @@ -70,22 +71,11 @@ "metadata": {}, "outputs": [], "source": [ - "trains = [(l_ for l_ in logs.supervisor_logs.stage_runs if l_.id == PipelineStage.HANDLE_SINGLE_TRIGGER.name)]\n", - "evals = [(l_ for l_ in logs.supervisor_logs.stage_runs if l_.id == PipelineStage.EVALUATE_SINGLE.name and l_.info.eval_request.dataset_id == \"cglm_landmark_min25-test\")]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from analytics.app.data.transform import dfs_models_and_evals\n", "\n", "df_logs = logs_dataframe(logs)\n", "# max_timestamp = df_logs[\"sample_time\"].max()\n", - "max_timestamp = 1703682949\n", - "df_models, df_evals = dfs_models_and_evals(logs, max_timestamp)" + "max_timestamp = df_logs[\"sample_time\"].max()\n", + "df_models, eval_requests, evals_metrics = dfs_models_and_evals(logs, max_timestamp)" ] }, { @@ -110,7 +100,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_evals" + "eval_requests" ] }, { @@ -119,7 +109,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_evals[df_evals[\"most_recent_model\"]]" + "eval_requests[eval_requests[\"currently_active_model\"]]" ] }, { @@ -137,9 +127,6 @@ "source": [ "for eval_log in logs.supervisor_logs.stage_runs:\n", " if eval_log.id == PipelineStage.EVALUATE_SINGLE.name:\n", - " # Let's throw away all information about the most recent model, let's rebuild it\n", - " eval_log.info.eval_request.most_recent_model = False\n", - "\n", " # For a fixed interval the evaluation request of a certain model is the most recent, if the model training\n", " # interval center lies within the evaluation interval.\n", " # Note: this is not a generic solution, but works for the slicing case with fixed evaluation and trigger\n", @@ -148,7 +135,7 @@ " assert len(model_row) == 1\n", "\n", " training_center = (model_row.iloc[0][\"train_start\"].to_pydatetime().timestamp() + model_row.iloc[0][\"train_end\"].to_pydatetime().timestamp()) / 2\n", - " eval_log.info.eval_request.most_recent_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end" + " eval_log.info.eval_request.currently_trained_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end" ] }, { @@ -160,11 +147,152 @@ "# Write results back\n", "pipeline_logfile.write_text(logs.model_dump_json(by_alias=True))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def patch_logfile(path):\n", + " logs = PipelineLogs.model_validate_json(path.read_text())\n", + " df_logs = logs_dataframe(logs)\n", + " max_timestamp = df_logs[\"sample_time\"].max()\n", + " df_models, eval_requests, evals_metrics = dfs_models_and_evals(logs, max_timestamp)\n", + "\n", + " for eval_log in logs.supervisor_logs.stage_runs:\n", + " if eval_log.id == PipelineStage.EVALUATE_SINGLE.name:\n", + " # Let's throw away all information about the most recent model, let's rebuild it\n", + " eval_log.info.eval_request.currently_active_model = False\n", + "\n", + " # For a fixed interval the evaluation request of a certain model is the most recent, if the model training\n", + " # interval center lies within the evaluation interval.\n", + " # Note: this is not a generic solution, but works for the slicing case with fixed evaluation and trigger\n", + " # intervals in the same order of magnitude.\n", + " model_row = df_models[df_models[\"id_model\"] == eval_log.info.eval_request.id_model]\n", + " assert len(model_row) == 1\n", + "\n", + " training_center = (model_row.iloc[0][\"train_start\"].to_pydatetime().timestamp() + model_row.iloc[0][\"train_end\"].to_pydatetime().timestamp()) / 2\n", + " eval_log.info.eval_request.currently_active_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end\n", + " eval_log.info.eval_request.currently_trained_model = eval_log.info.eval_request.interval_start <= training_center <= eval_log.info.eval_request.interval_end\n", + "\n", + " patched_path = path.parent / \"pipeline.patched\"\n", + " patched_path.write_text(logs.model_dump_json(by_alias=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "log_dir = Path(\"/Users/mboether/phd/dynamic-data/sigmod-data/cglm-landmark/data_selection_50%/logs\")\n", + "logfiles = [logfile for logfile in log_dir.glob(\"**/pipeline.log\") if (logfile.parent / \"snapshot\").exists()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "for logfile in tqdm(logfiles):\n", + " patch_logfile(logfile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "models_red = df_models[[\"trigger_id\", \"id_model\", \"train_start\", \"train_end\"]]\n", + "models_red" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eval_red = eval_requests[[\"trigger_id\", \"training_idx\", \"model_idx\", \"interval_start\", \"interval_end\", \"eval_handler\", \"dataset_id\"]]\n", + "eval_red" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_cross = models_red.merge(eval_red, on=\"trigger_id\").rename(columns={\"train_start\": \"first_timestamp\", \"train_end\": \"last_timestamp\"})\n", + "assert df_cross.shape[0] == eval_red.shape[0]\n", + "df_cross" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Adapted logic from handler.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# df_cross[\"active_candidate\"] = df_cross[\"last_timestamp\"] < df_cross[\"active_model_trained_before\"]\n", + "\n", + "# # find the maximum model for every EvalCandidate that doesn't violate that constraint\n", + "# max_model_id = (\n", + "# df_cross[df_cross[\"active_candidate\"]]\n", + "# .groupby(\"active_model_trained_before\")[\"id_model\"]\n", + "# .aggregate(max_model_id=\"max\")\n", + "# )\n", + "\n", + "# # combine: a model in the cross product is most recent for a certain interval iff\n", + "# # it has maximum model id for its active_model_trained_before\n", + "# df_active_models = df_cross.merge(max_model_id, on=\"active_model_trained_before\", how=\"left\")\n", + "# df_active_models[\"active_model\"] = df_active_models[\"id_model\"] == df_active_models[\"max_model_id\"]\n", + "\n", + "# # for a given interval, the currently trained model is the model with the smallest id\n", + "# # from all models that have a strictly bigger id than the most recent model. Hence it is the model after the\n", + "# # most recent model.\n", + "# # For that we first build a model -> successor model mapping:\n", + "# model_successor_relation = df_active_models[[\"id_model\"]].drop_duplicates().sort_values(by=\"id_model\")\n", + "# model_successor_relation[\"next_id_model\"] = model_successor_relation[\"id_model\"].shift(-1, fill_value=-1)\n", + "\n", + "# # if there's no active model for the first interval(s), we still need to define the next model as the\n", + "# # trained model\n", + "# model_successor_relation = pd.concat(\n", + "# [\n", + "# model_successor_relation,\n", + "# pd.DataFrame([{\"id_model\": None, \"next_id_model\": df_active_models[\"id_model\"].min()}]),\n", + "# ]\n", + "# )\n", + "\n", + "# df_trained_models = df_active_models.merge(\n", + "# model_successor_relation, how=\"left\", left_on=\"max_model_id\", right_on=\"id_model\", suffixes=(\"\", \"__\")\n", + "# )\n", + "# df_trained_models[\"trained_model\"] = df_trained_models[\"id_model\"] == df_trained_models[\"next_id_model\"]\n" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -178,9 +306,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.1.-1" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/dev-requirements.txt b/dev-requirements.txt index 34927185d..b8b416fee 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -22,4 +22,4 @@ seaborn dash dash-daq dash_cytoscape - +dash_bootstrap_components diff --git a/environment.yml b/environment.yml index 5de4819b7..7e494c791 100644 --- a/environment.yml +++ b/environment.yml @@ -14,7 +14,7 @@ channels: - huggingface dependencies: - - python>=3.11 + - python=3.11 - pip - tqdm - conda-forge::enlighten diff --git a/modyn/supervisor/internal/pipeline_executor/models.py b/modyn/supervisor/internal/pipeline_executor/models.py index 470c40c3d..380e34d6d 100644 --- a/modyn/supervisor/internal/pipeline_executor/models.py +++ b/modyn/supervisor/internal/pipeline_executor/models.py @@ -2,13 +2,14 @@ import dataclasses import datetime +import itertools import logging import multiprocessing as mp import os from dataclasses import dataclass from functools import cached_property from pathlib import Path -from typing import Any, Callable, Literal, Optional, Union, cast +from typing import Any, Callable, Iterator, Literal, Optional, Union, cast import pandas as pd from modyn.config.schema.pipeline import ModynPipelineConfig @@ -144,29 +145,34 @@ class StageInfo(BaseModel): `StageInfo` class is therefore intended to be subclassed for different pipeline stage information. """ + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return [] + @property - def df(self) -> pd.DataFrame | None: + def df_row(self) -> tuple: """ While appending StageLog subclasses to `StageLog.info` is sufficient to persist additional information in the - logs, this method is used to provide a DataFrame representation of the data for online analysis. - - `Online` refers the to the ability to analyze the data while the pipeline is running as we do not only - want to analyze the data after the pipeline has finished (e.g. for triggering policies). + logs, this method is used to provide a DataFrame representation of the data for analytical purposes. Returns: - A DataFrame if the stage should collect data, else None. + The dataframe rows. """ - return None + return () class FetchDataInfo(StageInfo): num_samples: int = Field(..., description="Number of samples processed in the new data.") trigger_indexes: list[int] = Field(..., description="Indices of triggers in the new data.") + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return ["num_samples", "trigger_indexes"] + @override @property - def df(self) -> pd.DataFrame: - return pd.DataFrame([(self.num_samples, str(self.trigger_indexes))], columns=["num_samples", "trigger_indexes"]) + def df_row(self) -> tuple: + return (self.num_samples, str(self.trigger_indexes)) class ProcessNewDataInfo(StageInfo): @@ -174,10 +180,14 @@ class ProcessNewDataInfo(StageInfo): num_samples: int = Field(..., description="Number of samples processed") trigger_indexes: list[int] = Field(..., description="Indices of triggers in the new data.") + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return ["fetch_time", "num_samples"] + @override @property - def df(self) -> pd.DataFrame: - return pd.DataFrame([(self.fetch_time, self.num_samples)], columns=["fetch_time", "num_samples"]) + def df_row(self) -> tuple: + return (self.fetch_time, self.num_samples) class EvaluateTriggerInfo(StageInfo): @@ -186,10 +196,14 @@ class EvaluateTriggerInfo(StageInfo): trigger_eval_times: list[int] = Field(default_factory=list) """Time in milliseconds that every next(...) call of the trigger.inform(...) generator took.""" + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return ["batch_size", "trigger_indexes"] + @override @property - def df(self) -> pd.DataFrame: - return pd.DataFrame([(self.batch_size, list(self.trigger_indexes))], columns=["batch_size", "trigger_indexes"]) + def df_row(self) -> tuple: + return (self.batch_size, list(self.trigger_indexes)) class _TriggerLogMixin(StageInfo): @@ -207,26 +221,28 @@ class SelectorInformTriggerInfo(_TriggerLogMixin): selector_log: dict[str, Any] num_samples_in_trigger: int + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return ["trigger_i", "trigger_index", "trigger_id", "num_samples_in_trigger"] + @override @property - def df(self) -> pd.DataFrame: - return pd.DataFrame( - [(self.trigger_i, self.trigger_index, self.trigger_i, self.num_samples_in_trigger)], - columns=["trigger_i", "trigger_index", "trigger_id", "num_samples_in_trigger"], - ) + def df_row(self) -> tuple: + return (self.trigger_i, self.trigger_index, self.trigger_i, self.num_samples_in_trigger) class TriggerExecutionInfo(_TriggerLogMixin): first_timestamp: int | None last_timestamp: int | None + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return ["trigger_i", "trigger_index", "trigger_id", "first_timestamp", "last_timestamp"] + @override @property - def df(self) -> pd.DataFrame: - return pd.DataFrame( - [(self.trigger_i, self.trigger_index, self.trigger_id, self.first_timestamp, self.last_timestamp)], - columns=["trigger_i", "trigger_index", "trigger_id", "first_timestamp", "last_timestamp"], - ) + def df_row(self) -> tuple: + return (self.trigger_i, self.trigger_index, self.trigger_id, self.first_timestamp, self.last_timestamp) class _TrainInfoMixin(StageInfo): @@ -237,25 +253,27 @@ class _TrainInfoMixin(StageInfo): class TrainingInfo(_TrainInfoMixin): trainer_log: dict[str, Any] + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return ["trigger_id", "training_id", "num_batches", "num_samples"] + @override @property - def df(self) -> pd.DataFrame: - return pd.DataFrame( - [(self.trigger_id, self.training_id, self.trainer_log["num_batches"], self.trainer_log["num_samples"])], - columns=["trigger_id", "training_id", "num_batches", "num_samples"], - ) + def df_row(self) -> tuple: + return (self.trigger_id, self.training_id, self.trainer_log["num_batches"], self.trainer_log["num_samples"]) class StoreModelInfo(_TrainInfoMixin): id_model: int # model_ prefix not allowed in pydantic + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return ["trigger_id", "training_id", "id_model"] + @override @property - def df(self) -> pd.DataFrame: - return pd.DataFrame( - [(self.trigger_id, self.training_id, self.id_model)], - columns=["trigger_id", "training_id", "id_model"], - ) + def df_row(self) -> tuple: + return (self.trigger_id, self.training_id, self.id_model) class SingleEvaluationInfo(StageInfo): @@ -263,24 +281,60 @@ class SingleEvaluationInfo(StageInfo): results: dict[str, Any] = Field(default_factory=dict) failure_reason: str | None = None + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return [ + "trigger_id", + "training_id", + "id_model", + "currently_active_model", + "currently_trained_model", + "eval_handler", + "dataset_id", + "interval_start", + "interval_end", + "num_samples", + ] + @override @property - def df(self) -> pd.DataFrame: - """One dataframe per requests (does not contain metrics)""" + def df_row(self) -> tuple: + return ( + self.eval_request.trigger_id, + self.eval_request.training_id, + self.eval_request.id_model, + self.eval_request.currently_active_model, + self.eval_request.currently_trained_model, + self.eval_request.eval_handler, + self.eval_request.dataset_id, + self.eval_request.interval_start, + self.eval_request.interval_end, + self.results.get("dataset_size", 0), + ) + + @classmethod + def results_df(cls, infos: list[SingleEvaluationInfo]) -> pd.DataFrame: + """As one evaluation can have multiple metrics, we return a DataFrame with one row per metric.""" return pd.DataFrame( [ ( - self.eval_request.trigger_id, - self.eval_request.training_id, - self.eval_request.id_model, - self.eval_request.currently_active_model, - self.eval_request.currently_trained_model, - self.eval_request.eval_handler, - self.eval_request.dataset_id, - self.eval_request.interval_start, - self.eval_request.interval_end, - self.results.get("dataset_size", 0), + # per request + info.eval_request.trigger_id, + info.eval_request.training_id, + info.eval_request.id_model, + info.eval_request.currently_active_model, + info.eval_request.currently_trained_model, + info.eval_request.eval_handler, + info.eval_request.dataset_id, + info.eval_request.interval_start, + info.eval_request.interval_end, + info.results.get("dataset_size", 0), + # per metric + metric["name"], + metric["result"], ) + for info in infos + for metric in info.results["metrics"] # pylint: disable=unsubscriptable-object ], columns=[ "trigger_id", @@ -292,35 +346,26 @@ def df(self) -> pd.DataFrame: "dataset_id", "interval_start", "interval_end", - "num_samples", + "dataset_size", + "metric", + "value", ], ) - def results_df(self) -> pd.DataFrame: - """As one evaluation can have multiple metrics, we return a DataFrame with one row per metric.""" - return self.df.merge( - pd.DataFrame( - [ - (metric["name"], metric["result"]) - for metric in self.results["metrics"] # pylint: disable=unsubscriptable-object - ], - columns=["metric", "value"], - ), - how="cross", - ) - class SelectorInformInfo(StageInfo): selector_log: dict[str, Any] | None remaining_data: bool trigger_indexes: list[int] + def df_columns(self) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return ["remaining_data", "trigger_indexes"] + @override @property - def df(self) -> pd.DataFrame: - return pd.DataFrame( - [(self.remaining_data, self.trigger_indexes)], columns=["remaining_data", "trigger_indexes"] - ) + def df_row(self) -> tuple: + return (self.remaining_data, self.trigger_indexes) StageInfoUnion = Union[ @@ -359,40 +404,23 @@ class StageLog(BaseModel): # stage specific log info info: StageInfo | None = Field(None) - def df(self, extended: bool = False) -> pd.DataFrame | None: - """ - Provides a DataFrame with the log information of this stage. - - To conveniently allow analysis of lists of log entries, this method provides a DataFrame representation of the - log entry. - - Args: - extended: If True, include the columns of the info attribute. Requires all logs to have the same type. - - Returns: - A DataFrame with the log information of this stage. - """ - df = pd.DataFrame( - [ - ( - self.id, - self.start, - self.end, - self.duration, - self.batch_idx, - self.sample_idx, - self.sample_time, - self.trigger_idx, - ) - ], - columns=["id", "start", "end", "duration", "batch_idx", "sample_idx", "sample_time", "trigger_idx"], + def df_columns(self, extended: bool = False) -> list[str]: + """Provide the column names of the DataFrame representation of the data.""" + return ["id", "start", "end", "duration", "batch_idx", "sample_idx", "sample_time", "trigger_idx"] + ( + self.info.df_columns() if extended and self.info else [] ) - info_df = self.info.df if self.info else None - if info_df is not None and extended: - # add additional columns - df = pd.concat([df, info_df], axis=1) - return df + def df_row(self, extended: bool = False) -> tuple: + return ( + self.id, + self.start, + self.end, + self.duration, + self.batch_idx, + self.sample_idx, + self.sample_time, + self.trigger_idx, + ) + (self.info.df_row if extended and self.info else ()) # (De)Serialization to enable parsing all classes in the StageInfoUnion; # with that logic we avoid having to add disciminator fields to every subclass of StageInfo @@ -417,6 +445,28 @@ def deserializer(cls, data: Any) -> Any: data["info"] = None return data + @classmethod + def df(cls, stage_logs: Iterator[StageLog], extended: bool = False) -> pd.DataFrame: + """ + Provides a DataFrame with the log information of this stage. + + To conveniently allow analysis of lists of log entries, this method provides a DataFrame representation of the + log entry. + + Args: + extended: If True, include the columns of the info attribute. Requires all logs to have the same type. + + Returns: + A DataFrame row with the log information of this stage. + """ + if not stage_logs: + return pd.DataFrame() + stage_logs, iter_copy = itertools.tee(stage_logs) + return pd.DataFrame( + [stage.df_row(extended=extended) for stage in stage_logs], + columns=next(iter_copy).df_columns(extended=extended), + ) + class SupervisorLogs(BaseModel): stage_runs: list[StageLog] = Field(default_factory=list) diff --git a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py index 52412502b..3062243d8 100644 --- a/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py +++ b/modyn/supervisor/internal/pipeline_executor/pipeline_executor.py @@ -114,8 +114,10 @@ def report_results(stage_log: StageLog) -> None: if track and stage_log.info: # ensure df exists old_df = state.tracking.get(stage_log.id, None) - if (new_rows := stage_log.df(extended=True)) is not None: - state.tracking[stage_log.id] = pd.concat([old_df, new_rows]) if old_df is not None else new_rows + columns = old_df.columns if old_df is not None else stage_log.df_columns(extended=True) + if (new_row := stage_log.df_row(extended=True)) is not None: + new_df = pd.DataFrame([new_row], columns=columns) + state.tracking[stage_log.id] = pd.concat([old_df, new_df]) if old_df is not None else new_df # record logs if log: From 97a2b5f91f80156eeba122a4dbb3eb01aeaa97b7 Mon Sep 17 00:00:00 2001 From: Xianzhe Ma Date: Sun, 23 Jun 2024 20:27:23 +0200 Subject: [PATCH 4/4] Fix batch number (#533) Previously, we didn't record the number of passed batches correctly: We use a `batch_number` which is generated purely from enumeration in `dataloader`. Therefore this number is irrelevant to the number of epochs (only shows how many batches there are in one epoch). A similar issue exists on the iteration on `StB` when we calculate scores class by class. The number of batches passed in the previous class is not correctly accumulated on the current class. This PR fixes it. --- .../abstract_downsampling_strategy.py | 18 +- .../downsampling_strategies/test_scheduler.py | 6 +- .../internal/trainer/test_pytorch_trainer.py | 196 +++++++++++++++--- .../internal/trainer/pytorch_trainer.py | 48 +++-- 4 files changed, 200 insertions(+), 68 deletions(-) diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py index a8dae1333..a71879fbb 100644 --- a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py +++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py @@ -43,19 +43,11 @@ def __init__( self.requires_remote_computation = True self.maximum_keys_in_memory = maximum_keys_in_memory self.downsampling_config = downsampling_config - self.status_bar_scale = self._compute_status_bar_scale() - - def _compute_status_bar_scale(self) -> int: - """ - This function is used to create the downsampling status bar and handle the training one accordingly. - - For BTS, we return 100 since the training status bar sees all the samples - For STB, we return the downsampling_ratio since the training status bar sees only a fraction of points - (while the downsampling status bas sees all the points) - """ - if self.downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE: - return 100 - return self.downsampling_ratio + # the status bar scale is used in conjunction with the total number of samples (after presampling) + # and the number of already trained samples to show current training progress + # No matter it is BtS or StB, the number of trained samples should be compared to the total number of samples + # divided by the downsampling ratio. Therefore, the status bar scale should be the downsampling ratio. + self.status_bar_scale = self.downsampling_ratio @property def downsampling_params(self) -> dict: diff --git a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py index e3783c66b..7b993cb7d 100644 --- a/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py +++ b/modyn/tests/selector/internal/selector_strategies/downsampling_strategies/test_scheduler.py @@ -102,7 +102,7 @@ def test_switch_functions(): "ratio_max": 100, } assert downs.downsampling_strategy == "RemoteGradNormDownsampling" - assert downs.training_status_bar_scale == 100 + assert downs.training_status_bar_scale == 25 def test_wrong_number_threshold(): @@ -158,7 +158,7 @@ def test_double_threshold(): "ratio_max": 100, } assert downs.downsampling_strategy == "RemoteGradNormDownsampling" - assert downs.training_status_bar_scale == 100 + assert downs.training_status_bar_scale == 25 # above the last threshold for i in range(15, 25): @@ -203,7 +203,7 @@ def test_wrong_trigger(): "ratio_max": 100, } assert downs.downsampling_strategy == "RemoteGradNormDownsampling" - assert downs.training_status_bar_scale == 100 + assert downs.training_status_bar_scale == 25 def test_instantiate_scheduler_just_one(): diff --git a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py index 52527184d..aceb19b3b 100644 --- a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py +++ b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py @@ -10,7 +10,7 @@ from collections import OrderedDict from io import BytesIO from time import sleep -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, call, patch import grpc import pytest @@ -29,6 +29,7 @@ from modyn.trainer_server.internal.metadata_collector.metadata_collector import MetadataCollector from modyn.trainer_server.internal.trainer.metadata_pytorch_callbacks.base_callback import BaseCallback from modyn.trainer_server.internal.trainer.pytorch_trainer import PytorchTrainer, train +from modyn.trainer_server.internal.trainer.remote_downsamplers import RemoteGradMatchDownsamplingStrategy from modyn.trainer_server.internal.utils.trainer_messages import TrainerMessages from modyn.trainer_server.internal.utils.training_info import TrainingInfo from modyn.utils import DownsamplingMode @@ -117,6 +118,28 @@ def get_mock_label_transformer(): ) +class MockDataloader: + def __init__(self, batch_size, num_batches): + self.batch_size = batch_size + self.num_batches = num_batches + self.dataset = MagicMock() + + def __iter__(self): + return iter( + [ + ( + ("1",) * self.batch_size, + torch.ones(self.batch_size, 10, requires_grad=True), + torch.ones(self.batch_size, dtype=torch.uint8), + ) + for _ in range(self.num_batches) + ] + ) + + def __len__(self): + return self.num_batches + + def mock_get_dataloaders( pipeline_id, trigger_id, @@ -135,12 +158,7 @@ def mock_get_dataloaders( log_path, num_batches: int = 100, ): - mock_train_dataloader = iter( - [ - (("1",) * batch_size, torch.ones(batch_size, 10, requires_grad=True), torch.ones(batch_size, dtype=int)) - for _ in range(num_batches) - ] - ) + mock_train_dataloader = MockDataloader(batch_size, num_batches) return mock_train_dataloader, None @@ -257,6 +275,7 @@ def get_training_info( @patch.object(StorageStub, "__init__", noop_constructor_mock) @patch.object(SelectorStub, "__init__", noop_constructor_mock) +@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders) @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True) @patch( "modyn.trainer_server.internal.dataset.key_sources.selector_key_source.grpc_connection_established", @@ -266,13 +285,13 @@ def get_training_info( @patch("modyn.trainer_server.internal.utils.training_info.dynamic_module_import") @patch("modyn.trainer_server.internal.trainer.pytorch_trainer.dynamic_module_import") @patch.object(PytorchTrainer, "connect_to_selector", return_value=None) -@patch.object(PytorchTrainer, "get_selection_strategy", return_value=(False, "", {})) +@patch.object(PytorchTrainer, "get_selection_strategy") @patch.object(PytorchTrainer, "get_num_samples_in_trigger") @patch.object(SelectorKeySource, "uses_weights", return_value=False) def get_mock_trainer( modyn_config: ModynConfig, - query_queue: mp.Queue, - response_queue: mp.Queue, + query_queue_training: mp.Queue, + response_queue_training: mp.Queue, use_pretrained: bool, load_optimizer_state: bool, pretrained_model_path: pathlib.Path, @@ -289,22 +308,13 @@ def get_mock_trainer( test_grpc_connection_established_selector: MagicMock, test_grpc_connection_established: MagicMock, batch_size: int = 32, - downsampling_mode: DownsamplingMode = DownsamplingMode.DISABLED, - downsampling_ratio: int = 25, - ratio_max: int = 100, + selection_strategy: tuple[bool, str, dict] = (False, "", {}), ): model_dynamic_module_patch.return_value = MockModule(num_optimizers) lr_scheduler_dynamic_module_patch.return_value = MockLRSchedulerModule() mock_get_num_samples.return_value = batch_size * 100 - if downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE: - mock_selection_strategy.return_value = ( - True, - "RemoteGradNormDownsampling", - {"downsampling_ratio": downsampling_ratio, "ratio_max": ratio_max, "sample_then_batch": False}, - ) - elif downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH: - raise NotImplementedError() + mock_selection_strategy.return_value = selection_strategy training_info = get_training_info( 0, @@ -323,8 +333,8 @@ def get_mock_trainer( modyn_config.model_dump(by_alias=True), training_info, "cpu", - query_queue, - response_queue, + query_queue_training, + response_queue_training, mp.Queue(), mp.Queue(), logging.getLogger(__name__), @@ -621,7 +631,6 @@ def test_send_model_state_to_server(dummy_system_config: ModynConfig): } -@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders) @patch.object(PytorchTrainer, "weights_handling", return_value=(False, False)) def test_train_invalid_query_message(test_weight_handling, dummy_system_config: ModynConfig): query_status_queue = mp.Queue() @@ -652,7 +661,6 @@ def test_train_invalid_query_message(test_weight_handling, dummy_system_config: # # pylint: disable=too-many-locals -@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders) @patch.object(BaseCallback, "on_train_begin", return_value=None) @patch.object(BaseCallback, "on_train_end", return_value=None) @patch.object(BaseCallback, "on_batch_begin", return_value=None) @@ -870,7 +878,6 @@ def test_create_trainer_with_exception( @pytest.mark.parametrize("downsampling_ratio, ratio_max", [(25, 100), (50, 100), (250, 1000), (125, 1000)]) -@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders) @patch.object(BaseCallback, "on_train_begin", return_value=None) @patch.object(BaseCallback, "on_train_end", return_value=None) @patch.object(BaseCallback, "on_batch_begin", return_value=None) @@ -914,9 +921,11 @@ def test_train_batch_then_sample_accumulation( "custom", False, batch_size=batch_size, - downsampling_mode=DownsamplingMode.BATCH_THEN_SAMPLE, - downsampling_ratio=downsampling_ratio, - ratio_max=ratio_max, + selection_strategy=( + True, + "RemoteGradNormDownsampling", + {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": ratio_max}, + ), ) assert trainer._downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE @@ -949,6 +958,7 @@ def mock_forward(data): assert trainer._num_samples == batch_size * num_batches assert trainer._log["num_samples"] == batch_size * num_batches + assert trainer._log["num_batches"] == num_batches # We only train on whole batches, hence we have to scale by batch size assert trainer._log["num_samples_trained"] == ((expected_bts_size * num_batches) // batch_size) * batch_size assert test_on_batch_begin.call_count == len(trainer._callbacks) * num_batches @@ -970,7 +980,6 @@ def mock_forward(data): assert torch.allclose(data, expected_data) -@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_dataloaders", mock_get_dataloaders) @patch.object(MetadataCollector, "send_metadata", return_value=None) @patch.object(MetadataCollector, "cleanup", return_value=None) @patch.object(CustomLRScheduler, "step", return_value=None) @@ -1003,3 +1012,130 @@ def test_lr_scheduler_init( ) assert trainer._lr_scheduler.T_max == 100 + + +@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.SelectorKeySource") +@patch.object(PytorchTrainer, "get_available_labels_from_selector") +@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.prepare_per_class_dataloader_from_online_dataset") +@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.LocalDatasetWriter") +@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.LocalKeySource") +@patch.object(PytorchTrainer, "start_embedding_recording_if_needed") +@patch.object(PytorchTrainer, "end_embedding_recorder_if_needed") +@patch.object(PytorchTrainer, "get_embeddings_if_recorded") +@patch.object(RemoteGradMatchDownsamplingStrategy, "inform_samples") +@patch.object(RemoteGradMatchDownsamplingStrategy, "inform_end_of_current_label") +@patch.object(PytorchTrainer, "update_queue") +def test_downsample_trigger_training_set_label_by_label( + test_update_queue, + test_inform_end_of_current_label, + test_inform_samples, + test_get_embeddings, + test_end_embedding_recording, + test_start_embedding_recording, + test_local_key_source, + test_local_dataset_writer, + test_prepare_per_class_dataloader, + test_get_available_labels, + test_selector_key_source, + dummy_system_config: ModynConfig, +): + batch_size = 4 + available_labels = [0, 1, 2, 3, 4, 5] + test_prepare_per_class_dataloader.return_value = MockDataloader(batch_size, 100) + test_get_available_labels.return_value = available_labels + num_batches = 100 # hardcoded into mock dataloader + query_status_queue_training = mp.Queue() + status_queue_training = mp.Queue() + trainer = get_mock_trainer( + dummy_system_config, + query_status_queue_training, + status_queue_training, + False, + False, + None, + 2, + "custom", + False, + batch_size=batch_size, + selection_strategy=( + True, + "RemoteGradMatchDownsamplingStrategy", + { + "downsampling_ratio": 25, + "downsampling_period": 1, + "sample_then_batch": True, + "balance": True, + "ratio_max": 100, + }, + ), + ) + assert trainer._downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH + assert trainer._downsampler.requires_data_label_by_label + trainer.downsample_trigger_training_set() + assert test_prepare_per_class_dataloader.call_count == 1 + assert test_update_queue.call_count == len(available_labels) * num_batches + 1 + # check the args of the last call + last_call_args = test_update_queue.call_args_list[-1] + expected_batch_number = len(available_labels) * num_batches + expected_num_samples = expected_batch_number * batch_size + assert last_call_args == call("DOWNSAMPLING", expected_batch_number, expected_num_samples, training_active=True) + assert test_inform_end_of_current_label.call_count == len(available_labels) + + +@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.SelectorKeySource") +@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.LocalDatasetWriter") +@patch("modyn.trainer_server.internal.trainer.pytorch_trainer.LocalKeySource") +@patch.object(PytorchTrainer, "start_embedding_recording_if_needed") +@patch.object(PytorchTrainer, "end_embedding_recorder_if_needed") +@patch.object(PytorchTrainer, "get_embeddings_if_recorded") +@patch.object(RemoteGradMatchDownsamplingStrategy, "inform_samples") +@patch.object(RemoteGradMatchDownsamplingStrategy, "select_points", return_value=([1, 2], torch.ones(2))) +@patch.object(PytorchTrainer, "update_queue") +def test_downsample_trigger_training_set( + test_update_queue, + test_select_points, + test_inform_samples, + test_get_embeddings, + test_end_embedding_recording, + test_start_embedding_recording, + test_local_key_source, + test_local_dataset_writer, + test_selector_key_source, + dummy_system_config: ModynConfig, +): + batch_size = 4 + num_batches = 100 # hardcoded into mock dataloader + query_status_queue_training = mp.Queue() + status_queue_training = mp.Queue() + trainer = get_mock_trainer( + dummy_system_config, + query_status_queue_training, + status_queue_training, + False, + False, + None, + 2, + "custom", + False, + batch_size=batch_size, + selection_strategy=( + True, + "RemoteGradMatchDownsamplingStrategy", + { + "downsampling_ratio": 25, + "downsampling_period": 1, + "sample_then_batch": True, + "balance": False, + "ratio_max": 100, + }, + ), + ) + assert trainer._downsampling_mode == DownsamplingMode.SAMPLE_THEN_BATCH + assert not trainer._downsampler.requires_data_label_by_label + trainer.downsample_trigger_training_set() + assert test_update_queue.call_count == num_batches + 1 + # check the args of the last call + last_call_args = test_update_queue.call_args_list[-1] + expected_batch_number = num_batches + expected_num_samples = expected_batch_number * batch_size + assert last_call_args == call("DOWNSAMPLING", expected_batch_number, expected_num_samples, training_active=True) diff --git a/modyn/trainer_server/internal/trainer/pytorch_trainer.py b/modyn/trainer_server/internal/trainer/pytorch_trainer.py index b4a755765..82d3da8d6 100644 --- a/modyn/trainer_server/internal/trainer/pytorch_trainer.py +++ b/modyn/trainer_server/internal/trainer/pytorch_trainer.py @@ -211,7 +211,6 @@ def train(self) -> None: # pylint: disable=too-many-locals, too-many-branches self._info("Handled OnBegin Callbacks.") self._log["epochs"] = [] - batch_number = -1 if self.num_samples_to_pass == 0: epoch_num_generator: Iterable[int] = range(self.epochs_per_trigger) else: @@ -236,30 +235,33 @@ def train(self) -> None: # pylint: disable=too-many-locals, too-many-branches batch_accumulator = BatchAccumulator(self._batch_size // post_downsampling_size, self._device) trained_batches = 0 + passed_batches = 0 for epoch in epoch_num_generator: stopw = Stopwatch() # Reset timings per epoch self._log["epochs"].append({}) batch_timings = [] if self._sample_then_batch_this_epoch(epoch): - self.update_queue("TRAINING", batch_number, self._num_samples, training_active=False) + self.update_queue( + "TRAINING", trained_batches, trained_batches * self._batch_size, training_active=False + ) with GPUMeasurement(self._measure_gpu_ops, "DownsampleSTB", self._device, stopw): self.downsample_trigger_training_set() stopw.start("IndivFetchBatch", overwrite=True) stopw.start("FetchBatch", resume=True) - for batch_number, batch in enumerate(self._train_dataloader): + for batch in self._train_dataloader: stopw.stop("FetchBatch") batch_timings.append(stopw.stop("IndivFetchBatch")) retrieve_weights_from_dataloader, weighted_optimization = self.weights_handling(len(batch)) stopw.start("OnBatchBeginCallbacks", resume=True) for _, callback in self._callbacks.items(): - callback.on_batch_begin(self._model.model, self._optimizers, batch, batch_number) + callback.on_batch_begin(self._model.model, self._optimizers, batch, passed_batches) stopw.stop() - self.update_queue("TRAINING", batch_number, self._num_samples, training_active=True) - + self.update_queue("TRAINING", trained_batches, trained_batches * self._batch_size, training_active=True) + passed_batches += 1 with GPUMeasurement(self._measure_gpu_ops, "PreprocessBatch", self._device, stopw, resume=True): sample_ids, target, data = self.preprocess_batch(batch, stopw) @@ -285,6 +287,7 @@ def train(self) -> None: # pylint: disable=too-many-locals, too-many-branches data, sample_ids, target, weights = batch_accumulator.get_accumulated_batch() self._assert_data_size(self._batch_size, data, sample_ids, target) + with GPUMeasurement(self._measure_gpu_ops, "Forward", self._device, stopw, resume=True): output = self._model.model(data) @@ -299,7 +302,7 @@ def train(self) -> None: # pylint: disable=too-many-locals, too-many-branches stopw.start("OnBatchBeforeUpdate", resume=True) for _, callback in self._callbacks.items(): callback.on_batch_before_update( - self._model.model, self._optimizers, batch_number, sample_ids, data, target, output, loss + self._model.model, self._optimizers, trained_batches, sample_ids, data, target, output, loss ) stopw.stop() @@ -315,10 +318,10 @@ def train(self) -> None: # pylint: disable=too-many-locals, too-many-branches self._step_lr_if_necessary(True) - if self._checkpoint_interval > 0 and batch_number % self._checkpoint_interval == 0: + if self._checkpoint_interval > 0 and trained_batches % self._checkpoint_interval == 0: stopw.start("Checkpoint", resume=True) - checkpoint_file_name = self._checkpoint_path / f"model_{batch_number}.modyn" - self.save_state(checkpoint_file_name, batch_number) + checkpoint_file_name = self._checkpoint_path / f"model_{trained_batches}.modyn" + self.save_state(checkpoint_file_name, trained_batches) stopw.stop("Checkpoint") self._num_samples += self._batch_size @@ -326,7 +329,7 @@ def train(self) -> None: # pylint: disable=too-many-locals, too-many-branches stopw.start("OnBatchEnd", resume=True) for _, callback in self._callbacks.items(): callback.on_batch_end( - self._model.model, self._optimizers, batch_number, sample_ids, data, target, output, loss + self._model.model, self._optimizers, trained_batches, sample_ids, data, target, output, loss ) stopw.stop() if 0 < self.num_samples_to_pass <= self._num_samples: @@ -376,10 +379,11 @@ def train(self) -> None: # pylint: disable=too-many-locals, too-many-branches total_stopw.stop("TotalTrain") - self._info(f"Finished training: {self._num_samples} samples, {batch_number + 1} batches.") + self._info(f"Finished training: {self._num_samples} samples, {passed_batches} batches.") self._log["num_samples"] = self._num_samples self._log["num_samples_trained"] = trained_batches * self._batch_size - self._log["num_batches"] = batch_number + 1 + self._log["num_batches"] = passed_batches + self._log["num_batches_trained"] = trained_batches self._log["total_train"] = total_stopw.measurements.get("TotalTrain", 0) self._assert_training_size(epoch, trained_batches) @@ -387,7 +391,7 @@ def train(self) -> None: # pylint: disable=too-many-locals, too-many-branches self._persist_pipeline_log() for _, callback in self._callbacks.items(): - callback.on_train_end(self._model.model, self._optimizers, self._num_samples, batch_number) + callback.on_train_end(self._model.model, self._optimizers, self._num_samples, passed_batches) for metric in self._callbacks: self._metadata_collector.send_metadata(metric) @@ -435,7 +439,7 @@ def downsample_trigger_training_set(self) -> None: available_labels = self.get_available_labels_from_selector() number_of_samples = 0 - batch_number = 0 + batch_number = -1 first_label = True for label in available_labels: if first_label: @@ -480,7 +484,7 @@ def downsample_trigger_training_set(self) -> None: ) self._train_dataloader.dataset.change_key_source(new_key_source) - self.update_queue("DOWNSAMPLING", batch_number, number_of_samples, training_active=True) + self.update_queue("DOWNSAMPLING", batch_number + 1, number_of_samples, training_active=True) # set the model to train self._model.model.train() @@ -863,16 +867,16 @@ def _sample_then_batch_this_epoch(self, epoch: int) -> bool: def _iterate_dataloader_and_compute_scores( self, dataloader: torch.utils.data.DataLoader, - previous_batch_number: int = 0, + previous_batch_number: int = -1, previous_number_of_samples: int = 0, ) -> Tuple[int, int]: """ Function to iterate a dataloader, compute the forward pass and send the forward output to the downsampler. Args: dataloader: torch.dataloader to get the data - previous_batch_number: number of batches processed before calling this function. Useful when this function - is called several times to keep track of previous invocations (ex label by label dataloader). We need to - have a total to correctly update the queue and show the progress in the supervisor counter. + previous_batch_number: The batch number returned from the last call to this method. Useful when this + function is called several times to keep track of previous invocations (ex label by label dataloader). We + need to have a total to correctly update the queue and show the progress in the supervisor counter. previous_number_of_samples: number of samples processed before calling this function. See above for the use. Returns: @@ -880,9 +884,9 @@ def _iterate_dataloader_and_compute_scores( """ number_of_samples = previous_number_of_samples batch_number = previous_batch_number - for batch_number, batch in enumerate(dataloader): + for batch in dataloader: self.update_queue("DOWNSAMPLING", batch_number, number_of_samples, training_active=False) - + batch_number += 1 sample_ids, target, data = self.preprocess_batch(batch) number_of_samples += len(sample_ids)