hotfixes

eth-easl · robinholzi · Jun 23, 2024 · Jun 20, 2024 · Jun 20, 2024 · Jun 20, 2024
commit f121dce8c2230d0eef9cb2060cabbffed5d4a5ad
diff --git a/analytics/app/data/transform.py b/analytics/app/data/transform.py
@@ -2,7 +2,6 @@
 from typing import Any, Literal, cast
 
 import pandas as pd
-
 from modyn.supervisor.internal.grpc.enums import PipelineStage
 from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs, SingleEvaluationInfo
 from modyn.supervisor.internal.utils.time_tools import generate_real_training_end_timestamp

diff --git a/analytics/app/pages/plots/eval_heatmap.py b/analytics/app/pages/plots/eval_heatmap.py
@@ -2,11 +2,10 @@
 from dataclasses import dataclass
 
 import pandas as pd
+from analytics.app.data.transform import patch_yearbook_time
 from dash import Input, Output, callback, dcc, html
 from plotly import graph_objects as go
 
-from analytics.app.data.transform import patch_yearbook_time
-
 
 @dataclass
 class _SharedData:

diff --git a/analytics/app/pages/plots/eval_over_time.py b/analytics/app/pages/plots/eval_over_time.py
@@ -3,11 +3,10 @@
 
 import pandas as pd
 import plotly.express as px
+from analytics.app.data.transform import patch_yearbook_time
 from dash import Input, Output, callback, dcc, html
 from plotly import graph_objects as go
 
-from analytics.app.data.transform import patch_yearbook_time
-
 
 @dataclass
 class _SharedData:

diff --git a/analytics/tools/patch_logfile.ipynb b/analytics/tools/patch_logfile.ipynb
@@ -175,6 +175,86 @@
    "metadata": {},
    "outputs": [],
    "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "models_red = df_models[[\"trigger_id\", \"id_model\", \"train_start\", \"train_end\"]]\n",
+    "models_red"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_red = eval_requests[[\"trigger_id\", \"training_idx\", \"model_idx\", \"interval_start\", \"interval_end\", \"eval_handler\", \"dataset_id\"]]\n",
+    "eval_red"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_cross = models_red.merge(eval_red, on=\"trigger_id\").rename(columns={\"train_start\": \"first_timestamp\", \"train_end\": \"last_timestamp\"})\n",
+    "assert df_cross.shape[0] == eval_red.shape[0]\n",
+    "df_cross"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Adapted logic from handler.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_cross[\"active_candidate\"] = df_cross[\"last_timestamp\"] < df_cross[\"active_model_trained_before\"]\n",
+    "\n",
+    "# # find the maximum model for every EvalCandidate that doesn't violate that constraint\n",
+    "# max_model_id = (\n",
+    "#     df_cross[df_cross[\"active_candidate\"]]\n",
+    "#     .groupby(\"active_model_trained_before\")[\"id_model\"]\n",
+    "#     .aggregate(max_model_id=\"max\")\n",
+    "# )\n",
+    "\n",
+    "# # combine: a model in the cross product is most recent for a certain interval iff\n",
+    "# #  it has maximum model id for its active_model_trained_before\n",
+    "# df_active_models = df_cross.merge(max_model_id, on=\"active_model_trained_before\", how=\"left\")\n",
+    "# df_active_models[\"active_model\"] = df_active_models[\"id_model\"] == df_active_models[\"max_model_id\"]\n",
+    "\n",
+    "# # for a given interval, the currently trained model is the model with the smallest id\n",
+    "# # from all models that have a strictly bigger id than the most recent model. Hence it is the model after the\n",
+    "# # most recent model.\n",
+    "# # For that we first build a model -> successor model mapping:\n",
+    "# model_successor_relation = df_active_models[[\"id_model\"]].drop_duplicates().sort_values(by=\"id_model\")\n",
+    "# model_successor_relation[\"next_id_model\"] = model_successor_relation[\"id_model\"].shift(-1, fill_value=-1)\n",
+    "\n",
+    "# # if there's no active model for the first interval(s), we still need to define the next model as the\n",
+    "# # trained model\n",
+    "# model_successor_relation = pd.concat(\n",
+    "#     [\n",
+    "#         model_successor_relation,\n",
+    "#         pd.DataFrame([{\"id_model\": None, \"next_id_model\": df_active_models[\"id_model\"].min()}]),\n",
+    "#     ]\n",
+    "# )\n",
+    "\n",
+    "# df_trained_models = df_active_models.merge(\n",
+    "#     model_successor_relation, how=\"left\", left_on=\"max_model_id\", right_on=\"id_model\", suffixes=(\"\", \"__\")\n",
+    "# )\n",
+    "# df_trained_models[\"trained_model\"] = df_trained_models[\"id_model\"] == df_trained_models[\"next_id_model\"]\n"
+   ]
   }
  ],
  "metadata": {

diff --git a/modyn/config/schema/pipeline/trigger.py b/modyn/config/schema/pipeline/trigger.py
@@ -13,6 +13,13 @@ class TimeTriggerConfig(ModynBaseModel):
         description="Interval length for the trigger as an integer followed by a time unit: s, m, h, d, w, y",
         pattern=rf"^\d+{REGEX_TIME_UNIT}$",
     )
+    start_timestamp: int | None = Field(
+        None,
+        description=(
+            "The timestamp at which the triggering schedule starts. First trigger will be at start_timestamp + every."
+            "Use None to start at the first timestamp of the data."
+        ),
+    )
 
     @cached_property
     def every_seconds(self) -> int:

diff --git a/modyn/supervisor/internal/eval/handler.py b/modyn/supervisor/internal/eval/handler.py
@@ -1,12 +1,10 @@
 from __future__ import annotations
 
 import pandas as pd
-from pydantic import BaseModel, Field
-
 from modyn.config.schema.pipeline import EvalHandlerConfig
 from modyn.supervisor.internal.eval.strategies.abstract import AbstractEvalStrategy
-from modyn.supervisor.internal.utils.time_tools import generate_real_training_end_timestamp
 from modyn.utils import dynamic_module_import
+from pydantic import BaseModel, Field
 
 eval_strategy_module = dynamic_module_import("modyn.supervisor.internal.eval.strategies")
 
@@ -89,15 +87,8 @@ def get_eval_requests_after_pipeline(self, df_trainings: pd.DataFrame) -> list[E
         ) == set()
         df_trainings = df_trainings.copy()
 
-        # for sparse datasets we want to use next_training_start-1 as training interval end instead of last_timestamp
-        # as there could be a long gap between the max(sample_time) in one training batch and the min(sample_time) in
-        # the next training batch.
-        # e.g. if we want to train for 1.1.2020-31.12.2020 but only have timestamps on 1.1.2020, last_timestamp
-        # would be 1.1.2020, but the next training would start on 1.1.2021.
-        df_trainings["real_last_timestamp"] = generate_real_training_end_timestamp(df_trainings)
-
         training_intervals: list[tuple[int, int]] = [
-            (row["first_timestamp"], row["real_last_timestamp"]) for _, row in df_trainings.iterrows()
+            (row["first_timestamp"], row["last_timestamp"]) for _, row in df_trainings.iterrows()
         ]
         eval_intervals = self.eval_strategy.get_eval_intervals(training_intervals)
         df_eval_intervals = pd.DataFrame(
@@ -114,7 +105,7 @@ def get_eval_requests_after_pipeline(self, df_trainings: pd.DataFrame) -> list[E
         # Check if a combination is the active. We first compute if model was trained before
         # the usage starts last_timestamp (df_trainings) defines the end of the training data;
         # active_model_trained_before (df_eval_intervals): defines center of an eval intervals.
-        df_cross["active_candidate"] = df_cross["real_last_timestamp"] < df_cross["active_model_trained_before"]
+        df_cross["active_candidate"] = df_cross["last_timestamp"] < df_cross["active_model_trained_before"]
 
         # find the maximum model for every EvalCandidate that doesn't violate that constraint
         max_model_id = (

diff --git a/modyn/supervisor/internal/triggers/timetrigger.py b/modyn/supervisor/internal/triggers/timetrigger.py
@@ -12,20 +12,23 @@ class TimeTrigger(Trigger):
     Clock starts with the first observed datapoint"""
 
     def __init__(self, config: TimeTriggerConfig):
-        self.trigger_every_s: int = config.every_seconds
+        self.config = config
         self.next_trigger_at: int | None = None
 
-        if self.trigger_every_s < 1:
-            raise ValueError(f"trigger_every must be > 0, but is {self.trigger_every_s}")
+        if self.config.every_seconds < 1:
+            raise ValueError(f"trigger_every must be > 0, but is {self.config.every_seconds}")
 
         super().__init__()
 
     def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, None]:
         if self.next_trigger_at is None:
-            if len(new_data) > 0:
-                self.next_trigger_at = new_data[0][1] + self.trigger_every_s  # new_data is sorted
+            if self.config.start_timestamp is not None:
+                self.next_trigger_at = self.config.start_timestamp + self.config.every_seconds
             else:
-                return
+                if len(new_data) > 0:
+                    self.next_trigger_at = new_data[0][1] + self.config.every_seconds  # new_data is sorted
+                else:
+                    return
 
         max_timestamp = new_data[-1][1]  # new_data is sorted
         triggering_indices = []
@@ -42,9 +45,9 @@ def inform(self, new_data: list[tuple[int, int, int]]) -> Generator[int, None, N
             # This means that there was a trigger before the first item that we got informed about
             # However, there might have been multiple triggers, e.g., if there is one trigger every second
             # and 5 seconds have passed since the last item came through
-            # This is caught by our while loop which increases step by step for `trigger_every_s`.
+            # This is caught by our while loop which increases step by step for `config.every_seconds`.
 
             triggering_indices.append(idx - 1)
-            self.next_trigger_at += self.trigger_every_s
+            self.next_trigger_at += self.config.every_seconds
 
         yield from triggering_indices