Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add more powerful drift windowing strategies, warmup and dynamic thresholds #564

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9d71dfb
Add more powerful windowing strategies, warumup and dynamic thresholds
robinholzi Jul 4, 2024
244da67
Merge branch 'main' into robinholzi/feat/more-powerful-drift-windows-…
robinholzi Jul 4, 2024
85ed4f5
Merge branch 'main' into robinholzi/feat/more-powerful-drift-windows-…
robinholzi Jul 22, 2024
4ca50ed
Add tests
robinholzi Jul 24, 2024
91e0dda
Merge branch 'main' into robinholzi/feat/more-powerful-drift-windows-…
robinholzi Jul 24, 2024
28dbb9f
Merge branch 'main' into robinholzi/feat/more-powerful-drift-windows-…
robinholzi Jul 26, 2024
6b22b6d
fix linting
robinholzi Jul 26, 2024
0d42176
fix linting
robinholzi Jul 26, 2024
0028e1b
Implement suggestions, v1
robinholzi Aug 7, 2024
9f3b70f
Merge branch 'main' into robinholzi/feat/more-powerful-drift-windows-…
robinholzi Aug 7, 2024
bbacf1e
Integrate suggestions, rename things, more tests, documentation
robinholzi Aug 8, 2024
44f29d1
Fix
robinholzi Aug 8, 2024
836af4f
Merge branch 'main' into robinholzi/feat/more-powerful-drift-windows-…
robinholzi Aug 12, 2024
f950c25
Fix
robinholzi Aug 12, 2024
8912476
Merge branch 'main' into robinholzi/feat/more-powerful-drift-windows-…
robinholzi Aug 12, 2024
c9091aa
Tests and adjustments to warmup
robinholzi Aug 12, 2024
4792fd4
Integrate suggestions, rename things
robinholzi Aug 12, 2024
ed0baaa
fix
robinholzi Aug 12, 2024
ccb6ba3
Fix
robinholzi Aug 13, 2024
46f52d3
Move averaging logic into detector
robinholzi Aug 13, 2024
a2392ad
Merge branch 'main' into robinholzi/feat/more-powerful-drift-windows-…
robinholzi Aug 13, 2024
ea036c1
Final adjustments
robinholzi Aug 14, 2024
9594453
Small fix wrt interval tests
robinholzi Aug 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions modyn/config/schema/pipeline/trigger/drift/alibi_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from modyn.config.schema.base_model import ModynBaseModel
from pydantic import Field, model_validator

from modyn.config.schema.pipeline.trigger.drift.metric import BaseMetric

class _AlibiDetectBaseDriftMetric(ModynBaseModel):

class _AlibiDetectBaseDriftMetric(BaseMetric):
p_val: float = Field(0.05, description="The p-value threshold for the drift detection.")
x_ref_preprocessed: bool = Field(False)

Expand Down Expand Up @@ -51,7 +53,7 @@ def validate_threshold_permutations(self) -> "AlibiDetectMmdDriftMetric":
if self.threshold is not None and self.num_permutations is not None:
raise ValueError(
"threshold and num_permutations are mutually exclusive."
+ "Please specify whether you want to use hypothosis testing "
+ "Please specify whether you want to use hypothesis testing "
+ "or threshold comparison for making drift decisions."
)

Expand Down
46 changes: 12 additions & 34 deletions modyn/config/schema/pipeline/trigger/drift/config.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from __future__ import annotations

from functools import cached_property
from typing import Annotated, ForwardRef, Literal, Optional, Union

from modyn.config.schema.base_model import ModynBaseModel
from modyn.const.regex import REGEX_TIME_UNIT
from modyn.utils.utils import SECONDS_PER_UNIT
from pydantic import Field

from modyn.config.schema.base_model import ModynBaseModel
from modyn.config.schema.pipeline.trigger.drift.detection_window import AmountWindowingStrategy, DriftWindowingStrategy

from .aggregation import DriftAggregationStrategy, MajorityVoteDriftAggregationStrategy
from .alibi_detect import AlibiDetectDriftMetric
from .evidently import EvidentlyDriftMetric
Expand All @@ -23,52 +22,31 @@
]


class AmountWindowingStrategy(ModynBaseModel):
id: Literal["AmountWindowingStrategy"] = Field("AmountWindowingStrategy")
amount: int = Field(1000, description="How many data points should fit in the window")


class TimeWindowingStrategy(ModynBaseModel):
id: Literal["TimeWindowingStrategy"] = Field("TimeWindowingStrategy")
limit: str = Field(
description="Window size as an integer followed by a time unit: s, m, h, d, w, y",
pattern=rf"^\d+{REGEX_TIME_UNIT}$",
)

@cached_property
def limit_seconds(self) -> int:
unit = str(self.limit)[-1:]
num = int(str(self.limit)[:-1])
return num * SECONDS_PER_UNIT[unit]


DriftWindowingStrategy = Annotated[
Union[
AmountWindowingStrategy,
TimeWindowingStrategy,
],
Field(discriminator="id"),
]


class DataDriftTriggerConfig(ModynBaseModel):
id: Literal["DataDriftTrigger"] = Field("DataDriftTrigger")

detection_interval: Optional[__TriggerConfig] = Field( # type: ignore[valid-type]
None, description="The Trigger policy to determine the interval at which drift detection is performed."
) # currently not used

detection_interval_data_points: int = Field(
1000, description="The number of samples in the interval after which drift detection is performed.", ge=1
)

windowing_strategy: DriftWindowingStrategy = Field(
AmountWindowingStrategy(), description="Which windowing strategy to use for current and reference data"
)
warmup_intervals: int | None = Field(
None,
description=(
"The number of intervals before starting to use the drift detection. Some "
"`DecisionCriteria` use this to calibrate the threshold. During the warmup, every interval will cause "
robinholzi marked this conversation as resolved.
Show resolved Hide resolved
"a trigger."
),
)

reset_current_window_on_trigger: bool = Field(
False, description="Whether the current window should be reset on trigger or rather be extended."
)

metrics: dict[str, DriftMetric] = Field(
min_length=1, description="The metrics used for drift detection keyed by a reference."
)
Expand Down
71 changes: 71 additions & 0 deletions modyn/config/schema/pipeline/trigger/drift/detection_window.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

from functools import cached_property
from typing import Annotated, Literal, Union

from pydantic import Field

from modyn.config.schema.base_model import ModynBaseModel
from modyn.const.regex import REGEX_TIME_UNIT
from modyn.utils.utils import SECONDS_PER_UNIT


class _BaseWindowingStrategy(ModynBaseModel):
allow_overlap: bool = Field(
False, description="Whether the windows are allowed to overlap. This is useful for time-based windows."
)


class AmountWindowingStrategy(_BaseWindowingStrategy):
id: Literal["AmountWindowingStrategy"] = Field("AmountWindowingStrategy")
amount_ref: int = Field(1000, description="How many data points should fit in the reference window")
amount_cur: int = Field(1000, description="How many data points should fit in the current window")

@property
def current_buffer_size(self) -> int | None:
return self.amount_ref

@property
def reference_buffer_size(self) -> int | None:
return self.amount_cur


class TimeWindowingStrategy(_BaseWindowingStrategy):
id: Literal["TimeWindowingStrategy"] = Field("TimeWindowingStrategy")
limit_ref: str = Field(
description="Window size as an integer followed by a time unit: s, m, h, d, w, y",
pattern=rf"^\d+{REGEX_TIME_UNIT}$",
)
limit_cur: str = Field(
description="Window size as an integer followed by a time unit: s, m, h, d, w, y",
pattern=rf"^\d+{REGEX_TIME_UNIT}$",
)

@cached_property
def limit_seconds_ref(self) -> int:
unit = str(self.limit_ref)[-1:]
num = int(str(self.limit_ref)[:-1])
return num * SECONDS_PER_UNIT[unit]

@cached_property
def limit_seconds_cur(self) -> int:
unit = str(self.limit_cur)[-1:]
num = int(str(self.limit_cur)[:-1])
return num * SECONDS_PER_UNIT[unit]

@property
def current_buffer_size(self) -> int | None:
return None

@property
def reference_buffer_size(self) -> int | None:
return None


DriftWindowingStrategy = Annotated[
Union[
AmountWindowingStrategy,
TimeWindowingStrategy,
],
Field(discriminator="id"),
]
5 changes: 3 additions & 2 deletions modyn/config/schema/pipeline/trigger/drift/evidently.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from typing import Annotated, Literal, Union

from modyn.config.schema.base_model import ModynBaseModel
from pydantic import Field

from modyn.config.schema.pipeline.trigger.drift.metric import BaseMetric

class _EvidentlyBaseDriftMetric(ModynBaseModel):

class _EvidentlyBaseDriftMetric(BaseMetric):
num_pca_component: int | None = Field(None)


Expand Down
48 changes: 48 additions & 0 deletions modyn/config/schema/pipeline/trigger/drift/metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Annotated, Literal, Union

from pydantic import Field

from modyn.config.schema.base_model import ModynBaseModel


class HypothesisTestDecisionCriterion(ModynBaseModel):
id: Literal["HypothesisTestDecisionCriterion"] = "HypothesisTestDecisionCriterion"

@property
def needs_calibration(self) -> bool:
MaxiBoether marked this conversation as resolved.
Show resolved Hide resolved
return False


class ThresholdDecisionCriterion(ModynBaseModel):
id: Literal["ThresholdDecisionCriterion"] = "ThresholdDecisionCriterion"
threshold: float

@property
def needs_calibration(self) -> bool:
return False


class DynamicThresholdCriterion(ModynBaseModel):
id: Literal["DynamicThresholdCriterion"] = "DynamicThresholdCriterion"
window_size: int
percentile_threshold: float = Field(
0.05, description="The percentile that a threshold has to be in to trigger a drift event."
)

@property
def needs_calibration(self) -> bool:
return True


DecisionCriterion = Annotated[
Union[
HypothesisTestDecisionCriterion,
ThresholdDecisionCriterion,
DynamicThresholdCriterion,
],
Field(discriminator="id"),
]


class BaseMetric(ModynBaseModel):
decision_criterion: DecisionCriterion
Loading
Loading