Skip to content

Commit

Permalink
chore: Miscellaneous updates mostly wrt. offline drift exploration (#592
Browse files Browse the repository at this point in the history
)
  • Loading branch information
robinholzi authored Aug 19, 2024
1 parent 5d1b088 commit 7130d6b
Show file tree
Hide file tree
Showing 10 changed files with 229 additions and 135 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ cmake-build-debug/
clang-tidy-build/
libbuild/


# Data & config files

.data/
.debug/
.env

exploration/
110 changes: 0 additions & 110 deletions benchmark/huffpost_kaggle/analytics.ipynb

This file was deleted.

6 changes: 2 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: "3.8"

# By default, we disable mountain the current directory under /modyn_host. However, this might be helpful for local development.
# For the trainer server, you additionally might want to enable the runtime and deployment option to enable the GPU in the container.
# For the storage, you probably want to mount some volume containing the datasets.
Expand All @@ -8,7 +6,7 @@ version: "3.8"

services:
metadata-db:
image: postgres:15.2-alpine
image: postgres:16.4-alpine
restart: always
environment:
POSTGRES_USER: postgres
Expand All @@ -28,7 +26,7 @@ services:
timeout: 5s
retries: 20
storage-db:
image: postgres:15.2-alpine
image: postgres:16.4-alpine
restart: always
environment:
POSTGRES_USER: postgres
Expand Down
84 changes: 84 additions & 0 deletions modyn/config/examples/modyn_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,24 @@ storage:
selector_batch_size: 2000000,
},
# ---------------------------------- YEARBOOK ---------------------------------- #
{
name: "yearbook_all",
description: "Yearbook Dataset from Wild-Time (full set)",
version: "0.0.1",
base_path: "/datasets/yearbook/all",
filesystem_wrapper_type: "LocalFilesystemWrapper",
file_wrapper_type: "BinaryFileWrapper",
file_wrapper_config:
{
byteorder: "big",
record_size: 12292,
label_size: 4,
file_extension: ".bin",
},
ignore_last_timestamp: false,
file_watcher_interval: 5,
selector_batch_size: 256,
},
{
name: "yearbook_train",
description: "Yearbook Dataset from Wild-Time (training set)",
Expand Down Expand Up @@ -114,6 +132,39 @@ storage:
file_watcher_interval: 5,
selector_batch_size: 4096,
},
# ------------------------------- HUFFPOST KAGGLE ------------------------------ #
{
name: "huffpost_kaggle_train",
description: "Original Huffpost Dataset from Kaggle (train)",
version: "0.0.1",
base_path: "/datasets/huffpost_kaggle/train",
filesystem_wrapper_type: "LocalFilesystemWrapper",
file_wrapper_type: "CsvFileWrapper",
file_wrapper_config: {
file_extension: ".csv",
separator: "\t", #tsv best option here since headlines contain commas and semicolons
label_index: 1,
},
ignore_last_timestamp: false,
file_watcher_interval: 5,
selector_batch_size: 4096,
},
{
name: "huffpost_kaggle_test",
description: "Original Huffpost Dataset from Kaggle (test)",
version: "0.0.1",
base_path: "/datasets/huffpost_kaggle/test",
filesystem_wrapper_type: "LocalFilesystemWrapper",
file_wrapper_type: "CsvFileWrapper",
file_wrapper_config: {
file_extension: ".csv",
separator: "\t", #tsv best option here since headlines contain commas and semicolons
label_index: 1,
},
ignore_last_timestamp: false,
file_watcher_interval: 5,
selector_batch_size: 4096,
},
# ------------------------------------ ARXIV ----------------------------------- #
{
name: "arxiv_train",
Expand Down Expand Up @@ -147,6 +198,39 @@ storage:
file_watcher_interval: 5,
selector_batch_size: 4096,
},
# -------------------------------- ARXIV KAGGLE -------------------------------- #
{
name: "arxiv_kaggle_train",
description: "Original Arxiv Dataset from Kaggle (training set)",
version: "0.0.1",
base_path: "/datasets/arxiv_kaggle/train",
filesystem_wrapper_type: "LocalFilesystemWrapper",
file_wrapper_type: "CsvFileWrapper",
file_wrapper_config: {
file_extension: ".csv",
separator: "\t", #tsv best option here since sentences contain commas and semicolons
label_index: 1,
},
ignore_last_timestamp: false,
file_watcher_interval: 5,
selector_batch_size: 4096,
},
{
name: "arxiv_kaggle_test",
description: "Original Arxiv Dataset from Kaggle (test set)",
version: "0.0.1",
base_path: "/datasets/arxiv_kaggle/test",
filesystem_wrapper_type: "LocalFilesystemWrapper",
file_wrapper_type: "CsvFileWrapper",
file_wrapper_config: {
file_extension: ".csv",
separator: "\t", #tsv best option here since sentences contain commas and semicolons
label_index: 1,
},
ignore_last_timestamp: false,
file_watcher_interval: 5,
selector_batch_size: 4096,
},
# ------------------------------------ CLOC ------------------------------------ #
{
name: "cloc",
Expand Down
11 changes: 11 additions & 0 deletions modyn/config/schema/pipeline/trigger/drift/alibi_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@

from modyn.config.schema.base_model import ModynBaseModel
from modyn.config.schema.pipeline.trigger.drift.metric import BaseMetric
from modyn.config.schema.pipeline.trigger.drift.preprocess.alibi_detect import (
AlibiDetectNLPreprocessor,
)


class _AlibiDetectBaseDriftMetric(BaseMetric):
p_val: float = Field(0.05, description="The p-value threshold for the drift detection.")
x_ref_preprocessed: bool = Field(False)
preprocessor: AlibiDetectNLPreprocessor | None = Field(None, description="Preprocessor function.")


class AlibiDetectDeviceMixin(ModynBaseModel):
Expand Down Expand Up @@ -65,6 +69,13 @@ def validate_threshold_permutations(self) -> "AlibiDetectMmdDriftMetric":
return self


class AlibiDetectClassifierDriftMetric(_AlibiDetectBaseDriftMetric, AlibiDetectDeviceMixin):
id: Literal["AlibiDetectClassifierDriftMetric"] = Field("AlibiDetectClassifierDriftMetric")
classifier_id: str = Field(
description="The model to use for classifications; has to be registered in alibi_detector.py"
)


class AlibiDetectKSDriftMetric(
_AlibiDetectBaseDriftMetric,
_AlibiDetectAlternativeMixin,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from collections.abc import Callable
from functools import partial

from alibi_detect.cd.pytorch import preprocess_drift
from alibi_detect.models.pytorch import TransformerEmbedding
from pydantic import Field
from transformers import AutoTokenizer

from modyn.config.schema.base_model import ModynBaseModel


class AlibiDetectNLPreprocessor(ModynBaseModel):
tokenizer_model: str = Field(description="AutoTokenizer pretrained model name. E.g. bert-base-cased")
n_layers: int = Field(8)
max_len: int = Field(..., description="Maximum length of input token sequences.")
batch_size: int = Field(32, description="Batch size for tokenization.")

def gen_preprocess_fn(self, device: str | None) -> Callable:
tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_model)
emb_type = "hidden_state"
layers = [-_ for _ in range(1, self.n_layers + 1)]

embedding = TransformerEmbedding(self.tokenizer_model, emb_type, layers)
if device:
embedding = embedding.to(device)
embedding = embedding.eval()

return partial(
preprocess_drift, model=embedding, tokenizer=tokenizer, max_len=self.max_len, batch_size=self.batch_size
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from modyn.supervisor.internal.triggers.drift.classifier_models.ybnet_classifier import YearbookNetDriftDetector

alibi_classifier_models = {
"ybnet": YearbookNetDriftDetector(3),
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import torch
from torch import nn

from modyn.models.coreset_methods_support import CoresetSupportingModule


class YearbookNetDriftDetector(CoresetSupportingModule):
def __init__(self, num_input_channels: int) -> None:
super().__init__()
self.enc = nn.Sequential(
self.conv_block(num_input_channels, 32),
self.conv_block(32, 32),
self.conv_block(32, 32),
self.conv_block(32, 32),
)
self.hid_dim = 32
# Binary classifier for drift detection
# see: https://docs.seldon.io/projects/alibi-detect/en/latest/cd/methods/classifierdrift.html
self.classifier = nn.Sequential(nn.Flatten(), nn.Linear(32, 2))

def conv_block(self, in_channels: int, out_channels: int) -> nn.Module:
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(),
nn.MaxPool2d(2),
)

def forward(self, data: torch.Tensor) -> torch.Tensor:
data = self.enc(data)
data = torch.mean(data, dim=(2, 3))
data = self.classifier(data)
return data

def get_last_layer(self) -> nn.Module:
return self.classifier
Loading

0 comments on commit 7130d6b

Please sign in to comment.