chore: Miscellaneous updates mostly wrt. offline drift exploration (#592

)
eth-easl · Aug 19, 2024 · 7130d6b · 7130d6b
1 parent 5d1b088
commit 7130d6b
Show file tree

Hide file tree

Showing 10 changed files with 229 additions and 135 deletions.
diff --git a/.gitignore b/.gitignore
@@ -79,10 +79,10 @@ cmake-build-debug/
 clang-tidy-build/
 libbuild/
 
-
 # Data & config files
 
 .data/
+.debug/
 .env
 
 exploration/
diff --git a/benchmark/huffpost_kaggle/analytics.ipynb b/benchmark/huffpost_kaggle/analytics.ipynb
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,5 +1,3 @@
-version: "3.8"
-
 # By default, we disable mountain the current directory under /modyn_host. However, this might be helpful for local development.
 # For the trainer server, you additionally might want to enable the runtime and deployment option to enable the GPU in the container.
 # For the storage, you probably want to mount some volume containing the datasets.
@@ -8,7 +6,7 @@ version: "3.8"
 
 services:
   metadata-db:
-    image: postgres:15.2-alpine
+    image: postgres:16.4-alpine
     restart: always
     environment:
       POSTGRES_USER: postgres
@@ -28,7 +26,7 @@ services:
       timeout: 5s
       retries: 20
   storage-db:
-    image: postgres:15.2-alpine
+    image: postgres:16.4-alpine
     restart: always
     environment:
       POSTGRES_USER: postgres

diff --git a/modyn/config/examples/modyn_config.yaml b/modyn/config/examples/modyn_config.yaml
@@ -47,6 +47,24 @@ storage:
         selector_batch_size: 2000000,
       },
       # ---------------------------------- YEARBOOK ---------------------------------- #
+      {
+        name: "yearbook_all",
+        description: "Yearbook Dataset from Wild-Time (full set)",
+        version: "0.0.1",
+        base_path: "/datasets/yearbook/all",
+        filesystem_wrapper_type: "LocalFilesystemWrapper",
+        file_wrapper_type: "BinaryFileWrapper",
+        file_wrapper_config:
+          {
+            byteorder: "big",
+            record_size: 12292,
+            label_size: 4,
+            file_extension: ".bin",
+          },
+        ignore_last_timestamp: false,
+        file_watcher_interval: 5,
+        selector_batch_size: 256,
+      },
       {
         name: "yearbook_train",
         description: "Yearbook Dataset from Wild-Time (training set)",
@@ -114,6 +132,39 @@ storage:
         file_watcher_interval: 5,
         selector_batch_size: 4096,
       },
+      # ------------------------------- HUFFPOST KAGGLE ------------------------------ #
+      {
+        name: "huffpost_kaggle_train",
+        description: "Original Huffpost Dataset from Kaggle (train)",
+        version: "0.0.1",
+        base_path: "/datasets/huffpost_kaggle/train",
+        filesystem_wrapper_type: "LocalFilesystemWrapper",
+        file_wrapper_type: "CsvFileWrapper",
+        file_wrapper_config: {
+            file_extension: ".csv",
+            separator: "\t", #tsv best option here since headlines contain commas and semicolons
+            label_index: 1,
+          },
+        ignore_last_timestamp: false,
+        file_watcher_interval: 5,
+        selector_batch_size: 4096,
+      },
+      {
+        name: "huffpost_kaggle_test",
+        description: "Original Huffpost Dataset from Kaggle (test)",
+        version: "0.0.1",
+        base_path: "/datasets/huffpost_kaggle/test",
+        filesystem_wrapper_type: "LocalFilesystemWrapper",
+        file_wrapper_type: "CsvFileWrapper",
+        file_wrapper_config: {
+            file_extension: ".csv",
+            separator: "\t", #tsv best option here since headlines contain commas and semicolons
+            label_index: 1,
+          },
+        ignore_last_timestamp: false,
+        file_watcher_interval: 5,
+        selector_batch_size: 4096,
+      },
       # ------------------------------------ ARXIV ----------------------------------- #
       {
         name: "arxiv_train",
@@ -147,6 +198,39 @@ storage:
         file_watcher_interval: 5,
         selector_batch_size: 4096,
       },
+      # -------------------------------- ARXIV KAGGLE -------------------------------- #
+      {
+        name: "arxiv_kaggle_train",
+        description: "Original Arxiv Dataset from Kaggle (training set)",
+        version: "0.0.1",
+        base_path: "/datasets/arxiv_kaggle/train",
+        filesystem_wrapper_type: "LocalFilesystemWrapper",
+        file_wrapper_type: "CsvFileWrapper",
+        file_wrapper_config: {
+            file_extension: ".csv",
+            separator: "\t", #tsv best option here since sentences contain commas and semicolons
+            label_index: 1,
+          },
+        ignore_last_timestamp: false,
+        file_watcher_interval: 5,
+        selector_batch_size: 4096,
+      },
+      {
+        name: "arxiv_kaggle_test",
+        description: "Original Arxiv Dataset from Kaggle (test set)",
+        version: "0.0.1",
+        base_path: "/datasets/arxiv_kaggle/test",
+        filesystem_wrapper_type: "LocalFilesystemWrapper",
+        file_wrapper_type: "CsvFileWrapper",
+        file_wrapper_config: {
+            file_extension: ".csv",
+            separator: "\t", #tsv best option here since sentences contain commas and semicolons
+            label_index: 1,
+          },
+        ignore_last_timestamp: false,
+        file_watcher_interval: 5,
+        selector_batch_size: 4096,
+      },
       # ------------------------------------ CLOC ------------------------------------ #
       {
         name: "cloc",

diff --git a/modyn/config/schema/pipeline/trigger/drift/alibi_detect.py b/modyn/config/schema/pipeline/trigger/drift/alibi_detect.py
@@ -8,11 +8,15 @@
 
 from modyn.config.schema.base_model import ModynBaseModel
 from modyn.config.schema.pipeline.trigger.drift.metric import BaseMetric
+from modyn.config.schema.pipeline.trigger.drift.preprocess.alibi_detect import (
+    AlibiDetectNLPreprocessor,
+)
 
 
 class _AlibiDetectBaseDriftMetric(BaseMetric):
     p_val: float = Field(0.05, description="The p-value threshold for the drift detection.")
     x_ref_preprocessed: bool = Field(False)
+    preprocessor: AlibiDetectNLPreprocessor | None = Field(None, description="Preprocessor function.")
 
 
 class AlibiDetectDeviceMixin(ModynBaseModel):
@@ -65,6 +69,13 @@ def validate_threshold_permutations(self) -> "AlibiDetectMmdDriftMetric":
         return self
 
 
+class AlibiDetectClassifierDriftMetric(_AlibiDetectBaseDriftMetric, AlibiDetectDeviceMixin):
+    id: Literal["AlibiDetectClassifierDriftMetric"] = Field("AlibiDetectClassifierDriftMetric")
+    classifier_id: str = Field(
+        description="The model to use for classifications; has to be registered in alibi_detector.py"
+    )
+
+
 class AlibiDetectKSDriftMetric(
     _AlibiDetectBaseDriftMetric,
     _AlibiDetectAlternativeMixin,

diff --git a/modyn/config/schema/pipeline/trigger/drift/preprocess/alibi_detect.py b/modyn/config/schema/pipeline/trigger/drift/preprocess/alibi_detect.py
@@ -0,0 +1,30 @@
+from collections.abc import Callable
+from functools import partial
+
+from alibi_detect.cd.pytorch import preprocess_drift
+from alibi_detect.models.pytorch import TransformerEmbedding
+from pydantic import Field
+from transformers import AutoTokenizer
+
+from modyn.config.schema.base_model import ModynBaseModel
+
+
+class AlibiDetectNLPreprocessor(ModynBaseModel):
+    tokenizer_model: str = Field(description="AutoTokenizer pretrained model name. E.g. bert-base-cased")
+    n_layers: int = Field(8)
+    max_len: int = Field(..., description="Maximum length of input token sequences.")
+    batch_size: int = Field(32, description="Batch size for tokenization.")
+
+    def gen_preprocess_fn(self, device: str | None) -> Callable:
+        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_model)
+        emb_type = "hidden_state"
+        layers = [-_ for _ in range(1, self.n_layers + 1)]
+
+        embedding = TransformerEmbedding(self.tokenizer_model, emb_type, layers)
+        if device:
+            embedding = embedding.to(device)
+        embedding = embedding.eval()
+
+        return partial(
+            preprocess_drift, model=embedding, tokenizer=tokenizer, max_len=self.max_len, batch_size=self.batch_size
+        )
diff --git a/modyn/supervisor/internal/triggers/drift/classifier_models/__init__.py b/modyn/supervisor/internal/triggers/drift/classifier_models/__init__.py
@@ -0,0 +1,5 @@
+from modyn.supervisor.internal.triggers.drift.classifier_models.ybnet_classifier import YearbookNetDriftDetector
+
+alibi_classifier_models = {
+    "ybnet": YearbookNetDriftDetector(3),
+}
diff --git a/modyn/supervisor/internal/triggers/drift/classifier_models/ybnet_classifier.py b/modyn/supervisor/internal/triggers/drift/classifier_models/ybnet_classifier.py
@@ -0,0 +1,36 @@
+import torch
+from torch import nn
+
+from modyn.models.coreset_methods_support import CoresetSupportingModule
+
+
+class YearbookNetDriftDetector(CoresetSupportingModule):
+    def __init__(self, num_input_channels: int) -> None:
+        super().__init__()
+        self.enc = nn.Sequential(
+            self.conv_block(num_input_channels, 32),
+            self.conv_block(32, 32),
+            self.conv_block(32, 32),
+            self.conv_block(32, 32),
+        )
+        self.hid_dim = 32
+        # Binary classifier for drift detection
+        # see: https://docs.seldon.io/projects/alibi-detect/en/latest/cd/methods/classifierdrift.html
+        self.classifier = nn.Sequential(nn.Flatten(), nn.Linear(32, 2))
+
+    def conv_block(self, in_channels: int, out_channels: int) -> nn.Module:
+        return nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 3, padding=1),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+        )
+
+    def forward(self, data: torch.Tensor) -> torch.Tensor:
+        data = self.enc(data)
+        data = torch.mean(data, dim=(2, 3))
+        data = self.classifier(data)
+        return data
+
+    def get_last_layer(self) -> nn.Module:
+        return self.classifier