Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 243 additions & 0 deletions benchmark_utils/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from torch import nn
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm


class ARModel(nn.Module):
Expand Down Expand Up @@ -122,3 +128,240 @@ def forward(self, x):
x, (_, _) = self.decoder(x)

return x


class SlidingWindowDataset(Dataset):
def __init__(self, data, window_size):
self.data = data
self.window_size = window_size

def __len__(self):
return len(self.data) - self.window_size + 1

def __getitem__(self, idx):
window = self.data[idx:idx + self.window_size]
return window # Input and target are the same for autoencoder


class Autoencoder(nn.Module):
def __init__(
self,
input_size=32,
hidden_size=32,
latent_size=16,
sliding_window=10
):
super(Autoencoder, self).__init__()

self.sliding_window = sliding_window
self.decision_scores_ = None

# Encoder
self.encoder = nn.Sequential(
nn.Linear(input_size, hidden_size),
nn.ReLU(),
nn.BatchNorm1d(hidden_size),
nn.Linear(hidden_size, latent_size),
nn.ReLU(),
nn.BatchNorm1d(latent_size),
)

# Decoder
self.decoder = nn.Sequential(
nn.Linear(latent_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, input_size),
nn.ReLU(),
)

def forward(self, x):
# Flatten input if needed
x = x.view(x.size(0), -1)

# Encode
encoded = self.encoder(x)

# Decode
decoded = self.decoder(encoded)

return decoded

def encode(self, x):
x = x.view(x.size(0), -1)
return self.encoder(x)

def _create_sliding_windows(self, X):
"""Create sliding windows from input data"""
if isinstance(X, np.ndarray):
X = torch.from_numpy(X).float()

# If X is 1D, reshape to 2D
if X.dim() == 1:
X = X.unsqueeze(1)

windows = []
for i in range(len(X) - self.sliding_window + 1):
window = X[i:i + self.sliding_window].flatten()
windows.append(window)

return torch.stack(windows)

def fit(
self,
X,
num_epochs=50,
learning_rate=1e-3,
device="cuda",
batch_size=32
):
"""
Train the autoencoder on the provided data.

Args:
X: Input data tensor or numpy array shape (n_samples, n_features)
num_epochs: Number of training epochs
learning_rate: Learning rate for optimizer
device: Device to train on ('cuda' or 'cpu')
batch_size: Batch size for training

Returns:
List of training losses per epoch
"""
# Convert to tensor if numpy array
if isinstance(X, np.ndarray):
X = torch.from_numpy(X).float()

# Ensure X is 2D
if X.dim() == 1:
X = X.unsqueeze(1)
if X.dim() == 3:
# (n_samples, n_timesteps, n_features)
X = X.view(-1, 1)

# Create sliding windows
windowed_data = self._create_sliding_windows(X)

# Create dataset and dataloader
# window_size=1 since we already created windows
dataset = SlidingWindowDataset(windowed_data, window_size=1)
dataloader = DataLoader(
dataset, batch_size=batch_size, shuffle=True, drop_last=True)

self.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(self.parameters(), lr=learning_rate)

self.train()
losses = []

# Progress bar for epochs
epoch_pbar = tqdm(range(num_epochs), desc="Training", unit="epoch")

for epoch in epoch_pbar:
epoch_loss = 0.0

# Progress bar for batches
batch_pbar = tqdm(
dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

for batch_idx, (data) in enumerate(batch_pbar):
data = data.to(device)

# Forward pass
output = self(data)
loss = criterion(output, data)

# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()

epoch_loss += loss.item()

# Update batch progress bar
batch_pbar.set_postfix({"Batch Loss": f"{loss.item():.4f}"})

avg_loss = epoch_loss / len(dataloader)
losses.append(avg_loss)

# Update epoch progress bar
epoch_pbar.set_postfix({"Avg Loss": f"{avg_loss:.4f}"})

return losses

def predict(self, X_test, X_dirty=None, device="cuda"):
"""
Predict anomaly scores for time series data.

Args:
X_test: Test data for reconstruction
X_dirty: Original dirty data (if None, uses X_test)
device: Device to run inference on

Returns:
Reconstructed data and sets decision_scores_ attribute
"""
self.eval()
self.to(device)

# Create sliding windows for test data
if isinstance(X_test, np.ndarray):
X_test = torch.from_numpy(X_test).float()

windowed_test = self._create_sliding_windows(X_test)
windowed_test = windowed_test.to(device)

with torch.no_grad():
test_predict = self(windowed_test).cpu().numpy()

# Calculate MAE loss
test_mae_loss = np.mean(
np.abs(test_predict - windowed_test.cpu().numpy()), axis=1)

# Normalize MAE loss
nor_test_mae_loss = MinMaxScaler().fit_transform(
test_mae_loss.reshape(-1, 1)).ravel()

# Use X_dirty if provided, otherwise use original X_test
if X_dirty is None:
if isinstance(X_test, torch.Tensor):
X_dirty = X_test.cpu().numpy()
else:
X_dirty = X_test

# Initialize score array
score = np.zeros(len(X_dirty))

# Fill the score array with sliding window approach
score[self.sliding_window // 2:self.sliding_window //
2 + len(test_mae_loss)] = nor_test_mae_loss
score[:self.sliding_window // 2] = nor_test_mae_loss[0]
score[self.sliding_window // 2 +
len(test_mae_loss):] = nor_test_mae_loss[-1]

# Store decision scores
self.decision_scores_ = score

return test_predict

def encode_data(self, x, device="cuda"):
"""
Encode input data to latent representation.

Args:
x: Input tensor or numpy array
device: Device to run inference on

Returns:
Encoded data as numpy array
"""
self.eval()
self.to(device)

# Convert to tensor if numpy array
if isinstance(x, np.ndarray):
x = torch.from_numpy(x).float()
x = x.to(device)
with torch.no_grad():
encoded = self.encode(x)
return encoded.cpu().numpy()
124 changes: 124 additions & 0 deletions datasets/daphnet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from benchopt import BaseDataset, safe_import_context, config

with safe_import_context() as import_ctx:
from pathlib import Path
import numpy as np
import pandas as pd

PATH = config.get_data_path("DAPHNET")


def load_data(db_path, record_ids=None):
"""
Load data from the database path for specified record IDs.

Args:
db_path: Path to the database directory
record_ids: List of record IDs to load.
If None, loads all available records.

Returns:
tuple: (X, y_true) where:
- X: numpy array of shape (num_records, num_samples)
- y_true: numpy array of shape (num_records, num_samples)
"""
db_path = Path(db_path)

if record_ids is None:
# Get all available record files with .test.csv@X.out pattern
record_files = list(db_path.glob("*.test.csv@*.out"))
record_ids = [f.name for f in record_files]

data_list = []
labels_list = []
for record_id in record_ids:
# Find all files matching the pattern for the given record_id
record_files = list(db_path.glob(f"{record_id}.test.csv@*.out"))

if not record_files:
print(f"No record files found for ID: {record_id}")
continue

for record_file in record_files:
print(f"Loading record file: {record_file}")
# Load the record data
record_data = pd.read_csv(
record_file, header=None).dropna().to_numpy()
# Assuming first column is the data, second column is labels
if record_data.shape[1] >= 2:
data_list.append(record_data[:, 0].astype(float))
labels_list.append(record_data[:, 1].astype(int))
else:
print(
f"Insufficient columns for record file {record_file.name}")

if not data_list:
raise ValueError("No valid data found")

# Find maximum length for padding
max_length = max(len(data) for data in data_list)

# Pad all sequences to the same length
padded_data = []
padded_labels = []
for data, labels in zip(data_list, labels_list):
if len(data) < max_length:
# Pad with last value for data and 0 for labels
padded_data.append(
np.pad(
data,
(0, max_length - len(data)),
mode="constant",
constant_values=data[-1],
)
)
padded_labels.append(
np.pad(
labels,
(0, max_length - len(labels)),
mode="constant",
constant_values=0,
)
)
else:
padded_data.append(data[:max_length])
padded_labels.append(labels[:max_length])

return np.array(padded_data), np.array(padded_labels)


class Dataset(BaseDataset):
name = "DAPHNET"

parameters = {
"recordings_id": [["S01R02E0"]],
"debug": [False],
}

def get_data(self):
"""Load the DAPHNET dataset."""

# X shape (n_recordings, n_samples)
# y shape (n_recordings, n_samples)
X, y_true = load_data(PATH, self.recordings_id)

X_test = X.copy()
y_test = y_true.copy()

X_train = X[:, :int(X.shape[1] * 0.1)]

if self.debug:
X_train = X_train[:, :1000]
X_test = X_test[:, :1000]
y_test = y_test[:, :1000]

# Reshaping data to (n_samples, n_features)
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

return dict(
X_train=X_train,
y_test=y_test,
X_test=X_test
)
Loading
Loading