Skip to content

Commit

Permalink
Update for unning code:lb2 (openproblems-bio#23)
Browse files Browse the repository at this point in the history
* update lb2_data

* hardcoded dims

* refactor config & scripts

* add method to wf

* clean config

---------

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>
  • Loading branch information
HelloWorldLTY and rcannood authored May 18, 2024
1 parent ba4429d commit b4afd62
Show file tree
Hide file tree
Showing 9 changed files with 1,064 additions and 1 deletion.
55 changes: 55 additions & 0 deletions src/task/methods/transformer_ensemble/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
__merge__: ../../api/comp_method.yaml

functionality:
name: transformer_ensemble
info:
label: Transformer Ensemble
rank: 2
summary: An ensemble of four transformer models, trained on diverse feature sets, with a cluster-based sampling strategy and robust validation for optimal performance.
description: |
This method employs an ensemble of four transformer models,
each with different weights and trained on slightly varying feature sets.
The feature engineering process involved one-hot encoding of categorical labels,
target encoding using mean and standard deviation, and enriching the feature set
with the standard deviation of target variables. Additionally, the dataset was
carefully examined to ensure data cleanliness. A sophisticated sampling strategy
based on K-Means clustering was employed to partition the data into training and
validation sets, ensuring a representative distribution. The model architecture
leveraged sparse and dense feature encoding, along with a transformer for effective
learning.
documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458738
repository_url: https://github.com/Eliorkalfon/single_cell_pb
arguments:
- name: --num_train_epochs
type: integer
default: 20000
description: "Number of training epochs."
info:
test_default: 10
resources:
- type: python_script
path: script.py
- path: models.py
- path: utils.py
- path: train.py
- path: predict.py
- path: seq.py

platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4
setup:
- type: python
packages:
- fastparquet
- pyarrow
- pandas~=2.0.3
- scikit-learn~=1.0.1
- tqdm~=4.66.1
- numpy~=1.23
- matplotlib~=3.5.0
- PyYAML~=6.0.1
- lion-pytorch
- type: nextflow
directives:
label: [ midtime, highmem, highcpu, gpu ]
128 changes: 128 additions & 0 deletions src/task/methods/transformer_ensemble/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Model Architecture
import torch
import torch.nn as nn
import torch.optim


class CustomTransformer(nn.Module):
def __init__(self, num_features, num_labels, d_model=128, num_heads=8, num_layers=6): # num_heads=8
super(CustomTransformer, self).__init__()
self.embedding = nn.Linear(num_features, d_model)
# Embedding layer for sparse features
# self.embedding = nn.Embedding(num_features, d_model)

# self.norm = nn.BatchNorm1d(d_model, affine=True)
self.norm = nn.LayerNorm(d_model)
# self.transformer = nn.Transformer(d_model=d_model, nhead=num_heads, num_encoder_layers=num_layers,
# dropout=0.1, device='cuda')
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, device='cuda', dropout=0.3,
activation=nn.GELU(),
batch_first=True), enable_nested_tensor=True, num_layers=num_layers
)
# Dropout layer for regularization
# self.dropout = nn.Dropout(0.2)
self.fc = nn.Linear(d_model, num_labels)

def forward(self, x):
x = self.embedding(x)

# x = (self.transformer(x,x))
x = self.transformer(x)
x = self.norm(x)
# x = self.fc(self.dropout(x))
x = self.fc(x)
return x


class CustomTransformer_mean_std(nn.Module): # mean + std
def __init__(self, num_features, num_targets, num_labels, d_model=128, num_heads=8, num_layers=6, dropout=0.3):
super(CustomTransformer_mean_std, self).__init__()
self.num_target_encodings = num_targets * 4
self.num_sparse_features = num_features - self.num_target_encodings

self.sparse_feature_embedding = nn.Linear(self.num_sparse_features, d_model)
self.target_encoding_embedding = nn.Linear(self.num_target_encodings, d_model)
self.norm = nn.LayerNorm(d_model)

self.concatenation_layer = nn.Linear(2 * d_model, d_model)
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, activation=nn.GELU(),
batch_first=True),
num_layers=num_layers
)
self.fc = nn.Linear(d_model, num_labels)

def forward(self, x):
sparse_features = x[:, :self.num_sparse_features]
target_encodings = x[:, self.num_sparse_features:]

sparse_features = self.sparse_feature_embedding(sparse_features)
target_encodings = self.target_encoding_embedding(target_encodings)

combined_features = torch.cat((sparse_features, target_encodings), dim=1)
combined_features = self.concatenation_layer(combined_features)
combined_features = self.norm(combined_features)

x = self.transformer(combined_features)
x = self.norm(x)

x = self.fc(x)
return x

class CustomTransformer_mean(nn.Module): # mean + std
def __init__(self, num_features, num_targets, num_labels, d_model=128, num_heads=8, num_layers=6, dropout=0.3):
super(CustomTransformer_mean, self).__init__()
self.num_target_encodings = num_targets * 2
self.num_sparse_features = num_features - self.num_target_encodings

self.sparse_feature_embedding = nn.Linear(self.num_sparse_features, d_model)
self.target_encoding_embedding = nn.Linear(self.num_target_encodings, d_model)
self.norm = nn.LayerNorm(d_model)

self.concatenation_layer = nn.Linear(2 * d_model, d_model)
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, activation=nn.GELU(),
batch_first=True),
num_layers=num_layers
)
self.fc = nn.Linear(d_model, num_labels)

def forward(self, x):
sparse_features = x[:, :self.num_sparse_features]
target_encodings = x[:, self.num_sparse_features:]

sparse_features = self.sparse_feature_embedding(sparse_features)
target_encodings = self.target_encoding_embedding(target_encodings)

combined_features = torch.cat((sparse_features, target_encodings), dim=1)
combined_features = self.concatenation_layer(combined_features)
combined_features = self.norm(combined_features)

x = self.transformer(combined_features)
x = self.norm(x)

x = self.fc(x)
return x

class CustomMLP(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, num_layers=6, dropout=0.3, layer_norm=True):
super(CustomMLP, self).__init__()
layers = []

for _ in range(num_layers):
if layer_norm:
layers.append(nn.LayerNorm(input_dim))
layers.append(nn.Linear(input_dim, hidden_dim))
layers.append(nn.ReLU())
if dropout > 0:
layers.append(nn.Dropout(p=dropout))
input_dim = hidden_dim

self.model = nn.Sequential(*layers)
self.fc = nn.Linear(hidden_dim, output_dim)

def forward(self, x):
x = self.model(x)
x = self.fc(x)
return x
84 changes: 84 additions & 0 deletions src/task/methods/transformer_ensemble/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import torch
import torch.optim
import copy
import pandas as pd

from utils import load_transformer_model, prepare_augmented_data, load_transformer_model, prepare_augmented_data_mean_only

@torch.no_grad()
def predict_test(par, data, models, n_components_list, d_list, batch_size, device='cpu', outname='traineddata'):
num_samples = len(data)
de_train = pd.read_parquet(par["de_train"])
id_map = pd.read_csv(par["id_map"])
gene_names = [col for col in de_train.columns if col not in {"cell_type", "sm_name", "sm_lincs_id", "SMILES", "split", "control", "index"}]

for i, n_components in enumerate(n_components_list):
for j, d_model in enumerate(d_list):
combined_outputs = []
label_reducer, scaler, transformer_model = models[f'{n_components},{d_model}']
transformer_model.eval()
for i in range(0, num_samples, batch_size):
batch_unseen_data = data[i:i + batch_size]
transformed_data = transformer_model(batch_unseen_data)
if scaler:
transformed_data = torch.tensor(scaler.inverse_transform(
label_reducer.inverse_transform(transformed_data.cpu().detach().numpy()))).to(device)
# print(transformed_data.shape)
combined_outputs.append(transformed_data)

# Stack the combined outputs
combined_outputs = torch.vstack(combined_outputs)

submission_df = pd.DataFrame(
combined_outputs.cpu().detach().numpy(),
index=id_map["id"],
columns=gene_names
).reset_index()
submission_df.to_csv(f"{outname}_output.csv")
# only one d_model and n_component is run at a time
return


def predict_main(
par,
n_components_list,
model_dir,
d_models_list=[128],
batch_size=32,
device='cpu',
mean_std='mean_std',
uncommon=False,
):
data_file = par['de_train']
id_map_file = par['id_map']

# Prepare augmented data
if mean_std == "mean_std":
one_hot_encode_features, targets, one_hot_test = prepare_augmented_data(
data_file=data_file,
id_map_file=id_map_file,
uncommon=uncommon
)
else:
one_hot_encode_features, targets, one_hot_test = prepare_augmented_data_mean_only(
data_file=data_file,
id_map_file=id_map_file
)
unseen_data = torch.tensor(one_hot_test, dtype=torch.float32).to(device) # Replace X_unseen with your new data
transformer_models = {}
for n_components in n_components_list:
for d_model in d_models_list:
label_reducer, scaler, transformer_model = load_transformer_model(
n_components,
input_features=one_hot_encode_features.shape[1],
num_targets=targets.shape[1],
d_model=d_model,
models_folder=f'{model_dir}',
device=device,
mean_std=mean_std
)
transformer_model.eval()
transformer_models[f'{n_components},{d_model}'] = (
copy.deepcopy(label_reducer), copy.deepcopy(scaler), copy.deepcopy(transformer_model))
predict_test(par, unseen_data, transformer_models, n_components_list, d_models_list, batch_size, device=device, outname = model_dir)

87 changes: 87 additions & 0 deletions src/task/methods/transformer_ensemble/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import os
import sys
import tempfile
import shutil

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## VIASH START
par = {
"de_train": "resources/neurips-2023-data/de_train.parquet",
"de_test": "resources/neurips-2023-data/de_test.parquet",
"id_map": "resources/neurips-2023-data/id_map.csv",
"output": "output.parquet",
}
meta = {
"resources_dir": "src/task/methods/lb2",
}
## VIASH END

sys.path.append(meta['resources_dir'])

from train import train_main
from predict import predict_main
from seq import seq_main

# determine n_components_list
import pandas as pd
de_train = pd.read_parquet(par["de_train"])
de_train.drop(columns=["cell_type", "sm_name", "sm_lincs_id", "SMILES", "split", "control"], inplace=True)
n_components_list = [de_train.shape[1]]
del de_train

# determine model dirs
output_model = par.get("output_model") or tempfile.TemporaryDirectory(dir = meta["temp_dir"]).name
if not os.path.exists(output_model):
os.makedirs(output_model, exist_ok=True)
if not par.get("output_model"):
import atexit
atexit.register(lambda: shutil.rmtree(output_model))

# train and predict models
argsets = [
{
"dir": f"{output_model}/trained_models_kmeans_mean_std",
"mean_std": "mean_std",
"uncommon": False,
"sampling_strategy": "k-means",
"weight": .4
},
{
"dir": f"{output_model}/trained_models_kmeans_mean_std_trueuncommon",
"mean_std": "mean_std",
"uncommon": True,
"sampling_strategy": "k-means",
"weight": .1
},
{
"dir": f"{output_model}/trained_models_kmeans_mean",
"mean_std": "mean",
"uncommon": False,
"sampling_strategy": "k-means",
"weight": .2
},
{
"dir": f"{output_model}/trained_models_nonkmeans_mean",
"mean_std": "mean",
"uncommon": False,
"sampling_strategy": "random",
"weight": .3
}
]

print(f"Train and predict models", flush=True)
for argset in argsets:
print(f"Generate model {argset['dir']}", flush=True)
train_main(par, n_components_list, argset['dir'], mean_std=argset['mean_std'], uncommon=argset['uncommon'], sampling_strategy=argset['sampling_strategy'])

print(f"Predict model {argset['dir']}", flush=True)
predict_main(par, n_components_list, argset['dir'], mean_std=argset['mean_std'], uncommon=argset['uncommon'])

print(f"Combine predictions", flush=True)
seq_main(
par,
model_dirs=[argset['dir'] for argset in argsets],
weights=[argset['weight'] for argset in argsets],
)
Loading

0 comments on commit b4afd62

Please sign in to comment.