Skip to content

Commit

Permalink
Attempt fixing the transformer_ensemble method (openproblems-bio#64)
Browse files Browse the repository at this point in the history
* try not letting the weights sum up to one

* fix script

* add script

* add more values to args, directly use output_model
  • Loading branch information
rcannood authored Jun 4, 2024
1 parent 9313317 commit ca5de78
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 55 deletions.
28 changes: 28 additions & 0 deletions scripts/run_benchmark_tw_traens.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

# todo: remove this before merging PR #64

RUN_ID="traens_$(date +%Y-%m-%d_%H-%M-%S)"
resources_dir="s3://openproblems-bio/public/neurips-2023-competition/workflow-resources"
publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/results/${RUN_ID}"

cat > /tmp/params.yaml << HERE
param_list:
- id: neurips-2023-data
de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
id_map: "$resources_dir/neurips-2023-data/id_map.csv"
layer: clipped_sign_log10_pval
method_ids: [transformer_ensemble]
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE

tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
--revision fix_trafo_ens_build \
--pull-latest \
--main-script target/nextflow/workflows/run_benchmark/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file /tmp/params.yaml \
--config src/common/nextflow_helpers/labels_tw.config
13 changes: 12 additions & 1 deletion src/task/methods/transformer_ensemble/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,24 @@ functionality:
description: "Number of training epochs."
info:
test_default: 10
- name: --d_model
type: integer
default: 128
description: "Dimensionality of the model."
- name: --batch_size
type: integer
default: 32
description: "Batch size."
- name: --early_stopping
type: integer
default: 5000
description: "Number of epochs to wait for early stopping."
resources:
- type: python_script
path: script.py
- path: models.py
- path: utils.py
- path: train.py
- path: ../../utils/anndata_to_dataframe.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4
Expand Down
81 changes: 47 additions & 34 deletions src/task/methods/transformer_ensemble/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,21 @@
import anndata as ad
import sys
import torch
import copy
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## VIASH START
par = {
"de_train_h5ad": "resources/neurips-2023-data/de_train.h5ad",
"de_train": "resources/neurips-2023-data/de_train.h5ad",
"id_map": "resources/neurips-2023-data/id_map.csv",
"output": "output.h5ad",
"de_train_h5ad": "resources/neurips-2023-kaggle/de_train.h5ad",
"id_map": "resources/neurips-2023-kaggle/id_map.csv",
"output": "output/prediction.h5ad",
"output_model": "output/model/",
"num_train_epochs": 10,
"layer": "clipped_sign_log10_pval"
"early_stopping": 5000,
"batch_size": 64,
"d_model": 128,
"layer": "sign_log10_pval"
}
meta = {
"resources_dir": "src/task/methods/transformer_ensemble",
Expand All @@ -22,55 +25,55 @@

sys.path.append(meta["resources_dir"])

# Fixed training params
d_model = 128
batch_size = 32
early_stopping = 5000

from anndata_to_dataframe import anndata_to_dataframe
from utils import prepare_augmented_data, prepare_augmented_data_mean_only
from train import train_k_means_strategy, train_non_k_means_strategy

# create output model directory if need be
if par["output_model"]:
os.makedirs(par["output_model"], exist_ok=True)

# read data
de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"])
de_train = anndata_to_dataframe(de_train_h5ad, par["layer"])
id_map = pd.read_csv(par["id_map"])

# convert .obs categoricals to string for ease of use
for col in de_train_h5ad.obs.select_dtypes(include=["category"]).columns:
de_train_h5ad.obs[col] = de_train_h5ad.obs[col].astype(str)
# reset index
de_train_h5ad.obs.reset_index(drop=True, inplace=True)

# determine other variables
gene_names = list(de_train_h5ad.var_names)
n_components = len(gene_names)

# train and predict models
# note, the weights intentionally don't add up to one
argsets = [
# Note by author - weight_df1: 0.5 (utilizing std, mean, and clustering sampling, yielding 0.551)
{
"name": "weight_df1",
"mean_std": "mean_std",
"uncommon": False,
"sampling_strategy": "random",
"weight": 0.5,
},
# Note by author - weight_df2: 0.25 (excluding uncommon elements, resulting in 0.559)
{
"name": "weight_df2",
"mean_std": "mean_std",
"uncommon": True,
"sampling_strategy": "random",
"weight": 0.25,
},
# Note by author - weight_df3: 0.25 (leveraging clustering sampling, achieving 0.575)
{
"name": "weight_df3",
"mean_std": "mean_std",
"uncommon": False, # should this be set to False or True?
"sampling_strategy": "k-means",
"weight": 0.25,
},
# Note by author - weight_df4: 0.3 (incorporating mean, random sampling, and excluding std, attaining 0.554)
{
"name": "weight_df4",
"mean_std": "mean",
"uncommon": False, # should this be set to False or True?
"uncommon": False,
"sampling_strategy": "random",
"weight": 0.3,
}
Expand All @@ -80,19 +83,22 @@
predictions = []

print(f"Train and predict models", flush=True)
for argset in argsets:
print(f"Train and predict model {argset['name']}", flush=True)
for i, argset in enumerate(argsets):
print(f"Train and predict model {i+1}/{len(argsets)}", flush=True)

print(f"> Prepare augmented data", flush=True)
if argset["mean_std"] == "mean_std":
one_hot_encode_features, targets, one_hot_test = prepare_augmented_data(
de_train=copy.deepcopy(de_train),
id_map=copy.deepcopy(id_map),
de_train_h5ad=de_train_h5ad,
id_map=id_map,
layer=par["layer"],
uncommon=argset["uncommon"],
)
elif argset["mean_std"] == "mean":
one_hot_encode_features, targets, one_hot_test = (
prepare_augmented_data_mean_only(de_train=de_train, id_map=id_map)
one_hot_encode_features, targets, one_hot_test = prepare_augmented_data_mean_only(
de_train_h5ad=de_train_h5ad,
id_map=id_map,
layer=par["layer"],
)
else:
raise ValueError("Invalid mean_std argument")
Expand All @@ -101,24 +107,24 @@
if argset["sampling_strategy"] == "k-means":
label_reducer, scaler, transformer_model = train_k_means_strategy(
n_components=n_components,
d_model=d_model,
d_model=par["d_model"],
one_hot_encode_features=one_hot_encode_features,
targets=targets,
num_epochs=par["num_train_epochs"],
early_stopping=early_stopping,
batch_size=batch_size,
early_stopping=par["early_stopping"],
batch_size=par["batch_size"],
device=device,
mean_std=argset["mean_std"],
)
elif argset["sampling_strategy"] == "random":
label_reducer, scaler, transformer_model = train_non_k_means_strategy(
n_components=n_components,
d_model=d_model,
d_model=par["d_model"],
one_hot_encode_features=one_hot_encode_features,
targets=targets,
num_epochs=par["num_train_epochs"],
early_stopping=early_stopping,
batch_size=batch_size,
early_stopping=par["early_stopping"],
batch_size=par["batch_size"],
device=device,
mean_std=argset["mean_std"],
)
Expand All @@ -138,8 +144,8 @@
print(f"Predict on test data", flush=True)
num_samples = len(unseen_data)
transformed_data = []
for i in range(0, num_samples, batch_size):
batch_result = transformer_model(unseen_data[i : i + batch_size])
for i in range(0, num_samples, par["batch_size"]):
batch_result = transformer_model(unseen_data[i : i + par["batch_size"]])
transformed_data.append(batch_result)
transformed_data = torch.vstack(transformed_data)
if scaler:
Expand All @@ -150,13 +156,20 @@
).to(device)

pred = transformed_data.cpu().detach().numpy()

if par["output_model"]:
model_path = f"{par['output_model']}/model_{i}.pt"
torch.save(transformer_model.state_dict(), model_path)
pred_path = f"{par['output_model']}/pred_{i}.csv"
pd.DataFrame(pred).to_csv(pred_path, index=False)

predictions.append(pred)

print(f"Combine predictions", flush=True)
# compute weighted sum
sum_weights = sum([argset["weight"] for argset in argsets])
# note, the weights intentionally don't add up to one
weighted_pred = sum([
pred * argset["weight"] / sum_weights
pred * argset["weight"]
for argset, pred in zip(argsets, predictions)
])

Expand Down
45 changes: 25 additions & 20 deletions src/task/methods/transformer_ensemble/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,33 @@ def reduce_labels(Y, n_components):


def prepare_augmented_data(
de_train,
de_train_h5ad,
id_map,
layer,
uncommon=False
):
de_train = de_train.drop(columns = ['split'])
xlist = ['cell_type', 'sm_name']
_ylist = ['cell_type', 'sm_name', 'sm_lincs_id', 'SMILES', 'control']
y = de_train.drop(columns=_ylist)
y = pd.DataFrame(
de_train_h5ad.layers[layer],
columns=de_train_h5ad.var_names,
index=de_train_h5ad.obs.index
)

# Combine train and test data for one-hot encoding
combined_data = pd.concat([de_train[xlist], id_map[xlist]])
combined_data = pd.concat([de_train_h5ad.obs[xlist], id_map[xlist]])

dum_data = pd.get_dummies(combined_data, columns=xlist)

# Split the combined data back into train and test
train = dum_data.iloc[:len(de_train)]
test = dum_data.iloc[len(de_train):]
train = dum_data.iloc[:de_train_h5ad.n_obs]
test = dum_data.iloc[de_train_h5ad.n_obs:]
if uncommon:
uncommon = [f for f in train if f not in test]
X = train.drop(columns=uncommon)
X = train
de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))]
de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]

de_cell_type = pd.concat([de_train_h5ad.obs[['cell_type']], y], axis=1)
de_sm_name = pd.concat([de_train_h5ad.obs[['sm_name']], y], axis=1)

mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
std_cell_type = de_cell_type.groupby('cell_type').std().reset_index().fillna(0)
Expand Down Expand Up @@ -107,29 +111,30 @@ def prepare_augmented_data(


def prepare_augmented_data_mean_only(
de_train,
de_train_h5ad,
layer,
id_map
):
de_train = de_train.drop(columns = ['split'])
xlist = ['cell_type', 'sm_name']
_ylist = ['cell_type', 'sm_name', 'sm_lincs_id', 'SMILES', 'control']
y = de_train.drop(columns=_ylist)
# train = pd.get_dummies(de_train[xlist], columns=xlist)
# test = pd.get_dummies(id_map[xlist], columns=xlist)
y = pd.DataFrame(
de_train_h5ad.layers[layer],
columns=de_train_h5ad.var_names,
index=de_train_h5ad.obs.index
)
# Combine train and test data for one-hot encoding
combined_data = pd.concat([de_train[xlist], id_map[xlist]])
combined_data = pd.concat([de_train_h5ad.obs[xlist], id_map[xlist]])

dum_data = pd.get_dummies(combined_data, columns=xlist)

# Split the combined data back into train and test
train = dum_data.iloc[:len(de_train)]
test = dum_data.iloc[len(de_train):]
train = dum_data.iloc[:de_train_h5ad.n_obs]
test = dum_data.iloc[de_train_h5ad.n_obs:]
# uncommon = [f for f in train if f not in test]
# X = train.drop(columns=uncommon)

X = train
de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))]
de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]
de_cell_type = pd.concat([de_train_h5ad.obs[['cell_type']], y], axis=1)
de_sm_name = pd.concat([de_train_h5ad.obs[['sm_name']], y], axis=1)
mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
rows = []
Expand Down

0 comments on commit ca5de78

Please sign in to comment.