Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding AmbrosM's solution to methods #31

Merged
merged 37 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
278da98
converted AmbrosM's notebook into viash script
andrew-benz May 15, 2024
491e8ee
fixed parameters
andrew-benz May 15, 2024
1938415
modified config
andrew-benz May 16, 2024
5cb25bf
method now runs successfully (in native mode with correct packages in…
andrew-benz May 16, 2024
4bc7739
Merge remote-tracking branch 'origin/main' into ambros_m_judges_third…
rcannood May 17, 2024
700a3a4
remove ipynbcheckpoints
rcannood May 17, 2024
4e563ab
add ipynb_checkpoints to gitignore
rcannood May 17, 2024
f42ed40
remove ipynbcheckpoints
rcannood May 17, 2024
44fdce0
Merge remote-tracking branch 'origin/main' into ambros_m_judges_third…
rcannood May 18, 2024
c1e5427
Add longer description
andrew-benz May 18, 2024
05ea0c8
Clean up viash arguments
andrew-benz May 18, 2024
4dfa504
Fix Viash start/end tags
andrew-benz May 18, 2024
095800f
fixed script parameters to match task/api/comp_method.yaml
andrew-benz May 18, 2024
30333a0
modified config
andrew-benz May 18, 2024
898420c
bug fixes
andrew-benz May 20, 2024
7379998
switching to nvidia docker image
andrew-benz May 20, 2024
5a2d80d
changed nvidia docker container version
andrew-benz May 20, 2024
bec35ae
adding colorama to list of required packages
andrew-benz May 20, 2024
db1dd4a
using older version of nvidia pytorch package that uses CUDA 11
andrew-benz May 20, 2024
138c705
trying removing cupy-cuda11x==12.2.0 from package requirements to avo…
andrew-benz May 20, 2024
9dc8e7a
trying to fix cupy to cuda 11.8
andrew-benz May 20, 2024
2cfb118
trying now to install just cuda11x (without pinning version)
andrew-benz May 20, 2024
fa0a410
fix config
rcannood May 20, 2024
1bd6e3a
Merge branch 'ambros_m_judges_third_place' of github.com:openproblems…
rcannood May 20, 2024
68f40ab
fix config
rcannood May 20, 2024
7ff1a0f
remove unused imports
rcannood May 20, 2024
35296e6
minor changes to script
rcannood May 20, 2024
cef37f3
move helper functions to separate file
rcannood May 20, 2024
83f6d9d
fix missing variables
rcannood May 20, 2024
7fdd3aa
Merge remote-tracking branch 'origin/main' into ambros_m_judges_third…
rcannood May 20, 2024
bc5cd75
remove cross validation
rcannood May 20, 2024
5a1948a
rename method
rcannood May 20, 2024
b09cdff
add to wf
rcannood May 20, 2024
41927bb
remove gpus from config
rcannood May 20, 2024
70e8327
move imports to individual models
rcannood May 20, 2024
c9b1fed
Delete .attach_pid21979
rcannood May 20, 2024
5684429
fix path in resources dir
rcannood May 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix missing variables
  • Loading branch information
rcannood committed May 20, 2024
commit 83f6d9d7e05c1ca2dad02c5740d2efdbd442beda
58 changes: 28 additions & 30 deletions src/task/methods/ambros_m_judges_third_place/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ def t_score_to_de(t_score):
p_value = p_value.clip(1e-180, None)
return - np.log10(p_value) * np.sign(t_score)


def fit_predict_py_boost(de_tr, id_map):
def fit_predict_py_boost(de_tr, id_map, train_sm_names, genes, cell_type_ratio):
"""Fit the model and predict.

Parameters:
Expand Down Expand Up @@ -84,20 +83,20 @@ def fit_predict_py_boost(de_tr, id_map):
# sm_mean has shape (143, 18211) and contains the means of 3 or 4 values each
sm_mean = Yt_train_red[X_train_categorical['cell_type'].isin(cell_types_tr)].groupby('sm_name').mean()
X_train_encoded = np.hstack([ct_mean.reindex(X_train_categorical['cell_type']).values,
sm_mean.reindex(X_train_categorical['sm_name']).values])
sm_mean.reindex(X_train_categorical['sm_name']).values])
X_test_encoded = np.hstack([ct_mean.reindex(id_map['cell_type']).values,
sm_mean.reindex(id_map['sm_name']).values])
sm_mean.reindex(id_map['sm_name']).values])

# Fit the model
model = GradientBoosting('mse',
ntrees=ntrees,
lr=lr,
max_depth=max_depth,
subsample=subsample,
colsample=colsample,
min_data_in_leaf=1,
min_gain_to_split=0,
verbose=10000)
ntrees=ntrees,
lr=lr,
max_depth=max_depth,
subsample=subsample,
colsample=colsample,
min_data_in_leaf=1,
min_gain_to_split=0,
verbose=10000)
model.fit(X_train_encoded, Yt_train_red)

# Predict
Expand All @@ -107,7 +106,7 @@ def fit_predict_py_boost(de_tr, id_map):

return de_pred

def fit_predict_ridge_recommender(de_tr, id_map):
def fit_predict_ridge_recommender(de_tr, id_map, train_sm_names, genes, cell_type_ratio):
"""Fit the model and predict.

Parameters:
Expand All @@ -116,8 +115,8 @@ def fit_predict_ridge_recommender(de_tr, id_map):

Returns:
de_pred: prediction dataframes of shape (n_samples, 18211), double index matching id_map
If a compound occurs in id_map but not in de_tr, the corresponding row
of de_pred will be filled with np.nan
If a compound occurs in id_map but not in de_tr, the corresponding row
of de_pred will be filled with np.nan
"""
# Hyperparameters
n_components_in, n_components_out = 7, 70
Expand Down Expand Up @@ -181,7 +180,7 @@ def fit_predict_ridge_recommender(de_tr, id_map):
de_pred = pd.DataFrame(Y_test_pred, index=pd.MultiIndex.from_frame(id_map), columns=genes)
return de_pred

def fit_predict_knn_recommender(de_tr, id_map):
def fit_predict_knn_recommender(de_tr, id_map, train_sm_names, genes, cell_type_ratio):
"""Fit the model and predict.

Parameters:
Expand All @@ -190,8 +189,8 @@ def fit_predict_knn_recommender(de_tr, id_map):

Returns:
de_pred: prediction dataframes of shape (n_samples, 18211), double index matching id_map
If a compound occurs in id_map but not in de_tr, the corresponding row
of de_pred will be filled with np.nan
If a compound occurs in id_map but not in de_tr, the corresponding row
of de_pred will be filled with np.nan
"""
# Hyperparameters
n_components_in, n_components_out = 7, 70
Expand Down Expand Up @@ -226,12 +225,12 @@ def fit_predict_knn_recommender(de_tr, id_map):

for sm1, sm2 in combinations(train_sm_names, 2):
a = (2 * Yt_train_red.query("sm_name == @sm1").reset_index('sm_name', drop=True)
+ Yt_train_red.query("sm_name == @sm2").reset_index('sm_name', drop=True)) / 3
+ Yt_train_red.query("sm_name == @sm2").reset_index('sm_name', drop=True)) / 3
a.dropna(inplace=True)
a = pd.concat([a], keys=[f"{sm1}+{sm2} a"], names=['sm_name'])
a = a.reorder_levels(['cell_type', 'sm_name'])
b = (Yt_train_red.query("sm_name == @sm1").reset_index('sm_name', drop=True)
+ 2 * Yt_train_red.query("sm_name == @sm2").reset_index('sm_name', drop=True)) / 3
+ 2 * Yt_train_red.query("sm_name == @sm2").reset_index('sm_name', drop=True)) / 3
b.dropna(inplace=True)
b = pd.concat([b], keys=[f"{sm1}+{sm2} b"], names=['sm_name'])
b = b.reorder_levels(['cell_type', 'sm_name'])
Expand Down Expand Up @@ -288,7 +287,7 @@ def fit_predict_knn_recommender(de_tr, id_map):
de_pred = pd.DataFrame(Y_test_pred, index=pd.MultiIndex.from_frame(id_map), columns=genes)
return de_pred

def fit_predict_extratrees(de_tr, id_map):
def fit_predict_extratrees(de_tr, id_map, train_sm_names, genes, cell_type_ratio):
"""Fit the model and predict.

Parameters:
Expand All @@ -297,8 +296,8 @@ def fit_predict_extratrees(de_tr, id_map):

Returns:
de_pred: prediction dataframes of shape (n_samples, 18211), double index matching id_map
If a compound occurs in id_map but not in de_tr, the corresponding row
of de_pred will be filled with np.nan
If a compound occurs in id_map but not in de_tr, the corresponding row
of de_pred will be filled with np.nan
"""
# Hyperparameters
n_components_in, n_components_out = 35, 200
Expand All @@ -323,12 +322,12 @@ def fit_predict_extratrees(de_tr, id_map):
ct_mean = Yt_train_red_in[X_train_categorical['sm_name'].isin(train_sm_names)].groupby('cell_type').mean() # shape (6, n_components), means of 13 or 14 values each
sm_mean = Yt_train_red_in[X_train_categorical['cell_type'].isin(cell_types_tr)].groupby('sm_name').mean() # shape (143, n_components), means of 3 or 4 values each
X_train_encoded = np.hstack([ct_mean.reindex(X_train_categorical['cell_type']).values,
sm_mean.reindex(X_train_categorical['sm_name']).values,
sm_mean.reindex(X_train_categorical['sm_name']).values * np.sqrt(cell_type_ratio.reindex(X_train_categorical['cell_type']).values.reshape(-1, 1))
sm_mean.reindex(X_train_categorical['sm_name']).values,
sm_mean.reindex(X_train_categorical['sm_name']).values * np.sqrt(cell_type_ratio.reindex(X_train_categorical['cell_type']).values.reshape(-1, 1))
])
X_test_encoded = np.hstack([ct_mean.reindex(id_map['cell_type']).values,
sm_mean.reindex(id_map['sm_name']).values,
sm_mean.reindex(id_map['sm_name']).values * np.sqrt(cell_type_ratio.reindex(id_map['cell_type']).values.reshape(-1, 1))
sm_mean.reindex(id_map['sm_name']).values,
sm_mean.reindex(id_map['sm_name']).values * np.sqrt(cell_type_ratio.reindex(id_map['cell_type']).values.reshape(-1, 1))
])

# Train the model
Expand All @@ -345,8 +344,7 @@ def fit_predict_extratrees(de_tr, id_map):
de_pred = de_pred.reindex(id_map)
return de_pred


def cross_val_log10pvalue(predictor, noise=0):
def cross_val_log10pvalue(train_sm_names, genes, cell_type_ratio, train_cell_types, de_train, de_train_indexed, de_oof_dict, mrrmse_noise_list, removed_compounds, predictor, noise=0):
"""Cross-validate a machine-learning model

Parameters:
Expand Down Expand Up @@ -388,7 +386,7 @@ def cross_val_log10pvalue(predictor, noise=0):
de_tr = t_score_to_de(de_to_t_score(de_tr) + rng.normal(scale=noise, size=de_tr.shape))

# Fit the model and predict validation log10pvalues
de_pred = predictor(de_tr, de_va.index.to_frame())
de_pred = predictor(de_tr, de_va.index.to_frame(), train_sm_names, genes, cell_type_ratio)

# Update out-of-fold predictions and score
de_oof_list.append(de_pred)
Expand Down
7 changes: 5 additions & 2 deletions src/task/methods/ambros_m_judges_third_place/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
sys.path.append(meta["resources_dir"])
from helper import fit_predict_py_boost, fit_predict_ridge_recommender, fit_predict_knn_recommender, fit_predict_extratrees, cross_val_log10pvalue, mean_rowwise_rmse



## Loading data

de_train = pd.read_parquet(par['de_train'])
Expand Down Expand Up @@ -78,11 +80,12 @@
removed_compounds = []

# Cross-validate the four models (saving the oof predictions)

predictors = [fit_predict_py_boost, fit_predict_ridge_recommender, fit_predict_knn_recommender, fit_predict_extratrees]
de_oof_dict, mrrmse_noise_list = {}, []
for predictor in predictors:
print(fit_predict_py_boost)
cross_val_log10pvalue(predictor)
cross_val_log10pvalue(train_sm_names, genes, cell_type_ratio, train_cell_types, de_train, de_train_indexed, de_oof_dict, mrrmse_noise_list, removed_compounds, predictor)

# Ensemble the oof predictions
de_oof = sum(de_oof_dict.values()) / len(de_oof_dict)
Expand All @@ -96,7 +99,7 @@
de_tr = de_train_indexed.query("~sm_name.isin(@removed_compounds)")

# Fit all models and average their predictions
pred_list = [fit_predict(de_tr, id_map) for fit_predict in predictors]
pred_list = [fit_predict(de_tr, id_map, train_sm_names, genes, cell_type_ratio) for fit_predict in predictors]
de_pred = sum(pred_list) / len(pred_list)

# Test for missing values
Expand Down