From 437af829d18d586ca00ffcf124eb165130780ffd Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 30 May 2024 13:02:24 +0200 Subject: [PATCH] apply patch to fix lgc_ensemble on bootstrapped data Co-authored-by: ttunja <60556758+ttunja@users.noreply.github.com> --- src/task/methods/lgc_ensemble_helpers/prepare_data.py | 3 ++- src/task/methods/lgc_ensemble_prepare/script.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/task/methods/lgc_ensemble_helpers/prepare_data.py b/src/task/methods/lgc_ensemble_helpers/prepare_data.py index 4d948210..481f0a63 100644 --- a/src/task/methods/lgc_ensemble_helpers/prepare_data.py +++ b/src/task/methods/lgc_ensemble_helpers/prepare_data.py @@ -19,6 +19,7 @@ def prepare_data(par, paths): mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index() std_cell_type = de_cell_type.groupby('cell_type').std().reset_index() std_sm_name = de_sm_name.groupby('sm_name').std().reset_index() + std_sm_name_filled = std_sm_name.fillna(0) cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line quantiles_cell_type = pd.concat([pd.DataFrame(cell_types)]+[de_cell_type.groupby('cell_type')[col]\ .quantile([0.25, 0.50, 0.75], interpolation='linear').unstack().reset_index(drop=True) for col in list(de_train.columns)[5:]], axis=1) @@ -30,7 +31,7 @@ def prepare_data(par, paths): mean_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/mean_cell_type.csv', index=False) std_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/std_cell_type.csv', index=False) mean_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/mean_sm_name.csv', index=False) - std_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False) + std_sm_name_filled.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False) quantiles_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False) ## Create one hot encoding features one_hot_encode(de_train[["cell_type", "sm_name"]], id_map[["cell_type", "sm_name"]], out_dir=paths["train_data_aug_dir"]) diff --git a/src/task/methods/lgc_ensemble_prepare/script.py b/src/task/methods/lgc_ensemble_prepare/script.py index c4fd164c..53f68309 100644 --- a/src/task/methods/lgc_ensemble_prepare/script.py +++ b/src/task/methods/lgc_ensemble_prepare/script.py @@ -60,6 +60,7 @@ mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index() std_cell_type = de_cell_type.groupby('cell_type').std().reset_index() std_sm_name = de_sm_name.groupby('sm_name').std().reset_index() +std_sm_name_filled = std_sm_name.fillna(0) cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line quantiles_cell_type = pd.concat( [pd.DataFrame(cell_types)] + @@ -74,7 +75,7 @@ mean_cell_type.to_csv(f'{par["train_data_aug_dir"]}/mean_cell_type.csv', index=False) std_cell_type.to_csv(f'{par["train_data_aug_dir"]}/std_cell_type.csv', index=False) mean_sm_name.to_csv(f'{par["train_data_aug_dir"]}/mean_sm_name.csv', index=False) -std_sm_name.to_csv(f'{par["train_data_aug_dir"]}/std_sm_name.csv', index=False) +std_sm_name_filled.to_csv(f'{par["train_data_aug_dir"]}/std_sm_name.csv', index=False) quantiles_cell_type.to_csv(f'{par["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False) with open(f'{par["train_data_aug_dir"]}/gene_names.json', 'w') as f: json.dump(gene_names, f)