diff --git a/src/task/methods/lgc_ensemble_helpers/prepare_data.py b/src/task/methods/lgc_ensemble_helpers/prepare_data.py index 4d948210..481f0a63 100644 --- a/src/task/methods/lgc_ensemble_helpers/prepare_data.py +++ b/src/task/methods/lgc_ensemble_helpers/prepare_data.py @@ -19,6 +19,7 @@ def prepare_data(par, paths): mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index() std_cell_type = de_cell_type.groupby('cell_type').std().reset_index() std_sm_name = de_sm_name.groupby('sm_name').std().reset_index() + std_sm_name_filled = std_sm_name.fillna(0) cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line quantiles_cell_type = pd.concat([pd.DataFrame(cell_types)]+[de_cell_type.groupby('cell_type')[col]\ .quantile([0.25, 0.50, 0.75], interpolation='linear').unstack().reset_index(drop=True) for col in list(de_train.columns)[5:]], axis=1) @@ -30,7 +31,7 @@ def prepare_data(par, paths): mean_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/mean_cell_type.csv', index=False) std_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/std_cell_type.csv', index=False) mean_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/mean_sm_name.csv', index=False) - std_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False) + std_sm_name_filled.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False) quantiles_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False) ## Create one hot encoding features one_hot_encode(de_train[["cell_type", "sm_name"]], id_map[["cell_type", "sm_name"]], out_dir=paths["train_data_aug_dir"])