Skip to content

Commit

Permalink
Merge branch 'nazar-experiment' into feature/adapt_pipeline_for_exist…
Browse files Browse the repository at this point in the history
…ing_nulls
  • Loading branch information
proc1v authored May 13, 2024
2 parents 36a9446 + e9450b1 commit 58ff135
Show file tree
Hide file tree
Showing 7 changed files with 567 additions and 27 deletions.
6 changes: 3 additions & 3 deletions cluster/run_exp1/run-exp1-imputers-template.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#SBATCH --cpus-per-task=24
#SBATCH --time=48:00:00
#SBATCH --mail-type=END
#SBATCH --mail-user=dh3553@nyu.edu
#SBATCH --mail-user=np2969@nyu.edu
#SBATCH --job-name=<DATASET>_<NULL_IMPUTER>_<EVALUATION_SCENARIO>
#SBATCH --output=%j_<DATASET>_<NULL_IMPUTER>_<EVALUATION_SCENARIO>.out
#SBATCH --error=%j_<DATASET>_<NULL_IMPUTER>_<EVALUATION_SCENARIO>.err
Expand All @@ -17,12 +17,12 @@ python ../scripts/impute_nulls_with_predictor.py \
--dataset <DATASET> \
--null_imputers \[\"'<NULL_IMPUTER>'\"] \
--run_nums \[<RUN_NUMS>\] \
--tune_imputers true \
--tune_imputers false \
--save_imputed_datasets true \
--evaluation_scenarios \[\"'<EVALUATION_SCENARIO>'\"\]
"

singularity exec \
--overlay /scratch/dh3553/ml_life_cycle_project/vldb_env.ext3:ro \
--overlay /scratch/np2969/ml_life_cycle_project/vldb_env.ext3:ro \
/scratch/work/public/singularity/ubuntu-20.04.1.sif \
/bin/bash -c "source /ext3/env.sh; ${EXECUTION_COMMAND}"
45 changes: 27 additions & 18 deletions cluster/run_exp1/run-exp1-imputers.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,32 @@
# Define the list of tuples (dataset_name, model_name, run_nums)
declare -a job_configs=(
"heart automl exp1_mcar3 1"
"heart automl exp1_mcar3 2"
"heart automl exp1_mcar3 3"
"heart automl exp1_mcar3 4"
"heart automl exp1_mcar3 5"
"heart automl exp1_mcar3 6"
"heart automl exp1_mar3 1"
"heart automl exp1_mar3 2"
"heart automl exp1_mar3 3"
"heart automl exp1_mar3 4"
"heart automl exp1_mar3 5"
"heart automl exp1_mar3 6"
"heart automl exp1_mnar3 1"
"heart automl exp1_mnar3 2"
"heart automl exp1_mnar3 3"
"heart automl exp1_mnar3 4"
"heart automl exp1_mnar3 5"
"heart automl exp1_mnar3 6"
"law_school k_means_clustering exp1_mcar3 1,2"
"law_school k_means_clustering exp1_mcar3 3,4"
"law_school k_means_clustering exp1_mcar3 5,6"
"law_school k_means_clustering exp1_mar3 1,2"
"law_school k_means_clustering exp1_mar3 3,4"
"law_school k_means_clustering exp1_mar3 5,6"
"law_school k_means_clustering exp1_mnar3 1,2"
"law_school k_means_clustering exp1_mnar3 3,4"
"law_school k_means_clustering exp1_mnar3 5,6"
"german k_means_clustering exp1_mcar3 1,2"
"german k_means_clustering exp1_mcar3 3,4"
"german k_means_clustering exp1_mcar3 5,6"
"german k_means_clustering exp1_mar3 1,2"
"german k_means_clustering exp1_mar3 3,4"
"german k_means_clustering exp1_mar3 5,6"
"german k_means_clustering exp1_mnar3 1,2"
"german k_means_clustering exp1_mnar3 3,4"
"german k_means_clustering exp1_mnar3 5,6"
"diabetes k_means_clustering exp1_mcar3 1,2"
"diabetes k_means_clustering exp1_mcar3 3,4"
"diabetes k_means_clustering exp1_mcar3 5,6"
"diabetes k_means_clustering exp1_mar3 1,2"
"diabetes k_means_clustering exp1_mar3 3,4"
"diabetes k_means_clustering exp1_mar3 5,6"
"diabetes k_means_clustering exp1_mnar3 1,2"
"diabetes k_means_clustering exp1_mnar3 3,4"
"diabetes k_means_clustering exp1_mnar3 5,6"
)

TEMPLATE_FILE="../cluster/run_exp1/run-exp1-imputers-template.sbatch"
Expand Down
25 changes: 24 additions & 1 deletion configs/null_imputers_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,27 @@
ErrorRepairMethod.boost_clean.value: {"method": prepare_boostclean, "kwargs": {}}
}

NULL_IMPUTERS_HYPERPARAMS = {}
NULL_IMPUTERS_HYPERPARAMS = {
ErrorRepairMethod.k_means_clustering.value: {
ACS_INCOME_DATASET: {
"MCAR3": {"n_clusters": 2},
"MAR3": {"n_clusters": 2},
"MNAR3": {"n_clusters": 2}
},
GERMAN_CREDIT_DATASET: {
"MCAR3": {"n_clusters": 2},
"MAR3": {"n_clusters": 2},
"MNAR3": {"n_clusters": 2}
},
DIABETES_DATASET: {
"MCAR3": {"n_clusters": 2},
"MAR3": {"n_clusters": 2},
"MNAR3": {"n_clusters": 2}
},
LAW_SCHOOL_DATASET: {
"MCAR3": {"n_clusters": 2},
"MAR3": {"n_clusters": 2},
"MNAR3": {"n_clusters": 2}
},
}
}
496 changes: 496 additions & 0 deletions notebooks/cluster_analysis/clustering_silhoette.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion source/custom_classes/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ def inject_and_impute_nulls(self, data_loader, null_imputer_name: str, evaluatio
imputation_runtime=imputation_runtime,
null_imputer_name=null_imputer_name,
evaluation_scenario=evaluation_scenario,
experiment_seed=experiment_seed)
experiment_seed=experiment_seed,
null_imputer_params_dct=null_imputer_params_dct)

if save_imputed_datasets:
self._save_imputed_datasets_to_fs(X_train_val=X_train_val_imputed_wo_sensitive_attrs,
Expand Down
7 changes: 6 additions & 1 deletion source/custom_classes/ml_lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ def _impute_nulls(self, X_train_with_nulls, X_tests_with_nulls_lst, null_imputer
train_injection_strategy, _ = get_injection_scenarios(evaluation_scenario)
hyperparams = NULL_IMPUTERS_HYPERPARAMS.get(null_imputer_name, {}).get(self.dataset_name, {}).get(train_injection_strategy, {})

self._logger.info('Impute nulls hyper-params: {}'.format(hyperparams))
self._logger.info('tune_imputers: {}'.format(tune_imputers))

# Use a method, kwargs, and hyperparams from NULL_IMPUTERS_CONFIG
imputation_method = NULL_IMPUTERS_CONFIG[null_imputer_name]["method"]
imputation_kwargs = NULL_IMPUTERS_CONFIG[null_imputer_name]["kwargs"]
Expand Down Expand Up @@ -302,7 +305,8 @@ def _evaluate_imputation(self, real, imputed, corrupted, numerical_columns, null
return metrics_df

def _save_imputation_metrics_to_db(self, train_imputation_metrics_df: pd.DataFrame, test_imputation_metrics_dfs_lst: list,
imputation_runtime: float, null_imputer_name: str, evaluation_scenario: str, experiment_seed: int):
imputation_runtime: float, null_imputer_name: str, evaluation_scenario: str,
experiment_seed: int, null_imputer_params_dct: dict):
train_imputation_metrics_df['Imputation_Guid'] = train_imputation_metrics_df.apply(
lambda row: generate_guid(ordered_hierarchy_lst=[self.dataset_name, null_imputer_name,
evaluation_scenario, experiment_seed,
Expand All @@ -320,6 +324,7 @@ def _save_imputation_metrics_to_db(self, train_imputation_metrics_df: pd.DataFra
'dataset_part': 'X_train_val',
'runtime_in_mins': imputation_runtime,
'record_create_date_time': datetime.now(timezone.utc),
'null_imputer_params_dct': null_imputer_params_dct
})

# Save imputation results into a database for each test set from the evaluation scenario
Expand Down
12 changes: 9 additions & 3 deletions source/null_imputers/kmeans_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ def fit(self, X, cat_vars, y=None):

pred_clusters = self.model.predict(X_observed, categorical=self.cat_vars_)
self._calculate_cluster_stats(X, pred_clusters)
# save percentage of clusters
self.cluster_percentages_ = {str(cluster): len(np.where(pred_clusters == cluster)[0]) / len(pred_clusters) for cluster in set(pred_clusters)}
print(f"Cluster percentages: {self.cluster_percentages_}")

return self

Expand Down Expand Up @@ -191,6 +194,9 @@ def fit_transform(self, df, target_column: str = None, **fit_params):

def get_predictors_params(self):
if self.hyperparameters is None:
return self.best_params_

return self.hyperparameters
output = self.best_params_
else:
output = self.hyperparameters

#output['cluster_percentages_'] = self.cluster_percentages_
return output

0 comments on commit 58ff135

Please sign in to comment.