Merge branch 'nazar-experiment' into feature/adapt_pipeline_for_exist…

…ing_nulls
FalaahArifKhan · May 13, 2024 · 58ff135 · 58ff135
2 parents 36a9446 + e9450b1
commit 58ff135
Show file tree

Hide file tree

Showing 7 changed files with 567 additions and 27 deletions.
diff --git a/cluster/run_exp1/run-exp1-imputers-template.sbatch b/cluster/run_exp1/run-exp1-imputers-template.sbatch
@@ -5,7 +5,7 @@
 #SBATCH --cpus-per-task=24
 #SBATCH --time=48:00:00
 #SBATCH --mail-type=END
-#SBATCH --mail-user=dh3553@nyu.edu
+#SBATCH --mail-user=np2969@nyu.edu
 #SBATCH --job-name=<DATASET>_<NULL_IMPUTER>_<EVALUATION_SCENARIO>
 #SBATCH --output=%j_<DATASET>_<NULL_IMPUTER>_<EVALUATION_SCENARIO>.out
 #SBATCH --error=%j_<DATASET>_<NULL_IMPUTER>_<EVALUATION_SCENARIO>.err
@@ -17,12 +17,12 @@ python ../scripts/impute_nulls_with_predictor.py \
     --dataset <DATASET> \
     --null_imputers \[\"'<NULL_IMPUTER>'\"] \
     --run_nums \[<RUN_NUMS>\] \
-    --tune_imputers true \
+    --tune_imputers false \
     --save_imputed_datasets true \
     --evaluation_scenarios \[\"'<EVALUATION_SCENARIO>'\"\]
 "
 
 singularity exec \
-	    --overlay /scratch/dh3553/ml_life_cycle_project/vldb_env.ext3:ro \
+	    --overlay /scratch/np2969/ml_life_cycle_project/vldb_env.ext3:ro \
 	    /scratch/work/public/singularity/ubuntu-20.04.1.sif \
 	    /bin/bash -c "source /ext3/env.sh; ${EXECUTION_COMMAND}"
diff --git a/cluster/run_exp1/run-exp1-imputers.sh b/cluster/run_exp1/run-exp1-imputers.sh
@@ -1,23 +1,32 @@
 # Define the list of tuples (dataset_name, model_name, run_nums)
 declare -a job_configs=(
-    "heart automl exp1_mcar3 1"
-    "heart automl exp1_mcar3 2"
-    "heart automl exp1_mcar3 3"
-    "heart automl exp1_mcar3 4"
-    "heart automl exp1_mcar3 5"
-    "heart automl exp1_mcar3 6"
-    "heart automl exp1_mar3 1"
-    "heart automl exp1_mar3 2"
-    "heart automl exp1_mar3 3"
-    "heart automl exp1_mar3 4"
-    "heart automl exp1_mar3 5"
-    "heart automl exp1_mar3 6"
-    "heart automl exp1_mnar3 1"
-    "heart automl exp1_mnar3 2"
-    "heart automl exp1_mnar3 3"
-    "heart automl exp1_mnar3 4"
-    "heart automl exp1_mnar3 5"
-    "heart automl exp1_mnar3 6"
+    "law_school k_means_clustering exp1_mcar3 1,2"
+    "law_school k_means_clustering exp1_mcar3 3,4"
+    "law_school k_means_clustering exp1_mcar3 5,6"
+    "law_school k_means_clustering exp1_mar3 1,2"
+    "law_school k_means_clustering exp1_mar3 3,4"
+    "law_school k_means_clustering exp1_mar3 5,6"
+    "law_school k_means_clustering exp1_mnar3 1,2"
+    "law_school k_means_clustering exp1_mnar3 3,4"
+    "law_school k_means_clustering exp1_mnar3 5,6"
+    "german k_means_clustering exp1_mcar3 1,2"
+    "german k_means_clustering exp1_mcar3 3,4"
+    "german k_means_clustering exp1_mcar3 5,6"
+    "german k_means_clustering exp1_mar3 1,2"
+    "german k_means_clustering exp1_mar3 3,4"
+    "german k_means_clustering exp1_mar3 5,6"
+    "german k_means_clustering exp1_mnar3 1,2"
+    "german k_means_clustering exp1_mnar3 3,4"
+    "german k_means_clustering exp1_mnar3 5,6"
+    "diabetes k_means_clustering exp1_mcar3 1,2"
+    "diabetes k_means_clustering exp1_mcar3 3,4"
+    "diabetes k_means_clustering exp1_mcar3 5,6"
+    "diabetes k_means_clustering exp1_mar3 1,2"
+    "diabetes k_means_clustering exp1_mar3 3,4"
+    "diabetes k_means_clustering exp1_mar3 5,6"
+    "diabetes k_means_clustering exp1_mnar3 1,2"
+    "diabetes k_means_clustering exp1_mnar3 3,4"
+    "diabetes k_means_clustering exp1_mnar3 5,6"
 )
 
 TEMPLATE_FILE="../cluster/run_exp1/run-exp1-imputers-template.sbatch"

diff --git a/configs/null_imputers_config.py b/configs/null_imputers_config.py
@@ -20,4 +20,27 @@
     ErrorRepairMethod.boost_clean.value: {"method": prepare_boostclean, "kwargs": {}}
 }
 
-NULL_IMPUTERS_HYPERPARAMS = {}
+NULL_IMPUTERS_HYPERPARAMS = {
+    ErrorRepairMethod.k_means_clustering.value: {
+        ACS_INCOME_DATASET: {
+            "MCAR3": {"n_clusters": 2},
+            "MAR3": {"n_clusters": 2},
+            "MNAR3": {"n_clusters": 2}
+        },
+        GERMAN_CREDIT_DATASET: {
+            "MCAR3": {"n_clusters": 2},
+            "MAR3": {"n_clusters": 2},
+            "MNAR3": {"n_clusters": 2}
+        },
+        DIABETES_DATASET: {
+            "MCAR3": {"n_clusters": 2},
+            "MAR3": {"n_clusters": 2},
+            "MNAR3": {"n_clusters": 2}
+        },
+        LAW_SCHOOL_DATASET: {
+            "MCAR3": {"n_clusters": 2},
+            "MAR3": {"n_clusters": 2},
+            "MNAR3": {"n_clusters": 2}
+        },
+    }
+}
diff --git a/notebooks/cluster_analysis/clustering_silhoette.ipynb b/notebooks/cluster_analysis/clustering_silhoette.ipynb
diff --git a/source/custom_classes/benchmark.py b/source/custom_classes/benchmark.py
@@ -161,7 +161,8 @@ def inject_and_impute_nulls(self, data_loader, null_imputer_name: str, evaluatio
                                             imputation_runtime=imputation_runtime,
                                             null_imputer_name=null_imputer_name,
                                             evaluation_scenario=evaluation_scenario,
-                                            experiment_seed=experiment_seed)
+                                            experiment_seed=experiment_seed,
+                                            null_imputer_params_dct=null_imputer_params_dct)
 
         if save_imputed_datasets:
             self._save_imputed_datasets_to_fs(X_train_val=X_train_val_imputed_wo_sensitive_attrs,

diff --git a/source/custom_classes/ml_lifecycle.py b/source/custom_classes/ml_lifecycle.py
@@ -147,6 +147,9 @@ def _impute_nulls(self, X_train_with_nulls, X_tests_with_nulls_lst, null_imputer
             train_injection_strategy, _ = get_injection_scenarios(evaluation_scenario)
             hyperparams = NULL_IMPUTERS_HYPERPARAMS.get(null_imputer_name, {}).get(self.dataset_name, {}).get(train_injection_strategy, {})
 
+        self._logger.info('Impute nulls hyper-params: {}'.format(hyperparams))
+        self._logger.info('tune_imputers: {}'.format(tune_imputers))
+
         # Use a method, kwargs, and hyperparams from NULL_IMPUTERS_CONFIG
         imputation_method = NULL_IMPUTERS_CONFIG[null_imputer_name]["method"]
         imputation_kwargs = NULL_IMPUTERS_CONFIG[null_imputer_name]["kwargs"]
@@ -302,7 +305,8 @@ def _evaluate_imputation(self, real, imputed, corrupted, numerical_columns, null
         return metrics_df
 
     def _save_imputation_metrics_to_db(self, train_imputation_metrics_df: pd.DataFrame, test_imputation_metrics_dfs_lst: list,
-                                       imputation_runtime: float, null_imputer_name: str, evaluation_scenario: str, experiment_seed: int):
+                                       imputation_runtime: float, null_imputer_name: str, evaluation_scenario: str, 
+                                       experiment_seed: int, null_imputer_params_dct: dict):
         train_imputation_metrics_df['Imputation_Guid'] = train_imputation_metrics_df.apply(
             lambda row: generate_guid(ordered_hierarchy_lst=[self.dataset_name, null_imputer_name,
                                                              evaluation_scenario, experiment_seed,
@@ -320,6 +324,7 @@ def _save_imputation_metrics_to_db(self, train_imputation_metrics_df: pd.DataFra
                                              'dataset_part': 'X_train_val',
                                              'runtime_in_mins': imputation_runtime,
                                              'record_create_date_time': datetime.now(timezone.utc),
+                                             'null_imputer_params_dct': null_imputer_params_dct
                                          })
 
         # Save imputation results into a database for each test set from the evaluation scenario

diff --git a/source/null_imputers/kmeans_imputer.py b/source/null_imputers/kmeans_imputer.py
@@ -148,6 +148,9 @@ def fit(self, X, cat_vars, y=None):
 
         pred_clusters = self.model.predict(X_observed, categorical=self.cat_vars_)
         self._calculate_cluster_stats(X, pred_clusters)
+        # save percentage of clusters
+        self.cluster_percentages_ = {str(cluster): len(np.where(pred_clusters == cluster)[0]) / len(pred_clusters) for cluster in set(pred_clusters)}
+        print(f"Cluster percentages: {self.cluster_percentages_}")
 
         return self
 
@@ -191,6 +194,9 @@ def fit_transform(self, df, target_column: str = None, **fit_params):
 
     def get_predictors_params(self):
         if self.hyperparameters is None:
-            return self.best_params_
-
-        return self.hyperparameters
+            output = self.best_params_
+        else:
+            output = self.hyperparameters
+
+        #output['cluster_percentages_'] = self.cluster_percentages_
+        return output