Added saving intermediate imputed sets to fs

FalaahArifKhan · Apr 19, 2024 · fe39a4d · fe39a4d
1 parent 2ccc1b9
commit fe39a4d
Show file tree

Hide file tree

Showing 6 changed files with 191 additions and 153 deletions.
diff --git a/cluster/tests.txt b/cluster/tests.txt
@@ -38,6 +38,27 @@ python ../scripts/impute_nulls_with_predictor.py \
     --save_imputed_datasets true \
     --evaluation_scenarios \[\"mcar_mcar1\"\]
 
+-- Test load_imputed_datasets
+python ./scripts/evaluate_models.py \
+    --dataset folk \
+    --null_imputers \[\"median-mode\"\] \
+    --models \[\"lr_clf\"\] \
+    --run_nums \[1\] \
+    --tune_imputers true \
+    --ml_impute true \
+    --save_imputed_datasets true \
+    --evaluation_scenarios \[\"mcar2\",\"mar3\"\]
+
+python ./scripts/evaluate_models.py \
+    --dataset folk \
+    --null_imputers \[\"median-mode\"\] \
+    --models \[\"lr_clf\"\] \
+    --run_nums \[1\] \
+    --tune_imputers true \
+    --ml_impute false \
+    --evaluation_scenarios \[\"mcar2\",\"mar3\"\]
+
+
 # =======================================================================================
 # Cluster
 # =======================================================================================
@@ -66,7 +87,8 @@ python build_space.py --data_dir data/datasets --dataset Puma --mv_type systemat
 
 python cpclean.py --space_dir space_dir/Puma --dataset Puma --mv_type systematic --result_dir result_dir
 
-Example 1
-X_test_imputed_values = kmeans_imputer.transform(X_test_imputed)
-X_test_imputed = pd.DataFrame(X_test_imputed_values, columns=X_test_imputed.columns, index=X_test_imputed.index)
-X_test_imputed[categorical_columns_with_nulls] = X_test_imputed[categorical_columns_with_nulls].astype(int).astype('str')
+
+# =======================================================================================
+# Experiments
+# =======================================================================================
+
diff --git a/configs/models_config_for_tuning.py b/configs/models_config_for_tuning.py
@@ -24,8 +24,8 @@ def get_models_params_for_tuning(models_tuning_seed):
             'model': LogisticRegression(random_state=models_tuning_seed, max_iter=1000),
             'params': {
                 'penalty': ['l1', 'l2'],
-                'C' : [0.001, 0.01, 0.1, 1],
-                'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
+                # 'C' : [0.001, 0.01, 0.1, 1],
+                # 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
             }
         },
         MLModels.lgbm_clf.value: {

diff --git a/configs/yaml_files/acs_income_GA_2018_config.yaml b/configs/yaml_files/acs_income_GA_2018_config.yaml
@@ -1,5 +1,6 @@
 dataset_name: folk
 bootstrap_fraction: 0.8
-n_estimators: 50
+#n_estimators: 50
+n_estimators: 5
 computation_mode: error_analysis
 sensitive_attributes_dct: {'SEX': '2', 'RAC1P': ['2', '3', '4', '5', '6', '7', '8', '9'], 'SEX & RAC1P': None}
diff --git a/scripts/evaluate_models.py b/scripts/evaluate_models.py
@@ -38,6 +38,8 @@ def parse_input_args():
     parser.add_argument("--run_nums", type=str, help="a list of experiment run numbers", required=True)
     parser.add_argument("--tune_imputers", type=str2bool, required=True,
                         help="True -- tune null imputers, False -- take hyper-params of null imputers from configs/null_imputers_config.py")
+    parser.add_argument("--save_imputed_datasets", type=str2bool, required=False, default=False,
+                        help="True -- save imputed train and test sets, False -- do not save train and test sets")
     parser.add_argument("--ml_impute", type=str2bool, required=True,
                         help="True -- apply ML-oriented imputers, False -- use pre-computed imputed datasets")
     parser.add_argument("--evaluation_scenarios", type=str, help="a list of evaluation scenarios",
@@ -69,7 +71,8 @@ def parse_input_args():
                              evaluation_scenarios=args.evaluation_scenarios,
                              model_names=args.models,
                              tune_imputers=args.tune_imputers,
-                             ml_impute=args.ml_impute)
+                             ml_impute=args.ml_impute,
+                             save_imputed_datasets=args.save_imputed_datasets)
 
     end_time = datetime.now()
     print(f'The script is successfully executed. Run time: {end_time - start_time}')