Skip to content

Commit

Permalink
Added saving intermediate imputed sets to fs
Browse files Browse the repository at this point in the history
  • Loading branch information
denysgerasymuk799 committed Apr 19, 2024
1 parent 2ccc1b9 commit fe39a4d
Show file tree
Hide file tree
Showing 6 changed files with 191 additions and 153 deletions.
30 changes: 26 additions & 4 deletions cluster/tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,27 @@ python ../scripts/impute_nulls_with_predictor.py \
--save_imputed_datasets true \
--evaluation_scenarios \[\"mcar_mcar1\"\]

-- Test load_imputed_datasets
python ./scripts/evaluate_models.py \
--dataset folk \
--null_imputers \[\"median-mode\"\] \
--models \[\"lr_clf\"\] \
--run_nums \[1\] \
--tune_imputers true \
--ml_impute true \
--save_imputed_datasets true \
--evaluation_scenarios \[\"mcar2\",\"mar3\"\]

python ./scripts/evaluate_models.py \
--dataset folk \
--null_imputers \[\"median-mode\"\] \
--models \[\"lr_clf\"\] \
--run_nums \[1\] \
--tune_imputers true \
--ml_impute false \
--evaluation_scenarios \[\"mcar2\",\"mar3\"\]


# =======================================================================================
# Cluster
# =======================================================================================
Expand Down Expand Up @@ -66,7 +87,8 @@ python build_space.py --data_dir data/datasets --dataset Puma --mv_type systemat

python cpclean.py --space_dir space_dir/Puma --dataset Puma --mv_type systematic --result_dir result_dir

Example 1
X_test_imputed_values = kmeans_imputer.transform(X_test_imputed)
X_test_imputed = pd.DataFrame(X_test_imputed_values, columns=X_test_imputed.columns, index=X_test_imputed.index)
X_test_imputed[categorical_columns_with_nulls] = X_test_imputed[categorical_columns_with_nulls].astype(int).astype('str')

# =======================================================================================
# Experiments
# =======================================================================================

4 changes: 2 additions & 2 deletions configs/models_config_for_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def get_models_params_for_tuning(models_tuning_seed):
'model': LogisticRegression(random_state=models_tuning_seed, max_iter=1000),
'params': {
'penalty': ['l1', 'l2'],
'C' : [0.001, 0.01, 0.1, 1],
'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
# 'C' : [0.001, 0.01, 0.1, 1],
# 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
}
},
MLModels.lgbm_clf.value: {
Expand Down
3 changes: 2 additions & 1 deletion configs/yaml_files/acs_income_GA_2018_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
dataset_name: folk
bootstrap_fraction: 0.8
n_estimators: 50
#n_estimators: 50
n_estimators: 5
computation_mode: error_analysis
sensitive_attributes_dct: {'SEX': '2', 'RAC1P': ['2', '3', '4', '5', '6', '7', '8', '9'], 'SEX & RAC1P': None}
5 changes: 4 additions & 1 deletion scripts/evaluate_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def parse_input_args():
parser.add_argument("--run_nums", type=str, help="a list of experiment run numbers", required=True)
parser.add_argument("--tune_imputers", type=str2bool, required=True,
help="True -- tune null imputers, False -- take hyper-params of null imputers from configs/null_imputers_config.py")
parser.add_argument("--save_imputed_datasets", type=str2bool, required=False, default=False,
help="True -- save imputed train and test sets, False -- do not save train and test sets")
parser.add_argument("--ml_impute", type=str2bool, required=True,
help="True -- apply ML-oriented imputers, False -- use pre-computed imputed datasets")
parser.add_argument("--evaluation_scenarios", type=str, help="a list of evaluation scenarios",
Expand Down Expand Up @@ -69,7 +71,8 @@ def parse_input_args():
evaluation_scenarios=args.evaluation_scenarios,
model_names=args.models,
tune_imputers=args.tune_imputers,
ml_impute=args.ml_impute)
ml_impute=args.ml_impute,
save_imputed_datasets=args.save_imputed_datasets)

end_time = datetime.now()
print(f'The script is successfully executed. Run time: {end_time - start_time}')
Expand Down
Loading

0 comments on commit fe39a4d

Please sign in to comment.