diff --git a/README.md b/README.md index 95ecfab0..142f532c 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,12 @@ # data-cleaning-stability + Studying the impact of data cleaning techniques on fairness and stability + + +## Setup + +Install datawig: +```shel +pip install mxnet-cu110 +pip install datawig --no-deps +``` diff --git a/requirements.txt b/requirements.txt index 815780a2..d427c7bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ virny -mxnet-cu110 -datawig==0.2.0 +lightgbm==4.3.0 numpy==1.23.5 matplotlib~=3.6.2 pandas~=1.5.2 diff --git a/scripts/run-script.sbatch b/scripts/run-script.sbatch index d5531682..2c9025d4 100644 --- a/scripts/run-script.sbatch +++ b/scripts/run-script.sbatch @@ -15,16 +15,3 @@ python ./scripts/evaluate_models.py \ --models \[\"lr_clf\"\] \ --run_nums \[1,2,3\] \ --ml_impute True --evaluation_scenarios \[\"mcar_mcar1\"\] - -python ./scripts/evaluate_models.py \ - --dataset folk \ - --null_imputers \[\"median-mode\"\] \ - --models \[\"lr_clf\"\] \ - --run_nums \[1\] \ - --ml_impute True --evaluation_scenarios \[\"mcar_mcar1\"\] - -python ./scripts/impute_nulls_with_predictor.py \ - --dataset folk \ - --null_imputers \[\"median-mode\"\] \ - --run_nums \[1,2\] \ - --save_imputed_datasets True --evaluation_scenarios \[\"mcar_mcar1\",\"mcar_mar1\"\] diff --git a/scripts/tests.txt b/scripts/tests.txt new file mode 100644 index 00000000..95dc46df --- /dev/null +++ b/scripts/tests.txt @@ -0,0 +1,18 @@ +python ./scripts/evaluate_models.py \ + --dataset folk \ + --null_imputers \[\"median-mode\"\] \ + --models \[\"lr_clf\"\] \ + --run_nums \[1\] \ + --ml_impute True --evaluation_scenarios \[\"mcar_mcar1\"\] + +python ./scripts/impute_nulls_with_predictor.py \ + --dataset folk \ + --null_imputers \[\"median-mode\"\] \ + --run_nums \[1,2\] \ + --save_imputed_datasets True --evaluation_scenarios \[\"mcar_mcar1\",\"mcar_mar1\"\] + +python ./scripts/impute_nulls_with_predictor.py \ + --dataset folk \ + --null_imputers \[\"datawig\"\] \ + --run_nums \[1\] \ + --save_imputed_datasets True --evaluation_scenarios \[\"mcar_mcar1\"\] diff --git a/source/custom_classes/benchmark.py b/source/custom_classes/benchmark.py index 66765918..1c373c4d 100644 --- a/source/custom_classes/benchmark.py +++ b/source/custom_classes/benchmark.py @@ -14,6 +14,7 @@ from virny.user_interfaces.multiple_models_with_db_writer_api import compute_metrics_with_db_writer import source.null_imputers.simple_imputer as simple_imputer +import source.null_imputers.datawig_imputer as datawig_imputer from configs.models_config_for_tuning import get_models_params_for_tuning from configs.constants import (EXP_COLLECTION_NAME, MODEL_HYPER_PARAMS_COLLECTION_NAME, IMPUTATION_PERFORMANCE_METRICS_COLLECTION_NAME, EXPERIMENT_RUN_SEEDS, NUM_FOLDS_FOR_TUNING, ErrorRepairMethod, ErrorInjectionStrategy) @@ -121,6 +122,17 @@ def _impute_nulls(self, X_train_with_nulls, X_test_with_nulls, null_imputer_name train_numerical_null_columns=train_numerical_null_columns, train_categorical_null_columns=train_categorical_null_columns)) + elif null_imputer_name == ErrorRepairMethod.datawig.value: + X_train_imputed, X_test_imputed, null_imputer_params = ( + datawig_imputer.complete(X_train_with_nulls=X_train_with_nulls, + X_test_with_nulls=X_test_with_nulls, + numeric_columns_with_nulls=train_numerical_null_columns, + categorical_columns_with_nulls=train_categorical_null_columns, + hpo=False, + output_path=pathlib.Path(__file__).parent.parent.parent.joinpath('results'))) + numerical_null_imputer_params = null_imputer_params + categorical_null_imputer_params = null_imputer_params + else: raise ValueError(f'{null_imputer_name} null imputer is not implemented') @@ -147,11 +159,11 @@ def _evaluate_imputation(self, real, imputed, corrupted, numerical_columns, null recall = None f1 = None if column_type == 'numerical': - null_imputer_params = numerical_null_imputer_params + null_imputer_params = numerical_null_imputer_params[column_name] if numerical_null_imputer_params is not None else None rmse = mean_squared_error(true, pred, squared=False) print('RMSE for {}: {:.2f}'.format(column_name, rmse)) else: - null_imputer_params = categorical_null_imputer_params + null_imputer_params = categorical_null_imputer_params[column_name] if categorical_null_imputer_params is not None else None precision, recall, f1, _ = precision_recall_fscore_support(true, pred, average="micro") print('Precision for {}: {:.2f}'.format(column_name, precision)) print('Recall for {}: {:.2f}'.format(column_name, recall)) diff --git a/source/null_imputers/datawig_imputer.py b/source/null_imputers/datawig_imputer.py index 7e6ac65d..acd78641 100644 --- a/source/null_imputers/datawig_imputer.py +++ b/source/null_imputers/datawig_imputer.py @@ -3,11 +3,12 @@ import pandas as pd -def complete(data_frame: pd.DataFrame, +def complete(X_train_with_nulls: pd.DataFrame, + X_test_with_nulls: pd.DataFrame, + numeric_columns_with_nulls: list, + categorical_columns_with_nulls : list, precision_threshold: float = 0.0, - inplace: bool = False, hpo: bool = False, - verbose: int = 0, num_epochs: int = 100, iterations: int = 1, output_path: str = "."): @@ -33,45 +34,48 @@ def complete(data_frame: pd.DataFrame, :return: dataframe with imputations """ - missing_mask = data_frame.copy().isnull() + # Import datawig inside a function to avoid its installation to use other null imputers + import datawig - if inplace is False: - data_frame = data_frame.copy() + train_missing_mask = X_train_with_nulls.copy().isnull() + test_missing_mask = X_test_with_nulls.copy().isnull() + X_train_imputed = X_train_with_nulls.copy() + X_test_imputed = X_test_with_nulls.copy() - numeric_columns = [c for c in data_frame.columns if is_numeric_dtype(data_frame[c])] - string_columns = list(set(data_frame.columns) - set(numeric_columns)) - logger.debug("Assuming numerical columns: {}".format(", ".join(numeric_columns))) - - col_set = set(numeric_columns + string_columns) - - categorical_columns = [col for col in string_columns if SimpleImputer._is_categorical(data_frame[col])] - logger.debug("Assuming categorical columns: {}".format(", ".join(categorical_columns))) + col_set = set(X_train_imputed.columns) + null_imputer_params = dict() for _ in range(iterations): - for output_col in set(numeric_columns) | set(categorical_columns): + for output_col in set(numeric_columns_with_nulls) | set(categorical_columns_with_nulls): # train on all input columns but the to-be-imputed one input_cols = list(col_set - set([output_col])) # train on all observed values - idx_missing = missing_mask[output_col] + train_idx_missing = train_missing_mask[output_col] - imputer = SimpleImputer(input_columns=input_cols, - output_column=output_col, - output_path=os.path.join(output_path, output_col)) + imputer = datawig.SimpleImputer(input_columns=input_cols, + output_column=output_col, + output_path=os.path.join(output_path, output_col)) if hpo: - imputer.fit_hpo(data_frame.loc[~idx_missing, :], - patience=5 if output_col in categorical_columns else 20, + imputer.fit_hpo(X_train_imputed.loc[~train_idx_missing, :], + patience=5 if output_col in categorical_columns_with_nulls else 20, num_epochs=100, final_fc_hidden_units=[[0], [10], [50], [100]]) else: - imputer.fit(data_frame.loc[~idx_missing, :], - patience=5 if output_col in categorical_columns else 20, + imputer.fit(X_train_imputed.loc[~train_idx_missing, :], + patience=5 if output_col in categorical_columns_with_nulls else 20, num_epochs=num_epochs, calibrate=False) - tmp = imputer.predict(data_frame, precision_threshold=precision_threshold) - data_frame.loc[idx_missing, output_col] = tmp[output_col + "_imputed"] + tmp_train = imputer.predict(X_train_imputed, precision_threshold=precision_threshold) + X_train_imputed.loc[train_idx_missing, output_col] = tmp_train[output_col + "_imputed"] + + test_idx_missing = test_missing_mask[output_col] + tmp_test = imputer.predict(X_test_imputed, precision_threshold=precision_threshold) + X_test_imputed.loc[test_idx_missing, output_col] = tmp_test[output_col + "_imputed"] + + null_imputer_params[output_col] = {k: v for k, v in imputer.__dict__.items() if k not in ['imputer']} # remove the directory with logfiles for this column shutil.rmtree(os.path.join(output_path, output_col)) - return data_frame + return X_train_imputed, X_test_imputed, null_imputer_params