Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
denysgerasymuk799 committed Apr 1, 2024
1 parent ba59fc9 commit 8f78223
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 43 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
# data-cleaning-stability

Studying the impact of data cleaning techniques on fairness and stability


## Setup

Install datawig:
```shel
pip install mxnet-cu110
pip install datawig --no-deps
```
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
virny
mxnet-cu110
datawig==0.2.0
lightgbm==4.3.0
numpy==1.23.5
matplotlib~=3.6.2
pandas~=1.5.2
Expand Down
13 changes: 0 additions & 13 deletions scripts/run-script.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,3 @@ python ./scripts/evaluate_models.py \
--models \[\"lr_clf\"\] \
--run_nums \[1,2,3\] \
--ml_impute True --evaluation_scenarios \[\"mcar_mcar1\"\]

python ./scripts/evaluate_models.py \
--dataset folk \
--null_imputers \[\"median-mode\"\] \
--models \[\"lr_clf\"\] \
--run_nums \[1\] \
--ml_impute True --evaluation_scenarios \[\"mcar_mcar1\"\]

python ./scripts/impute_nulls_with_predictor.py \
--dataset folk \
--null_imputers \[\"median-mode\"\] \
--run_nums \[1,2\] \
--save_imputed_datasets True --evaluation_scenarios \[\"mcar_mcar1\",\"mcar_mar1\"\]
18 changes: 18 additions & 0 deletions scripts/tests.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
python ./scripts/evaluate_models.py \
--dataset folk \
--null_imputers \[\"median-mode\"\] \
--models \[\"lr_clf\"\] \
--run_nums \[1\] \
--ml_impute True --evaluation_scenarios \[\"mcar_mcar1\"\]

python ./scripts/impute_nulls_with_predictor.py \
--dataset folk \
--null_imputers \[\"median-mode\"\] \
--run_nums \[1,2\] \
--save_imputed_datasets True --evaluation_scenarios \[\"mcar_mcar1\",\"mcar_mar1\"\]

python ./scripts/impute_nulls_with_predictor.py \
--dataset folk \
--null_imputers \[\"datawig\"\] \
--run_nums \[1\] \
--save_imputed_datasets True --evaluation_scenarios \[\"mcar_mcar1\"\]
16 changes: 14 additions & 2 deletions source/custom_classes/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from virny.user_interfaces.multiple_models_with_db_writer_api import compute_metrics_with_db_writer

import source.null_imputers.simple_imputer as simple_imputer
import source.null_imputers.datawig_imputer as datawig_imputer
from configs.models_config_for_tuning import get_models_params_for_tuning
from configs.constants import (EXP_COLLECTION_NAME, MODEL_HYPER_PARAMS_COLLECTION_NAME, IMPUTATION_PERFORMANCE_METRICS_COLLECTION_NAME,
EXPERIMENT_RUN_SEEDS, NUM_FOLDS_FOR_TUNING, ErrorRepairMethod, ErrorInjectionStrategy)
Expand Down Expand Up @@ -121,6 +122,17 @@ def _impute_nulls(self, X_train_with_nulls, X_test_with_nulls, null_imputer_name
train_numerical_null_columns=train_numerical_null_columns,
train_categorical_null_columns=train_categorical_null_columns))

elif null_imputer_name == ErrorRepairMethod.datawig.value:
X_train_imputed, X_test_imputed, null_imputer_params = (
datawig_imputer.complete(X_train_with_nulls=X_train_with_nulls,
X_test_with_nulls=X_test_with_nulls,
numeric_columns_with_nulls=train_numerical_null_columns,
categorical_columns_with_nulls=train_categorical_null_columns,
hpo=False,
output_path=pathlib.Path(__file__).parent.parent.parent.joinpath('results')))
numerical_null_imputer_params = null_imputer_params
categorical_null_imputer_params = null_imputer_params

else:
raise ValueError(f'{null_imputer_name} null imputer is not implemented')

Expand All @@ -147,11 +159,11 @@ def _evaluate_imputation(self, real, imputed, corrupted, numerical_columns, null
recall = None
f1 = None
if column_type == 'numerical':
null_imputer_params = numerical_null_imputer_params
null_imputer_params = numerical_null_imputer_params[column_name] if numerical_null_imputer_params is not None else None
rmse = mean_squared_error(true, pred, squared=False)
print('RMSE for {}: {:.2f}'.format(column_name, rmse))
else:
null_imputer_params = categorical_null_imputer_params
null_imputer_params = categorical_null_imputer_params[column_name] if categorical_null_imputer_params is not None else None
precision, recall, f1, _ = precision_recall_fscore_support(true, pred, average="micro")
print('Precision for {}: {:.2f}'.format(column_name, precision))
print('Recall for {}: {:.2f}'.format(column_name, recall))
Expand Down
56 changes: 30 additions & 26 deletions source/null_imputers/datawig_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import pandas as pd


def complete(data_frame: pd.DataFrame,
def complete(X_train_with_nulls: pd.DataFrame,
X_test_with_nulls: pd.DataFrame,
numeric_columns_with_nulls: list,
categorical_columns_with_nulls : list,
precision_threshold: float = 0.0,
inplace: bool = False,
hpo: bool = False,
verbose: int = 0,
num_epochs: int = 100,
iterations: int = 1,
output_path: str = "."):
Expand All @@ -33,45 +34,48 @@ def complete(data_frame: pd.DataFrame,
:return: dataframe with imputations
"""
missing_mask = data_frame.copy().isnull()
# Import datawig inside a function to avoid its installation to use other null imputers
import datawig

if inplace is False:
data_frame = data_frame.copy()
train_missing_mask = X_train_with_nulls.copy().isnull()
test_missing_mask = X_test_with_nulls.copy().isnull()
X_train_imputed = X_train_with_nulls.copy()
X_test_imputed = X_test_with_nulls.copy()

numeric_columns = [c for c in data_frame.columns if is_numeric_dtype(data_frame[c])]
string_columns = list(set(data_frame.columns) - set(numeric_columns))
logger.debug("Assuming numerical columns: {}".format(", ".join(numeric_columns)))

col_set = set(numeric_columns + string_columns)

categorical_columns = [col for col in string_columns if SimpleImputer._is_categorical(data_frame[col])]
logger.debug("Assuming categorical columns: {}".format(", ".join(categorical_columns)))
col_set = set(X_train_imputed.columns)
null_imputer_params = dict()
for _ in range(iterations):
for output_col in set(numeric_columns) | set(categorical_columns):
for output_col in set(numeric_columns_with_nulls) | set(categorical_columns_with_nulls):
# train on all input columns but the to-be-imputed one
input_cols = list(col_set - set([output_col]))

# train on all observed values
idx_missing = missing_mask[output_col]
train_idx_missing = train_missing_mask[output_col]

imputer = SimpleImputer(input_columns=input_cols,
output_column=output_col,
output_path=os.path.join(output_path, output_col))
imputer = datawig.SimpleImputer(input_columns=input_cols,
output_column=output_col,
output_path=os.path.join(output_path, output_col))
if hpo:
imputer.fit_hpo(data_frame.loc[~idx_missing, :],
patience=5 if output_col in categorical_columns else 20,
imputer.fit_hpo(X_train_imputed.loc[~train_idx_missing, :],
patience=5 if output_col in categorical_columns_with_nulls else 20,
num_epochs=100,
final_fc_hidden_units=[[0], [10], [50], [100]])
else:
imputer.fit(data_frame.loc[~idx_missing, :],
patience=5 if output_col in categorical_columns else 20,
imputer.fit(X_train_imputed.loc[~train_idx_missing, :],
patience=5 if output_col in categorical_columns_with_nulls else 20,
num_epochs=num_epochs,
calibrate=False)

tmp = imputer.predict(data_frame, precision_threshold=precision_threshold)
data_frame.loc[idx_missing, output_col] = tmp[output_col + "_imputed"]
tmp_train = imputer.predict(X_train_imputed, precision_threshold=precision_threshold)
X_train_imputed.loc[train_idx_missing, output_col] = tmp_train[output_col + "_imputed"]

test_idx_missing = test_missing_mask[output_col]
tmp_test = imputer.predict(X_test_imputed, precision_threshold=precision_threshold)
X_test_imputed.loc[test_idx_missing, output_col] = tmp_test[output_col + "_imputed"]

null_imputer_params[output_col] = {k: v for k, v in imputer.__dict__.items() if k not in ['imputer']}

# remove the directory with logfiles for this column
shutil.rmtree(os.path.join(output_path, output_col))

return data_frame
return X_train_imputed, X_test_imputed, null_imputer_params

0 comments on commit 8f78223

Please sign in to comment.