Skip to content

Commit

Permalink
Successfully tested NOMI, TDM, and GAIN
Browse files Browse the repository at this point in the history
  • Loading branch information
denysgerasymuk799 committed Dec 25, 2024
1 parent 89bdf58 commit 172db32
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 15 deletions.
3 changes: 2 additions & 1 deletion source/null_imputers/gain_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,8 @@ def _set_hyperparameters_for_optimization(self, trial: optuna.trial.Trial) -> No
"noise": trial.suggest_discrete_uniform("noise", 0, 1, 1),

# training
"batch_size": trial.suggest_discrete_uniform("batch_size", 0, 1024, 1),
# "batch_size": trial.suggest_discrete_uniform("batch_size", 0, 1024, 1),
"batch_size": trial.suggest_discrete_uniform("batch_size", 0, 512, 1),
"max_epochs": trial.suggest_discrete_uniform("max_epochs", 0, 10000, 1),
"early_stop": trial.suggest_discrete_uniform("early_stop", 0, 1000, 1),

Expand Down
15 changes: 3 additions & 12 deletions source/null_imputers/imputation_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,9 @@ def impute_with_tdm(X_train_with_nulls: pd.DataFrame, X_tests_with_nulls_lst: li
hyperparams: dict, **kwargs):
dataset_name = kwargs['dataset_name']
seed = kwargs['experiment_seed']
torch.manual_seed(seed) # Set the random seed for reproducibility
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

X_train_encoded, cat_encoders, _ = encode_dataset_for_missforest(df=X_train_with_nulls,
dataset_name=dataset_name,
Expand Down Expand Up @@ -286,10 +288,6 @@ def impute_with_nomi(X_train_with_nulls: pd.DataFrame, X_tests_with_nulls_lst: l
np.random.seed(seed)
torch.manual_seed(seed)

print("X_train_with_nulls.shape:", X_train_with_nulls.shape)
print("X_train_with_nulls.dtypes:", X_train_with_nulls.dtypes)
print("X_train_with_nulls.head():", X_train_with_nulls.head())

num_indices_with_nulls = [X_train_with_nulls.columns.get_loc(col) for col in numeric_columns_with_nulls]
cat_indices_with_nulls = [X_train_with_nulls.columns.get_loc(col) for col in categorical_columns_with_nulls]

Expand All @@ -304,39 +302,32 @@ def impute_with_nomi(X_train_with_nulls: pd.DataFrame, X_tests_with_nulls_lst: l
categorical_columns_with_nulls=categorical_columns_with_nulls)[0]
for X_test_with_nulls in X_tests_with_nulls_lst
]
print("X_train_encoded.head():", X_train_encoded.head())

# Apply an imputer
imputer = NOMIImputer(k_neighbors=kwargs['k_neighbors'],
similarity_metric=kwargs['similarity_metric'],
max_iterations=kwargs['max_iterations'],
tau=kwargs['tau'],
beta=kwargs['beta'])
print("np.sum(X_train_encoded mask):", np.sum(np.isnan(X_train_encoded.to_numpy())))
X_train_imputed_np = imputer.fit_transform(X_train_encoded.to_numpy(), num_indices_with_nulls, cat_indices_with_nulls)
X_tests_imputed_np_lst = list(map(lambda X_test_encoded:
imputer.transform(X_test_encoded.to_numpy(), num_indices_with_nulls, cat_indices_with_nulls),
X_tests_encoded_lst)
)
print("X_train_imputed_np[:5]:", X_train_imputed_np[:5])

# Convert numpy arrays back to DataFrames
X_train_imputed = pd.DataFrame(X_train_imputed_np, columns=X_train_with_nulls.columns, index=X_train_with_nulls.index)
X_tests_imputed_lst = [
pd.DataFrame(X_test, columns=X_test_with_nulls.columns, index=X_test_with_nulls.index)
for X_test, X_test_with_nulls in zip(X_tests_imputed_np_lst, X_tests_with_nulls_lst)
]
print("X_train_imputed.head():", X_train_imputed.head())

# Decode categories back
X_train_imputed = decode_dataset_for_missforest(X_train_imputed, cat_encoders, dataset_name=dataset_name)
X_tests_imputed_lst = [
decode_dataset_for_missforest(X_test_imputed, cat_encoders, dataset_name=dataset_name)
for X_test_imputed in X_tests_imputed_lst
]
print("X_train_imputed.shape:", X_train_imputed.shape)
print("X_train_imputed.dtypes:", X_train_imputed.dtypes)
print("X_train_imputed.head():", X_train_imputed.head())

hyperparams = {
"k_neighbors": imputer.k_neighbors,
Expand Down
9 changes: 7 additions & 2 deletions source/null_imputers/nomi_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@
import torch
import numpy as np
import hnswlib
import neural_tangents as nt
import tensorflow as tf
from distutils.version import LooseVersion
from tqdm import tqdm
from neural_tangents import stax

# Check if the required dependencies are available
if LooseVersion(tf.__version__) >= LooseVersion("2.16"):
import neural_tangents as nt
from neural_tangents import stax


def sample_batch_index(total, batch_size):
Expand Down

0 comments on commit 172db32

Please sign in to comment.