Skip to content

Commit

Permalink
Added an init for NOMIImputer
Browse files Browse the repository at this point in the history
  • Loading branch information
denysgerasymuk799 committed Dec 24, 2024
1 parent 0bbf9da commit 30363cd
Show file tree
Hide file tree
Showing 3 changed files with 628 additions and 0 deletions.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ numpy==1.23.5
lightgbm==4.3.0
POT~=0.9.5
FrEIA~=0.2
hnswlib~=0.8.0
neural-tangents~=0.6.5
matplotlib~=3.6.2
pandas~=1.5.2
altair~=4.2.0
Expand Down
349 changes: 349 additions & 0 deletions source/null_imputers/nomi_imputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
"""
The original paper for the code below:
- GitHub: https://github.com/guaiyoui/NOMI
- Citation:
@article{wang2024uncertainty,
title={Missing Data Imputation with Uncertainty-Driven Network},
author={Wang, Jianwei and Zhang, Ying and Wang, Kai and Lin, Xuemin and Zhang, Wenjie},
journal={Proceedings of the ACM on Management of Data},
volume={2},
number={3},
pages={1--25},
year={2024},
publisher={ACM New York, NY, USA}
}
"""
import torch
import numpy as np
import hnswlib
import neural_tangents as nt
from tqdm import tqdm
from neural_tangents import stax


def sample_batch_index(total, batch_size):
"""
Sample index of the mini-batch.
Args:
- total: total number of samples
- batch_size: batch size
Returns:
- batch_idx: batch index
"""
total_idx = np.random.permutation(total)
batch_idx = total_idx[:batch_size]
return batch_idx


def dist2sim(neigh_dist):
if torch.is_tensor(neigh_dist):
neigh_dist = neigh_dist.cpu().detach().numpy()
with np.errstate(divide="ignore"):
dist = 1.0 / neigh_dist

inf_mask = np.isinf(dist)
inf_row = np.any(inf_mask, axis=1)
dist[inf_row] = inf_mask[inf_row]
denom = np.sum(dist, axis=1)
denom = denom.reshape((-1,1))

return dist/denom


def prediction(pred_fn, X_test, kernel_type="nngp", compute_cov = True):
pred_mean, pred_cov = pred_fn(x_test=X_test, get=kernel_type,
compute_cov= compute_cov)
return pred_mean, pred_cov


def normalization_std(data):
min_vals = np.min(data, axis=0)
max_vals = np.max(data, axis=0)

normalized_data = (data - min_vals) / (max_vals - min_vals)
return normalized_data + 1


def normalization(data, parameters=None):
"""
Normalize data in [0, 1] range.
Args:
- data: original data
Returns:
- norm_data: normalized data
- norm_parameters: min_val, max_val for each feature for renormalization
"""

# Parameters
_, dim = data.shape
norm_data = data.copy()

if parameters is None:

# MixMax normalization
min_val = np.zeros(dim)
max_val = np.zeros(dim)

# For each dimension
for i in range(dim):
min_val[i] = np.nanmin(norm_data[:,i])
norm_data[:,i] = norm_data[:,i] - np.nanmin(norm_data[:,i])
max_val[i] = np.nanmax(norm_data[:,i])
norm_data[:,i] = norm_data[:,i] / (np.nanmax(norm_data[:,i]) + 1e-6)

# Return norm_parameters for renormalization
norm_parameters = {'min_val': min_val,
'max_val': max_val}

else:
min_val = parameters['min_val']
max_val = parameters['max_val']

# For each dimension
for i in range(dim):
norm_data[:,i] = norm_data[:,i] - min_val[i]
norm_data[:,i] = norm_data[:,i] / (max_val[i] + 1e-6)

norm_parameters = parameters

return norm_data, norm_parameters


def renormalization(norm_data, norm_parameters):
"""
Renormalize data from [0, 1] range to the original range.
Args:
- norm_data: normalized data
- norm_parameters: min_val, max_val for each feature for renormalization
Returns:
- renorm_data: renormalized original data
"""

min_val = norm_parameters['min_val']
max_val = norm_parameters['max_val']

_, dim = norm_data.shape
renorm_data = norm_data.copy()

for i in range(dim):
renorm_data[:,i] = renorm_data[:,i] * (max_val[i] + 1e-6)
renorm_data[:,i] = renorm_data[:,i] + min_val[i]

return renorm_data


def rounding(imputed_data, data_x):
"""
Round imputed data for categorical variables.
Args:
- imputed_data: imputed data
- data_x: original data with missing values
Returns:
- rounded_data: rounded imputed data
"""

_, dim = data_x.shape
rounded_data = imputed_data.copy()

for i in range(dim):
temp = data_x[~np.isnan(data_x[:, i]), i]
# Only for the categorical variable
if len(np.unique(temp)) < 20:
rounded_data[:, i] = np.round(rounded_data[:, i])

return rounded_data


class NOMIImputer:
def __init__(self, k_neighbors=10, similarity_metric="l2", max_iterations=3, tau=1.0, beta=1.0):
self.k_neighbors = k_neighbors
self.similarity_metric = similarity_metric
self.max_iterations = max_iterations
self.tau = tau
self.beta = beta

self.index_dct = dict()
self.predict_fn_dct = dict()
self.Y_train_dct = dict()
self.is_fitted = False

def fit_transform(self, X):
"""
Fit and transform the NOMI imputer using the provided training data.
Parameters:
- X: numpy array of shape (n_samples, n_features)
The dataset containing missing values.
"""
data_x = X
data_m = data_x.isna()
norm_data, norm_parameters = normalization(data_x)
norm_data_x = np.nan_to_num(norm_data, 0)

num, dims = norm_data_x.shape
imputed_X = norm_data_x.copy()
data_m_imputed = data_m.copy()

# Step 1: Model Initialization
_, _, kernel_fn = stax.serial(
stax.Dense(2 * dims), stax.Relu(),
stax.Dense(dims), stax.Relu(),
stax.Dense(1), stax.Sigmoid_like()
)
# Step 2: Iterative Imputation Process
for iteration in range(self.max_iterations):
# Iterates over each dimension of the dataset
for dim in tqdm(range(dims)):
# Extract observed values
X_wo_dim = np.delete(imputed_X, dim, 1)
i_not_nan_index = data_m_imputed[:, dim].astype(bool)

# Create training and test sets (X_train, Y_train for observed, X_test for missing)
X_train = X_wo_dim[i_not_nan_index]
Y_train = imputed_X[i_not_nan_index, dim]

X_test = X_wo_dim[~i_not_nan_index]
true_indices = np.where(~i_not_nan_index)[0]

if X_test.shape[0] == 0:
continue

no, d = X_train.shape

# Use hnswlib.Index for nearest-neighbor search on X_train.
# Select k neighbors (args.k_candidate) to calculate distances and weights.
index = hnswlib.Index(space=self.similarity_metric, dim=d)
index.init_index(max_elements=no, ef_construction=200, M=16)
index.add_items(X_train)
index.set_ef(int(self.k_neighbors * 1.2))

if X_train.shape[0] > 300:
batch_idx = sample_batch_index(X_train.shape[0], 300)
else:
batch_idx = sample_batch_index(X_train.shape[0], X_train.shape[0])

X_batch = X_train[batch_idx,:]
Y_batch = Y_train[batch_idx]

neigh_ind, neigh_dist = index.knn_query(X_batch, k=self.k_neighbors, filter=None)
neigh_dist = np.sqrt(neigh_dist)

# Calculate weights (dist2sim) for neighbor contributions.
weights = dist2sim(neigh_dist[:,1:])

# Prepare train_input and test_input matrices by combining weights with neighbor values.
y_neighbors = Y_train[neigh_ind[:,1:]]
train_input = weights*y_neighbors

neigh_ind_test, neigh_dist_test = index.knn_query(X_test, k=self.k_neighbors, filter=None)
neigh_dist_test = np.sqrt(neigh_dist_test)

weights_test = dist2sim(neigh_dist_test[:, :-1])
y_neighbors_test = Y_train[neigh_ind_test[:, :-1]]
test_input = weights_test*y_neighbors_test

# Use a prediction function (nt.predict.gradient_descent_mse_ensemble)
# to learn a regression model and predict missing values.
predict_fn = nt.predict.gradient_descent_mse_ensemble(kernel_fn,
train_input, Y_batch.reshape(-1, 1), diag_reg=1e-4)

y_pred, pred_cov = prediction(predict_fn, test_input, kernel_type="nngp")


if iteration == 0:
# Replace missing values directly with predictions
imputed_X[~i_not_nan_index, dim] = y_pred.reshape(-1)
elif iteration <= 3:
# Normalize uncertainty
pred_std = np.sqrt(np.diag(pred_cov))
pred_std = np.ravel(np.array(pred_std))
pred_std = normalization_std(pred_std)
pred_std = np.nan_to_num(pred_std, nan=1.0)

# Adjust imputed values using a weighted combination of prior and current predictions based on pred_std
greater_than_threshold_indices = np.where(pred_std <= self.tau)[0]

for i in range(greater_than_threshold_indices.shape[0]):
data_m_imputed[true_indices[greater_than_threshold_indices[i]]:, dim] = 1

imputed_X[~i_not_nan_index, dim] = (1 - self.beta / pred_std)*imputed_X[~i_not_nan_index, dim] + self.beta / pred_std * y_pred.reshape(-1)
else:
imputed_X[~i_not_nan_index, dim] = y_pred.reshape(-1)

# Save fitted variables for the transform method
if iteration + 1 == self.max_iterations:
self.index_dct[dim] = index
self.predict_fn_dct[dim] = predict_fn
self.Y_train_dct[dim] = Y_train

# Step 3: Post-Processing
imputed_data = renormalization(imputed_X, norm_parameters) # Renormalize the imputed data
imputed_data = rounding(imputed_data, data_x) # Round values to match the original format
self.is_fitted = True

return imputed_data

def transform(self, X):
"""
Impute missing values in the provided dataset using the trained model.
Parameters:
- X: numpy array of shape (n_samples, n_features)
Dataset with missing values to impute.
Returns:
- numpy array of shape (n_samples, n_features)
Dataset with imputed values.
"""
if not self.is_fitted:
raise RuntimeError("The NOMIImputer must be fitted before calling transform.")

data_x = X
data_m = data_x.isna()
norm_data, norm_parameters = normalization(data_x)
norm_data_x = np.nan_to_num(norm_data, 0)

num, dims = norm_data_x.shape
imputed_X = norm_data_x.copy()
data_m_imputed = data_m.copy()

_, _, kernel_fn = stax.serial(
stax.Dense(2 * dims), stax.Relu(),
stax.Dense(dims), stax.Relu(),
stax.Dense(1), stax.Sigmoid_like()
)
for dim in tqdm(range(dims)):
index = self.index_dct[dim]
predict_fn = self.predict_fn_dct[dim]
Y_train = self.Y_train_dct[dim]

X_wo_dim = np.delete(imputed_X, dim, 1)
i_not_nan_index = data_m_imputed[:, dim].astype(bool)

X_test = X_wo_dim[~i_not_nan_index]
if X_test.shape[0] == 0:
continue

neigh_ind_test, neigh_dist_test = index.knn_query(X_test, k=self.k_neighbors, filter=None)
neigh_dist_test = np.sqrt(neigh_dist_test)

weights_test = dist2sim(neigh_dist_test[:, :-1])
y_neighbors_test = Y_train[neigh_ind_test[:, :-1]]
test_input = weights_test * y_neighbors_test

y_pred, pred_cov = prediction(predict_fn, test_input, kernel_type="nngp")
imputed_X[~i_not_nan_index, dim] = y_pred.reshape(-1)

imputed_data = renormalization(imputed_X, norm_parameters)
imputed_data = rounding(imputed_data, data_x)

return imputed_data
Loading

0 comments on commit 30363cd

Please sign in to comment.