Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ outputs/*
local_tests/**
.ipynb_checkpoints/*
test/__pychache__/*
src/sygnet/__pychache__/*
src/sygnet/__pychache__/*
.DS_store
16 changes: 12 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,25 @@ To install via pip, you can run the following command at the command line:

You can find a demonstration of **sygnet** under [examples/basic_example](examples/basic_example.ipynb).

### Current version: 0.0.8 (alpha release)
### Current version: 0.0.9 (alpha release)

**Alpha release**: You should expect both functionality and pipelines to change (rapidly). Comments and bug reports are very welcome!
**Alpha release**: You should expect both functionality and pipelines to change (rapidly and without warning). Comments and bug reports are very welcome!

Changes:

* Update `tune()` to provide no k-fold cross validation as default
* Update numpy dependency to fix pre-processing bug
* Rewrite of main interface and underlying functions
* Bulding models now structured in terms of hidden "blocks"
* Added self-attention mechanism

### Previous releases

**0.0.8**

Changes:

* Update `tune()` to provide no k-fold cross validation as default
* Update numpy dependency to fix pre-processing bug

**0.0.7**
* Update internal `train_*` functions to return losses and improve logging
* Update `tune()` function
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ maintainers = [
{name = "Maksim Zubok"},
]

version = "0.0.8"
version = "0.0.9"
description = "Synthetic data using Generative Adversarial Networks"
keywords = [
"synthetic data",
Expand All @@ -34,7 +34,7 @@ classifiers = [
dependencies = [
"numpy>=1.21",
"torch>=1.10.0",
"scikit-learn>=1.0",
"scikit-learn>=1.2",
"pandas>=1.4",
"datetime",
"tqdm",
Expand Down
Binary file added src/.DS_Store
Binary file not shown.
Binary file modified src/sygnet/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
Binary file modified src/sygnet/__pycache__/dataloaders.cpython-39.pyc
Binary file not shown.
Binary file modified src/sygnet/__pycache__/interface.cpython-39.pyc
Binary file not shown.
Binary file modified src/sygnet/__pycache__/loader.cpython-39.pyc
Binary file not shown.
Binary file modified src/sygnet/__pycache__/models.cpython-39.pyc
Binary file not shown.
Binary file modified src/sygnet/__pycache__/requirements.cpython-39.pyc
Binary file not shown.
Binary file modified src/sygnet/__pycache__/train.cpython-39.pyc
Binary file not shown.
Binary file modified src/sygnet/__pycache__/tune.cpython-39.pyc
Binary file not shown.
92 changes: 92 additions & 0 deletions src/sygnet/blocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from .requirements import *

class gHead(nn.Module):
'''
Single attention-head module
'''
def __init__(self, head_size, n_lin, d_p):
super().__init__()
self.key = nn.Linear(n_lin, head_size, bias=False)
self.query = nn.Linear(n_lin, head_size, bias=False)
self.value = nn.Linear(n_lin, head_size, bias=False)
self.dropout = nn.Dropout(d_p)

def forward(self, x):
T,C = x.shape
k = self.key(x)
q = self.query(x)
att_score = q @ k.T * C**-5 # (T,C) @ (C,T) -> (T,T)
att_score = F.softmax(att_score, dim=-1)
att_score = self.dropout(att_score)
v = self.value(x)
out = att_score @ v # (T,T) @ (T,C) = (T,C)
return out

class gMultiHeadAttention(nn.Module):
'''
Multi-headed attention block
'''
def __init__(self, n_heads, head_size, n_lin, d_p):
super().__init__()
self.heads = nn.ModuleList([gHead(head_size, n_lin, d_p) for _ in range(n_heads)])
self.proj = nn.Linear(n_heads * head_size, n_lin)
self.dropout = nn.Dropout(d_p)

def forward(self, x):
out = torch.cat([h(x) for h in self.heads], dim=-1) #
out = self.proj(out)
out = self.dropout(out)
return out

class gLN1(nn.Module):
'''
SyGNet-LN1 block option when you do not want MHA
'''
def __init__(self, n_lin, d_p, r_a):
super().__init__()
self.lin = nn.Linear(n_lin, n_lin)
self.ln = nn.LayerNorm(n_lin)
self.relu = nn.LeakyReLU(r_a)
self.dp = nn.Dropout(d_p)

def forward(self, x):
x = self.lin(x)
x = self.ln(x)
x = self.relu(x)
x = self.dp(x)
return x



class LgBlock(nn.Module):
'''
Linear no residual connection block for generator module
'''
def __init__(self, n_heads, n_lin, d_p):
super().__init__()
head_size = n_lin // n_heads
self.sa = gMultiHeadAttention(n_heads, head_size, n_lin, d_p)
self.norm1 = nn.BatchNorm1d(n_lin)
self.relu = nn.LeakyReLU()

def forward(self, x):
x = x + self.sa(x)
x = self.norm1(x)
x = self.relu(x)
return x

class LcBlock(nn.Module):
'''
Linear no residual connection block for critic module
'''
def __init__(self, n_lin, d_p, r_a):
super().__init__()
self.lin = nn.Linear(n_lin, n_lin)
self.dp = nn.Dropout(d_p)
self.relu = nn.LeakyReLU(r_a)

def forward(self, x):
x = self.lin(x)
x = self.relu(x)
x = self.dp(x)
return x
78 changes: 33 additions & 45 deletions src/sygnet/dataloaders.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .requirements import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -40,16 +40,16 @@ def __init__(self, real_data, conditional = False, cond_cols= None):
data_in.drop(cond_cols, axis = 1, inplace = True)

# Process latent data
self.x, self.x_indxs, self.x_funcs, self.x_OHE, self.colnames = _preprocess_df(data_in)
self.x, self.x_indxs, self.x_funcs, self.x_transformers, self.colnames = _preprocess_df(data_in)
self.x = torch.from_numpy(self.x)

# Process conditional labels (no need to save funcs as won't be fed to activation)
self.labels,_,_,self.labels_OHE, label_names = _preprocess_df(cond_labels)
self.labels,_,_,self.labels_transformers, label_names = _preprocess_df(cond_labels)
self.labels = torch.from_numpy(self.labels)
self.colnames += label_names

else:
self.x, self.x_indxs, self.x_funcs, self.x_OHE, self.colnames = _preprocess_df(data_in)
self.x, self.x_indxs, self.x_funcs, self.x_transformers, self.colnames = _preprocess_df(data_in)
self.x = torch.from_numpy(self.x)
self.labels = torch.ones(self.n_samples, 1)

Expand All @@ -63,70 +63,57 @@ def __len__(self):
def _preprocess_df(df):
'''
Sort and arrange columns for managing mixed activation

Args:
df(pd.Dataframe): The input data

Returns:
df (np.array)
col_idx (list): Tuples with 'column name' and list of one-hot indices for that column plus all numeric columns)
col_fs (list): List of functions for each column in data
OHE (Encoder): sklearn OneHotEncoding object that can be used to inverse transform synthetic data
transformers (tuple): OHE object and min-max scaler for inverse data transformation
df_cols (list): List of column names after column sorting but before one-hot encoding
'''
# 1. get categorical colum names
num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] # numeric columns
str_type = ['O','category','string'] # select desired data type: 'O' - string
str_type = 'O' # select desired data type: 'O' - string
categorical_cols = []
binary_cols = []
positive_cols = []
numeric_cols = []
dtypes = df.dtypes.to_dict()

for colname, data_type in dtypes.items():
if data_type in str_type:
if data_type == str_type:
categorical_cols.append(colname)
elif data_type in num_type:
if set(df[colname].unique()) == {1,0}:
binary_cols.append(colname)
elif df[colname].min() >= 0:
positive_cols.append(colname)
else:
numeric_cols.append(colname)
numeric_cols.append(colname)
else:
pass

# 2. one-hot encoding, puts categorical columns at the end of the df
OHE = OneHotEncoder(sparse=False)
cat_df = OHE.fit_transform(df[categorical_cols])
df.drop(categorical_cols, axis=1, inplace=True)

df_cols = df.columns.tolist() + categorical_cols


OHE = OneHotEncoder(sparse_output=False)
scaler = MinMaxScaler()
# fill missing categorical columns as nan
df_cat = df[categorical_cols].fillna('nan')
# OHe transform
df_cat = OHE.fit_transform(df_cat)
df_num = df.drop(categorical_cols, axis=1)
# fill missing numeric values
df_num = df_num.fillna(df_num.median())

# get ordered list of column names
df_cols = df_num.columns.tolist() + categorical_cols
if df_num.shape[1] > 0:
df_num = scaler.fit_transform(df_num)
transformers = (OHE, scaler)

# 3. finding idx for each original categorical column
col_idx = []
col_fs = []
col_idx, col_fs = [], []

# Numeric cols indx
# Numeric cols idx
if len(numeric_cols) != 0:
col_idx_tensor = torch.Tensor([df.columns.get_loc(c) for c in numeric_cols])
col_idx_tensor = torch.Tensor([c for c in range(len(numeric_cols))])
col_idx.append(col_idx_tensor)
col_fs.append('identity')

# Positive cols
if len(positive_cols) != 0:
col_idx_tensor = torch.Tensor([df.columns.get_loc(c) for c in positive_cols])
col_idx.append(col_idx_tensor)
col_fs.append('relu')

# Binary cols
if len(binary_cols) != 0:
col_idx_tensor = torch.Tensor([df.columns.get_loc(c) for c in binary_cols])
col_idx.append(col_idx_tensor)
col_fs.append('sigmoid')

# Categorical cols
n_numeric = df.shape[1]
# Categorical cols idx
n_numeric = df_num.shape[1]
cat_current_count = 0
for var in OHE.categories_:
one_hot_cols = var.tolist()
Expand All @@ -136,8 +123,9 @@ def _preprocess_df(df):
col_idx.append(col_idx_tensor)
col_fs.append('softmax')

df = np.concatenate((df, cat_df), axis = 1, dtype=np.float32)
return df, col_idx, col_fs, OHE, df_cols
df = np.concatenate((df_num, df_cat), axis = 1, dtype=np.float32)

return df, col_idx, col_fs, transformers, df_cols

def _ohe_colnames(OHE):
cat_cols = []
Expand Down
Loading