tsrobinson · tsrobinson · Apr 26, 2023 · Apr 21, 2023 · Apr 21, 2023 · Apr 21, 2023
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,5 @@ outputs/*
 local_tests/**
 .ipynb_checkpoints/*
 test/__pychache__/*
-src/sygnet/__pychache__/*
+src/sygnet/__pychache__/*
+.DS_store
diff --git a/README.md b/README.md
@@ -29,17 +29,25 @@ To install via pip, you can run the following command at the command line:
 
 You can find a demonstration of **sygnet** under [examples/basic_example](examples/basic_example.ipynb).
 
-### Current version: 0.0.8 (alpha release)
+### Current version: 0.0.9 (alpha release)
 
-**Alpha release**: You should expect both functionality and pipelines to change (rapidly). Comments and bug reports are very welcome!
+**Alpha release**: You should expect both functionality and pipelines to change (rapidly and without warning). Comments and bug reports are very welcome!
 
 Changes:
 
-* Update `tune()` to provide no k-fold cross validation as default
-* Update numpy dependency to fix pre-processing bug
+* Rewrite of main interface and underlying functions
+* Bulding models now structured in terms of hidden "blocks"
+* Added self-attention mechanism
 
 ### Previous releases
 
+**0.0.8**
+
+Changes:
+
+* Update `tune()` to provide no k-fold cross validation as default
+* Update numpy dependency to fix pre-processing bug
+
 **0.0.7**
 * Update internal `train_*` functions to return losses and improve logging
 * Update `tune()` function

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ maintainers = [
     {name = "Maksim Zubok"},
 ]
 
-version = "0.0.8"
+version = "0.0.9"
 description = "Synthetic data using Generative Adversarial Networks"
 keywords = [
     "synthetic data", 
@@ -34,7 +34,7 @@ classifiers = [
 dependencies = [
     "numpy>=1.21",
     "torch>=1.10.0",
-    "scikit-learn>=1.0",
+    "scikit-learn>=1.2",
     "pandas>=1.4",
     "datetime",
     "tqdm",

diff --git a/src/.DS_Store b/src/.DS_Store
diff --git a/src/sygnet/__pycache__/__init__.cpython-39.pyc b/src/sygnet/__pycache__/__init__.cpython-39.pyc
diff --git a/src/sygnet/__pycache__/dataloaders.cpython-39.pyc b/src/sygnet/__pycache__/dataloaders.cpython-39.pyc
diff --git a/src/sygnet/__pycache__/interface.cpython-39.pyc b/src/sygnet/__pycache__/interface.cpython-39.pyc
diff --git a/src/sygnet/__pycache__/loader.cpython-39.pyc b/src/sygnet/__pycache__/loader.cpython-39.pyc
diff --git a/src/sygnet/__pycache__/models.cpython-39.pyc b/src/sygnet/__pycache__/models.cpython-39.pyc
diff --git a/src/sygnet/__pycache__/requirements.cpython-39.pyc b/src/sygnet/__pycache__/requirements.cpython-39.pyc
diff --git a/src/sygnet/__pycache__/train.cpython-39.pyc b/src/sygnet/__pycache__/train.cpython-39.pyc
diff --git a/src/sygnet/__pycache__/tune.cpython-39.pyc b/src/sygnet/__pycache__/tune.cpython-39.pyc
diff --git a/src/sygnet/blocks.py b/src/sygnet/blocks.py
@@ -0,0 +1,92 @@
+from .requirements import *
+
+class gHead(nn.Module):
+  ''' 
+  Single attention-head module
+  '''
+  def __init__(self, head_size, n_lin, d_p):
+     super().__init__()
+     self.key = nn.Linear(n_lin, head_size, bias=False)
+     self.query = nn.Linear(n_lin, head_size, bias=False)
+     self.value = nn.Linear(n_lin, head_size, bias=False)
+     self.dropout = nn.Dropout(d_p)
+
+  def forward(self, x): 
+    T,C = x.shape
+    k = self.key(x) 
+    q = self.query(x)
+    att_score = q @ k.T * C**-5 # (T,C) @ (C,T) -> (T,T)
+    att_score = F.softmax(att_score, dim=-1)
+    att_score = self.dropout(att_score)
+    v = self.value(x) 
+    out = att_score @ v # (T,T) @ (T,C) = (T,C)
+    return out
+
+class gMultiHeadAttention(nn.Module):
+  ''' 
+  Multi-headed attention block
+  '''
+  def __init__(self, n_heads, head_size, n_lin, d_p):
+    super().__init__()
+    self.heads = nn.ModuleList([gHead(head_size, n_lin, d_p) for _ in range(n_heads)])
+    self.proj = nn.Linear(n_heads * head_size, n_lin)
+    self.dropout = nn.Dropout(d_p)
+
+  def forward(self, x):
+    out = torch.cat([h(x) for h in self.heads], dim=-1) #
+    out = self.proj(out)
+    out = self.dropout(out)
+    return out
+
+class gLN1(nn.Module):
+  '''
+  SyGNet-LN1 block option when you do not want MHA
+  '''
+  def __init__(self, n_lin, d_p, r_a):
+    super().__init__()
+    self.lin = nn.Linear(n_lin, n_lin)
+    self.ln = nn.LayerNorm(n_lin)
+    self.relu = nn.LeakyReLU(r_a)
+    self.dp = nn.Dropout(d_p)
+
+  def forward(self, x):
+    x = self.lin(x)
+    x = self.ln(x)
+    x = self.relu(x)
+    x = self.dp(x)
+    return x
+
+
+
+class LgBlock(nn.Module):
+  ''' 
+  Linear no residual connection block for generator module
+  '''
+  def __init__(self, n_heads, n_lin, d_p):
+    super().__init__()
+    head_size = n_lin // n_heads
+    self.sa = gMultiHeadAttention(n_heads, head_size, n_lin, d_p)
+    self.norm1 = nn.BatchNorm1d(n_lin)
+    self.relu = nn.LeakyReLU()
+
+  def forward(self, x):
+    x = x + self.sa(x)
+    x = self.norm1(x)
+    x = self.relu(x)
+    return x
+
+class LcBlock(nn.Module):
+  ''' 
+  Linear no residual connection block for critic module
+  '''
+  def __init__(self, n_lin, d_p, r_a):
+    super().__init__()
+    self.lin = nn.Linear(n_lin, n_lin)
+    self.dp = nn.Dropout(d_p)
+    self.relu = nn.LeakyReLU(r_a)
+
+  def forward(self, x):
+    x = self.lin(x)
+    x = self.relu(x)
+    x = self.dp(x)
+    return x
diff --git a/src/sygnet/dataloaders.py b/src/sygnet/dataloaders.py
@@ -1,5 +1,5 @@
 from .requirements import *
-from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
 
 logger = logging.getLogger(__name__)
 
@@ -40,16 +40,16 @@ def __init__(self, real_data, conditional = False, cond_cols= None):
             data_in.drop(cond_cols, axis = 1, inplace = True)
 
             # Process latent data
-            self.x, self.x_indxs, self.x_funcs, self.x_OHE, self.colnames = _preprocess_df(data_in)
+            self.x, self.x_indxs, self.x_funcs, self.x_transformers, self.colnames = _preprocess_df(data_in)
             self.x = torch.from_numpy(self.x)
 
             # Process conditional labels (no need to save funcs as won't be fed to activation)
-            self.labels,_,_,self.labels_OHE, label_names = _preprocess_df(cond_labels)
+            self.labels,_,_,self.labels_transformers, label_names = _preprocess_df(cond_labels)
             self.labels = torch.from_numpy(self.labels)
             self.colnames += label_names
 
         else:
-            self.x, self.x_indxs, self.x_funcs, self.x_OHE, self.colnames = _preprocess_df(data_in)
+            self.x, self.x_indxs, self.x_funcs, self.x_transformers, self.colnames = _preprocess_df(data_in)
             self.x = torch.from_numpy(self.x)
             self.labels = torch.ones(self.n_samples, 1)
 
@@ -63,70 +63,57 @@ def __len__(self):
 def _preprocess_df(df):
     '''
     Sort and arrange columns for managing mixed activation
-
     Args:
         df(pd.Dataframe): The input data
-
     Returns:
         df (np.array)
         col_idx (list): Tuples with 'column name' and list of one-hot indices for that column plus all numeric columns)
         col_fs (list): List of functions for each column in data
-        OHE (Encoder): sklearn OneHotEncoding object that can be used to inverse transform synthetic data
+        transformers (tuple): OHE object and min-max scaler for inverse data transformation
         df_cols (list): List of column names after column sorting but before one-hot encoding
     '''
     # 1. get categorical colum names
     num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']  # numeric columns
-    str_type = ['O','category','string']  # select desired data type: 'O' - string
+    str_type = 'O'  # select desired data type: 'O' - string
     categorical_cols = []
-    binary_cols = []
-    positive_cols = []
     numeric_cols = []
     dtypes = df.dtypes.to_dict()
 
     for colname, data_type in dtypes.items():
-        if data_type in str_type:
+        if data_type == str_type:
             categorical_cols.append(colname)
         elif data_type in num_type:
-            if set(df[colname].unique()) == {1,0}:
-                binary_cols.append(colname)
-            elif df[colname].min() >= 0:
-                positive_cols.append(colname)
-            else:
-                numeric_cols.append(colname)
+          numeric_cols.append(colname)
         else:
             pass
-
-    # 2. one-hot encoding, puts categorical columns at the end of the df
-    OHE = OneHotEncoder(sparse=False)
-    cat_df = OHE.fit_transform(df[categorical_cols])
-    df.drop(categorical_cols, axis=1, inplace=True)
-
-    df_cols = df.columns.tolist() + categorical_cols
-
+
+    OHE = OneHotEncoder(sparse_output=False)
+    scaler = MinMaxScaler()
+    # fill missing categorical columns as nan
+    df_cat = df[categorical_cols].fillna('nan')
+    # OHe transform
+    df_cat = OHE.fit_transform(df_cat)
+    df_num = df.drop(categorical_cols, axis=1)
+    # fill missing numeric values
+    df_num = df_num.fillna(df_num.median())
+
+    # get ordered list of column names
+    df_cols = df_num.columns.tolist() + categorical_cols
+    if df_num.shape[1] > 0:
+        df_num = scaler.fit_transform(df_num)
+    transformers = (OHE, scaler)
+
     # 3. finding idx for each original categorical column
-    col_idx = []
-    col_fs = []
+    col_idx, col_fs = [], []
 
-    # Numeric cols indx
+    # Numeric cols idx
     if len(numeric_cols) != 0:
-        col_idx_tensor = torch.Tensor([df.columns.get_loc(c) for c in numeric_cols])
+        col_idx_tensor = torch.Tensor([c for c in range(len(numeric_cols))])
         col_idx.append(col_idx_tensor)
         col_fs.append('identity')
 
-    # Positive cols
-    if len(positive_cols) != 0:
-        col_idx_tensor = torch.Tensor([df.columns.get_loc(c) for c in positive_cols])
-        col_idx.append(col_idx_tensor)
-        col_fs.append('relu')
-
-    # Binary cols
-    if len(binary_cols) != 0:
-        col_idx_tensor = torch.Tensor([df.columns.get_loc(c) for c in binary_cols])
-        col_idx.append(col_idx_tensor)
-        col_fs.append('sigmoid')
-
-    # Categorical cols
-    n_numeric = df.shape[1]
+    # Categorical cols idx
+    n_numeric = df_num.shape[1]
     cat_current_count = 0
     for var in OHE.categories_:
         one_hot_cols = var.tolist()
@@ -136,8 +123,9 @@ def _preprocess_df(df):
         col_idx.append(col_idx_tensor)
         col_fs.append('softmax')
 
-    df = np.concatenate((df, cat_df), axis = 1, dtype=np.float32)
-    return df, col_idx, col_fs, OHE, df_cols
+    df = np.concatenate((df_num, df_cat), axis = 1, dtype=np.float32) 
+
+    return df, col_idx, col_fs, transformers, df_cols 
 
 def _ohe_colnames(OHE):
     cat_cols = []