-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
27,695 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,26 @@ | ||
# timegan-pytorch | ||
This repository holds the code for the reimplementation of TimeGAN (Yoon et al., NIPS2019) using PyTorch | ||
This repository holds the code for the reimplementation of TimeGAN ([Yoon et al., NIPS2019](https://papers.nips.cc/paper/8789-time-series-generative-adversarial-networks)) using PyTorch. Some of the code was derived from the original implementation [here](https://github.com/jsyoon0823/TimeGAN). | ||
|
||
## Getting Started | ||
### Installing Requirements | ||
This implementation assumes Python3.8 and a Linux environment with a GPU is used. | ||
```bash | ||
cat requirements.txt | xargs -n 1 pip install --upgrade | ||
``` | ||
|
||
### Directory Hierarchy | ||
```bash | ||
data/ # the folder holding the datasets and preprocessing files | ||
├ data_preprocessing.py # the data preprocessing functions | ||
└ stock.csv # the example stock data derived from the original repo | ||
metrics/ # the folder holding the metric functions for evaluating the model | ||
├ dataset.py # the dataset class for feature predicting and one-step ahead predicting | ||
├ general_rnn.py # the model for fitting the dataset during TSTR evaluation | ||
├ metric_utils.py # the main function for evaluating TSTR | ||
└ visualization.py # PCA and t-SNE implementation for time series taken from the original repo | ||
models | ||
main.py # the main code for training and evaluating TSTR of the model | ||
requirements.txt # requirements for running code | ||
run.sh # the bash script for running model | ||
visualization.ipynb # jupyter notebook for running visualization of original and synthetic data | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
"""Hide-and-Seek Privacy Challenge Codebase. | ||
Reference: James Jordon, Daniel Jarrett, Jinsung Yoon, Ari Ercole, Cheng Zhang, Danielle Belgrave, Mihaela van der Schaar, | ||
"Hide-and-Seek Privacy Challenge: Synthetic Data Generation vs. Patient Re-identification with Clinical Time-series Data," | ||
Neural Information Processing Systems (NeurIPS) Competition, 2020. | ||
Link: https://www.vanderschaar-lab.com/announcing-the-neurips-2020-hide-and-seek-privacy-challenge/ | ||
Last updated Date: Oct 17th 2020 | ||
Code author: Jinsung Yoon, Evgeny Saveliev | ||
Contact: jsyoon0823@gmail.com, e.s.saveliev@gmail.com | ||
----------------------------- | ||
(1) data_preprocess: Load the data and preprocess into a 3d numpy array | ||
(2) imputater: Impute missing data | ||
""" | ||
# Local packages | ||
import os | ||
from typing import Union, Tuple, List | ||
import warnings | ||
warnings.filterwarnings("ignore") | ||
|
||
# 3rd party modules | ||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
from scipy import stats | ||
from sklearn.preprocessing import StandardScaler, MinMaxScaler | ||
|
||
def data_preprocess( | ||
file_name: str, | ||
max_seq_len: int, | ||
padding_value: float=-1.0, | ||
impute_method: str="mode", | ||
scaling_method: str="minmax", | ||
) -> Tuple[np.ndarray, np.ndarray, List]: | ||
"""Load the data and preprocess into 3d numpy array. | ||
Preprocessing includes: | ||
1. Remove outliers | ||
2. Extract sequence length for each patient id | ||
3. Impute missing data | ||
4. Normalize data | ||
6. Sort dataset according to sequence length | ||
Args: | ||
- file_name (str): CSV file name | ||
- max_seq_len (int): maximum sequence length | ||
- impute_method (str): The imputation method ("median" or "mode") | ||
- scaling_method (str): The scaler method ("standard" or "minmax") | ||
Returns: | ||
- processed_data: preprocessed data | ||
- time: ndarray of ints indicating the length for each data | ||
- params: the parameters to rescale the data | ||
""" | ||
|
||
######################### | ||
# Load data | ||
######################### | ||
|
||
if file_name.endswith("stock.csv"): | ||
index = 'Idx' | ||
else: | ||
index = 'admissionid' | ||
|
||
# Load csv | ||
print("Loading data...\n") | ||
ori_data = pd.read_csv(file_name) | ||
|
||
# Remove spurious column, so that column 0 is now 'admissionid'. | ||
if ori_data.columns[0] == "Unnamed: 0": | ||
ori_data = ori_data.drop(["Unnamed: 0"], axis=1) | ||
|
||
######################### | ||
# Remove outliers from dataset | ||
######################### | ||
|
||
no = ori_data.shape[0] | ||
z_scores = stats.zscore(ori_data, axis=0, nan_policy='omit') | ||
z_filter = np.nanmax(np.abs(z_scores), axis=1) < 3 | ||
ori_data = ori_data[z_filter] | ||
print(f"Dropped {no - ori_data.shape[0]} rows (outliers)\n") | ||
|
||
# Parameters | ||
uniq_id = np.unique(ori_data[index]) | ||
no = len(uniq_id) | ||
dim = len(ori_data.columns) - 1 | ||
|
||
######################### | ||
# Impute, scale and pad data | ||
######################### | ||
|
||
# Initialize scaler | ||
if scaling_method == "minmax": | ||
scaler = MinMaxScaler() | ||
scaler.fit(ori_data) | ||
params = [scaler.data_min_, scaler.data_max_] | ||
|
||
elif scaling_method == "standard": | ||
scaler = StandardScaler() | ||
scaler.fit(ori_data) | ||
params = [scaler.mean_, scaler.var_] | ||
|
||
# Imputation values | ||
if impute_method == "median": | ||
impute_vals = ori_data.median() | ||
elif impute_method == "mode": | ||
impute_vals = stats.mode(ori_data).mode[0] | ||
else: | ||
raise ValueError("Imputation method should be `median` or `mode`") | ||
|
||
# TODO: Sanity check for padding value | ||
# if np.any(ori_data == padding_value): | ||
# print(f"Padding value `{padding_value}` found in data") | ||
# padding_value = np.nanmin(ori_data.to_numpy()) - 1 | ||
# print(f"Changed padding value to: {padding_value}\n") | ||
|
||
# Output initialization | ||
loaded_data = np.empty([no, max_seq_len, dim]) # Shape:[no, max_seq_len, dim] | ||
loaded_data.fill(padding_value) | ||
time = [] | ||
|
||
# For each uniq id | ||
for i in tqdm(range(no)): | ||
# Extract the time-series data with a certain admissionid | ||
|
||
curr_data = ori_data[ori_data[index] == uniq_id[i]].to_numpy() | ||
|
||
# Impute missing data | ||
curr_data = imputer(curr_data, impute_vals) | ||
|
||
# Normalize data | ||
curr_data = scaler.transform(curr_data) | ||
|
||
# Extract time and assign to the preprocessed data (Excluding ID) | ||
curr_no = len(curr_data) | ||
|
||
# Pad data to `max_seq_len` | ||
if curr_no >= max_seq_len: | ||
loaded_data[i, :, :] = curr_data[:max_seq_len, 1:] # Shape: [1, max_seq_len, dim] | ||
time.append(max_seq_len) | ||
else: | ||
loaded_data[i, :curr_no, :] = curr_data[:, 1:] # Shape: [1, max_seq_len, dim] | ||
time.append(curr_no) | ||
|
||
return output, time, params, max_seq_len, padding_value | ||
|
||
def imputer( | ||
curr_data: np.ndarray, | ||
impute_vals: List, | ||
zero_fill: bool = True | ||
) -> np.ndarray: | ||
"""Impute missing data given values for each columns. | ||
Args: | ||
curr_data (np.ndarray): Data before imputation. | ||
impute_vals (list): Values to be filled for each column. | ||
zero_fill (bool, optional): Whather to Fill with zeros the cases where | ||
impute_val is nan. Defaults to True. | ||
Returns: | ||
np.ndarray: Imputed data. | ||
""" | ||
|
||
curr_data = pd.DataFrame(data=curr_data) | ||
impute_vals = pd.Series(impute_vals) | ||
|
||
# Impute data | ||
imputed_data = curr_data.fillna(impute_vals) | ||
|
||
# Zero-fill, in case the `impute_vals` for a particular feature is `nan`. | ||
imputed_data = imputed_data.fillna(0.0) | ||
|
||
# Check for any N/A values | ||
if imputed_data.isnull().any().any(): | ||
raise ValueError("NaN values remain after imputation") | ||
|
||
return imputed_data.to_numpy() |
Oops, something went wrong.