Skip to content

Commit

Permalink
Added faster/simpler data pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
isamu-isozaki committed Apr 17, 2021
1 parent 2d16e60 commit a2858bd
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 101 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Byte-compiled / optimized / DLL files
ptn11H_10/
ptn11H_1000/
ptn11H_1000.zip
__pycache__/
*.py[cod]
*$py.class
Expand Down
17 changes: 17 additions & 0 deletions data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from loadptn import train_data_loader, z_max, z_min, y_max, y_min, x_max, x_min, atom_pos, atom_type
import os
def get_data(fdir):
files = os.listdir(fdir)
files.sort()

#print(load_feature_dimensions(files, fdir))
# Initialize the feature set
feature_set = None
if os.path.isfile(dataset_file+'.npy'):
feature_set = np.load(dataset_file+'.npy')
else:
feature_set = np.zeros(shape=(len(files), z_max-z_min, y_max-y_min, x_max-x_min, 1 + len(atom_type) + len(atom_pos)))
train_data_loader(files, feature_set, fdir=fdir)
np.save(dataset_file, feature_set)

return feature_set
4 changes: 2 additions & 2 deletions gan.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from tensorflow.keras import layers
import tensorflow as tf
from tensorflow import keras
from loadptn import x_min, y_min, z_min, x_max, y_max, z_max, atom_pos, atom_type
physical_devices = tf.config.list_physical_devices('GPU')
try:
tf.config.experimental.set_memory_growth(physical_devices[0], True)
Expand All @@ -10,7 +11,7 @@
# Create the discriminator
discriminator = keras.Sequential(
[
keras.Input(shape=(59, 46, 63, 74)),
keras.Input(shape=(z_max-z_min, y_max-y_min, x_max-x_min, 1 + len(atom_type) + len(atom_pos))),
layers.Conv3D(64, (3, 3, 3), strides=(2, 2, 2), padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv3D(128, (3, 3), strides=(2, 2, 2), padding="same"),
Expand All @@ -26,7 +27,6 @@
generator = keras.Sequential(
[
keras.Input(shape=(latent_dim,)),
# We want to generate 128 coefficients to reshape into a 7x7x128 map
layers.Dense(7 * 7 * 128),
layers.LeakyReLU(alpha=0.2),
layers.Reshape((7, 7, 128)),
Expand Down
122 changes: 40 additions & 82 deletions loadptn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@
import random

import pickle
import os
from tqdm import tqdm

CUBIC_LENGTH_CONSTRAINT = 70
x_min, y_min, z_min, x_max, y_max, z_max = 1, 10, 4, 64, 56, 63

atom_type = ['C', 'N', 'O', 'S', 'None']
atom_type_data = pd.Series(atom_type)
atom_type_encoder = np.array(pd.get_dummies(atom_type_data))

atom_pos = ['O1', 'C9', 'O3', 'CZ2', 'CG2', 'CG', 'NE1', 'C1', 'C2', 'N3', 'CZ', 'OE2', 'SE', 'OE1', 'ND1', 'NH2', 'CE', 'C', 'OE21', 'OD2', 'OG', 'CH2', 'OXT', 'C5', 'ND2', 'C13', 'OE12', 'SD', 'C4', 'O', 'C6', 'C7', 'CE3', 'CH1', 'CA', 'C11', 'CB', 'CE1', 'NZ', 'C3', 'C12', 'OE11', 'NE', 'NE2', 'OG1', 'OH', 'N2', 'OT1', 'N1', 'O2', 'C14', 'C8', 'CD1', 'CG1', 'OD1', 'N', 'C10', 'CD2', 'CZ3', 'NH1', 'S', 'OT2', 'OE22', 'CD', 'SG', 'CE2', 'O4', 'None']
atom_pos_data = pd.Series(atom_pos)
atom_pos_encoder = np.array(pd.get_dummies(atom_pos_data))
dataset_file = 'ptn11H_10'

# Given a set of files storing entry objects and their directory location, return their feature dimensions such as the positional atom types and the bounds for the matrix.
def load_feature_dimensions(files, fdir = 'ptndata_10H/'):
Expand All @@ -28,30 +42,6 @@ def load_feature_dimensions(files, fdir = 'ptndata_10H/'):
return atom_pos, x_min, y_min, z_min, x_max, y_max, z_max


# This is a generator function for files containing entry objects in the given location. These objects, due to their large size, are fed into the CNN one at a time as a memory optimization step.
def sample_gen(files, feature_set, atom_type, atom_type_encoder, atom_pos, atom_pos_encoder, energy_scores, x_min, y_min, z_min, x_max, y_max, z_max, fdir='ptndata_10H/'):
for q, file in enumerate(files):
entry = pickle.load(open(fdir + file, 'rb'))
a = grid2logical(entry.mat)
b = grid2atomtype(entry.mat, atom_type, atom_type_encoder)
c = grid2atom(entry.mat, atom_pos, atom_pos_encoder)
dm_output = entry.dm
# rosetta_score, mse_score

y = dm_output[0].tolist()
y = np.reshape(y, (1, len(y[0]), len(y[0])))
y = y.astype(float)
#y = energy_scores.loc['ptndata_10H/' + file]['mse_score']
#y = np.array(y)
#y = y.reshape(-1,1)
for i in range(len(feature_set[0])):
for j in range(len(feature_set[0][0])):
for k in range(len(feature_set[0][0][0])):
feature_set[0][i][j][k] = [a[x_min + i][y_min + j][z_min + k]] + b[x_min + i][y_min + j][z_min + k].tolist() + c[x_min + i][y_min + j][z_min + k].tolist()

yield (feature_set, y)


# This is almost like sample_gen, except it is a function instead of a generator function. This is used for generating the validation data before training the CNN. It generates the validation samples for all three of the metrics.
def sample_loader(files, feature_set_, atom_type, atom_type_encoder, atom_pos, atom_pos_encoder, energy_scores, x_min, y_min, z_min, x_max, y_max, z_max, fdir='ptndata_10H/'):
#if True:
Expand Down Expand Up @@ -161,29 +151,6 @@ def find_bounds(mat):
return x_min, y_min, z_min, x_max, y_max, z_max


def sample_gen(files, feature_set, atom_type, atom_type_encoder, atom_pos, atom_pos_encoder, energy_scores, x_min, y_min, z_min, x_max, y_max, z_max, fdir='ptndata_10H/'):
for q, file in enumerate(files):
entry = pickle.load(open(fdir + file, 'rb'))
a = grid2logical(entry.mat)
b = grid2atomtype(entry.mat, atom_type, atom_type_encoder)
c = grid2atom(entry.mat, atom_pos, atom_pos_encoder)
dm_output = entry.dm
# rosetta_score, mse_score

y = dm_output[0].tolist()
y = np.reshape(y, (1, len(y[0]), len(y[0])))
y = y.astype(float)
#y = energy_scores.loc['ptndata_10H/' + file]['mse_score']
#y = np.array(y)
#y = y.reshape(-1,1)
for i in range(len(feature_set[0])):
for j in range(len(feature_set[0][0])):
for k in range(len(feature_set[0][0][0])):
feature_set[0][i][j][k] = [a[x_min + i][y_min + j][z_min + k]] + b[x_min + i][y_min + j][z_min + k].tolist() + c[x_min + i][y_min + j][z_min + k].tolist()

yield (feature_set, y)


# Given new bounds and old bounds, return the proper updated bounds.
def update_bounds(new_x_min, new_y_min, new_z_min, new_x_max, new_y_max, new_z_max, x_min, y_min, z_min, x_max, y_max, z_max):
if new_x_min < x_min:
Expand All @@ -206,45 +173,36 @@ def update_bounds(new_x_min, new_y_min, new_z_min, new_x_max, new_y_max, new_z_m

return x_min, y_min, z_min, x_max, y_max, z_max

if __name__ == "__main__":

n = 10

fdir='/Users/ethanmoyer/Projects/data/ptn/20210414_DAI_ptndata_11H/'

files = getfileswithname(fdir, 'obj')
def train_data_loader(files, feature_set, fdir='ptndata_10H/'):
global atom_type, atom_type_encoder, atom_pos, atom_pos_encoder, x_min, y_min, z_min, x_max, y_max, z_max
for q, file in tqdm(enumerate(files)):
entry = pickle.load(open(fdir + file, 'rb'))
a = grid2logical(entry.mat)
b = grid2atomtype(entry.mat, atom_type, atom_type_encoder)
c = grid2atom(entry.mat, atom_pos, atom_pos_encoder)
#y = energy_scores.loc['ptndata_10H/' + file]['mse_score']
#y = np.array(y)
#y = y.reshape(-1,1)
for i in range(len(feature_set[0])):
for j in range(len(feature_set[0][0])):
for k in range(len(feature_set[0][0][0])):
feature_set[q][i][j][k] = [a[x_min + i][y_min + j][z_min + k]] + b[x_min + i][y_min + j][z_min + k].tolist() + c[x_min + i][y_min + j][z_min + k].tolist()

if __name__ == "__main__":
fdir='ptn11H_10/'
files = os.listdir(fdir)
files.sort()

files = files[:n]

atom_type = ['C', 'N', 'O', 'S', 'None']
atom_type_data = pd.Series(atom_type)
atom_type_encoder = np.array(pd.get_dummies(atom_type_data))


# Skip if dimensions are already foudn for given data set
# atom_pos, x_min, y_min, z_min, x_max, y_max, z_max = load_feature_dimensions(files, fdir)

print(x_min, y_min, z_min, x_max, y_max, z_max)
x_min = 1
y_min = 10
z_min = 4
x_max = 64
y_max = 56
z_max = 63
atom_pos = ['O1', 'C9', 'O3', 'CZ2', 'CG2', 'CG', 'NE1', 'C1', 'C2', 'N3', 'CZ', 'OE2', 'SE', 'OE1', 'ND1', 'NH2', 'CE', 'C', 'OE21', 'OD2', 'OG', 'CH2', 'OXT', 'C5', 'ND2', 'C13', 'OE12', 'SD', 'C4', 'O', 'C6', 'C7', 'CE3', 'CH1', 'CA', 'C11', 'CB', 'CE1', 'NZ', 'C3', 'C12', 'OE11', 'NE', 'NE2', 'OG1', 'OH', 'N2', 'OT1', 'N1', 'O2', 'C14', 'C8', 'CD1', 'CG1', 'OD1', 'N', 'C10', 'CD2', 'CZ3', 'NH1', 'S', 'OT2', 'OE22', 'CD', 'SG', 'CE2', 'O4', 'None']


# Format the position specific atom list so it can be used as one-hot encoding in the network
atom_pos_data = pd.Series(atom_pos)
atom_pos_encoder = np.array(pd.get_dummies(atom_pos_data))

#print(load_feature_dimensions(files, fdir))
# Initialize the feature set
feature_set = np.array([[[[ [0] * (1 + len(atom_type) + len(atom_pos)) for i in range(x_min, x_max)] for j in range(y_min, y_max)] for k in range(z_min, z_max)] for q in range(1)])

feature_set_ = np.array([[[[ [0] * (1 + len(atom_type) + len(atom_pos)) for i in range(x_min, x_max)] for j in range(y_min, y_max)] for k in range(z_min, z_max)] for q in range(validation_samples)])

feature_set = None
if os.path.isfile(dataset_file+'.npy'):
feature_set = np.load(dataset_file+'.npy')
else:
feature_set = np.zeros(shape=(len(files), z_max-z_min, y_max-y_min, x_max-x_min, 1 + len(atom_type) + len(atom_pos)))
train_data_loader(files, feature_set, fdir=fdir)
np.save(dataset_file, feature_set)
# feature_set_ = np.array([[[[ [0] * (1 + len(atom_type) + len(atom_pos)) for i in range(x_min, x_max)] for j in range(y_min, y_max)] for k in range(z_min, z_max)] for q in range(validation_samples)])



35 changes: 18 additions & 17 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,22 @@
import tensorflow as tf
from tensorflow import keras
import numpy as np
from data import get_data

# Prepare the dataset. We use both the training & test MNIST digits.
batch_size = 16
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
all_digits = np.concatenate([x_train, x_test])
all_digits = all_digits.astype("float32") / 255.0
all_digits = np.reshape(all_digits, (-1, 28, 28, 1))
dataset = tf.data.Dataset.from_tensor_slices(all_digits)
dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)
gan = GAN(discriminator=discriminator, generator=generator, latent_dim=latent_dim)
gan.compile(
d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
)
# To limit the execution time, we only train on 100 batches. You can train on
# the entire dataset. You will need about 20 epochs to get nice results.
gan.fit(dataset.take(100), epochs=1)
def main(batch_size, file_dir):
# Prepare the dataset. We use both the training & test MNIST digits.
(x_train, _), (x_test, _) = get_data(file_dir)
all_digits = np.concatenate([x_train, x_test])
all_digits = all_digits.astype("float32") / 255.0
all_digits = np.reshape(all_digits, (-1, 28, 28, 1))
dataset = tf.data.Dataset.from_tensor_slices(all_digits)
dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)
gan = GAN(discriminator=discriminator, generator=generator, latent_dim=latent_dim)
gan.compile(
d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
)
# To limit the execution time, we only train on 100 batches. You can train on
# the entire dataset. You will need about 20 epochs to get nice results.
gan.fit(dataset.take(100), epochs=1)

0 comments on commit a2858bd

Please sign in to comment.