Added faster/simpler data pipeline

drexelai · Apr 17, 2021 · a2858bd · a2858bd
1 parent 2d16e60
commit a2858bd
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 101 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,7 @@
 # Byte-compiled / optimized / DLL files
+ptn11H_10/
+ptn11H_1000/
+ptn11H_1000.zip
 __pycache__/
 *.py[cod]
 *$py.class

diff --git a/data.py b/data.py
@@ -0,0 +1,17 @@
+from loadptn import train_data_loader, z_max, z_min, y_max, y_min, x_max, x_min, atom_pos, atom_type
+import os
+def get_data(fdir):
+    files = os.listdir(fdir)
+	files.sort()
+
+	#print(load_feature_dimensions(files, fdir))
+	# Initialize the feature set
+    feature_set = None
+    if os.path.isfile(dataset_file+'.npy'):
+		feature_set = np.load(dataset_file+'.npy')
+	else:
+	    feature_set = np.zeros(shape=(len(files), z_max-z_min, y_max-y_min, x_max-x_min, 1 + len(atom_type) + len(atom_pos)))
+		train_data_loader(files, feature_set, fdir=fdir)
+		np.save(dataset_file, feature_set)
+
+    return feature_set
diff --git a/gan.py b/gan.py
@@ -1,6 +1,7 @@
 from tensorflow.keras import layers
 import tensorflow as tf
 from tensorflow import keras
+from loadptn import x_min, y_min, z_min, x_max, y_max, z_max, atom_pos, atom_type
 physical_devices = tf.config.list_physical_devices('GPU')
 try:
   tf.config.experimental.set_memory_growth(physical_devices[0], True)
@@ -10,7 +11,7 @@
 # Create the discriminator
 discriminator = keras.Sequential(
     [
-        keras.Input(shape=(59, 46, 63, 74)),
+        keras.Input(shape=(z_max-z_min, y_max-y_min, x_max-x_min, 1 + len(atom_type) + len(atom_pos))),
         layers.Conv3D(64, (3, 3, 3), strides=(2, 2, 2), padding="same"),
         layers.LeakyReLU(alpha=0.2),
         layers.Conv3D(128, (3, 3), strides=(2, 2, 2), padding="same"),
@@ -26,7 +27,6 @@
 generator = keras.Sequential(
     [
         keras.Input(shape=(latent_dim,)),
-        # We want to generate 128 coefficients to reshape into a 7x7x128 map
         layers.Dense(7 * 7 * 128),
         layers.LeakyReLU(alpha=0.2),
         layers.Reshape((7, 7, 128)),

diff --git a/loadptn.py b/loadptn.py
@@ -11,6 +11,20 @@
 import random
 
 import pickle
+import os
+from tqdm import tqdm
+
+CUBIC_LENGTH_CONSTRAINT = 70
+x_min, y_min, z_min, x_max, y_max, z_max = 1, 10, 4, 64, 56, 63
+
+atom_type = ['C', 'N', 'O', 'S', 'None']
+atom_type_data = pd.Series(atom_type)
+atom_type_encoder = np.array(pd.get_dummies(atom_type_data))
+
+atom_pos = ['O1', 'C9', 'O3', 'CZ2', 'CG2', 'CG', 'NE1', 'C1', 'C2', 'N3', 'CZ', 'OE2', 'SE', 'OE1', 'ND1', 'NH2', 'CE', 'C', 'OE21', 'OD2', 'OG', 'CH2', 'OXT', 'C5', 'ND2', 'C13', 'OE12', 'SD', 'C4', 'O', 'C6', 'C7', 'CE3', 'CH1', 'CA', 'C11', 'CB', 'CE1', 'NZ', 'C3', 'C12', 'OE11', 'NE', 'NE2', 'OG1', 'OH', 'N2', 'OT1', 'N1', 'O2', 'C14', 'C8', 'CD1', 'CG1', 'OD1', 'N', 'C10', 'CD2', 'CZ3', 'NH1', 'S', 'OT2', 'OE22', 'CD', 'SG', 'CE2', 'O4', 'None']
+atom_pos_data = pd.Series(atom_pos)
+atom_pos_encoder = np.array(pd.get_dummies(atom_pos_data))
+dataset_file = 'ptn11H_10'
 
 # Given a set of files storing entry objects and their directory location, return their feature dimensions such as the positional atom types and the bounds for the matrix.
 def load_feature_dimensions(files, fdir = 'ptndata_10H/'):
@@ -28,30 +42,6 @@ def load_feature_dimensions(files, fdir = 'ptndata_10H/'):
 	return atom_pos, x_min, y_min, z_min, x_max, y_max, z_max
 
 
-# This is a generator function for files containing entry objects in the given location. These objects, due to their large size, are fed into the CNN one at a time as a memory optimization step.
-def sample_gen(files, feature_set, atom_type, atom_type_encoder, atom_pos, atom_pos_encoder, energy_scores, x_min, y_min, z_min, x_max, y_max, z_max, fdir='ptndata_10H/'):
-	for q, file in enumerate(files):
-		entry = pickle.load(open(fdir + file, 'rb'))
-		a = grid2logical(entry.mat)
-		b = grid2atomtype(entry.mat, atom_type, atom_type_encoder)
-		c = grid2atom(entry.mat, atom_pos, atom_pos_encoder)
-		dm_output = entry.dm
-		# rosetta_score, mse_score
-
-		y = dm_output[0].tolist()
-		y = np.reshape(y, (1, len(y[0]), len(y[0])))
-		y = y.astype(float)
-		#y = energy_scores.loc['ptndata_10H/' + file]['mse_score']
-		#y = np.array(y)
-		#y = y.reshape(-1,1)	
-		for i in range(len(feature_set[0])):
-			for j in range(len(feature_set[0][0])):
-				for k in range(len(feature_set[0][0][0])):
-					feature_set[0][i][j][k] = [a[x_min + i][y_min + j][z_min + k]] + b[x_min + i][y_min + j][z_min + k].tolist() + c[x_min + i][y_min + j][z_min + k].tolist()
-
-		yield (feature_set, y)
-
-
 # This is almost like sample_gen, except it is a function instead of a generator function. This is used for generating the validation data before training the CNN. It generates the validation samples for all three of the metrics.
 def sample_loader(files, feature_set_, atom_type, atom_type_encoder, atom_pos, atom_pos_encoder, energy_scores, x_min, y_min, z_min, x_max, y_max, z_max, fdir='ptndata_10H/'):
 #if True:
@@ -161,29 +151,6 @@ def find_bounds(mat):
 	return x_min, y_min, z_min, x_max, y_max, z_max
 
 
-def sample_gen(files, feature_set, atom_type, atom_type_encoder, atom_pos, atom_pos_encoder, energy_scores, x_min, y_min, z_min, x_max, y_max, z_max, fdir='ptndata_10H/'):
-	for q, file in enumerate(files):
-		entry = pickle.load(open(fdir + file, 'rb'))
-		a = grid2logical(entry.mat)
-		b = grid2atomtype(entry.mat, atom_type, atom_type_encoder)
-		c = grid2atom(entry.mat, atom_pos, atom_pos_encoder)
-		dm_output = entry.dm
-		# rosetta_score, mse_score
-
-		y = dm_output[0].tolist()
-		y = np.reshape(y, (1, len(y[0]), len(y[0])))
-		y = y.astype(float)
-		#y = energy_scores.loc['ptndata_10H/' + file]['mse_score']
-		#y = np.array(y)
-		#y = y.reshape(-1,1)	
-		for i in range(len(feature_set[0])):
-			for j in range(len(feature_set[0][0])):
-				for k in range(len(feature_set[0][0][0])):
-					feature_set[0][i][j][k] = [a[x_min + i][y_min + j][z_min + k]] + b[x_min + i][y_min + j][z_min + k].tolist() + c[x_min + i][y_min + j][z_min + k].tolist()
-
-		yield (feature_set, y)
-
-
 # Given new bounds and old bounds, return the proper updated bounds.
 def update_bounds(new_x_min, new_y_min, new_z_min, new_x_max, new_y_max, new_z_max, x_min, y_min, z_min, x_max, y_max, z_max):
 	if new_x_min < x_min:
@@ -206,45 +173,36 @@ def update_bounds(new_x_min, new_y_min, new_z_min, new_x_max, new_y_max, new_z_m
 
 	return x_min, y_min, z_min, x_max, y_max, z_max
 
-if __name__ == "__main__":
-
-	n = 10
-
-	fdir='/Users/ethanmoyer/Projects/data/ptn/20210414_DAI_ptndata_11H/'
-
-	files = getfileswithname(fdir, 'obj')
+def train_data_loader(files, feature_set, fdir='ptndata_10H/'):
+	global atom_type, atom_type_encoder, atom_pos, atom_pos_encoder, x_min, y_min, z_min, x_max, y_max, z_max
+	for q, file in tqdm(enumerate(files)):
+		entry = pickle.load(open(fdir + file, 'rb'))
+		a = grid2logical(entry.mat)
+		b = grid2atomtype(entry.mat, atom_type, atom_type_encoder)
+		c = grid2atom(entry.mat, atom_pos, atom_pos_encoder)
+		#y = energy_scores.loc['ptndata_10H/' + file]['mse_score']
+		#y = np.array(y)
+		#y = y.reshape(-1,1)	
+		for i in range(len(feature_set[0])):
+			for j in range(len(feature_set[0][0])):
+				for k in range(len(feature_set[0][0][0])):
+					feature_set[q][i][j][k] = [a[x_min + i][y_min + j][z_min + k]] + b[x_min + i][y_min + j][z_min + k].tolist() + c[x_min + i][y_min + j][z_min + k].tolist()
 
+if __name__ == "__main__":
+	fdir='ptn11H_10/'
+	files = os.listdir(fdir)
 	files.sort()
 
-	files = files[:n]
-
-	atom_type = ['C', 'N', 'O', 'S', 'None']
-	atom_type_data = pd.Series(atom_type)
-	atom_type_encoder = np.array(pd.get_dummies(atom_type_data))
-
-
-	# Skip if dimensions are already foudn for given data set
-	# atom_pos, x_min, y_min, z_min, x_max, y_max, z_max = load_feature_dimensions(files, fdir)
-
-	print(x_min, y_min, z_min, x_max, y_max, z_max)
-	x_min = 1
-	y_min = 10
-	z_min = 4
-	x_max = 64
-	y_max = 56
-	z_max = 63
-	atom_pos = ['O1', 'C9', 'O3', 'CZ2', 'CG2', 'CG', 'NE1', 'C1', 'C2', 'N3', 'CZ', 'OE2', 'SE', 'OE1', 'ND1', 'NH2', 'CE', 'C', 'OE21', 'OD2', 'OG', 'CH2', 'OXT', 'C5', 'ND2', 'C13', 'OE12', 'SD', 'C4', 'O', 'C6', 'C7', 'CE3', 'CH1', 'CA', 'C11', 'CB', 'CE1', 'NZ', 'C3', 'C12', 'OE11', 'NE', 'NE2', 'OG1', 'OH', 'N2', 'OT1', 'N1', 'O2', 'C14', 'C8', 'CD1', 'CG1', 'OD1', 'N', 'C10', 'CD2', 'CZ3', 'NH1', 'S', 'OT2', 'OE22', 'CD', 'SG', 'CE2', 'O4', 'None']
-
-
-	# Format the position specific atom list so it can be used as one-hot encoding in the network
-	atom_pos_data = pd.Series(atom_pos)
-	atom_pos_encoder = np.array(pd.get_dummies(atom_pos_data))
-
+	#print(load_feature_dimensions(files, fdir))
 	# Initialize the feature set
-	feature_set = np.array([[[[ [0] * (1 + len(atom_type) + len(atom_pos)) for i in range(x_min, x_max)] for j in range(y_min, y_max)] for k in range(z_min, z_max)] for q in range(1)])
-
-	feature_set_ = np.array([[[[ [0] * (1 + len(atom_type) + len(atom_pos)) for i in range(x_min, x_max)] for j in range(y_min, y_max)] for k in range(z_min, z_max)] for q in range(validation_samples)])
-
+	feature_set = None
+	if os.path.isfile(dataset_file+'.npy'):
+		feature_set = np.load(dataset_file+'.npy')
+	else:
+		feature_set = np.zeros(shape=(len(files), z_max-z_min, y_max-y_min, x_max-x_min, 1 + len(atom_type) + len(atom_pos)))
+		train_data_loader(files, feature_set, fdir=fdir)
+		np.save(dataset_file, feature_set)
+	# feature_set_ = np.array([[[[ [0] * (1 + len(atom_type) + len(atom_pos)) for i in range(x_min, x_max)] for j in range(y_min, y_max)] for k in range(z_min, z_max)] for q in range(validation_samples)])
 
 
 
diff --git a/main.py b/main.py
@@ -2,21 +2,22 @@
 import tensorflow as tf
 from tensorflow import keras
 import numpy as np
+from data import get_data
 
-# Prepare the dataset. We use both the training & test MNIST digits.
-batch_size = 16
-(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
-all_digits = np.concatenate([x_train, x_test])
-all_digits = all_digits.astype("float32") / 255.0
-all_digits = np.reshape(all_digits, (-1, 28, 28, 1))
-dataset = tf.data.Dataset.from_tensor_slices(all_digits)
-dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)
-gan = GAN(discriminator=discriminator, generator=generator, latent_dim=latent_dim)
-gan.compile(
-    d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
-    g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
-    loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
-)
-# To limit the execution time, we only train on 100 batches. You can train on
-# the entire dataset. You will need about 20 epochs to get nice results.
-gan.fit(dataset.take(100), epochs=1)
+def main(batch_size, file_dir):
+    # Prepare the dataset. We use both the training & test MNIST digits.
+    (x_train, _), (x_test, _) = get_data(file_dir)
+    all_digits = np.concatenate([x_train, x_test])
+    all_digits = all_digits.astype("float32") / 255.0
+    all_digits = np.reshape(all_digits, (-1, 28, 28, 1))
+    dataset = tf.data.Dataset.from_tensor_slices(all_digits)
+    dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)
+    gan = GAN(discriminator=discriminator, generator=generator, latent_dim=latent_dim)
+    gan.compile(
+        d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
+        g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
+        loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
+    )
+    # To limit the execution time, we only train on 100 batches. You can train on
+    # the entire dataset. You will need about 20 epochs to get nice results.
+    gan.fit(dataset.take(100), epochs=1)