Skip to content

Commit

Permalink
Moved Loading of Dataset & Printing for results.csv to data.py
Browse files Browse the repository at this point in the history
  • Loading branch information
SuyashLakhotia committed Jan 14, 2018
1 parent 8e149cd commit 0dddf0c
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 74 deletions.
17 changes: 8 additions & 9 deletions baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,8 @@
# Data Preparation
# ==================================================

print("Loading training data...")
train = data.Text20News(subset="train")
train.preprocess_train(out="tfidf", norm="l1")

print("Loading test data...")
test = data.Text20News(subset="test")
test.preprocess_test(train_vocab=train.vocab, out="tfidf", norm="l1")
dataset = "20 Newsgroups"
train, test = data.load_dataset(dataset, out="tfidf", norm="l1")

x_train = train.data_tfidf.astype(np.float32)
x_test = test.data_tfidf.astype(np.float32)
Expand All @@ -41,10 +36,14 @@
svm_clf = LinearSVC()
svm_clf.fit(x_train, y_train)
predicted = svm_clf.predict(x_test)
print("Linear SVC Accuracy: {:.4f}".format(np.mean(predicted == y_test)))
svm_acc = np.mean(predicted == y_test)

# Multinomial Naive Bayes Classifier
bayes_clf = MultinomialNB(alpha=0.01)
bayes_clf.fit(x_train, y_train)
predicted = bayes_clf.predict(x_test)
print("Multinomial Naive Bayes Accuracy: {:.4f}".format(np.mean(predicted == y_test)))
bayes_acc = np.mean(predicted == y_test)

# Output for results.csv
data.print_result(dataset, "Linear SVC", svm_acc)
data.print_result(dataset, "Multinomial Naive Bayes", bayes_acc)
14 changes: 3 additions & 11 deletions cnn_ykim_train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import time
import subprocess

import numpy as np
import tensorflow as tf
Expand Down Expand Up @@ -42,13 +41,8 @@
# Data Preparation
# ==================================================

print("Loading training data...")
train = data.Text20News(subset="train")
train.preprocess_train(out="word2ind", maxlen=seq_len)

print("Loading test data...")
test = data.Text20News(subset="test")
test.preprocess_test(train_vocab=train.vocab, out="word2ind", maxlen=seq_len)
dataset = "20 Newsgroups"
train, test = data.load_dataset(dataset, out="word2ind", maxlen=seq_len)

x_train = train.data_word2ind.astype(np.int32)
x_test = test.data_word2ind.astype(np.int32)
Expand Down Expand Up @@ -105,6 +99,4 @@
# Output for results.csv
hyperparams = "{{seq_len: {}, filter_heights: {}, num_features: {}}}".format(
seq_len, filter_heights, num_features)
latest_git = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
print("\"{}\",\"{}\",\"{:.9f}\",\"{}\",\"{}\"".format(model_name, hyperparams, max_accuracy,
latest_git, timestamp))
data.print_result(dataset, model_name, max_accuracy, hyperparams, timestamp)
47 changes: 37 additions & 10 deletions data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import collections
import subprocess

import numpy as np
import sklearn.datasets
Expand Down Expand Up @@ -133,7 +134,7 @@ def generate_word2ind(self, maxlen=None, padding="post", truncating="post"):

class Text20News(TextDataset):
"""
20 Newsgroups Dataset
20 Newsgroups dataset.
http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
"""

Expand Down Expand Up @@ -177,18 +178,20 @@ def preprocess_test(self, train_vocab, out, **params):
self.generate_word2ind(**params)


def one_hot_labels(num_labels, labels):
def load_dataset(dataset, out, **params):
"""
Generate one-hot encoded label arrays.
Returns the train & test datasets for a chosen dataset.
"""
labels_arr = []
for i in range(len(labels)):
label = [0 for j in range(num_labels)]
label[labels[i]] = 1
labels_arr.append(label)
y = np.array(labels_arr)
if dataset == "20 Newsgroups":
print("Loading training data...")
train = Text20News(subset="train")
train.preprocess_train(out=out, **params)

return y
print("Loading test data...")
test = Text20News(subset="test")
test.preprocess_test(train_vocab=train.vocab, out=out, **params)

return train, test


def load_word2vec(filepath, vocabulary, embedding_dim):
Expand Down Expand Up @@ -243,3 +246,27 @@ def batch_iter(data, batch_size, num_epochs, shuffle=True):
indices.extend(np.arange(data_size))
idx = [indices.popleft() for i in range(batch_size)]
yield data[idx]


def one_hot_labels(num_labels, labels):
"""
Generate one-hot encoded label arrays.
"""
labels_arr = []
for i in range(len(labels)):
label = [0 for j in range(num_labels)]
label[labels[i]] = 1
labels_arr.append(label)
y = np.array(labels_arr)

return y


def print_result(dataset, model_name, acc, hyperparams="-", timestamp="-", notes="-"):
"""
Prints the record for results.csv.
"""
latest_git = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
print("")
print("\"{}\",\"{}\",\"{}\",\"{:.9f}\",\"{}\",\"{}\"".format(dataset, model_name, hyperparams,
acc, notes, latest_git, timestamp))
14 changes: 3 additions & 11 deletions gcnn_fourier_train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import time
import subprocess

import numpy as np
import tensorflow as tf
Expand Down Expand Up @@ -48,13 +47,8 @@
# Data Preparation
# ==================================================

print("Loading training data...")
train = data.Text20News(subset="train")
train.preprocess_train(out="tfidf", norm="l1")

print("Loading test data...")
test = data.Text20News(subset="test")
test.preprocess_test(train_vocab=train.vocab, out="tfidf", norm="l1")
dataset = "20 Newsgroups"
train, test = data.load_dataset(dataset, out="tfidf", norm="l1")

x_train = train.data_tfidf.astype(np.float32)
x_test = test.data_tfidf.astype(np.float32)
Expand Down Expand Up @@ -135,6 +129,4 @@
# Output for results.csv
hyperparams = "{{num_edges: {}, coarsening_levels: {}, filter_sizes: {}, num_features: {}, pooling_sizes: {}, fc_layers: {}, dropout: {}}}".format(
num_edges, coarsening_levels, filter_sizes, num_features, pooling_sizes, fc_layers, dropout_keep_prob)
latest_git = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
print("\"{}\",\"{}\",\"{:.9f}\",\"{}\",\"{}\"".format(model_name, hyperparams, max_accuracy,
latest_git, timestamp))
data.print_result(dataset, model_name, max_accuracy, hyperparams, timestamp)
14 changes: 3 additions & 11 deletions gcnn_mdeff_train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import time
import subprocess

import numpy as np
import tensorflow as tf
Expand Down Expand Up @@ -49,13 +48,8 @@
# Data Preparation
# ==================================================

print("Loading training data...")
train = data.Text20News(subset="train")
train.preprocess_train(out="tfidf", norm="l1")

print("Loading test data...")
test = data.Text20News(subset="test")
test.preprocess_test(train_vocab=train.vocab, out="tfidf", norm="l1")
dataset = "20 Newsgroups"
train, test = data.load_dataset(dataset, out="tfidf", norm="l1")

x_train = train.data_tfidf.astype(np.float32)
x_test = test.data_tfidf.astype(np.float32)
Expand Down Expand Up @@ -135,6 +129,4 @@
# Output for results.csv
hyperparams = "{{num_edges: {}, coarsening_levels: {}, polynomial_orders: {}, num_features: {}, pooling_sizes: {}, fc_layers: {}, dropout: {}}}".format(
num_edges, coarsening_levels, polynomial_orders, num_features, pooling_sizes, fc_layers, dropout_keep_prob)
latest_git = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
print("\"{}\",\"{}\",\"{:.9f}\",\"{}\",\"{}\"".format(model_name, hyperparams, max_accuracy,
latest_git, timestamp))
data.print_result(dataset, model_name, max_accuracy, hyperparams, timestamp)
14 changes: 3 additions & 11 deletions gcnn_spline_train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import time
import subprocess

import numpy as np
import tensorflow as tf
Expand Down Expand Up @@ -49,13 +48,8 @@
# Data Preparation
# ==================================================

print("Loading training data...")
train = data.Text20News(subset="train")
train.preprocess_train(out="tfidf", norm="l1")

print("Loading test data...")
test = data.Text20News(subset="test")
test.preprocess_test(train_vocab=train.vocab, out="tfidf", norm="l1")
dataset = "20 Newsgroups"
train, test = data.load_dataset(dataset, out="tfidf", norm="l1")

x_train = train.data_tfidf.astype(np.float32)
x_test = test.data_tfidf.astype(np.float32)
Expand Down Expand Up @@ -135,6 +129,4 @@
# Output for results.csv
hyperparams = "{{num_edges: {}, coarsening_levels: {}, filter_sizes: {}, num_features: {}, pooling_sizes: {}, fc_layers: {}, dropout: {}}}".format(
num_edges, coarsening_levels, filter_sizes, num_features, pooling_sizes, fc_layers, dropout_keep_prob)
latest_git = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
print("\"{}\",\"{}\",\"{:.9f}\",\"{}\",\"{}\"".format(model_name, hyperparams, max_accuracy,
latest_git, timestamp))
data.print_result(dataset, model_name, max_accuracy, hyperparams, timestamp)
14 changes: 3 additions & 11 deletions mlp_train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import time
import subprocess

import numpy as np
import tensorflow as tf
Expand Down Expand Up @@ -36,13 +35,8 @@
# Data Preparation
# ==================================================

print("Loading training data...")
train = data.Text20News(subset="train")
train.preprocess_train(out="tfidf", norm="l1")

print("Loading test data...")
test = data.Text20News(subset="test")
test.preprocess_test(train_vocab=train.vocab, out="tfidf", norm="l1")
dataset = "20 Newsgroups"
train, test = data.load_dataset(dataset, out="tfidf", norm="l1")

x_train = train.data_tfidf.astype(np.float32)
x_test = test.data_tfidf.astype(np.float32)
Expand Down Expand Up @@ -89,6 +83,4 @@

# Output for results.csv
hyperparams = "{{layers: {}}}".format(layers)
latest_git = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
print("\"{}\",\"{}\",\"{:.9f}\",\"{}\",\"{}\"".format(model_name, hyperparams,
max_accuracy, latest_git, timestamp))
data.print_result(dataset, model_name, max_accuracy, hyperparams, timestamp)

0 comments on commit 0dddf0c

Please sign in to comment.