Skip to content

Commit

Permalink
upload
Browse files Browse the repository at this point in the history
  • Loading branch information
emanuelegiona committed Apr 5, 2018
0 parents commit 8643fb3
Show file tree
Hide file tree
Showing 7 changed files with 3,223,124 additions and 0 deletions.
183 changes: 183 additions & 0 deletions data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
'''
Contains various utility functions to manipulate
training data and query data and their specifications.
'''

from model import *
import random
import spacy
import unicodedata
import string
import time
import math
import copy
from tqdm import tqdm

def custom_pipeline(nlp):
return (nlp.tagger, nlp.parser)

nlp = spacy.load('en', create_pipeline=custom_pipeline)

def init_dictionary(dictionary_dim):
word_to_ix = {'$' : 0, 'PAD' : 1, 'UNK' : 2}
ix_to_word = ["$", "PAD", "UNK"]

with open("training/dictionary.txt") as f_in:
counter = 0
for line in f_in:
if counter == dictionary_dim:
break

w = unicode_to_ascii(line)
if w not in word_to_ix:
word_to_ix[w] = len(word_to_ix)
ix_to_word.append(w)
counter += 1

return (word_to_ix, ix_to_word)

def split_to_sentences(lines):
sentences = []
for line in lines:
if line.startswith("TITLE:") or line.startswith("=="):
continue

tokens = nlp(line)
for sent in tokens.sents:
words = []
for word in sent:
words.append(unicode_to_ascii(word.text))
sentences.append(' '.join(words).strip())

return sentences

def _split_to_sentences(filename):
print("- Reading file...")
sentences = []
with open(filename) as f_in:
text = f_in.read()
lines = text.split("\n")
print("- Splitting sentences...")
for line in tqdm(lines[:32]):
if line.startswith("TITLE:") or line.startswith("=="):
continue

tokens = nlp(line)
for sent in tokens.sents:
words = []
for word in sent:
words.append(unicode_to_ascii(word.text))
sentences.append(' '.join(words).strip())

return sentences

def mask_words(sent):
sent_list = sent.split()
if len(sent_list) <= 0:
return None, None

sents = list()
targets = list()
for i in range(len(sent_list)):
new_list = copy.deepcopy(sent_list)
new_list[i] = "$"
sents.append(new_list)
targets.append(sent_list[i])
return sents, targets

def prepare_sequence(sent_list, word, to_ix, window_dim, training=True):
'''
sent_list = sent.split()
if len(sent_list) <= 0:
return None, None
word_id = random.choice(range(len(sent_list)))
word = sent_list[word_id]
with open("training/file1.txt", 'a') as f_out:
f_out.write(sent + "\t" + word + "\n")
f_out.flush()
'''

'''
if training:
sent_list[word_id] = "$"
'''
m = len(sent_list) / 2
#uniform sentence length
sent_list = sent_list[max(0, m - window_dim) : min(len(sent_list), m + window_dim)]

#pad sentence if necessary
if len(sent_list) < (2 * window_dim + 1):
sent_list += ['PAD'] * ((2 * window_dim + 1) - len(sent_list))

sent_tensor = prepare_tensor(sent_list, to_ix)
if sent_tensor is None:
return None, None

target = to_ix[word] if word in to_ix else to_ix['UNK']

'''
with open("training/file2.txt", 'a') as f_out:
f_out.write(str(sent_tensor) + "\t" + str(target) + "\n")
f_out.flush()
'''

return sent_tensor, target

def prepare_tensor(seq, to_ix):
try:
ids = [to_ix[w] if w in to_ix else to_ix['UNK'] for w in seq]
except:
ids = None
finally:
return ids

def _prepare_sequence(seq, to_ix):
try:
ids = [to_ix[w] if w in to_ix else to_ix['UNK'] for w in seq]
except:
ids = None
finally:
return ids

all_letters = string.ascii_letters + "1234567890-.,;'"
n_letters = len(all_letters)

def unicode_to_ascii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)

def elapsed(start):
now = time.time()
s = now - start
m = math.floor(s/60)
s -= m*60
s = float('%.3f' % (s))
if s < 10:
s = ('0%.3f' % (s))
ret = ('Time elapsed: %sm %ss' % (m, s))
return ret

def format_date(t):
date = time.localtime(t)
m = int(date.tm_mon)
if m < 10:
m = ('0%d' % (m))
d = int(date.tm_mday)
if d < 10:
d = ('0%d' % (d))
hr = int(date.tm_hour)
if hr < 10:
hr = ('0%d' % (hr))
mn = int(date.tm_min)
if mn < 10:
mn = ('0%d' % (mn))
sc = int(date.tm_sec)
if sc < 10:
sc = ('0%d' % (sc))
ret = ('%s/%s/%s %s:%s:%s' % (date.tm_year, m, d, hr, mn, sc))
return ret
43 changes: 43 additions & 0 deletions model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
'''
Contains the language model to predict a held-out word
given the surrounding context of a sentence.
'''

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

class WordGuesser(nn.Module):
def __init__(self, hidden_dim, context_dim, embedding_dim, vocabulary_dim, batch_dim, window_dim):
super(WordGuesser, self).__init__()
self.hidden_dim = hidden_dim
self.batch_dim = batch_dim
self.window_dim = window_dim
self.word_embeddings = nn.Embedding(vocabulary_dim, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
self.extract_context = nn.Linear((2 * window_dim + 1) * hidden_dim, context_dim)
self.predict = nn.Linear(context_dim, vocabulary_dim)
self.hidden = self.init_hidden()

def init_hidden(self):
return (autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()),
autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()))

def forward(self, sentence):
#0 rimpiazza parola w con $ --> nel training
#1 consuma tutte le parole della frase
embeddings = self.word_embeddings(sentence)
lstm_out, self.hidden = self.lstm(embeddings.permute(1, 0, 2), self.hidden)
lstm_out = lstm_out.view(-1, (2 * self.window_dim + 1) * self.hidden_dim)

#2 extract_context --> contesto c
context = self.extract_context(lstm_out)

#3 softmax per predire la parola w dal contesto c
prediction = self.predict(context)
#out = F.softmax(prediction, dim=1)
return prediction, context
104 changes: 104 additions & 0 deletions query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/home/emanuele/anaconda3/bin/python3.6
'''
Executes queries on a trained model to gather its accuracy.
'''

import sys
import random
from model import *
import data
import time
import numpy

# Hyper-parameters
dictionary_dim = 400000
window_dim = 20
batch_dim = 32
print_every = 10000
start = time.time()
modelfile = 'word_guesser.pt'
# ---

if len(sys.argv) < 2:
print("Error | Argument missing: testing corpus needed.")
sys.exit(1)

if sys.argv[1].find(".txt") != len(sys.argv[1])-4:
print("Error | Bad argument: textual (.txt) corpora only.")
sys.exit(1)

print("Initializing...")
word_to_ix, ix_to_word = data.init_dictionary(dictionary_dim)
if len(sys.argv) == 3:
modelfile = sys.argv[2]
model = torch.load(modelfile)
model.batch_dim = 1
model.hidden = model.init_hidden()
#model = torch.load('word_guesser.pt')
model = model.cuda()
#sent_count = 0
test_sentences = []

with open(sys.argv[1]) as f_in:
for line in f_in:
ss = line.strip().split("\t")
if len(ss[0]) <= 0:
continue
test_sentences.append(ss)
print("Done.")

'''
print("Starting querying...")
for sent in test_sentences:
sent_list = sent[0].split()
if len(sent_list) <= 0:
continue
word = sent[1]
sent_tensor = data.prepare_tensor(sent_list, word_to_ix)
if sent_tensor is None:
print('Sentence: %s\nFound word out of dictionary\n' % (sent[0]))
continue
sent_tensor = torch.LongTensor(sent_tensor)
sent_tensor = sent_tensor.cuda()
sent_tensor = autograd.Variable(sent_tensor)
prediction, c = model(sent_tensor)
prediction = F.softmax(prediction)
word_ids = prediction.data.topk(10)[1][0].cpu().numpy()
word_predictions = []
for i in word_ids:
word_predictions.append(ix_to_word[i])
print('Sentence: %s | Word: %s' % (sent[0], word))
print("Predictions:\n", word_predictions)
print("")
print("Done.")
'''

print("Querying...")
for t_sent in test_sentences:
sent = t_sent[0]
target = t_sent[1]
sent_tensor, _ = data.prepare_sequence(sent.split(), target, word_to_ix, window_dim, False)
sent_tensor = numpy.array([sent_tensor])

input_tensor = torch.LongTensor(sent_tensor)
input_tensor = input_tensor.cuda()
input_tensor = autograd.Variable(input_tensor)

prediction, context = model(input_tensor)
prediction = F.softmax(prediction, dim=1)

word_ids = prediction.data.topk(10)[1][0].cpu().numpy()
word_predictions = []
for i in word_ids:
word_predictions.append(ix_to_word[i])

print('Sentence: %s | Word: %s' % (sent, target))
print("Predictions:\n", word_predictions)
print("")

print("Done.")
4 changes: 4 additions & 0 deletions testing/overfit.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
The film was also intended to $ the first in a trilogy. be
The film $ also intended to be the first in a trilogy. was
$ film was also intended to be the first in a trilogy. The

Loading

2 comments on commit 8643fb3

@SandipSPatil
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very Good work

@emanuelegiona
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I appreciate your comment, although I would like to point out that this work is very outdated now and most of the parallel programming stuff can be leveraged automatically from within the PyTorch library itself.

Please sign in to comment.