-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8643fb3
Showing
7 changed files
with
3,223,124 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
''' | ||
Contains various utility functions to manipulate | ||
training data and query data and their specifications. | ||
''' | ||
|
||
from model import * | ||
import random | ||
import spacy | ||
import unicodedata | ||
import string | ||
import time | ||
import math | ||
import copy | ||
from tqdm import tqdm | ||
|
||
def custom_pipeline(nlp): | ||
return (nlp.tagger, nlp.parser) | ||
|
||
nlp = spacy.load('en', create_pipeline=custom_pipeline) | ||
|
||
def init_dictionary(dictionary_dim): | ||
word_to_ix = {'$' : 0, 'PAD' : 1, 'UNK' : 2} | ||
ix_to_word = ["$", "PAD", "UNK"] | ||
|
||
with open("training/dictionary.txt") as f_in: | ||
counter = 0 | ||
for line in f_in: | ||
if counter == dictionary_dim: | ||
break | ||
|
||
w = unicode_to_ascii(line) | ||
if w not in word_to_ix: | ||
word_to_ix[w] = len(word_to_ix) | ||
ix_to_word.append(w) | ||
counter += 1 | ||
|
||
return (word_to_ix, ix_to_word) | ||
|
||
def split_to_sentences(lines): | ||
sentences = [] | ||
for line in lines: | ||
if line.startswith("TITLE:") or line.startswith("=="): | ||
continue | ||
|
||
tokens = nlp(line) | ||
for sent in tokens.sents: | ||
words = [] | ||
for word in sent: | ||
words.append(unicode_to_ascii(word.text)) | ||
sentences.append(' '.join(words).strip()) | ||
|
||
return sentences | ||
|
||
def _split_to_sentences(filename): | ||
print("- Reading file...") | ||
sentences = [] | ||
with open(filename) as f_in: | ||
text = f_in.read() | ||
lines = text.split("\n") | ||
print("- Splitting sentences...") | ||
for line in tqdm(lines[:32]): | ||
if line.startswith("TITLE:") or line.startswith("=="): | ||
continue | ||
|
||
tokens = nlp(line) | ||
for sent in tokens.sents: | ||
words = [] | ||
for word in sent: | ||
words.append(unicode_to_ascii(word.text)) | ||
sentences.append(' '.join(words).strip()) | ||
|
||
return sentences | ||
|
||
def mask_words(sent): | ||
sent_list = sent.split() | ||
if len(sent_list) <= 0: | ||
return None, None | ||
|
||
sents = list() | ||
targets = list() | ||
for i in range(len(sent_list)): | ||
new_list = copy.deepcopy(sent_list) | ||
new_list[i] = "$" | ||
sents.append(new_list) | ||
targets.append(sent_list[i]) | ||
return sents, targets | ||
|
||
def prepare_sequence(sent_list, word, to_ix, window_dim, training=True): | ||
''' | ||
sent_list = sent.split() | ||
if len(sent_list) <= 0: | ||
return None, None | ||
word_id = random.choice(range(len(sent_list))) | ||
word = sent_list[word_id] | ||
with open("training/file1.txt", 'a') as f_out: | ||
f_out.write(sent + "\t" + word + "\n") | ||
f_out.flush() | ||
''' | ||
|
||
''' | ||
if training: | ||
sent_list[word_id] = "$" | ||
''' | ||
m = len(sent_list) / 2 | ||
#uniform sentence length | ||
sent_list = sent_list[max(0, m - window_dim) : min(len(sent_list), m + window_dim)] | ||
|
||
#pad sentence if necessary | ||
if len(sent_list) < (2 * window_dim + 1): | ||
sent_list += ['PAD'] * ((2 * window_dim + 1) - len(sent_list)) | ||
|
||
sent_tensor = prepare_tensor(sent_list, to_ix) | ||
if sent_tensor is None: | ||
return None, None | ||
|
||
target = to_ix[word] if word in to_ix else to_ix['UNK'] | ||
|
||
''' | ||
with open("training/file2.txt", 'a') as f_out: | ||
f_out.write(str(sent_tensor) + "\t" + str(target) + "\n") | ||
f_out.flush() | ||
''' | ||
|
||
return sent_tensor, target | ||
|
||
def prepare_tensor(seq, to_ix): | ||
try: | ||
ids = [to_ix[w] if w in to_ix else to_ix['UNK'] for w in seq] | ||
except: | ||
ids = None | ||
finally: | ||
return ids | ||
|
||
def _prepare_sequence(seq, to_ix): | ||
try: | ||
ids = [to_ix[w] if w in to_ix else to_ix['UNK'] for w in seq] | ||
except: | ||
ids = None | ||
finally: | ||
return ids | ||
|
||
all_letters = string.ascii_letters + "1234567890-.,;'" | ||
n_letters = len(all_letters) | ||
|
||
def unicode_to_ascii(s): | ||
return ''.join( | ||
c for c in unicodedata.normalize('NFD', s) | ||
if unicodedata.category(c) != 'Mn' | ||
and c in all_letters | ||
) | ||
|
||
def elapsed(start): | ||
now = time.time() | ||
s = now - start | ||
m = math.floor(s/60) | ||
s -= m*60 | ||
s = float('%.3f' % (s)) | ||
if s < 10: | ||
s = ('0%.3f' % (s)) | ||
ret = ('Time elapsed: %sm %ss' % (m, s)) | ||
return ret | ||
|
||
def format_date(t): | ||
date = time.localtime(t) | ||
m = int(date.tm_mon) | ||
if m < 10: | ||
m = ('0%d' % (m)) | ||
d = int(date.tm_mday) | ||
if d < 10: | ||
d = ('0%d' % (d)) | ||
hr = int(date.tm_hour) | ||
if hr < 10: | ||
hr = ('0%d' % (hr)) | ||
mn = int(date.tm_min) | ||
if mn < 10: | ||
mn = ('0%d' % (mn)) | ||
sc = int(date.tm_sec) | ||
if sc < 10: | ||
sc = ('0%d' % (sc)) | ||
ret = ('%s/%s/%s %s:%s:%s' % (date.tm_year, m, d, hr, mn, sc)) | ||
return ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
''' | ||
Contains the language model to predict a held-out word | ||
given the surrounding context of a sentence. | ||
''' | ||
|
||
import torch | ||
import torch.autograd as autograd | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
import torch.optim as optim | ||
|
||
torch.manual_seed(1) | ||
|
||
class WordGuesser(nn.Module): | ||
def __init__(self, hidden_dim, context_dim, embedding_dim, vocabulary_dim, batch_dim, window_dim): | ||
super(WordGuesser, self).__init__() | ||
self.hidden_dim = hidden_dim | ||
self.batch_dim = batch_dim | ||
self.window_dim = window_dim | ||
self.word_embeddings = nn.Embedding(vocabulary_dim, embedding_dim) | ||
self.lstm = nn.LSTM(embedding_dim, hidden_dim) | ||
self.extract_context = nn.Linear((2 * window_dim + 1) * hidden_dim, context_dim) | ||
self.predict = nn.Linear(context_dim, vocabulary_dim) | ||
self.hidden = self.init_hidden() | ||
|
||
def init_hidden(self): | ||
return (autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()), | ||
autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda())) | ||
|
||
def forward(self, sentence): | ||
#0 rimpiazza parola w con $ --> nel training | ||
#1 consuma tutte le parole della frase | ||
embeddings = self.word_embeddings(sentence) | ||
lstm_out, self.hidden = self.lstm(embeddings.permute(1, 0, 2), self.hidden) | ||
lstm_out = lstm_out.view(-1, (2 * self.window_dim + 1) * self.hidden_dim) | ||
|
||
#2 extract_context --> contesto c | ||
context = self.extract_context(lstm_out) | ||
|
||
#3 softmax per predire la parola w dal contesto c | ||
prediction = self.predict(context) | ||
#out = F.softmax(prediction, dim=1) | ||
return prediction, context |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#!/home/emanuele/anaconda3/bin/python3.6 | ||
''' | ||
Executes queries on a trained model to gather its accuracy. | ||
''' | ||
|
||
import sys | ||
import random | ||
from model import * | ||
import data | ||
import time | ||
import numpy | ||
|
||
# Hyper-parameters | ||
dictionary_dim = 400000 | ||
window_dim = 20 | ||
batch_dim = 32 | ||
print_every = 10000 | ||
start = time.time() | ||
modelfile = 'word_guesser.pt' | ||
# --- | ||
|
||
if len(sys.argv) < 2: | ||
print("Error | Argument missing: testing corpus needed.") | ||
sys.exit(1) | ||
|
||
if sys.argv[1].find(".txt") != len(sys.argv[1])-4: | ||
print("Error | Bad argument: textual (.txt) corpora only.") | ||
sys.exit(1) | ||
|
||
print("Initializing...") | ||
word_to_ix, ix_to_word = data.init_dictionary(dictionary_dim) | ||
if len(sys.argv) == 3: | ||
modelfile = sys.argv[2] | ||
model = torch.load(modelfile) | ||
model.batch_dim = 1 | ||
model.hidden = model.init_hidden() | ||
#model = torch.load('word_guesser.pt') | ||
model = model.cuda() | ||
#sent_count = 0 | ||
test_sentences = [] | ||
|
||
with open(sys.argv[1]) as f_in: | ||
for line in f_in: | ||
ss = line.strip().split("\t") | ||
if len(ss[0]) <= 0: | ||
continue | ||
test_sentences.append(ss) | ||
print("Done.") | ||
|
||
''' | ||
print("Starting querying...") | ||
for sent in test_sentences: | ||
sent_list = sent[0].split() | ||
if len(sent_list) <= 0: | ||
continue | ||
word = sent[1] | ||
sent_tensor = data.prepare_tensor(sent_list, word_to_ix) | ||
if sent_tensor is None: | ||
print('Sentence: %s\nFound word out of dictionary\n' % (sent[0])) | ||
continue | ||
sent_tensor = torch.LongTensor(sent_tensor) | ||
sent_tensor = sent_tensor.cuda() | ||
sent_tensor = autograd.Variable(sent_tensor) | ||
prediction, c = model(sent_tensor) | ||
prediction = F.softmax(prediction) | ||
word_ids = prediction.data.topk(10)[1][0].cpu().numpy() | ||
word_predictions = [] | ||
for i in word_ids: | ||
word_predictions.append(ix_to_word[i]) | ||
print('Sentence: %s | Word: %s' % (sent[0], word)) | ||
print("Predictions:\n", word_predictions) | ||
print("") | ||
print("Done.") | ||
''' | ||
|
||
print("Querying...") | ||
for t_sent in test_sentences: | ||
sent = t_sent[0] | ||
target = t_sent[1] | ||
sent_tensor, _ = data.prepare_sequence(sent.split(), target, word_to_ix, window_dim, False) | ||
sent_tensor = numpy.array([sent_tensor]) | ||
|
||
input_tensor = torch.LongTensor(sent_tensor) | ||
input_tensor = input_tensor.cuda() | ||
input_tensor = autograd.Variable(input_tensor) | ||
|
||
prediction, context = model(input_tensor) | ||
prediction = F.softmax(prediction, dim=1) | ||
|
||
word_ids = prediction.data.topk(10)[1][0].cpu().numpy() | ||
word_predictions = [] | ||
for i in word_ids: | ||
word_predictions.append(ix_to_word[i]) | ||
|
||
print('Sentence: %s | Word: %s' % (sent, target)) | ||
print("Predictions:\n", word_predictions) | ||
print("") | ||
|
||
print("Done.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
The film was also intended to $ the first in a trilogy. be | ||
The film $ also intended to be the first in a trilogy. was | ||
$ film was also intended to be the first in a trilogy. The | ||
|
Oops, something went wrong.
8643fb3
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very Good work
8643fb3
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I appreciate your comment, although I would like to point out that this work is very outdated now and most of the parallel programming stuff can be leveraged automatically from within the PyTorch library itself.