Skip to content

Commit

Permalink
final release
Browse files Browse the repository at this point in the history
  • Loading branch information
emanuelegiona committed Apr 26, 2018
1 parent dcb50df commit 23797fc
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 293 deletions.
36 changes: 32 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
### Still work in progress

# Neural WSD

This project aims to replicate Google's ["Semi-supervised Word Sense Disambiguation with Neural Models"](https://research.google.com/pubs/pub45729.html?authuser=0).
Expand All @@ -10,11 +8,19 @@ Dictionary built using [Google English One Million 1-grams](http://storage.googl

## How to use

### Setup:

- Python 3.6.3 (Anaconda custom 64-bit)
- PyTorch 0.3.1 (0.4.0 might not work due to ["torch.Tensor and autograd.Variable changes"](https://github.com/pytorch/pytorch/releases/tag/v0.4.0))
- CUDA 8
- spaCy v2.0 with English models (more ["here"](https://spacy.io/usage/))
- project folder must contain a folder named `batches` in the same directory of the train.py file

### Training

Start training by using this command:

`./train.py <path/to/training_set> <path/to/model>`
`python train.py <path/to/training_set> <path/to/model>`

where:
- the training set file is a UTF-8 encoded .txt file;
Expand All @@ -28,11 +34,33 @@ the training will retrain the model (for example to resume training).

Start querying the model by using this command:

`./query.py <path/to/test_set> <path/to/model>`
`python query.py <path/to/test_set> <path/to/model>`

where:
- the test set file is a UTF-8 encoded .txt file;
- model file: (same as training).

The model file is not mandatory: if not specified, it will assume there is a model stored in `word_guesser.pt`, while
specifying a model file, the model stored in that file will be used for predictions.

## Features

- Multi-threaded operation in order to read from the training file, split to sentences, batching, training simultaneously (producer-consumer pattern)
- Low RAM usage due to limited queues between threads and periodic dumps of created batches
- Sentences are never padded, instead they are organized by their length and then created batches from sentences of all the same length
- Dynamic batch size: will try to create batches of maximal size (hyper-parameter `batch_dim`) as much as possible, but batches smaller than the chosen size will not be padded

## Known bugs

- Missing `batches` folder creation if not present
- Training corpus only accepted format is UTF-8 encoded plain text
- Slow on computation of large training corpus, might become faster implementing hierarchical softmax or negative sampling

## Consulted resources

- ["PyTorch Tutorials"](http://pytorch.org/tutorials/)
- ["Practical PyTorch"](https://github.com/spro/practical-pytorch)
- ["The Incredible PyTorch"](https://github.com/ritchieng/the-incredible-pytorch)
- ["Optimizing PyTorch training code"](https://www.sagivtech.com/2017/09/19/optimizing-pytorch-training-code/)
- ["Word Sense Disambiguation with LSTM: Do We Really Need 100 Billion Words?"](https://github.com/cltl/wsd-dynamic-sense-vector)

82 changes: 13 additions & 69 deletions data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
'''
Contains various utility functions to manipulate
training data and query data and their specifications.
Contains various utility functions to manipulate training and query data.
'''

from model import *
Expand All @@ -17,10 +16,19 @@ def custom_pipeline(nlp):
return (nlp.tagger, nlp.parser)

nlp = spacy.load('en', create_pipeline=custom_pipeline)
all_letters = string.ascii_letters + "1234567890-.,;'"
n_letters = len(all_letters)

def unicode_to_ascii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)

def init_dictionary(dictionary_dim):
word_to_ix = {'$' : 0, 'PAD' : 1, 'UNK' : 2}
ix_to_word = ["$", "PAD", "UNK"]
word_to_ix = {'PAD' : 0, '$' : 1, 'UNK' : 2}
ix_to_word = ["PAD", "$", "UNK"]

with open("training/dictionary.txt") as f_in:
counter = 0
Expand Down Expand Up @@ -51,26 +59,6 @@ def split_to_sentences(lines):

return sentences

def _split_to_sentences(filename):
print("- Reading file...")
sentences = []
with open(filename) as f_in:
text = f_in.read()
lines = text.split("\n")
print("- Splitting sentences...")
for line in tqdm(lines[:32]):
if line.startswith("TITLE:") or line.startswith("=="):
continue

tokens = nlp(line)
for sent in tokens.sents:
words = []
for word in sent:
words.append(unicode_to_ascii(word.text))
sentences.append(' '.join(words).strip())

return sentences

def mask_words(sent):
sent_list = sent.split()
if len(sent_list) <= 0:
Expand All @@ -85,47 +73,13 @@ def mask_words(sent):
targets.append(sent_list[i])
return sents, targets

def prepare_sequence(sent_list, word, to_ix, window_dim, training=True):
'''
sent_list = sent.split()
if len(sent_list) <= 0:
return None, None
word_id = random.choice(range(len(sent_list)))
word = sent_list[word_id]
with open("training/file1.txt", 'a') as f_out:
f_out.write(sent + "\t" + word + "\n")
f_out.flush()
'''

'''
if training:
sent_list[word_id] = "$"
'''

'''
m = len(sent_list) / 2
#uniform sentence length
sent_list = sent_list[max(0, m - window_dim) : min(len(sent_list), m + window_dim)]
#pad sentence if necessary
if len(sent_list) < (2 * window_dim + 1):
sent_list += ['PAD'] * ((2 * window_dim + 1) - len(sent_list))
'''

def prepare_sequence(sent_list, word, to_ix):
sent_tensor = prepare_tensor(sent_list, to_ix)
if sent_tensor is None:
return None, None

target = to_ix[word] if word in to_ix else to_ix['UNK']

'''
with open("training/file2.txt", 'a') as f_out:
f_out.write(str(sent_tensor) + "\t" + str(target) + "\n")
f_out.flush()
'''

return sent_tensor, target

def prepare_tensor(seq, to_ix):
Expand All @@ -144,16 +98,6 @@ def _prepare_sequence(seq, to_ix):
finally:
return ids

all_letters = string.ascii_letters + "1234567890-.,;'"
n_letters = len(all_letters)

def unicode_to_ascii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)

def elapsed(start):
now = time.time()
s = now - start
Expand Down
20 changes: 6 additions & 14 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

torch.manual_seed(1)

class WordGuesser(nn.Module):
def __init__(self, hidden_dim, context_dim, embedding_dim, vocabulary_dim, batch_dim, window_dim):
def __init__(self, hidden_dim, context_dim, embedding_dim, vocabulary_dim, batch_dim):
super(WordGuesser, self).__init__()
self.hidden_dim = hidden_dim
self.batch_dim = batch_dim
self.window_dim = window_dim
self.word_embeddings = nn.Embedding(vocabulary_dim, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
#self.extract_context = nn.Linear((2 * window_dim + 1) * hidden_dim, context_dim)
self.extract_context = nn.Linear(hidden_dim, context_dim)
self.predict = nn.Linear(context_dim, vocabulary_dim)
self.hidden = self.init_hidden()
Expand All @@ -28,19 +28,11 @@ def init_hidden(self):
return (autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()),
autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()))

def forward(self, sentence, hidden):
#0 rimpiazza parola w con $ --> nel training
#1 consuma tutte le parole della frase
def forward(self, sentence):
embeddings = self.word_embeddings(sentence)
out, self.hidden = self.lstm(embeddings.permute(1, 0, 2), hidden)
packed = embeddings.permute(1, 0, 2)
out, self.hidden = self.lstm(packed, self.hidden)
lstm_out = out[-1]
#print(lstm_out)
#lstm_out = lstm_out.view(-1, (2 * self.window_dim + 1) * self.hidden_dim)

#2 extract_context --> contesto c
context = self.extract_context(lstm_out)

#3 softmax per predire la parola w dal contesto c
prediction = self.predict(context)
#out = F.softmax(prediction, dim=1)
return prediction, context
113 changes: 15 additions & 98 deletions query.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/home/emanuele/anaconda3/bin/python3.6
'''
Executes queries on a trained model to gather its accuracy.
'''
Expand All @@ -11,13 +10,14 @@
import numpy

# Hyper-parameters
dictionary_dim = 400000
window_dim = 20
hidden_units = 2048
context_dim = 512
embedding_dim = 512
dictionary_dim = 100000
window_dim = 10
batch_dim = 32
print_every = 10000
start = time.time()
modelfile = 'word_guesser.pt'
train2 = False
# ---

if len(sys.argv) < 2:
Expand All @@ -29,55 +29,16 @@
sys.exit(1)

print("Initializing...")
#word_to_ix, ix_to_word = data.init_dictionary(dictionary_dim)
#'''
if train2:
lines = open('training/overfit.txt').readlines()
sentences = data.split_to_sentences(lines)
word_to_ix=dict()
ix_to_word =dict()
i = 0
for s in sentences:
for t in s.strip().split(' '):
if t in word_to_ix:
continue
word_to_ix[t] = i
ix_to_word[i] = t
i+= 1
word_to_ix['$'] = i
ix_to_word[i] = '$'
else:
word_to_ix = {"$":0, "PAD":1, "UNK":2}
ix_to_word = ["$", "PAD", "UNK"]

with open("training/overfit.txt") as f_in:
for l in f_in.readlines():
for w in l.strip().split(' '):
if w not in word_to_ix:
word_to_ix[w] = len(word_to_ix)
ix_to_word.append(w)
#'''
word_to_ix, ix_to_word = data.init_dictionary(dictionary_dim)

if len(sys.argv) == 3:
modelfile = sys.argv[2]
'''
model = torch.load(modelfile)
model.train(False)
'''

hidden_units = 512
context_dim = 256
embedding_dim = 256
if train2:
model = WordGuesser(hidden_units, context_dim, embedding_dim, len(word_to_ix), 1, 13)
else:
model = WordGuesser(hidden_units, context_dim, embedding_dim, len(word_to_ix), 1, window_dim)
model = WordGuesser(hidden_units, context_dim, embedding_dim, len(word_to_ix), 1)
model.load_state_dict(torch.load(modelfile))
model.train(False)
model.hidden = model.init_hidden()
#model = torch.load('word_guesser.pt')
model = model.cuda()
#sent_count = 0
test_sentences = []

with open(sys.argv[1]) as f_in:
Expand All @@ -88,39 +49,10 @@
test_sentences.append(ss)
print("Done.")

'''
print("Starting querying...")
for sent in test_sentences:
sent_list = sent[0].split()
if len(sent_list) <= 0:
continue
word = sent[1]
sent_tensor = data.prepare_tensor(sent_list, word_to_ix)
if sent_tensor is None:
print('Sentence: %s\nFound word out of dictionary\n' % (sent[0]))
continue
sent_tensor = torch.LongTensor(sent_tensor)
sent_tensor = sent_tensor.cuda()
sent_tensor = autograd.Variable(sent_tensor)
prediction, c = model(sent_tensor)
prediction = F.softmax(prediction)
word_ids = prediction.data.topk(10)[1][0].cpu().numpy()
word_predictions = []
for i in word_ids:
word_predictions.append(ix_to_word[i])
print('Sentence: %s | Word: %s' % (sent[0], word))
print("Predictions:\n", word_predictions)
print("")
print("Done.")
'''

print("Querying...")
warm_up = 3

#warming up internal gradients before model.eval()
while warm_up != 0:
warm_up -= 1
if warm_up == 0:
Expand All @@ -129,39 +61,24 @@
for t_sent in test_sentences:
sent = t_sent[0]
target = t_sent[1]
#sent_tensor, _ = data.prepare_sequence(sent.split(), target, word_to_ix, window_dim, False)
#sent_tensor = numpy.array([sent_tensor])

sent_tensor = [word_to_ix[w] for w in sent.split()]
sent_tensor, _ = data.prepare_sequence(sent.split(), target, word_to_ix)

input_tensor = torch.LongTensor([sent_tensor])
input_tensor = input_tensor.cuda()
input_tensor = autograd.Variable(input_tensor)

hidden = model.init_hidden()
predictions, context = model(input_tensor, hidden)
#predictions = F.softmax(predictions, dim=1)
model.zero_grad()
model.hidden = model.init_hidden()
predictions, context = model(input_tensor)

#'''
if warm_up == 0:
#'''
print('Sentence: %s\nTarget: %s' % (sent, target))
for i, prediction in enumerate(predictions):
sorted_val = sorted(enumerate(numpy.array(prediction.data)), key=lambda x : x[1], reverse=True)
print([(ix_to_word[x[0]], x[1]) for x in sorted_val[:5]]) #, ix_to_word[Y[i]]
print("")
#'''

'''
word_ids = predictions.data.topk(10)[1][0].cpu().numpy()
word_ids = predictions.data.topk(5)[1][0].cpu().numpy()
word_predictions = []
for i in word_ids:
word_predictions.append(ix_to_word[i])

print('Sentence: %s | Word: %s' % (sent, target))
print('Sentence: %s\nWord: %s' % (sent, target))
print("Predictions:\n", word_predictions)
print("")
#'''
#'''

print("Done.")
Loading

0 comments on commit 23797fc

Please sign in to comment.