final release

emanuelegiona · Apr 26, 2018 · 23797fc · 23797fc
1 parent dcb50df
commit 23797fc
Show file tree

Hide file tree

Showing 5 changed files with 187 additions and 293 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,3 @@
-### Still work in progress
-
 # Neural WSD
 
 This project aims to replicate Google's ["Semi-supervised Word Sense Disambiguation with Neural Models"](https://research.google.com/pubs/pub45729.html?authuser=0).
@@ -10,11 +8,19 @@ Dictionary built using [Google English One Million 1-grams](http://storage.googl
 
 ## How to use
 
+### Setup:
+
+- Python 3.6.3 (Anaconda custom 64-bit)
+- PyTorch 0.3.1 (0.4.0 might not work due to ["torch.Tensor and autograd.Variable changes"](https://github.com/pytorch/pytorch/releases/tag/v0.4.0))
+- CUDA 8
+- spaCy v2.0 with English models (more ["here"](https://spacy.io/usage/))
+- project folder must contain a folder named `batches` in the same directory of the train.py file
+
 ### Training
 
 Start training by using this command:
 
-  `./train.py <path/to/training_set> <path/to/model>`
+  `python train.py <path/to/training_set> <path/to/model>`
 
 where:
 - the training set file is a UTF-8 encoded .txt file;
@@ -28,11 +34,33 @@ the training will retrain the model (for example to resume training).
 
 Start querying the model by using this command:
 
-  `./query.py <path/to/test_set> <path/to/model>`
+  `python query.py <path/to/test_set> <path/to/model>`
 
 where:
 - the test set file is a UTF-8 encoded .txt file;
 - model file: (same as training).
 
 The model file is not mandatory: if not specified, it will assume there is a model stored in `word_guesser.pt`, while
 specifying a model file, the model stored in that file will be used for predictions.
+
+## Features
+
+- Multi-threaded operation in order to read from the training file, split to sentences, batching, training simultaneously (producer-consumer pattern)
+- Low RAM usage due to limited queues between threads and periodic dumps of created batches
+- Sentences are never padded, instead they are organized by their length and then created batches from sentences of all the same length
+- Dynamic batch size: will try to create batches of maximal size (hyper-parameter `batch_dim`) as much as possible, but batches smaller than the chosen size will not be padded
+
+## Known bugs
+
+- Missing `batches` folder creation if not present
+- Training corpus only accepted format is UTF-8 encoded plain text
+- Slow on computation of large training corpus, might become faster implementing hierarchical softmax or negative sampling
+
+## Consulted resources
+
+- ["PyTorch Tutorials"](http://pytorch.org/tutorials/)
+- ["Practical PyTorch"](https://github.com/spro/practical-pytorch)
+- ["The Incredible PyTorch"](https://github.com/ritchieng/the-incredible-pytorch)
+- ["Optimizing PyTorch training code"](https://www.sagivtech.com/2017/09/19/optimizing-pytorch-training-code/)
+- ["Word Sense Disambiguation with LSTM: Do We Really Need 100 Billion Words?"](https://github.com/cltl/wsd-dynamic-sense-vector)
+
diff --git a/data.py b/data.py
@@ -1,6 +1,5 @@
 '''
-Contains various utility functions to manipulate
-training data and query data and their specifications.
+Contains various utility functions to manipulate training and query data.
 '''
 
 from model import *
@@ -17,10 +16,19 @@ def custom_pipeline(nlp):
     return (nlp.tagger, nlp.parser)
 
 nlp = spacy.load('en', create_pipeline=custom_pipeline)
+all_letters = string.ascii_letters + "1234567890-.,;'"
+n_letters = len(all_letters)
+
+def unicode_to_ascii(s):
+    return ''.join(
+        c for c in unicodedata.normalize('NFD', s)
+        if unicodedata.category(c) != 'Mn'
+            and c in all_letters
+    )
 
 def init_dictionary(dictionary_dim):
-    word_to_ix = {'$' : 0, 'PAD' : 1, 'UNK' : 2}
-    ix_to_word = ["$", "PAD", "UNK"]
+    word_to_ix = {'PAD' : 0, '$' : 1, 'UNK' : 2}
+    ix_to_word = ["PAD", "$", "UNK"]
 
     with open("training/dictionary.txt") as f_in:
         counter = 0
@@ -51,26 +59,6 @@ def split_to_sentences(lines):
 
     return sentences
 
-def _split_to_sentences(filename):
-    print("- Reading file...")
-    sentences = []
-    with open(filename) as f_in:
-        text = f_in.read()
-        lines = text.split("\n")
-        print("- Splitting sentences...")
-        for line in tqdm(lines[:32]):
-            if line.startswith("TITLE:") or line.startswith("=="):
-                continue
-
-            tokens = nlp(line)
-            for sent in tokens.sents:
-                words = []
-                for word in sent:
-                    words.append(unicode_to_ascii(word.text))
-                sentences.append(' '.join(words).strip())
-
-    return sentences
-
 def mask_words(sent):
     sent_list = sent.split()
     if len(sent_list) <= 0:
@@ -85,47 +73,13 @@ def mask_words(sent):
         targets.append(sent_list[i])
     return sents, targets
 
-def prepare_sequence(sent_list, word, to_ix, window_dim, training=True):
-    '''
-    sent_list = sent.split()
-    if len(sent_list) <= 0:
-        return None, None
-
-    word_id = random.choice(range(len(sent_list)))
-    word = sent_list[word_id]
-
-    with open("training/file1.txt", 'a') as f_out:
-        f_out.write(sent + "\t" + word + "\n")
-        f_out.flush()
-    '''
-
-    '''
-    if training:
-        sent_list[word_id] = "$"
-    '''
-
-    '''
-    m = len(sent_list) / 2
-    #uniform sentence length
-    sent_list = sent_list[max(0, m - window_dim) : min(len(sent_list), m + window_dim)]
-
-    #pad sentence if necessary
-    if len(sent_list) < (2 * window_dim + 1):
-        sent_list += ['PAD'] * ((2 * window_dim + 1) - len(sent_list))
-    '''
-
+def prepare_sequence(sent_list, word, to_ix):
     sent_tensor = prepare_tensor(sent_list, to_ix)
     if sent_tensor is None:
         return None, None
 
     target = to_ix[word] if word in to_ix else to_ix['UNK']
 
-    '''
-    with open("training/file2.txt", 'a') as f_out:
-        f_out.write(str(sent_tensor) + "\t" + str(target) + "\n")
-        f_out.flush()
-    '''
-
     return sent_tensor, target
 
 def prepare_tensor(seq, to_ix):
@@ -144,16 +98,6 @@ def _prepare_sequence(seq, to_ix):
     finally:
         return ids
 
-all_letters = string.ascii_letters + "1234567890-.,;'"
-n_letters = len(all_letters)
-
-def unicode_to_ascii(s):
-    return ''.join(
-        c for c in unicodedata.normalize('NFD', s)
-        if unicodedata.category(c) != 'Mn'
-            and c in all_letters
-    )
-
 def elapsed(start):
     now = time.time()
     s = now - start

diff --git a/model.py b/model.py
@@ -8,18 +8,18 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils.rnn import pad_packed_sequence
 
 torch.manual_seed(1)
 
 class WordGuesser(nn.Module):
-    def __init__(self, hidden_dim, context_dim, embedding_dim, vocabulary_dim, batch_dim, window_dim):
+    def __init__(self, hidden_dim, context_dim, embedding_dim, vocabulary_dim, batch_dim):
         super(WordGuesser, self).__init__()
         self.hidden_dim = hidden_dim
         self.batch_dim = batch_dim
-        self.window_dim = window_dim
         self.word_embeddings = nn.Embedding(vocabulary_dim, embedding_dim)
         self.lstm = nn.LSTM(embedding_dim, hidden_dim)
-        #self.extract_context = nn.Linear((2 * window_dim + 1) * hidden_dim, context_dim)
         self.extract_context = nn.Linear(hidden_dim, context_dim)
         self.predict = nn.Linear(context_dim, vocabulary_dim)
         self.hidden = self.init_hidden()
@@ -28,19 +28,11 @@ def init_hidden(self):
         return (autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()),
                 autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()))
 
-    def forward(self, sentence, hidden):
-        #0 rimpiazza parola w con $ --> nel training
-        #1 consuma tutte le parole della frase
+    def forward(self, sentence):
         embeddings = self.word_embeddings(sentence)
-        out, self.hidden = self.lstm(embeddings.permute(1, 0, 2), hidden)
+        packed = embeddings.permute(1, 0, 2)
+        out, self.hidden = self.lstm(packed, self.hidden)
         lstm_out = out[-1]
-        #print(lstm_out)
-        #lstm_out = lstm_out.view(-1, (2 * self.window_dim + 1) * self.hidden_dim)
-
-        #2 extract_context --> contesto c
         context = self.extract_context(lstm_out)
-
-        #3 softmax per predire la parola w dal contesto c
         prediction = self.predict(context)
-        #out = F.softmax(prediction, dim=1)
         return prediction, context
diff --git a/query.py b/query.py
@@ -1,4 +1,3 @@
-#!/home/emanuele/anaconda3/bin/python3.6
 '''
 Executes queries on a trained model to gather its accuracy.
 '''
@@ -11,13 +10,14 @@
 import numpy
 
 # Hyper-parameters
-dictionary_dim = 400000
-window_dim = 20
+hidden_units = 2048
+context_dim = 512
+embedding_dim = 512
+dictionary_dim = 100000
+window_dim = 10
 batch_dim = 32
-print_every = 10000
 start = time.time()
 modelfile = 'word_guesser.pt'
-train2 = False
 # ---
 
 if len(sys.argv) < 2:
@@ -29,55 +29,16 @@
     sys.exit(1)
 
 print("Initializing...")
-#word_to_ix, ix_to_word = data.init_dictionary(dictionary_dim)
-#'''
-if train2:
-    lines = open('training/overfit.txt').readlines()
-    sentences = data.split_to_sentences(lines)
-    word_to_ix=dict()
-    ix_to_word =dict()
-    i = 0
-    for s in sentences:
-        for t in s.strip().split(' '):
-            if t in word_to_ix:
-                continue
-            word_to_ix[t] = i
-            ix_to_word[i] = t
-            i+= 1
-    word_to_ix['$'] = i
-    ix_to_word[i] = '$'
-else:
-    word_to_ix = {"$":0, "PAD":1, "UNK":2}
-    ix_to_word = ["$", "PAD", "UNK"]
-
-    with open("training/overfit.txt") as f_in:
-        for l in f_in.readlines():
-            for w in l.strip().split(' '):
-                if w not in word_to_ix:
-                    word_to_ix[w] = len(word_to_ix)
-                    ix_to_word.append(w)
-#'''
+word_to_ix, ix_to_word = data.init_dictionary(dictionary_dim)
 
 if len(sys.argv) == 3:
     modelfile = sys.argv[2]
-'''
-model = torch.load(modelfile)
-model.train(False)
-'''
 
-hidden_units = 512
-context_dim = 256
-embedding_dim = 256
-if train2:
-    model = WordGuesser(hidden_units, context_dim, embedding_dim, len(word_to_ix), 1, 13)
-else:
-    model = WordGuesser(hidden_units, context_dim, embedding_dim, len(word_to_ix), 1, window_dim)
+model = WordGuesser(hidden_units, context_dim, embedding_dim, len(word_to_ix), 1)
 model.load_state_dict(torch.load(modelfile))
 model.train(False)
 model.hidden = model.init_hidden()
-#model = torch.load('word_guesser.pt')
 model = model.cuda()
-#sent_count = 0
 test_sentences = []
 
 with open(sys.argv[1]) as f_in:
@@ -88,39 +49,10 @@
         test_sentences.append(ss)
 print("Done.")
 
-'''
-print("Starting querying...")
-for sent in test_sentences:
-    sent_list = sent[0].split()
-    if len(sent_list) <= 0:
-        continue
-
-    word = sent[1]
-    sent_tensor = data.prepare_tensor(sent_list, word_to_ix)
-    if sent_tensor is None:
-        print('Sentence: %s\nFound word out of dictionary\n' % (sent[0]))
-        continue
-
-    sent_tensor = torch.LongTensor(sent_tensor)
-    sent_tensor = sent_tensor.cuda()
-    sent_tensor = autograd.Variable(sent_tensor)
-
-    prediction, c = model(sent_tensor)
-    prediction = F.softmax(prediction)
-
-    word_ids = prediction.data.topk(10)[1][0].cpu().numpy()
-    word_predictions = []
-    for i in word_ids:
-        word_predictions.append(ix_to_word[i])
-
-    print('Sentence: %s | Word: %s' % (sent[0], word))
-    print("Predictions:\n", word_predictions)
-    print("")
-print("Done.")
-'''
-
 print("Querying...")
 warm_up = 3
+
+#warming up internal gradients before model.eval()
 while warm_up != 0:
     warm_up -= 1
     if warm_up == 0:
@@ -129,39 +61,24 @@
     for t_sent in test_sentences:
         sent = t_sent[0]
         target = t_sent[1]
-        #sent_tensor, _ = data.prepare_sequence(sent.split(), target, word_to_ix, window_dim, False)
-        #sent_tensor = numpy.array([sent_tensor])
-
-        sent_tensor = [word_to_ix[w] for w in sent.split()]
+        sent_tensor, _ = data.prepare_sequence(sent.split(), target, word_to_ix)
 
         input_tensor = torch.LongTensor([sent_tensor])
         input_tensor = input_tensor.cuda()
         input_tensor = autograd.Variable(input_tensor)
 
-        hidden = model.init_hidden()
-        predictions, context = model(input_tensor, hidden)
-        #predictions = F.softmax(predictions, dim=1)
+        model.zero_grad()
+        model.hidden = model.init_hidden()
+        predictions, context = model(input_tensor)
 
-        #'''
         if warm_up == 0:
-            #'''
-            print('Sentence: %s\nTarget: %s' % (sent, target))
-            for i, prediction in enumerate(predictions):
-                sorted_val = sorted(enumerate(numpy.array(prediction.data)), key=lambda x : x[1], reverse=True)
-                print([(ix_to_word[x[0]], x[1]) for x in sorted_val[:5]]) #, ix_to_word[Y[i]]
-            print("")
-            #'''
-
-            '''
-            word_ids = predictions.data.topk(10)[1][0].cpu().numpy()
+            word_ids = predictions.data.topk(5)[1][0].cpu().numpy()
             word_predictions = []
             for i in word_ids:
                 word_predictions.append(ix_to_word[i])
 
-            print('Sentence: %s | Word: %s' % (sent, target))
+            print('Sentence: %s\nWord: %s' % (sent, target))
             print("Predictions:\n", word_predictions)
             print("")
-            #'''
-        #'''
 
 print("Done.")