upload

emanuelegiona · Apr 5, 2018 · 8643fb3 · 8643fb3 · SandipSPatil · Sep 2, 2020
commit 8643fb3
Show file tree

Hide file tree

Showing 7 changed files with 3,223,124 additions and 0 deletions.
diff --git a/data.py b/data.py
@@ -0,0 +1,183 @@
+'''
+Contains various utility functions to manipulate
+training data and query data and their specifications.
+'''
+
+from model import *
+import random
+import spacy
+import unicodedata
+import string
+import time
+import math
+import copy
+from tqdm import tqdm
+
+def custom_pipeline(nlp):
+    return (nlp.tagger, nlp.parser)
+
+nlp = spacy.load('en', create_pipeline=custom_pipeline)
+
+def init_dictionary(dictionary_dim):
+    word_to_ix = {'$' : 0, 'PAD' : 1, 'UNK' : 2}
+    ix_to_word = ["$", "PAD", "UNK"]
+
+    with open("training/dictionary.txt") as f_in:
+        counter = 0
+        for line in f_in:
+            if counter == dictionary_dim:
+                break
+
+            w = unicode_to_ascii(line)
+            if w not in word_to_ix:
+                word_to_ix[w] = len(word_to_ix)
+                ix_to_word.append(w)
+                counter += 1
+
+    return (word_to_ix, ix_to_word)
+
+def split_to_sentences(lines):
+    sentences = []
+    for line in lines:
+        if line.startswith("TITLE:") or line.startswith("=="):
+            continue
+
+        tokens = nlp(line)
+        for sent in tokens.sents:
+            words = []
+            for word in sent:
+                words.append(unicode_to_ascii(word.text))
+            sentences.append(' '.join(words).strip())
+
+    return sentences
+
+def _split_to_sentences(filename):
+    print("- Reading file...")
+    sentences = []
+    with open(filename) as f_in:
+        text = f_in.read()
+        lines = text.split("\n")
+        print("- Splitting sentences...")
+        for line in tqdm(lines[:32]):
+            if line.startswith("TITLE:") or line.startswith("=="):
+                continue
+
+            tokens = nlp(line)
+            for sent in tokens.sents:
+                words = []
+                for word in sent:
+                    words.append(unicode_to_ascii(word.text))
+                sentences.append(' '.join(words).strip())
+
+    return sentences
+
+def mask_words(sent):
+    sent_list = sent.split()
+    if len(sent_list) <= 0:
+        return None, None
+
+    sents = list()
+    targets = list()
+    for i in range(len(sent_list)):
+        new_list = copy.deepcopy(sent_list)
+        new_list[i] = "$"
+        sents.append(new_list)
+        targets.append(sent_list[i])
+    return sents, targets
+
+def prepare_sequence(sent_list, word, to_ix, window_dim, training=True):
+    '''
+    sent_list = sent.split()
+    if len(sent_list) <= 0:
+        return None, None
+
+    word_id = random.choice(range(len(sent_list)))
+    word = sent_list[word_id]
+
+    with open("training/file1.txt", 'a') as f_out:
+        f_out.write(sent + "\t" + word + "\n")
+        f_out.flush()
+    '''
+
+    '''
+    if training:
+        sent_list[word_id] = "$"
+    '''
+    m = len(sent_list) / 2
+    #uniform sentence length
+    sent_list = sent_list[max(0, m - window_dim) : min(len(sent_list), m + window_dim)]
+
+    #pad sentence if necessary
+    if len(sent_list) < (2 * window_dim + 1):
+        sent_list += ['PAD'] * ((2 * window_dim + 1) - len(sent_list))
+
+    sent_tensor = prepare_tensor(sent_list, to_ix)
+    if sent_tensor is None:
+        return None, None
+
+    target = to_ix[word] if word in to_ix else to_ix['UNK']
+
+    '''
+    with open("training/file2.txt", 'a') as f_out:
+        f_out.write(str(sent_tensor) + "\t" + str(target) + "\n")
+        f_out.flush()
+    '''
+
+    return sent_tensor, target
+
+def prepare_tensor(seq, to_ix):
+    try:
+        ids = [to_ix[w] if w in to_ix else to_ix['UNK'] for w in seq]
+    except:
+        ids = None
+    finally:
+        return ids
+
+def _prepare_sequence(seq, to_ix):
+    try:
+        ids = [to_ix[w] if w in to_ix else to_ix['UNK'] for w in seq]
+    except:
+        ids = None
+    finally:
+        return ids
+
+all_letters = string.ascii_letters + "1234567890-.,;'"
+n_letters = len(all_letters)
+
+def unicode_to_ascii(s):
+    return ''.join(
+        c for c in unicodedata.normalize('NFD', s)
+        if unicodedata.category(c) != 'Mn'
+            and c in all_letters
+    )
+
+def elapsed(start):
+    now = time.time()
+    s = now - start
+    m = math.floor(s/60)
+    s -= m*60
+    s = float('%.3f' % (s))
+    if s < 10:
+        s = ('0%.3f' % (s))
+    ret = ('Time elapsed: %sm %ss' % (m, s))
+    return ret
+
+def format_date(t):
+    date = time.localtime(t)
+    m = int(date.tm_mon)
+    if m < 10:
+        m = ('0%d' % (m))
+    d = int(date.tm_mday)
+    if d < 10:
+        d = ('0%d' % (d))
+    hr = int(date.tm_hour)
+    if hr < 10:
+        hr = ('0%d' % (hr))
+    mn = int(date.tm_min)
+    if mn < 10:
+        mn = ('0%d' % (mn))
+    sc = int(date.tm_sec)
+    if sc < 10:
+        sc = ('0%d' % (sc))
+    ret = ('%s/%s/%s %s:%s:%s' % (date.tm_year, m, d, hr, mn, sc))
+    return ret
diff --git a/model.py b/model.py
@@ -0,0 +1,43 @@
+'''
+Contains the language model to predict a held-out word
+given the surrounding context of a sentence.
+'''
+
+import torch
+import torch.autograd as autograd
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+torch.manual_seed(1)
+
+class WordGuesser(nn.Module):
+    def __init__(self, hidden_dim, context_dim, embedding_dim, vocabulary_dim, batch_dim, window_dim):
+        super(WordGuesser, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.batch_dim = batch_dim
+        self.window_dim = window_dim
+        self.word_embeddings = nn.Embedding(vocabulary_dim, embedding_dim)
+        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
+        self.extract_context = nn.Linear((2 * window_dim + 1) * hidden_dim, context_dim)
+        self.predict = nn.Linear(context_dim, vocabulary_dim)
+        self.hidden = self.init_hidden()
+
+    def init_hidden(self):
+        return (autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()),
+                autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()))
+
+    def forward(self, sentence):
+        #0 rimpiazza parola w con $ --> nel training
+        #1 consuma tutte le parole della frase
+        embeddings = self.word_embeddings(sentence)
+        lstm_out, self.hidden = self.lstm(embeddings.permute(1, 0, 2), self.hidden)
+        lstm_out = lstm_out.view(-1, (2 * self.window_dim + 1) * self.hidden_dim)
+
+        #2 extract_context --> contesto c
+        context = self.extract_context(lstm_out)
+
+        #3 softmax per predire la parola w dal contesto c
+        prediction = self.predict(context)
+        #out = F.softmax(prediction, dim=1)
+        return prediction, context
diff --git a/query.py b/query.py
@@ -0,0 +1,104 @@
+#!/home/emanuele/anaconda3/bin/python3.6
+'''
+Executes queries on a trained model to gather its accuracy.
+'''
+
+import sys
+import random
+from model import *
+import data
+import time
+import numpy
+
+# Hyper-parameters
+dictionary_dim = 400000
+window_dim = 20
+batch_dim = 32
+print_every = 10000
+start = time.time()
+modelfile = 'word_guesser.pt'
+# ---
+
+if len(sys.argv) < 2:
+    print("Error | Argument missing: testing corpus needed.")
+    sys.exit(1)
+
+if sys.argv[1].find(".txt") != len(sys.argv[1])-4:
+    print("Error | Bad argument: textual (.txt) corpora only.")
+    sys.exit(1)
+
+print("Initializing...")
+word_to_ix, ix_to_word = data.init_dictionary(dictionary_dim)
+if len(sys.argv) == 3:
+    modelfile = sys.argv[2]
+model = torch.load(modelfile)
+model.batch_dim = 1
+model.hidden = model.init_hidden()
+#model = torch.load('word_guesser.pt')
+model = model.cuda()
+#sent_count = 0
+test_sentences = []
+
+with open(sys.argv[1]) as f_in:
+    for line in f_in:
+        ss = line.strip().split("\t")
+        if len(ss[0]) <= 0:
+            continue
+        test_sentences.append(ss)
+print("Done.")
+
+'''
+print("Starting querying...")
+for sent in test_sentences:
+    sent_list = sent[0].split()
+    if len(sent_list) <= 0:
+        continue
+
+    word = sent[1]
+    sent_tensor = data.prepare_tensor(sent_list, word_to_ix)
+    if sent_tensor is None:
+        print('Sentence: %s\nFound word out of dictionary\n' % (sent[0]))
+        continue
+
+    sent_tensor = torch.LongTensor(sent_tensor)
+    sent_tensor = sent_tensor.cuda()
+    sent_tensor = autograd.Variable(sent_tensor)
+
+    prediction, c = model(sent_tensor)
+    prediction = F.softmax(prediction)
+
+    word_ids = prediction.data.topk(10)[1][0].cpu().numpy()
+    word_predictions = []
+    for i in word_ids:
+        word_predictions.append(ix_to_word[i])
+
+    print('Sentence: %s | Word: %s' % (sent[0], word))
+    print("Predictions:\n", word_predictions)
+    print("")
+print("Done.")
+'''
+
+print("Querying...")
+for t_sent in test_sentences:
+    sent = t_sent[0]
+    target = t_sent[1]
+    sent_tensor, _ = data.prepare_sequence(sent.split(), target, word_to_ix, window_dim, False)
+    sent_tensor = numpy.array([sent_tensor])
+
+    input_tensor = torch.LongTensor(sent_tensor)
+    input_tensor = input_tensor.cuda()
+    input_tensor = autograd.Variable(input_tensor)
+
+    prediction, context = model(input_tensor)
+    prediction = F.softmax(prediction, dim=1)
+
+    word_ids = prediction.data.topk(10)[1][0].cpu().numpy()
+    word_predictions = []
+    for i in word_ids:
+        word_predictions.append(ix_to_word[i])
+
+    print('Sentence: %s | Word: %s' % (sent, target))
+    print("Predictions:\n", word_predictions)
+    print("")
+
+print("Done.")
diff --git a/testing/overfit.txt b/testing/overfit.txt
@@ -0,0 +1,4 @@
+The film was also intended to $ the first in a trilogy.	be
+The film $ also intended to be the first in a trilogy.	was
+$ film was also intended to be the first in a trilogy.	The
+