main.py


# coding: utf-8

# In[1]:


get_ipython().run_cell_magic('javascript', '', '\nwindow.load_remote_theme = true\nvar theme_js = "https://odhk.github.io/hyrule_theme/custom.js";\n\nwindow.load_local_theme = function(){\n    var hostname = document.location.hostname\n    return ((hostname == "localhost" || hostname == \'127.0.0.1\') && !load_remote_theme)\n}\n\nvar url = load_local_theme() ? document.location.origin + "/files/theme/custom.js" : theme_js\n\n$.getScript(url)')


# In[2]:


from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from functools import reduce
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# In[3]:


SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS",2:"<unk>"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


# In[4]:


# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


# In[5]:


def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


# In[6]:


MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


# In[7]:


def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'chn', True)
print(random.choice(pairs))


# In[8]:


output_lang.word2count['i']
eng = output_lang
eng_words = output_lang.index2word.values()
#print(list(eng_words))
big_sentence_array = [p[1].split() for p in pairs]
big_sentence_array = [p.split('.') for arr in big_sentence_array for p in arr]


# In[9]:


test_sentence = reduce(lambda x,y:x+y,big_sentence_array)
test_sentence = [w for w in test_sentence if w!='']
for index in range(len(test_sentence)):
    if test_sentence[index] == 'm':
        test_sentence[index]='am'
    elif test_sentence[index] == 're':
        test_sentence[index]='are'
    elif test_sentence[index] == 's':
        test_sentence[index]='is'
#vocab = set(test_sentence)
#word_to_ix = {word: i for i, word in enumerate(vocab)}


# In[ ]:


CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}
word_to_ix['<unk>'] = len(word_to_ix)

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]

model = NGramLanguageModeler(len(word_to_ix), EMBEDDING_DIM,CONTEXT_SIZE)


# In[ ]:


losses = []
loss_function = nn.NLLLoss()

optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(50):
    total_loss = torch.Tensor([0])
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in variables)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a variable)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        #print(1)
        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    print(total_loss)
print(losses)  # The loss decreased every iteration over the training data!


# In[ ]:


# 仅保存和加载模型参数(推荐使用)
torch.save(model.state_dict(), 'lm-model.pkl')
#model.load_state_dict(torch.load('lm-model.pkl'))


# In[ ]:


embed = model.embeddings


# In[ ]:


```
input = [0 for _ in range(len(vocab))]
input[eng.word2index['i']] = 1
print(embed(torch.tensor(eng.word2index['i'])))
print(embed(torch.tensor(eng.word2index['me'])))
print(embed(torch.tensor(eng.word2index['you'])))
a = embed(torch.tensor(eng.word2index['i'])).detach().numpy()
b = embed(torch.tensor(eng.word2index['me'])).detach().numpy()
c = embed(torch.tensor(eng.word2index['man'])).detach().numpy()
#b = np.array(embed(torch.tensor(eng.word2index['me'])))
#c = np.array(embed(torch.tensor(eng.word2index['duck'])))
cosab = a.dot(b)/(a.dot(a)*b.dot(b))
cosbc = a.dot(c)/(a.dot(a)*c.dot(c))
print(cosab)
print(cosbc)
```