Skip to content

Commit a95b3dd

Browse files
committed
first commit
1 parent 6b16853 commit a95b3dd

File tree

10 files changed

+415
-0
lines changed

10 files changed

+415
-0
lines changed

.idea/Transformer.iml

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/vcs.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

EncoderDecoder.py

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
import torch.nn.functional as F
2+
import torch
3+
import torch.nn as nn
4+
import copy
5+
import numpy as np
6+
7+
8+
class EncoderDecoder(nn.Module): # 继承Module类
9+
"""
10+
标准的Encoder-Decoder架构。这是很多模型的基础
11+
"""
12+
def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
13+
super().__init__() # 调用父类(超类)的一个方法(init)。
14+
# encoder和decoder都是构造的时候传入的,这样会非常灵活
15+
self.encoder = encoder
16+
self.decoder = decoder
17+
# 源语言和目标语言的embedding方法 ,参数为src或tgt
18+
self.src_embed = src_embed
19+
self.tgt_embed = tgt_embed
20+
# generator后面会讲到,就是根据Decoder的隐状态输出当前时刻的词
21+
# 基本的实现就是隐状态输入一个全连接层,全连接层的输出大小是词的个数
22+
# 然后接一个softmax变成概率。
23+
self.generator = generator
24+
25+
def forward(self, src, tgt, src_mask, tgt_mask): # Module中定义了__call__()函数,该函数调用了forward()函数,类传入参数时会自动调用
26+
# 首先调用encode方法对输入进行编码,然后调用decode方法解码
27+
return self.decode(self.encode(src, src_mask), src_mask,
28+
tgt, tgt_mask)
29+
30+
def encode(self, src, src_mask):
31+
# 调用encoder来进行编码,传入的参数embedding的src和src_mask
32+
return self.encoder(self.src_embed(src), src_mask)
33+
34+
def decode(self, memory, src_mask, tgt, tgt_mask): # 即为encode获取的信息
35+
# 调用decoder
36+
return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
37+
38+
39+
class Generator(nn.Module):
40+
# 根据Decoder的隐状态输出一个词,Decoder的后面两步(linear+softmax)
41+
# d_model是Decoder输出的大小,vocab是词典大小
42+
def __init__(self, d_model, vocab):
43+
super().__init__()
44+
self.proj = nn.Linear(d_model, vocab) # 全连接层进行线性变换
45+
46+
# 全连接再加上一个softmax
47+
def forward(self, x):
48+
return F.log_softmax(self.proj(x), dim=-1) # 按照指定维度在softmax基础上再log
49+
50+
51+
# m = nn.LogSoftmax(dim=1)
52+
# criterion = nn.NLLLoss()
53+
# x = torch.randn(1, 5)
54+
# y = torch.empty(1, dtype=torch.long).random_(5)
55+
# loss = criterion(m(x), y)
56+
# print(loss)
57+
58+
59+
def clones(module, N):
60+
# 克隆N个完全相同的SubLayer,使用了copy.deepcopy
61+
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
62+
# 但是nn.ModuleList并不是Module(的子类),因此它没有forward等方法,我们通常把它放到某个Module里。
63+
64+
65+
class Encoder(nn.Module):
66+
"Encoder就是N个SubLayer的stack,最后加上一个LayerNorm。"
67+
# 不应该说有多层Encoder,应该说Encoder有多个SubLayer层
68+
def __init__(self, layer, N):
69+
super(Encoder, self).__init__()
70+
# layer是一个SubLayer,我们clone N个
71+
self.layers = clones(layer, N)
72+
# 再加一个LayerNorm层
73+
self.norm = LayerNorm(layer.size)
74+
75+
def forward(self, x, mask):
76+
"逐层进行处理"
77+
for layer in self.layers:
78+
x = layer(x, mask)
79+
# 最后进行LayerNorm,后面会解释为什么最后还有一个LayerNorm。
80+
return self.norm(x)
81+
82+
83+
class Decoder(nn.Module):
84+
def __init__(self, layer, N):
85+
super(Decoder, self).__init__()
86+
# layer是一个SubLayer,我们clone N个
87+
self.layers = clones(layer, N)
88+
# 再加一个LayerNorm层
89+
self.norm = LayerNorm(layer.size)
90+
91+
def forward(self, x, mask):
92+
"逐层进行处理"
93+
for layer in self.layers:
94+
x = layer(x, mask)
95+
# 最后进行LayerNorm,后面会解释为什么最后还有一个LayerNorm。
96+
return self.norm(x)
97+
98+
99+
# 若特征间具有不同的值范围时,因此梯度更新时,会来回震荡,经过较长的时间才能达到局部最优值或全局最优值。
100+
# 为了解决该模型问题,我们需要归一化数据,我们确保不同的特征具有相同的值范围,这样梯度下降可以很快的收敛。
101+
class LayerNorm(nn.Module):
102+
def __init__(self, features, eps=1e-6):
103+
super(LayerNorm, self).__init__()
104+
self.a_2 = nn.Parameter(torch.ones(features))
105+
self.b_2 = nn.Parameter(torch.zeros(features))
106+
self.eps = eps
107+
108+
def forward(self, x):
109+
mean = x.mean(-1, keepdim=True)
110+
std = x.std(-1, keepdim=True)
111+
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
112+
113+
114+
# 不管是Self-Attention还是全连接层,都首先是LayerNorm,然后是Self-Attention/Dense,然后是Dropout,最后是残差连接。
115+
# 构造norm+dropout+add,这里面有很多可以重用的代码,我们把它封装成SublayerConnection。
116+
class SublayerConnection(nn.Module):
117+
"""
118+
LayerNorm + sublayer(Self-Attenion/Dense) + dropout + 残差连接
119+
为了简单,把LayerNorm放到了前面,这和原始论文稍有不同,原始论文LayerNorm在最后。
120+
"""
121+
122+
def __init__(self, size, dropout):
123+
super(SublayerConnection, self).__init__()
124+
self.norm = LayerNorm(size)
125+
self.dropout = nn.Dropout(dropout)
126+
127+
def forward(self, x, sublayer): # 这个方法需要两个参数,一个是输入Tensor,一个是一个callable,并且这个callable可以用一个参数来调用
128+
"sublayer是传入的参数,参考DecoderLayer,它可以当成函数调用,这个函数的有一个输入参数"
129+
return x + self.dropout(sublayer(self.norm(x)))
130+
131+
132+
# 构造Self-Attention或者Dense
133+
class EncoderLayer(nn.Module):
134+
"EncoderLayer由self-attn和feed forward组成"
135+
# 为了复用,这里的self_attn层和feed_forward层也是传入的参数,这里只构造两个SublayerConnection。
136+
def __init__(self, size, self_attn, feed_forward, dropout):
137+
super(EncoderLayer, self).__init__()
138+
self.self_attn = self_attn # self_attn函数需要4个参数(Query的输入,Key的输入,Value的输入和Mask)
139+
self.feed_forward = feed_forward
140+
self.sublayer = clones(SublayerConnection(size, dropout), 2) # 自注意层和前向层都需要进行norm+dropout+add
141+
self.size = size
142+
143+
# def forward(self, x, mask):
144+
# "Follow Figure 1 (left) for connections."
145+
# x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) # 使用lambda的技巧把它变成一个参数x的函数(mask可以看成已知的数)。
146+
# return self.sublayer[1](x, self.feed_forward)
147+
148+
# 解释:
149+
# self_attn有4个参数,但是我们知道在Encoder里,前三个参数都是输入y,第四个参数是mask。
150+
# 这里mask是已知的,因此我们可以用lambda的技巧它变成一个参数的函数z = lambda y: self.self_attn(y, y, y, mask),这个函数的输入是y。
151+
def forward(self, x, mask):
152+
z = lambda y: self.self_attn(y, y, y, mask)
153+
x = self.sublayer[0](x, z) # z就等于sublayer中forward 的 sublayer
154+
# self.sublayer[0]是个callable,self.sublayer[0] (x, z)会调用self.sublayer[0].call(x, z),
155+
# 然后会调用SublayerConnection.forward(x, z),然后会调用sublayer(self.norm(x)),sublayer就是传入的参数z,因此就是z(self.norm(x))。
156+
return self.sublayer[1](x, self.feed_forward)
157+
158+
159+
class DecoderLayer(nn.Module):
160+
"Decoder包括self-attn, src-attn, 和feed forward "
161+
162+
def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
163+
super(DecoderLayer, self).__init__()
164+
self.size = size
165+
self.self_attn = self_attn
166+
self.src_attn = src_attn # 比EncoderLayer多了一个src-attn层。
167+
# 这是Decoder时attend to Encoder的输出(memory)。src-attn和self-attn的实现是一样的,只不过使用的Query,Key和Value的输入不同。
168+
self.feed_forward = feed_forward
169+
self.sublayer = clones(SublayerConnection(size, dropout), 3)
170+
171+
def forward(self, x, memory, src_mask, tgt_mask): # 多一个来自Encoder的memory
172+
m = memory
173+
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask)) # self-attention
174+
x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask)) # encoder-decoder attention
175+
return self.sublayer[2](x, self.feed_forward)
176+
177+
178+
# Decoder和Encoder有一个关键的不同:Decoder在解码第t个时刻的时候只能使用1…t时刻的输入,
179+
# 而不能使用t+1时刻及其之后的输入。因此我们需要一个函数来产生一个Mask矩阵,
180+
def subsequent_mask(size):
181+
"Mask out subsequent positions."
182+
attn_shape = (1, size, size) # 全初始化为1
183+
subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8') # 返回函数的上三角矩阵(其他位为0),k=1表示对角线的位置上移1个对角线,k默认是0
184+
return torch.from_numpy(subsequent_mask) == 0 # torch.from_numpy()方法把数组转换成张量,且二者共享内存。matrix == 0来实现把0变成1,把1变成0。
185+
186+
# print(subsequent_mask(5))
187+
# 输出:
188+
# 1 0 0 0 0
189+
# 1 1 0 0 0
190+
# 1 1 1 0 0
191+
# 1 1 1 1 0
192+
# 1 1 1 1 1
193+
194+
195+
196+
197+
198+
199+

FeedForward.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import torch.nn as nn
2+
import torch.nn.functional as F
3+
4+
5+
# 全连接层,以独立并行计算,由两个线性变换以及它们之间的ReLU激活组成
6+
class PositionwiseFeedForward(nn.Module):
7+
def __init__(self, d_model, d_ff, dropout=0.1): # 输入输出是d_model维,隐单元个数为d_ff
8+
super(PositionwiseFeedForward, self).__init__()
9+
self.w_1 = nn.Linear(d_model, d_ff)
10+
self.w_2 = nn.Linear(d_ff, d_model)
11+
self.dropout = nn.Dropout(dropout)
12+
13+
def forward(self, x):
14+
return self.w_2(self.dropout(F.relu(self.w_1(x)))) # 在两个线性变换之间除了ReLu还使用了一个Dropout。

InputEmbedding.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import torch.nn as nn
2+
import math
3+
4+
5+
class Embeddings(nn.Module):
6+
def __init__(self, d_model, vocab):
7+
super(Embeddings, self).__init__()
8+
self.lut = nn.Embedding(vocab, d_model)
9+
self.d_model = d_model
10+
11+
def forward(self, x):
12+
return self.lut(x) * math.sqrt(self.d_model)
13+
# forward处理使用nn.Embedding对输入x进行Embedding之外,还除以了√d_model。

Model.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from InputEmbedding import *
2+
from EncoderDecoder import *
3+
from MultiHeadedAttention import *
4+
from PositionalEncoding import *
5+
from FeedForward import *
6+
import time
7+
8+
9+
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
10+
c = copy.deepcopy # 把copy.deepcopy命名为c,这样使下面的代码简洁一点。
11+
attn = MultiHeadedAttention(h, d_model)
12+
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
13+
position = PositionalEncoding(d_model, dropout)
14+
model = EncoderDecoder( # 构造EncoderDecoder对象。它需要5个参数:Encoder、Decoder、src-embed、tgt-embed和Generator。
15+
Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
16+
Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), # Decoder由N个DecoderLayer组成
17+
nn.Sequential(Embeddings(d_model, src_vocab), c(position)), # src-embed是一个Embeddings层和一个位置编码层c(position)
18+
nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), # tgt-embed也是类似的。
19+
Generator(d_model, tgt_vocab)) # 作用是把模型的隐单元变成输出词的概率
20+
21+
# 随机初始化参数,这非常重要
22+
for p in model.parameters():
23+
if p.dim() > 1:
24+
nn.init.xavier_uniform(p) # Xavier初始化
25+
return model
26+
27+
28+
class Batch:
29+
def __init__(self, src, trg=None, pad=0): # Batch构造函数的输入是src和trg
30+
self.src = src
31+
self.src_mask = (src != pad).unsqueeze(-2)
32+
if trg is not None:
33+
self.trg = trg[:, :-1]
34+
self.trg_y = trg[:, 1:]
35+
self.trg_mask = \
36+
self.make_std_mask(self.trg, pad)
37+
self.ntokens = (self.trg_y != pad).data.sum()
38+
39+
@staticmethod
40+
def make_std_mask(tgt, pad):
41+
"创建Mask,使得我们不能attend to未来的词"
42+
tgt_mask = (tgt != pad).unsqueeze(-2)
43+
tgt_mask = tgt_mask & Variable(subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
44+
return tgt_mask
45+
46+
47+
# 它遍历一个epoch的数据,然后调用forward,接着用loss_compute函数计算梯度,更新参数并且返回loss。
48+
def run_epoch(data_iter, model, loss_compute):
49+
start = time.time()
50+
total_tokens = 0
51+
total_loss = 0
52+
tokens = 0
53+
for i, batch in enumerate(data_iter):
54+
out = model.forward(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
55+
loss = loss_compute(out, batch.trg_y, batch.ntokens)
56+
# loss_compute是一个函数,它的输入是模型的预测out,真实的标签序列batch.trg_y和batch的词个数。
57+
# 本来计算损失和更新参数比较简单,但是这里为了实现多GPU的训练,这个类就比较复杂了。
58+
total_loss += loss
59+
total_tokens += batch.ntokens
60+
tokens += batch.ntokens
61+
if i % 50 == 1:
62+
elapsed = time.time() - start
63+
print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
64+
(i, loss / batch.ntokens, tokens / elapsed))
65+
start = time.time()
66+
tokens = 0
67+
return total_loss / total_tokens

0 commit comments

Comments
 (0)