@@ -51,14 +51,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN
5151在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现:
5252
5353``` python
54- # ### Encoder
5554src_word_id = paddle.layer.data(
5655 name = ' source_language_word' ,
5756 type = paddle.data_type.integer_value_sequence(source_dict_dim))
57+
5858# source embedding
5959src_embedding = paddle.layer.embedding(
6060 input = src_word_id, size = word_vector_dim)
61- # use bidirectional_gru
61+
62+ # bidirectional GRU as encoder
6263encoded_vector = paddle.networks.bidirectional_gru(
6364 input = src_embedding,
6465 size = encoder_size,
@@ -84,19 +85,17 @@ encoded_vector = paddle.networks.bidirectional_gru(
8485
8586
8687### 无注意力机制的解码器
87- PaddleBook中[ 机器翻译] ( https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md ) 的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[ [ 3] ( #参考文献 )] 。
88+ - PaddleBook中[ 机器翻译] ( https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md ) 的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例介绍的则是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[ [ 3] ( #参考文献 )] 。
8889
8990对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的` recurrent_layer_group ` 。首先,自定义单步逻辑函数,再利用函数 ` recurrent_group() ` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用` recurrent_layer_group ` 来实现,其中,单步逻辑函数` gru_decoder_without_attention() ` 相关代码如下:
9091
9192``` python
92- # ### Decoder
93+ # the initialization state for decoder GRU
9394encoder_last = paddle.layer.last_seq(input = encoded_vector)
94- encoder_last_projected = paddle.layer.mixed(
95- size = decoder_size,
96- act = paddle.activation.Tanh(),
97- input = paddle.layer.full_matrix_projection(input = encoder_last))
95+ encoder_last_projected = paddle.layer.fc(
96+ size = decoder_size, act = paddle.activation.Tanh(), input = encoder_last)
9897
99- # gru step
98+ # the step function for decoder GRU
10099def gru_decoder_without_attention (enc_vec , current_word ):
101100 '''
102101 Step function for gru decoder
@@ -106,33 +105,29 @@ def gru_decoder_without_attention(enc_vec, current_word):
106105 :type current_word: layer object
107106 '''
108107 decoder_mem = paddle.layer.memory(
109- name = ' gru_decoder' ,
110- size = decoder_size,
111- boot_layer = encoder_last_projected)
108+ name = " gru_decoder" ,
109+ size = decoder_size,
110+ boot_layer = encoder_last_projected)
112111
113112 context = paddle.layer.last_seq(input = enc_vec)
114113
115- decoder_inputs = paddle.layer.mixed(
116- size = decoder_size * 3 ,
117- input = [
118- paddle.layer.full_matrix_projection(input = context),
119- paddle.layer.full_matrix_projection(input = current_word)
120- ])
114+ decoder_inputs = paddle.layer.fc(
115+ size = decoder_size * 3 , input = [context, current_word])
121116
122117 gru_step = paddle.layer.gru_step(
123- name = ' gru_decoder' ,
118+ name = " gru_decoder" ,
124119 act = paddle.activation.Tanh(),
125120 gate_act = paddle.activation.Sigmoid(),
126121 input = decoder_inputs,
127122 output_mem = decoder_mem,
128123 size = decoder_size)
129124
130- out = paddle.layer.mixed (
125+ out = paddle.layer.fc (
131126 size = target_dict_dim,
132127 bias_attr = True ,
133128 act = paddle.activation.Softmax(),
134- input = paddle.layer.full_matrix_projection( input = gru_step) )
135- return out
129+ input = gru_step)
130+ return out
136131```
137132
138133在模型训练和测试阶段,解码器的行为有很大的不同:
@@ -143,34 +138,14 @@ def gru_decoder_without_attention(enc_vec, current_word):
143138训练和生成的逻辑分别实现在如下的` if-else ` 条件分支中:
144139
145140``` python
146- decoder_group_name = " decoder_group"
147- group_input1 = paddle.layer.StaticInput(input = encoded_vector, is_seq = True )
141+ group_input1 = paddle.layer.StaticInput(input = encoded_vector)
148142group_inputs = [group_input1]
149- if not generating:
150- trg_embedding = paddle.layer.embedding(
151- input = paddle.layer.data(
152- name = ' target_language_word' ,
153- type = paddle.data_type.integer_value_sequence(target_dict_dim)),
154- size = word_vector_dim,
155- param_attr = paddle.attr.ParamAttr(name = ' _target_language_embedding' ))
156- group_inputs.append(trg_embedding)
157-
158- decoder = paddle.layer.recurrent_group(
159- name = decoder_group_name,
160- step = gru_decoder_without_attention,
161- input = group_inputs)
162-
163- lbl = paddle.layer.data(
164- name = ' target_language_next_word' ,
165- type = paddle.data_type.integer_value_sequence(target_dict_dim))
166- cost = paddle.layer.classification_cost(input = decoder, label = lbl)
167-
168- return cost
169- else :
170143
144+ decoder_group_name = " decoder_group"
145+ if is_generating:
171146 trg_embedding = paddle.layer.GeneratedInput(
172147 size = target_dict_dim,
173- embedding_name = ' _target_language_embedding' ,
148+ embedding_name = " _target_language_embedding" ,
174149 embedding_size = word_vector_dim)
175150 group_inputs.append(trg_embedding)
176151
@@ -184,36 +159,58 @@ else:
184159 max_length = max_length)
185160
186161 return beam_gen
162+ else :
163+ trg_embedding = paddle.layer.embedding(
164+ input = paddle.layer.data(
165+ name = " target_language_word" ,
166+ type = paddle.data_type.integer_value_sequence(target_dict_dim)),
167+ size = word_vector_dim,
168+ param_attr = paddle.attr.ParamAttr(name = " _target_language_embedding" ))
169+ group_inputs.append(trg_embedding)
170+
171+ decoder = paddle.layer.recurrent_group(
172+ name = decoder_group_name,
173+ step = gru_decoder_without_attention,
174+ input = group_inputs)
175+
176+ lbl = paddle.layer.data(
177+ name = " target_language_next_word" ,
178+ type = paddle.data_type.integer_value_sequence(target_dict_dim))
179+ cost = paddle.layer.classification_cost(input = decoder, label = lbl)
180+
181+ return cost
187182```
188183
189184## 数据准备
190185本例所用到的数据来自[ WMT14] ( http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/ ) ,该数据集是法文到英文互译的平行语料。用[ bitexts] ( http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz ) 作为训练数据,[ dev+test data] ( http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz ) 作为验证与测试数据。在PaddlePaddle中已经封装好了该数据集的读取接口,在首次运行的时候,程序会自动完成下载,用户无需手动完成相关的数据准备。
191186
192187## 模型的训练与测试
193188
194- 在定义好网络结构后,就可以进行模型训练与测试了。根据用户运行时传递的参数是` --train ` 还是 ` --generate ` ,Python 脚本的 ` main() ` 函数分别调用函数` train() ` 和` generate() ` 来完成模型的训练与测试。
195-
196189### 模型训练
197- 模型训练阶段,函数 ` train() ` 依次完成了如下的逻辑:
190+
191+ 启动模型训练的十分简单,只需在命令行窗口中执行` python train.py ` 。模型训练阶段 ` train.py ` 脚本中的 ` train() ` 函数依次完成了如下的逻辑:
198192
199193** a) 由网络定义,解析网络结构,初始化模型参数**
200194
201- ```
202- # initialize model
195+ ``` python
196+ # define the network topolgy.
203197cost = seq2seq_net(source_dict_dim, target_dict_dim)
204198parameters = paddle.parameters.create(cost)
205199```
206200
207201** b) 设定训练过程中的优化策略、定义训练数据读取 ` reader ` **
208202
209- ```
210- # define optimize method and trainer
203+ ``` python
204+ # define optimization method
211205optimizer = paddle.optimizer.RMSProp(
212206 learning_rate = 1e-3 ,
213207 gradient_clipping_threshold = 10.0 ,
214208 regularization = paddle.optimizer.L2Regularization(rate = 8e-4 ))
209+
210+ # define the trainer instance
215211trainer = paddle.trainer.SGD(
216212 cost = cost, parameters = parameters, update_equation = optimizer)
213+
217214# define data reader
218215wmt14_reader = paddle.batch(
219216 paddle.reader.shuffle(
@@ -223,40 +220,33 @@ wmt14_reader = paddle.batch(
223220
224221** c) 定义事件句柄,打印训练中间结果、保存模型快照**
225222
226- ```
227- # define event_handler callback
223+ ``` python
224+ # define the event_handler callback
228225def event_handler (event ):
229226 if isinstance (event, paddle.event.EndIteration):
230- if event.batch_id % 100 == 0 and event.batch_id > 0:
231- with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' %
232- event.batch_id, 'w') as f:
227+ if not event.batch_id % 100 and event.batch_id:
228+ with gzip.open(
229+ os.path.join(save_path,
230+ " nmt_without_att_%05d _batch_%05d .tar.gz" %
231+ event.pass_id, event.batch_id), " w" ) as f:
233232 parameters.to_tar(f)
234233
235- if event.batch_id % 10 == 0:
236- print "\nPass %d, Batch %d, Cost%f, %s" % (
237- event.pass_id, event.batch_id, event.cost, event.metrics)
238- else:
239- sys.stdout.write('.')
240- sys.stdout.flush()
234+ if event.batch_id and not event.batch_id % 10 :
235+ logger.info(" Pass %d , Batch %d , Cost %f , %s " % (
236+ event.pass_id, event.batch_id, event.cost, event.metrics))
241237```
242238
243239** d) 开始训练**
244240
245- ```
246- # start to train
241+ ``` python
242+ # start training
247243trainer.train(
248244 reader = wmt14_reader, event_handler = event_handler, num_passes = 2 )
249245```
250246
251- 启动模型训练的十分简单,只需在命令行窗口中执行
252-
253- ```
254- python nmt_without_attention_v2.py --train
255- ```
256-
257247输出样例为
258248
259- ```
249+ ``` text
260250Pass 0, Batch 0, Cost 267.674663, {'classification_error_evaluator': 1.0}
261251.........
262252Pass 0, Batch 10, Cost 172.892294, {'classification_error_evaluator': 0.953895092010498}
@@ -268,81 +258,80 @@ Pass 0, Batch 30, Cost 153.633665, {'classification_error_evaluator': 0.86438035
268258Pass 0, Batch 40, Cost 168.170543, {'classification_error_evaluator': 0.8348183631896973}
269259```
270260
261+ ### 生成翻译结果
262+ 利用训练好的模型生成翻译文本也十分简单。
263+
264+ 1 . 首先请修改` generate.py ` 脚本中` main ` 中传递给` generate ` 函数的参数,以选择使用哪一个保存的模型来生成。默认参数如下所示:
265+
266+ ``` python
267+ generate(
268+ source_dict_dim = 30000 ,
269+ target_dict_dim = 30000 ,
270+ batch_size = 20 ,
271+ beam_size = 3 ,
272+ model_path = " models/nmt_without_att_params_batch_00100.tar.gz" )
273+ ```
274+
275+ 2 . 在终端执行命令 `python generate.py` ,脚本中的`generate()` 执行了依次如下逻辑:
276+
277+ ** a) 加载测试样本**
278+
279+ ```python
280+ # load data samples for generation
281+ gen_creator = paddle.dataset.wmt14.gen(source_dict_dim)
282+ gen_data = []
283+ for item in gen_creator():
284+ gen_data.append((item[0 ], ))
285+ ```
286+
287+ ** b) 初始化模型,执行`infer()` 为每个输入样本生成`beam search` 的翻译结果**
288+
289+ ```python
290+ beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True )
291+ with gzip.open(init_models_path) as f:
292+ parameters = paddle.parameters.Parameters.from_tar(f)
293+ # prob is the prediction probabilities, and id is the prediction word.
294+ beam_result = paddle.infer(
295+ output_layer = beam_gen,
296+ parameters = parameters,
297+ input = gen_data,
298+ field = [' prob' , ' id' ])
299+ ```
300+
301+ ** c) 加载源语言和目标语言词典,将`id ` 序列表示的句子转化成原语言并输出结果**
302+
303+ ```python
304+ beam_result = inferer.infer(input = test_batch, field = [" prob" , " id" ])
305+
306+ gen_sen_idx = np.where(beam_result[1 ] == - 1 )[0 ]
307+ assert len (gen_sen_idx) == len (test_batch) * beam_size
308+
309+ start_pos, end_pos = 1 , 0
310+ for i, sample in enumerate (test_batch):
311+ print (" " .join([
312+ src_dict[w] for w in sample[0 ][1 :- 1 ]
313+ ])) # skip the start and ending mark when print the source sentence
314+ for j in xrange (beam_size):
315+ end_pos = gen_sen_idx[i * beam_size + j]
316+ print (" %.4f \t %s " % (beam_result[0 ][i][j], " " .join(
317+ trg_dict[w] for w in beam_result[1 ][start_pos:end_pos])))
318+ start_pos = end_pos + 2
319+ print (" \n " )
320+ ```
321+
322+ 设置beam search的宽度为3 ,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下:
323+
324+ ```text
325+ Elles connaissent leur entreprise mieux que personne .
326+ - 3.754819 They know their business better than anyone . < e>
327+ - 4.445528 They know their businesses better than anyone . < e>
328+ - 5.026885 They know their business better than anybody . < e>
271329
272- ### 模型测试
273- 模型测试阶段,函数` generate() ` 执行了依次如下逻辑:
274-
275- ** a) 加载测试样本**
276-
277- ```
278- # load data samples for generation
279- gen_creator = paddle.dataset.wmt14.gen(source_dict_dim)
280- gen_data = []
281- for item in gen_creator():
282- gen_data.append((item[0], ))
283330```
284-
285- ** b) 初始化模型,执行` infer() ` 为每个输入样本生成` beam search ` 的翻译结果**
286-
287- ```
288- beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True)
289- with gzip.open(init_models_path) as f:
290- parameters = paddle.parameters.Parameters.from_tar(f)
291- # prob is the prediction probabilities, and id is the prediction word.
292- beam_result = paddle.infer(
293- output_layer=beam_gen,
294- parameters=parameters,
295- input=gen_data,
296- field=['prob', 'id'])
297- ```
298-
299- ** c) 加载源语言和目标语言词典,将` id ` 序列表示的句子转化成原语言并输出结果**
300-
301- ```
302- # get the dictionary
303- src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim)
304-
305- # the delimited element of generated sequences is -1,
306- # the first element of each generated sequence is the sequence length
307- seq_list = []
308- seq = []
309- for w in beam_result[1]:
310- if w != -1:
311- seq.append(w)
312- else:
313- seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
314- seq = []
315-
316- prob = beam_result[0]
317- for i in xrange(len(gen_data)):
318- print "\n*******************************************************\n"
319- print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n"
320- for j in xrange(beam_size):
321- print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
322- ```
323-
324- 模型测试的执行与模型训练类似,只需执行
325-
326- ```
327- python nmt_without_attention_v2.py --generate
328- ```
329- 则自动为测试数据生成了对应的翻译结果。
330- 设置beam search的宽度为3,输入某个法文句子
331-
332- ```
333- src: <s> Elles connaissent leur entreprise mieux que personne . <e>
334- ```
335-
336- 其对应的英文翻译结果为
337-
338- ```
339- prob = -3.754819: They know their business better than anyone . <e>
340- prob = -4.445528: They know their businesses better than anyone . <e>
341- prob = -5.026885: They know their business better than anybody . <e>
342- ```
343-
344- * ` prob ` 表示生成句子的得分,随之其后则是翻译生成的句子;
345- * ` <s> ` 表示句子的开始,` <e> ` 表示一个句子的结束,如果出现了在词典中未包含的词,则用` <unk> ` 替代。
331+ - 第一行为输入的源语言句子。
332+ - 第二 ~ beam_size + 1 行是柱搜索生成的 ` beam_size ` 条翻译结果
333+ - 相同行的输出以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。
334+ - 符号` <s> ` 表示句子的开始,符号` <e> ` 表示一个句子的结束,如果出现了在词典中未包含的词,则用符号` <unk> ` 替代。
346335
347336至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。
348337
0 commit comments