-
Notifications
You must be signed in to change notification settings - Fork 65
/
Copy pathmodel.h
219 lines (154 loc) · 7.85 KB
/
model.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
//Model file that contains the parameters for the model
#ifndef MODEL_H
#define MODEL_H
#include <vector>
#include <Eigen/Dense>
#include "file_helper_decoder.h"
#include "fileHelper_source.h"
#include "decoder.h"
#include "LSTM.h"
#include "Eigen_Util.h"
//#include <boost/random/uniform_01.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_real.hpp>
#include "softmax.h"
#include <math.h>
#include <limits>
#include "Input_To_Hidden_Layer.h"
#include "Hidden_To_Hidden_Layer.h"
#include "memory_util.h"
template<typename dType>
class Input_To_Hidden_Layer;
template<typename dType>
class Hidden_To_Hidden_Layer;
struct file_helper;
namespace debug_flag {
bool flag = false;
}
template<typename dType>
class neuralMT_model {
public:
/////////////////////////////////Current minibatch info for the model///////////////////////////////////
//loss layer for the model
//softmax_layer<dType> *softmax;
base_loss_layer<dType> *softmax;
//First layer of model, the input to hidden layer
Input_To_Hidden_Layer<dType> input_layer_source;
Input_To_Hidden_Layer<dType> input_layer_target;
//Hidden layers of model
std::vector<Hidden_To_Hidden_Layer<dType>> source_hidden_layers;
std::vector<Hidden_To_Hidden_Layer<dType>> target_hidden_layers;
//extra source encoder for bi-directional stuff. In bidirectional case, the indicies are in the forward direction
Input_To_Hidden_Layer<dType> input_layer_source_bi;
std::vector<Hidden_To_Hidden_Layer<dType>> source_hidden_layers_bi;
bi_encoder<dType> bi_dir_source; //the bidirectional wrapper
encoder_multi_source<dType> multi_source_layer; //for multiple source languages
/////////////////////////////////Other random stuff//////////////////////////////////////////////////
file_helper *file_info;
softmax_layer_gpu_info s_layer_info;
std::ifstream input;
std::ofstream output;
Eigen::Matrix<dType,Eigen::Dynamic, Eigen::Dynamic> zero_error; //passed in from imaginary softmax for source side
std::string input_weight_file;
std::string output_weight_file;
bool debug;
bool train_perplexity_mode;
double train_perplexity=0;
bool truncated_softmax;
bool LM;// true if language model only, aka no source side
bool train = false; //this is for makign sure dropout is not used at test time
bool grad_check_flag = false;
//for the attention model
int source_length = -1;
//for birdirectional layer
bool bi_dir = false;
bool multi_source = false;
//attention model
attention_params attent_params;
std::ofstream output_alignments;
//for visualizing the RNN
bool dump_LSTM;
std::ofstream LSTM_stream_dump;
//for decoding multilayer models, on index for each layer
bool decode = false;
std::vector<prev_source_state<dType>> previous_source_states;
std::vector<prev_source_state<dType>> previous_source_states_bi; //for bi_directional encoder
std::vector<prev_target_state<dType>> previous_target_states;
std::vector<dType*> top_source_states; //for attention model in decoder
std::vector<dType*> top_source_states_v2; //for attention model in decoder
attention_layer<dType> decoder_att_layer; //for decoding only
bool multi_attention = false;
bool multi_attention_v2 = false;
file_helper_source src_fh; //for training
file_helper_source *src_fh_test; //for training
std::string multisource_file; //for training and testing, it is the path to the correct file
bool char_cnn = false;
char_cnn_params char_params;
Timer timer;
///////////////////////////////////Methods for the class//////////////////////////////////////////////
neuralMT_model() {};
//Called at beginning of program once to initialize the weights
void initModel(int LSTM_size,int minibatch_size,int source_vocab_size,int target_vocab_size,
int longest_sent,bool debug,dType learning_rate,bool clip_gradients,dType norm_clip,
std::string input_weight_file,std::string output_weight_file,bool scaled,bool train_perplexity,
bool truncated_softmax,int shortlist_size,int sampled_size,bool LM,int num_layers,std::vector<int> gpu_indicies,
bool dropout,dType dropout_rate,struct attention_params attent_params,global_params ¶ms);
//For the decoder
void initModel_decoding(int LSTM_size,int beam_size,int source_vocab_size,int target_vocab_size,
int num_layers,std::string input_weight_file,int gpu_num,global_params ¶ms,
bool attention_model,bool feed_input,bool multi_source,bool combine_LSTM,bool char_cnn);
//This initializes the streams,event and cuBLAS handlers, along with setting the GPU's for the layers
void init_GPUs();
//Dumps all the GPU info
void print_GPU_Info();
//initialize prev states for decoding
void init_prev_states(int num_layers, int LSTM_size,int minibatch_size, int device_number,bool multi_source);
//Gets called one minibatch is formulated into a matrix
//This matrix is then passed in and forward/back prop is done, then gradients are updated
template<typename Derived>
void compute_gradients(const Eigen::MatrixBase<Derived> &source_input_minibatch_const,
const Eigen::MatrixBase<Derived> &source_output_minibatch_const,const Eigen::MatrixBase<Derived> &target_input_minibatch_const,
const Eigen::MatrixBase<Derived> &target_output_minibatch_const,int *h_input_vocab_indicies_source,
int *h_output_vocab_indicies_source,int *h_input_vocab_indicies_target,int *h_output_vocab_indicies_target,
int current_source_length,int current_target_length,int *h_output_vocab_indicies_source_Wgrad,
int *h_input_vocab_indicies_target_Wgrad,int len_source_Wgrad,int len_target_Wgrad,int *h_sampled_indices,
int len_unique_words_trunc_softmax,int *h_batch_info,file_helper *temp_fh);
//Sets all gradient matrices to zero, called after a minibatch updates the gradients
void clear_gradients();
//Called after you get gradients for the current minibatch
void updateParameters();
void check_all_gradients(dType epsilon);
//Get the sum of all errors in the minibatch
double getError(bool GPU);
//Runs gradient check on a parameter vector or matrix
template<typename Derived,typename Derived3>
void check_gradient(dType epsilon,const Eigen::MatrixBase<Derived3> ¶meter_const,const Eigen::MatrixBase<Derived> &grad);
//Called after each minibatch, once the gradients are calculated
void update_weights();
void update_weights_OLD(); //per matrix clipping
//Output the weights to a file
void dump_weights();
void dump_best_model(std::string best_model_name,std::string const_model);
//Read in Weights from file
void load_weights();
void update_learning_rate(dType new_learning_rate);
//gets the perplexity of a file
double get_perplexity(std::string test_file_name,int minibatch_size,int &test_num_lines_in_file, int longest_sent,
int source_vocab_size,int target_vocab_size,bool load_weights_val,int &test_total_words,
bool HPC_output_flag,bool force_decode,std::string fd_filename);
//Maps the file info pointer to the model
void initFileInfo(struct file_helper *file_info);
void stoicastic_generation(int length,std::string output_file_name,double temperature);
void forward_prop_source(int *d_input_vocab_indicies_source,int *d_input_vocab_indicies_source_bi,int *d_ones,
int source_length,int source_length_bi,int LSTM_size,int *d_char_cnn_indicies);
void forward_prop_target(int curr_index,int *d_current_indicies,int *d_ones,int LSTM_size, int beam_size,
int *d_char_cnn_indicies);
template<typename Derived>
void swap_decoding_states(const Eigen::MatrixBase<Derived> &indicies,int index,dType *d_temp_swap_vals);
void target_copy_prev_states(int LSTM_size, int beam_size);
void dump_alignments(int target_length,int minibatch_size,int *h_input_vocab_indicies_source,int *h_input_vocab_indicies_target,int *h_input_vocab_indicies_source_2);
// for fsa line
void get_chts(std::vector<Eigen::Matrix<dType, Eigen::Dynamic,1>> &chts, int beam_index, int beam_size);
void set_chts(const std::vector<Eigen::Matrix<dType, Eigen::Dynamic,1>>& chts, int beam_size);
};
#endif