From ef0a575a09bfe90fb2fa47cdcda3ad2090b0846f Mon Sep 17 00:00:00 2001 From: Shi Xing Date: Mon, 15 May 2017 10:49:03 -0700 Subject: [PATCH] add 'words_ensemble' model --- .gitignore | 7 ++ README_XING.md | 12 ++- executable/ZOPH_RNN_XING | 4 +- scripts/fsa/demo.sh | 12 ++- src/decoder_model_wrapper.h | 6 +- src/decoder_model_wrapper.hpp | 4 +- src/ensemble_factory.hpp | 141 ++++++++++++++++++++++++++++++---- 7 files changed, 165 insertions(+), 21 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4d903c7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.DS_Store +*~ +[#].[#] +.[#]* +*[#] + +*pyc diff --git a/README_XING.md b/README_XING.md index 13d9074..c2dba8e 100644 --- a/README_XING.md +++ b/README_XING.md @@ -134,11 +134,21 @@ $EXEC -k 10 best.nn kbest_fsa.txt --print-score 1 -b 5 --fsa fsa.txt --print-bea You can choose one of the following three commend to type in STDIN: 1. `source ` : process the source-side forward propagation. -2. `words word1 word2 word3` feed the target-side RNN with words sequence `word1 owrd2 word3`. This is supposed to be the line that human composed. +2. `words word1 word2 word3` feed the target-side RNN with words sequence `word1 owrd2 word3`. This is supposed to be the line that human composed. 3. `fsaline encourage_list_files:enc1.txt,enc2.txt encourage_weights:1.0,-1.0 repetition:0.0 alliteration:0.0 wordlen:0.0` Let the RNN to continue decode with FSA. Both step 2 and 3 will start from the previous hidden states and cell states of target-side RNN. +You can also ensemble two models `best.nn.1` and `best.nn.2` by: + +``` +$EXEC -k 10 best.nn.1 best.nn.2 kbest_fsa.txt --print-score 1 -b 5 --fsa fsa.txt --print-beam 1 --decode-main-data-files source.valid.txt source.valid.txt --interactive-line 1 --interactive 1 +``` + +and addtionally, you can use `words_ensemble` option to provide two different human inputs for the two models: + +4. `words_ensemble word11 word12 word13 ___sep___ word21 word22 word23 ___sep___` feed the target-side RNN with words sequence `word11 owrd12 word13` for `best.nn.1` and `word21 word22 word23` for `best.nn.2` These are supposed to be the lines human composed. + # Decoding with Word Alignment Suppose we are translating from French to English, we could use the word alignment information to speed up the decoding. Please find details in 5. [Speeding up Neural Machine Translation Decoding by Shrinking Run-time Vocabulary](http://xingshi.me/data/pdf/ACL2017short.pdf). diff --git a/executable/ZOPH_RNN_XING b/executable/ZOPH_RNN_XING index 8590ee1..8b464d6 100644 --- a/executable/ZOPH_RNN_XING +++ b/executable/ZOPH_RNN_XING @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e95acec2d0089aee919025f7b7dcf93c399a0ec47cb6b006ade868a41984532 -size 126229976 +oid sha256:7528806c908a31c36956112822036ee93322e4c8581e2b3218bb2f4cc76bb444 +size 126230008 diff --git a/scripts/fsa/demo.sh b/scripts/fsa/demo.sh index 4e6c4e1..b83765c 100644 --- a/scripts/fsa/demo.sh +++ b/scripts/fsa/demo.sh @@ -41,10 +41,20 @@ $EXEC -k 10 best.nn kbest_fsa.txt --print-score 1 -b 5 --fsa fsa.txt --print-bea # the command line should contains --fsa and --decode-main-data-files , both fsa_file and source_file should exist and are valid fsa_file and source file, although you don't really use them in the interactive mode. # [Interactive-line mode] : --interactive 1 --interactive-line 1 -$EXEC -k 10 best.nn kbest_fsa.txt --print-score 1 -b 5 --fsa fsa.txt --print-beam 1 --decode-main-data-files source.valid.txt --interactive-line 1 --interactive-line 1 +$EXEC -k 10 best.nn kbest_fsa.txt --print-score 1 -b 5 --fsa fsa.txt --print-beam 1 --decode-main-data-files source.valid.txt --interactive-line 1 --interactive 1 +# 1. `source ` : process the source-side forward propagation. +# 2. `words word1 word2 word3` feed the target-side RNN with words sequence `word1 owrd2 word3`. This is supposed to be the line that human composed. +# 3. `fsaline encourage_list_files:enc1.txt,enc2.txt encourage_weights:1.0,-1.0 repetition:0.0 alliteration:0.0 wordlen:0.0` Let the RNN to continue decode with FSA. +# [Interactive-line mode + ensemble ] : --interactive 1 --interactive-line 1 +$EXEC -k 10 best.nn best.nn kbest_fsa.txt --print-score 1 -b 5 --fsa fsa.txt --print-beam 1 --decode-main-data-files source.valid.txt source.valid.txt --interactive-line 1 --interactive 1 + +# 1. `source ` : process the source-side forward propagation. +# 2. `words word1 word2 word3` feed the target-side RNN with words sequence `word1 owrd2 word3`. This is supposed to be the line that human composed. +# 3. `words_ensemble word11 word12 word13 ___sep___ word21 word22 word23 ___sep___` feed the target-side RNN with words sequence `word11 owrd12 word13` for `best.nn.1` and `word21 word22 word23` for `best.nn.2` This is supposed to be the line that human composed. +# 4. `fsaline encourage_list_files:enc1.txt,enc2.txt encourage_weights:1.0,-1.0 repetition:0.0 alliteration:0.0 wordlen:0.0` Let the RNN to continue decode with FSA. diff --git a/src/decoder_model_wrapper.h b/src/decoder_model_wrapper.h index c516aa1..728b085 100644 --- a/src/decoder_model_wrapper.h +++ b/src/decoder_model_wrapper.h @@ -15,7 +15,11 @@ class decoder_model_wrapper { dType *h_outputdist; dType *d_temp_swap_vals; int *d_input_vocab_indicies_source; - int *d_current_indicies; + int *d_current_indicies; + + int *h_current_indices; // every model should have this vector for model ensemble; + + neuralMT_model *model; //This is the model diff --git a/src/decoder_model_wrapper.hpp b/src/decoder_model_wrapper.hpp index eb80127..4507124 100644 --- a/src/decoder_model_wrapper.hpp +++ b/src/decoder_model_wrapper.hpp @@ -71,7 +71,9 @@ decoder_model_wrapper::decoder_model_wrapper(int gpu_num,int beam_size, //allocate the current indicies CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_current_indicies,beam_size*sizeof(int)),"GPU memory allocation failed\n"); - + h_current_indices = (int *) malloc(beam_size*sizeof(int)); + + model = new neuralMT_model(); //initialize the model model->initModel_decoding(LSTM_size,beam_size,source_vocab_size,target_vocab_size, diff --git a/src/ensemble_factory.hpp b/src/ensemble_factory.hpp index c19ee21..2fb764c 100644 --- a/src/ensemble_factory.hpp +++ b/src/ensemble_factory.hpp @@ -77,16 +77,24 @@ void ensemble_factory::decode_file_interactive_line() { // both of the two funcs needs to prepare the follwing two things: // 1. init the pre_target_states.c_t_pre/h_t_pre as h_2 ( h2 = lstm(w2,h1) ) // 2. init the h_current_indicies = [w3] * beam_size; + // 3. Now, we can ensemble different model, so each models[i] has a h_current_indices: it will records the h_current_indices before and after each function call: + /* + {nothing} -> source -> {models[i].h_current_indices = model_decoder.h_current_indices} + {model_decoder.h_current_indices = models[i].h_current_indices } -> words -> {models[i].h_current_indices = model_decoder.h_current_indices} + {model_decoder.h_current_indices = models[i].h_current_indices } -> words_ensemble -> {models[i].h_current_indices = model_decoder.h_current_indices} + {model_decoder.h_current_indices = models[i].h_current_indices } -> fsaline -> {models[i].h_current_indices = model_decoder.h_current_indices} + */ while (true) { // 1. source -> [END] // 2. words -> [END] - // 3. fsa encourage_list_files:enc1.txt,enc2.txt encourage_weights:1.0,-1.0 repetition:0.0 alliteration:0.0 wordlen:0.0 -> [END] : as normal + // 2. words_ensemble ___sep___ ___sep___ -> [END] + // 3. (removed )fsa encourage_list_files:enc1.txt,enc2.txt encourage_weights:1.0,-1.0 repetition:0.0 alliteration:0.0 wordlen:0.0 -> [END] : as normal // 4. fsaline encourage_list_files:enc1.txt,enc2.txt encourage_weights:1.0,-1.0 repetition:0.0 alliteration:0.0 wordlen:0.0 -> [END]: as noraml, but at the end, move corresponding ct and ht to all beams. - std::cout<<"Please input \n"; + std::cout<<"Please input \n"; std::cout.flush(); // read input // input format: @@ -137,6 +145,13 @@ void ensemble_factory::decode_file_interactive_line() { for(int j=0; j < models.size(); j++) { models[j].forward_prop_source(); } + + //copy model_decoder->h_current_indicies to each model's h_current_indicies; + for(int j=0; j < models.size(); j++) { + for (int k = 0; k < model_decoder->beam_size; k += 1 ){ + models[j].h_current_indices[k] = model_decoder->h_current_indices[k]; + } + } std::cout<<"[END]\n"; std::cout.flush(); @@ -164,14 +179,23 @@ void ensemble_factory::decode_file_interactive_line() { word_indices.push_back(word_index); } - - - for (int i = 0; i< word_indices.size() ; i ++){ - + // init model_decoder->h_current_indices with each modle's h_current_indices; + for (int k = 0; k < model_decoder->beam_size; k += 1 ){ + model_decoder->h_current_indices[k] = models[0].h_current_indices[k] ; + } - std::cout<< "WI: "<< model_decoder->h_current_indices[0] << "\n"; - + for (int i = 0; i< word_indices.size() ; i ++){ + for(int j=0; j < models.size(); j++) { + if (i == 0){ + // for words_ensemble, different model have different h_current_indices; + for (int k = 0; k < model_decoder->beam_size; k += 1 ){ + model_decoder->h_current_indices[k] = models[j].h_current_indices[k] ; + } + } + + std::cout<< "WI["<h_current_indices[0] << "\n"; + models[j].forward_prop_target(curr_index+i,model_decoder->h_current_indices); models[j].target_copy_prev_states(); } @@ -182,16 +206,87 @@ void ensemble_factory::decode_file_interactive_line() { model_decoder->h_current_indices[j] = word_index; } - } + // update each modle's h_current_indices with model_decoder->h_current_indices; + for(int j=0; j < models.size(); j++) { + for (int k = 0; k < model_decoder->beam_size; k += 1 ){ + models[j].h_current_indices[k] = model_decoder->h_current_indices[k]; + } + } std::cout<<"[END]\n"; std::cout.flush(); right_after_encoding = false; - } else if (action == "fsa") { + } else if (action == "words_ensemble"){ + std::vector> word_indices_array; + for (int i = 0; i< models.size(); i++){ + std::vector temp; + word_indices_array.push_back(temp); + } + + int curr_index = 0; + + if (right_after_encoding){ + curr_index = 0; + } else { + curr_index = 1; + } + + int i_sentence = 0; + for (int i = 1; i < ll.size(); i +=1 ){ + std::string word = ll[i]; + if (word == "___sep___"){ + i_sentence+=1; + continue; + } + int word_index = 2; // + if (model_decoder->tgt_mapping.count(word) > 0){ + word_index = model_decoder->tgt_mapping[word]; + } + word_indices_array[i_sentence].push_back(word_index); + } + + for (int j=0; j < word_indices_array.size(); j += 1){ + std::vector & word_indices = word_indices_array[j]; + + // init model_decoder->h_current_indices with each modle's h_current_indices; + for (int k = 0; k < model_decoder->beam_size; k += 1 ){ + model_decoder->h_current_indices[k] = models[j].h_current_indices[k] ; + } + + for (int i = 0; i< word_indices.size() ; i ++){ + + + std::cout<< "WI["<h_current_indices[0] << "\n"; + + models[j].forward_prop_target(curr_index+i,model_decoder->h_current_indices); + models[j].target_copy_prev_states(); + + int word_index = word_indices[i]; + + for (int j=0 ; j< model_decoder->beam_size; j++){ + model_decoder->h_current_indices[j] = word_index; + } + + } + + // update each modle's h_current_indices with model_decoder->h_current_indices; + for (int k = 0; k < model_decoder->beam_size; k += 1 ){ + models[j].h_current_indices[k] = model_decoder->h_current_indices[k]; + } + + } + + + std::cout<<"[END]\n"; + std::cout.flush(); + + right_after_encoding = false; + + } /*else if (action == "fsa") { fsa_file = ll[1]; model_decoder->init_fsa_interactive(fsa_file); @@ -237,10 +332,8 @@ void ensemble_factory::decode_file_interactive_line() { //process wordlen weight model_decoder->wordlen_weight = wordlen_weight; - - decode_file_line(right_after_encoding,false); - + //read output and print into stdout; input_file_prep input_helper; input_helper.unint_file(p_params->model_names[0],p_params->decoder_output_file,p_params->decoder_final_file,false,true); @@ -264,7 +357,7 @@ void ensemble_factory::decode_file_interactive_line() { right_after_encoding = false; - } else if (action == "fsaline") { + } */ else if (action == "fsaline") { fsa_file = ll[1]; @@ -343,7 +436,7 @@ void ensemble_factory::decode_file_interactive_line() { template void ensemble_factory::decode_file_line(bool right_after_encoding, bool end_transfer) { - // right_after_encoding = true, means the system is never decoding a word, + // right_after_encoding = true, means the system hasn't decoded a word, // bool pre_end_transfer = model_decoder->end_transfer; model_decoder->end_transfer = end_transfer; @@ -363,7 +456,16 @@ void ensemble_factory::decode_file_line(bool right_after_encoding, bool e for(int j=0; j < models.size(); j++) { // curr_index: whether it's 0 or non-0. Doesn't matter if it's 1 or 2 or 3. // &c_t_pre = &pre_state ; c_t = f(c_t_pre) + + if (curr_index == 0){ + // for words_ensemble, different model have different h_current_indices; + for (int k = 0; k < model_decoder->beam_size; k += 1 ){ + model_decoder->h_current_indices[k] = models[j].h_current_indices[k] ; + } + } + models[j].forward_prop_target(curr_index+start_index,model_decoder->h_current_indices); + } @@ -403,6 +505,15 @@ void ensemble_factory::decode_file_line(bool right_after_encoding, bool e model_decoder->output_k_best_hypotheses(models[0].fileh->sentence_length); //model_decoder->print_current_hypotheses(); model_decoder->end_transfer = pre_end_transfer; + + // update each modle's h_current_indices with model_decoder->h_current_indices; + for(int j=0; j < models.size(); j++) { + for (int k = 0; k < model_decoder->beam_size; k += 1 ){ + models[j].h_current_indices[k] = model_decoder->h_current_indices[k]; + } + } + + }