legacy mode fix + generate_decode_train.py

isi-nlp · Jan 30, 2017 · ee35277 · ee35277
1 parent 0e8f977
commit ee35277
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 64 deletions.
diff --git a/scripts/generate_train_decode.py b/scripts/generate_train_decode.py
@@ -0,0 +1,107 @@
+import sys
+import os
+
+head = """
+#!/bin/bash
+#PBS -q isi
+#PBS -l walltime=300:00:00
+#PBS -l gpus=2
+
+PREFIX=__PREFIX__
+model_folder=/home/nlg-05/xingshi/lstm/model/$PREFIX/
+data_folder=/home/nlg-05/xingshi/lstm/ghdata/Eng_Uzb/
+output_folder=/home/nlg-05/xingshi/lstm/syntaxNMT/decode/
+EXEC=/home/nlg-05/xingshi/lstm/exec/ZOPH_RNN
+
+PY_FORMAT=/home/nlg-05/xingshi/lstm/single_layer_gpu_google_model/Scripts/bleu_format_valid.py
+PERL_BLEU=/home/nlg-05/xingshi/workspace/tools/mosesdecoder/scripts/generic/multi-bleu.perl
+
+SRC_TRN=$data_folder/training.tok.lc.uzb
+TGT_TRN=$data_folder/training.tok.lc.eng
+
+SRC_DEV=$data_folder/dev.tok.lc.uzb
+TGT_DEV=$data_folder/dev.tok.lc.eng
+
+SRC_TST=$data_folder/test.tok.lc.uzb
+TGT_TST=$data_folder/test.tok.lc.eng
+OUTPUT=$output_folder/$PREFIX.kbest
+REF=$output_folder/$PREFIX.ref
+BLEU=$output_folder/$PREFIX.bleu
+
+mkdir $model_folder
+cd $model_folder
+
+__cmd__
+
+"""
+
+cmd_train = "$EXEC --logfile HPC_OUTPUT_NEW.txt -a $SRC_DEV $TGT_DEV -t $SRC_TRN $TGT_TRN model.nn -B best.nn -v 50000 -V 25000 --screen-print-rate 300 -N 2 -M 0 0 1 -n 40 -w 5 -L 200 --attention-model true --feed-input true -m 64"
+# -A 0.9 -l 0.5 -d 0.5 -H 1000 
+
+cmd_decode = """ $EXEC -k 1 $model_folder/best.nn $OUTPUT --decode-main-data-files $SRC_TST -L 100 -b 12
+python $PY_FORMAT $OUTPUT $TGT_TST $REF
+perl $PERL_BLEU -lc $REF < $OUTPUT.bleu > $BLEU
+
+"""
+
+def main():
+    def A(val):
+        return "A{}".format(val), "-A {}".format(val)
+    def l(val):
+        return "l{}".format(val), "-l {}".format(val)
+    def d(val):
+        return "d{}".format(val), "-d {}".format(val)
+    def H(val):
+        return "H{}".format(val), "-H {}".format(val)
+
+    funcs = [H,l,d,A]
+    template = [300,0.5,0.5,0.5]
+    params = []
+
+    _Hs = [300,500,1000]
+    _ls = [0.5,1.0]
+    _ds = [0.5,0.8]
+
+    gen = ((x,y,z) for x in _Hs for y in _ls for z in _ds)
+    for _H, _l, _d in gen:
+        temp = list(template)
+        temp[0] = _H
+        temp[1] = _l
+        temp[2] = _d
+        params.append(temp)
+
+    def get_name_cmd(paras):
+        name = "Uz_En_"
+        cmd = [cmd_train]
+        for func, para in zip(funcs,paras):
+            n, c = func(para)
+            name += n
+            cmd.append(c)
+
+        name = name.replace(".",'')
+
+        cmd = " ".join(cmd)
+        return name, cmd
+
+    # train
+    for para in params:
+        name, cmd = get_name_cmd(para)
+        fn = "../sh_ue/{}.sh".format(name)
+        f = open(fn,'w')
+        content = head.replace("__cmd__",cmd).replace("__PREFIX__",name)
+        f.write(content)
+        f.close()
+
+    # decode
+    for para in params:
+        name, cmd = get_name_cmd(para)
+        fn = "../sh_ue/{}.decode.sh".format(name)
+        f = open(fn,'w')
+        content = head.replace("__cmd__",cmd_decode).replace("__PREFIX__",name)
+        f.write(content)
+        f.close()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/src/decoder.h b/src/decoder.h
@@ -196,11 +196,6 @@ struct decoder {
     float alliteration_weight = 0.0;
     float wordlen_weight = 0.0;
 
-    // for encourage list
-    bool encourage = false;
-    std::unordered_map<int,float> *encourage_list = NULL;
-    //float encourage_weight = 0;
-
     std::unordered_map<std::string,int> tgt_mapping;
 
     std::vector<state*> current_states;
@@ -472,11 +467,8 @@ struct decoder {
         for (int i = 0; i< fns.size(); i++){
             std::string encourage_file = fns[i];
             float weight = weights[i];
-            if (i == 0){
-                this->init_encourage_list(encourage_file, weight);
-            } else {
-                this->init_encourage_list_additional(encourage_file, weight);
-            }
+            this->init_encourage_list(encourage_file, weight);
+
         }
 
         CUDA_ERROR_WRAPPER(cudaMemcpy(d_encourage, h_encourage,
@@ -490,67 +482,35 @@ struct decoder {
     void init_encourage_list(std::string fn, float weight){
 
         // should call after init_fsa();
-        if (fn == ""){
-            encourage = false;
-            //encourage_weight = 0;
-            if (this->encourage_list != NULL){
-                delete this->encourage_list;
-                this->encourage_list = NULL;
-            }
-            return;
-        }
-
-        encourage = true;
-        //encourage_weight = weight;
-        if (this->encourage_list != NULL){
-            delete this->encourage_list;
-        }
-        this->encourage_list = new std::unordered_map<int,float>();
-
         std::ifstream fin(fn.c_str());
         std::string line;
+        int n_nounk = 0;
         while(std::getline(fin,line)){
-            int index = 2 ; // <UNK>
-            if (this->tgt_mapping.count(line) > 0){
-                index = this->tgt_mapping[line];
-            }
-            if (index != 2){
-                (*(this->encourage_list))[index] = weight;
-                h_encourage[index] = weight;
+
+            std::vector<std::string> ll = split(line,' ');
+            float i_weight = 1.0;
+            std::string word = ll[0];
+            if (ll.size() == 2){
+                i_weight = std::stof(ll[1]);
             }
-        }
-        fin.close();
-
-        BZ_CUDA::logger<< "Encourage Weight: " << weight <<"\n";
-        BZ_CUDA::logger<< "Encourage List Size: " <<(int)(encourage_list->size()) <<"\n";
-    }
-
-    void init_encourage_list_additional(std::string fn, float weight){
-        // if there's more than one encourage list, use this function to init the encourage_lists except the first one.
-        std::ifstream fin(fn.c_str());
-        std::string line;
-        while(std::getline(fin,line)){
             int index = 2 ; // <UNK>
-            if (this->tgt_mapping.count(line) > 0){
-                index = this->tgt_mapping[line];
+            if (this->tgt_mapping.count(word) > 0){
+                index = this->tgt_mapping[word];
             }
+
             if (index != 2){
-                if (this->encourage_list->count(index) == 0){
-                    (*(this->encourage_list))[index] = weight;
-                } else {
-                    (*(this->encourage_list))[index] += weight;
-                }
-                h_encourage[index] += weight;
+                //std::cout << word << " " << index << " " << i_weight << "\n";
+                h_encourage[index] += weight * i_weight;
+                n_nounk += 1;
             }
 
         }
         fin.close();
-        BZ_CUDA::logger<< "Encourage Weight: "<< weight <<"\n";
-        BZ_CUDA::logger<< "Encourage List Size: "<<(int)(encourage_list->size())<<"\n";
 
+        BZ_CUDA::logger<< "Encourage Weight: " << weight <<"\n";
+        BZ_CUDA::logger<< "Encourage List Size: " << n_nounk <<"\n";
     }
 
-
     // for single fsa file
     void init_fsa_inner(global_params &params){
         this->fsa_weight = params.fsa_weight;

diff --git a/src/ensemble_factory.hpp b/src/ensemble_factory.hpp
@@ -53,11 +53,12 @@ ensemble_factory<dType>::ensemble_factory(std::vector<std::string> weight_file_n
 
 template<typename dType>
 void ensemble_factory<dType>::decode_file() {
-
     if (this->interactive){
-        decode_file_interactive();
-    } else if (this->interactive_line){
-        decode_file_interactive_line();
+        if (this->interactive_line){
+            decode_file_interactive_line();
+        } else {
+            decode_file_interactive();
+        }
     } else {
         decode_file_batch();
     }
@@ -106,7 +107,7 @@ void ensemble_factory<dType>::decode_file_interactive_line() {
 
             input_file_prep input_helper;
             input_helper.integerize_file_kbest(p_params->model_names[0],source_file,p_params->decode_temp_files[0],
-                                               p_params->longest_sent,p_params->target_vocab_size,false,"NULL");
+                                               p_params->longest_sent,p_params->target_vocab_size,false,"NULL", p_params->legacy_model);
 
             int num_lines_in_file = 1;
 
@@ -274,7 +275,7 @@ void ensemble_factory<dType>::decode_file_line(bool right_after_encoding, bool e
     //run the forward prop of target
     for(int curr_index=0; curr_index < std::min( (int)(max_decoding_ratio*models[0].fileh->sentence_length) , longest_sent-2 ); curr_index++) {
 
-        std::cout << "WI:" << model_decoder->h_current_indices[0]<<"\n";
+        //std::cout << "WI:" << model_decoder->h_current_indices[0]<<"\n";
 
         for(int j=0; j < models.size(); j++) {
             // curr_index: whether it's 0 or non-0. Doesn't matter if it's 1 or 2 or 3.
@@ -391,7 +392,7 @@ void ensemble_factory<dType>::decode_file_interactive() {
         //
         input_file_prep input_helper;
         input_helper.integerize_file_kbest(p_params->model_names[0],source_file,p_params->decode_temp_files[0],
-                                           p_params->longest_sent,p_params->target_vocab_size,false,"NULL");
+                                           p_params->longest_sent,p_params->target_vocab_size,false,"NULL", p_params->legacy_model);
 
         int num_lines_in_file = 1;
 
@@ -577,6 +578,7 @@ void ensemble_factory<dType>::decode_file_batch() {
 	}
 
     models[0].model->timer.report();
+    models[0].model->timer.clear();
 
 }