hit-computer
diff --git a/‎Config.py‎
Lines changed: 27 additions & 0 deletions b/‎Config.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎Generation.py‎
Lines changed: 237 additions & 0 deletions b/‎Generation.py‎
Lines changed: 237 additions & 0 deletions
diff --git a/‎Preprocess.py‎
Lines changed: 131 additions & 0 deletions b/‎Preprocess.py‎
Lines changed: 131 additions & 0 deletions
@@ -0,0 +1,27 @@
+#coding:utf-8
+
+class Config(object):
+    data_dir = 'Data/'
+    vec_file = 'Data/vec10.txt'
+    init_scale = 0.04
+    learning_rate = 0.001
+    max_grad_norm = 10 #gradient clipping
+    num_layers = 2
+    num_steps = 101 #this value is one more than max number of words in sentence
+    hidden_size = 20
+    word_embedding_size = 10
+    max_epoch = 30
+    max_max_epoch = 80
+    keep_prob = 0.5 #The probability that each element is kept through dropout layer
+    lr_decay = 1.0
+    batch_size = 16
+    vocab_size = 7187
+    num_keywords = 5
+    save_freq = 10 #The step (counted by the number of iterations) at which the model is saved to hard disk.
+    model_path = './Model_News' #the path of model that need to save or load
+    
+    # parameter for generation
+    len_of_generation = 16 #The number of characters by generated
+    save_time = 20 #load save_time saved models
+    is_sample = True #true means using sample, if not using argmax
+    BeamSize = 2
@@ -0,0 +1,237 @@
+#coding:utf-8
+import tensorflow as tf
+import sys,time
+import numpy as np
+import cPickle, os
+import random
+import Config
+
+test_word = [u'FDA', u'menu']
+
+config_tf = tf.ConfigProto()
+config_tf.gpu_options.allow_growth = True
+
+word_vec = cPickle.load(open('word_vec.pkl', 'r'))
+vocab = cPickle.load(open('word_voc.pkl','r'))
+
+word_to_idx = { ch:i for i,ch in enumerate(vocab) }
+idx_to_word = { i:ch for i,ch in enumerate(vocab) }
+
+gen_config = Config.Config()
+
+gen_config.vocab_size = len(vocab)
+
+class Model(object):
+    def __init__(self, is_training, config):
+        self.batch_size = batch_size = config.batch_size
+        self.num_steps = num_steps = config.num_steps
+        self.size = size = config.hidden_size
+        vocab_size = config.vocab_size
+
+        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
+        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) 
+        self._input_word = tf.placeholder(tf.int32, [batch_size, config.num_keywords])
+        self._init_output = tf.placeholder(tf.float32, [batch_size, size])
+        self._mask = tf.placeholder(tf.float32, [batch_size, num_steps])
+        self.seq_length = tf.placeholder(tf.float32, [batch_size, 1])
+
+        
+        LSTM_cell = tf.nn.rnn_cell.LSTMCell(size, forget_bias=0.0, state_is_tuple=False)
+        if is_training and config.keep_prob < 1:
+            LSTM_cell = tf.nn.rnn_cell.DropoutWrapper(
+                LSTM_cell, output_keep_prob=config.keep_prob)
+        cell = tf.nn.rnn_cell.MultiRNNCell([LSTM_cell] * config.num_layers, state_is_tuple=False)
+
+        self._initial_state = cell.zero_state(batch_size, tf.float32)
+
+        with tf.device("/cpu:0"):
+            embedding = tf.get_variable('word_embedding', [vocab_size, config.word_embedding_size], trainable=True, initializer=tf.constant_initializer(word_vec))
+            inputs = tf.nn.embedding_lookup(embedding, self._input_data)
+            keyword_inputs = tf.nn.embedding_lookup(embedding, self._input_word)
+
+        if is_training and config.keep_prob < 1:
+            inputs = tf.nn.dropout(inputs, config.keep_prob)
+
+        
+        self.initial_gate = tf.ones([batch_size, config.num_keywords])
+        gate = self.initial_gate
+        
+        atten_sum = tf.zeros([batch_size, config.num_keywords])
+        
+        with tf.variable_scope("coverage"):
+            u_f = tf.get_variable("u_f", [config.num_keywords*config.word_embedding_size, config.num_keywords])
+            res1 = tf.sigmoid(tf.matmul(tf.reshape(keyword_inputs, [batch_size, -1]), u_f))
+            phi_res = self.seq_length * res1
+            
+            self.output1 = phi_res
+            
+        outputs = []
+        output_state = self._init_output
+        state = self._initial_state
+        with tf.variable_scope("RNN"):
+            entropy_cost = []
+            for time_step in range(num_steps):
+                vs = []
+                for s2 in range(config.num_keywords):
+                    with tf.variable_scope("RNN_attention"):
+                        if time_step > 0 or s2 > 0: tf.get_variable_scope().reuse_variables()
+                        u  = tf.get_variable("u", [size, 1])
+                        w1 = tf.get_variable("w1", [size, size])
+                        w2 = tf.get_variable("w2", [config.word_embedding_size, size])
+                        b  = tf.get_variable("b1", [size])
+
+                        vi = tf.matmul(tf.tanh(tf.add(tf.add(
+                            tf.matmul(output_state, w1),
+                            tf.matmul(keyword_inputs[:, s2, :], w2)), b)), u)
+                        vs.append(vi*gate[:,s2:s2+1])
+                
+                self.attention_vs = tf.concat(vs, axis=1)
+                prob_p = tf.nn.softmax(self.attention_vs)
+                
+                self.attention_weight = prob_p
+                
+                gate = gate - (prob_p / phi_res)
+                self.output_gate = gate
+                
+                atten_sum += prob_p * self._mask[:,time_step:time_step+1]
+                
+                mt = tf.add_n([prob_p[:,i:i+1]*keyword_inputs[:, i, :] for i in range(config.num_keywords)])
+
+                with tf.variable_scope("RNN_sentence"):
+                    if time_step > 0: tf.get_variable_scope().reuse_variables()
+                    (cell_output, state) = cell(tf.concat([inputs[:, time_step, :], mt], axis=1), state) 
+                    outputs.append(cell_output)
+                    output_state = cell_output
+            
+            self._end_output = cell_output
+            
+        self.output2 = atten_sum    
+        output = tf.reshape(tf.concat(outputs, axis=1), [-1, size])
+        
+        softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
+        softmax_b = tf.get_variable("softmax_b", [vocab_size])
+        logits = tf.matmul(output, softmax_w) + softmax_b 
+        
+        self._final_state = state
+        self._prob = tf.nn.softmax(logits)
+
+        return
+        
+    @property
+    def input_data(self):
+        return self._input_data
+    
+    @property
+    def end_output(self):
+        return self._end_output
+    
+    @property
+    def targets(self):
+        return self._targets
+
+    @property
+    def initial_state(self):
+        return self._initial_state
+
+    @property
+    def cost(self):
+        return self._cost
+
+    @property
+    def final_state(self):
+        return self._final_state
+
+ 
+
+def run_epoch(session, m, data, eval_op, state=None, is_test=False, input_words=None, verbose=False, flag = 1, last_output=None, last_gate=None, lens=None):
+    """Runs the model on the given data."""
+    x = data.reshape((1,1))
+    initial_output = np.zeros((m.batch_size, m.size))
+    if flag == 0:
+        prob, _state, _last_output, _last_gate, weight, _phi, _ = session.run([m._prob, m.final_state, m.end_output, m.output_gate, m.attention_weight, m.output1, eval_op],
+                             {m.input_data: x,
+                              m._input_word: input_words,
+                              m.initial_state: state,
+                              m._init_output: initial_output,
+                              m.seq_length: [[lens]]})
+                              
+        return prob, _state, _last_output, _last_gate, weight, _phi                      
+    else:
+        prob, _state, _last_output, _last_gate, weight, _ = session.run([m._prob, m.final_state, m.end_output, m.output_gate, m.attention_weight, eval_op],
+                             {m.input_data: x,
+                              m._input_word: input_words,
+                              m.initial_state: state,
+                              m._init_output: last_output,
+                              m.seq_length: [[lens]],
+                              m.initial_gate: last_gate})
+    return prob, _state, _last_output, _last_gate, weight
+    
+def main(_):
+    with tf.Graph().as_default(), tf.Session(config=config_tf) as session:
+        
+        gen_config.batch_size = 1
+        gen_config.num_steps = 1 
+        
+        beam_size = gen_config.BeamSize	
+
+        initializer = tf.random_uniform_initializer(-gen_config.init_scale,
+                                                gen_config.init_scale)
+        with tf.variable_scope("model", reuse=None, initializer=initializer):
+            mtest = Model(is_training=False, config=gen_config)
+
+        tf.initialize_all_variables().run()
+        
+        model_saver = tf.train.Saver(tf.all_variables())
+        print 'model loading ...'
+        model_saver.restore(session, gen_config.model_path+'--%d'%gen_config.save_time)
+        print 'Done!'
+        
+        test_word = [u'信念',u'人生',u'失落',u'心灵',u'不屈']
+        len_of_sample = gen_config.len_of_generation
+        
+        _state = mtest.initial_state.eval()
+        tmp = []
+        beams = [(0.0, [idx_to_word[1]], idx_to_word[1])]
+        for wd in test_word:
+            tmp.append(word_to_idx[wd])
+        _input_words = np.array([tmp], dtype=np.float32)
+        test_data = np.int32([1])
+        prob, _state, _last_output, _last_gate, weight, _phi  = run_epoch(session, mtest, test_data, tf.no_op(), _state, True, input_words=_input_words, flag=0, lens=len_of_sample)
+        y1 = np.log(1e-20 + prob.reshape(-1))
+        if gen_config.is_sample:
+            try:
+                top_indices = np.random.choice(gen_config.vocab_size, beam_size, replace=False, p=prob.reshape(-1))
+            except:
+                top_indices = np.random.choice(gen_config.vocab_size, beam_size, replace=True, p=prob.reshape(-1))
+        else:
+            top_indices = np.argsort(-y1)
+        b = beams[0]
+        beam_candidates = []
+        for i in xrange(beam_size):
+            wordix = top_indices[i]
+            beam_candidates.append((b[0] + y1[wordix], b[1] + [idx_to_word[wordix]], wordix, _state, _last_output, _last_gate))
+        beam_candidates.sort(key = lambda x:x[0], reverse = True) # decreasing order
+        beams = beam_candidates[:beam_size] # truncate to get new beams
+        for xy in range(len_of_sample-1):
+            beam_candidates = []
+            for b in beams:
+                test_data = np.int32(b[2])
+                prob, _state, _last_output, _last_gate, weight = run_epoch(session, mtest, test_data, tf.no_op(), b[3], True, input_words=_input_words, flag=1, last_output=b[4], last_gate=b[5], lens=len_of_sample)
+                y1 = np.log(1e-20 + prob.reshape(-1))
+                if gen_config.is_sample:
+                    try:
+                        top_indices = np.random.choice(gen_config.vocab_size, beam_size, replace=False, p=prob.reshape(-1))
+                    except:
+                        top_indices = np.random.choice(gen_config.vocab_size, beam_size, replace=True, p=prob.reshape(-1))
+                else:
+                    top_indices = np.argsort(-y1)
+                for i in xrange(beam_size):
+                    wordix = top_indices[i]
+                    beam_candidates.append((b[0] + y1[wordix], b[1] + [idx_to_word[wordix]], wordix, _state, _last_output, _last_gate))
+            beam_candidates.sort(key = lambda x:x[0], reverse = True) # decreasing order
+            beams = beam_candidates[:beam_size] # truncate to get new beams
+            
+        print ' '.join(beams[0][1][1:]).encode('utf-8')
+            
+if __name__ == "__main__":
+    tf.app.run()
@@ -0,0 +1,131 @@
+#coding:utf-8
+import numpy as np
+import tensorflow as tf
+import cPickle, os, collections
+import Config
+
+config = Config.Config()
+config.vocab_size += 4
+
+def Read_WordVec(config):
+    with open(config.vec_file, 'r') as fvec:
+        wordLS = []
+        vec_ls =[]
+        fvec.readline()
+        
+        wordLS.append(u'PAD')
+        vec_ls.append([0]*config.word_embedding_size)
+        wordLS.append(u'START')
+        vec_ls.append([0]*config.word_embedding_size)
+        wordLS.append(u'END')
+        vec_ls.append([0]*config.word_embedding_size)
+        wordLS.append(u'UNK')
+        vec_ls.append([0]*config.word_embedding_size)
+        for line in fvec:
+            line = line.split()
+            try:
+                word = line[0].decode('utf-8')
+                vec = [float(i) for i in line[1:]]
+                assert len(vec) == config.word_embedding_size
+                wordLS.append(word)
+                vec_ls.append(vec)
+            except:
+                print line[0]
+        assert len(wordLS) == config.vocab_size
+        word_vec = np.array(vec_ls, dtype=np.float32)
+        
+        cPickle.dump(word_vec, open('word_vec.pkl','w'), protocol=cPickle.HIGHEST_PROTOCOL)
+        cPickle.dump(wordLS, open('word_voc.pkl','w'), protocol=cPickle.HIGHEST_PROTOCOL)
+        
+    return wordLS, word_vec
+
+def Read_Data(config):
+    trainingdata = []
+    with open(os.path.join(config.data_dir, 'TrainingData.txt'),'r') as ftext:
+        for line in ftext:
+            line = line.decode('utf-8')
+            tmp = line.split()
+            idx = tmp.index('</d>')
+            doc = tmp[:idx]
+            keywords = tmp[idx+1:]
+            assert len(keywords) == 5
+            
+            trainingdata.append((doc, keywords))
+    return trainingdata
+    
+print 'loading the trainingdata...'
+DATADIR = config.data_dir
+vocab, _ = Read_WordVec(config)
+#key_word_voc = Create_Keyword_Voc(config)
+data = Read_Data(config)
+
+word_to_idx = { ch:i for i,ch in enumerate(vocab) }
+idx_to_word = { i:ch for i,ch in enumerate(vocab) }
+data_size, _vocab_size = len(data), len(vocab)
+
+print 'data has %d document, size of word vocabular: %d.' % (data_size, _vocab_size)
+
+#keyword_voc_size = len(key_word_voc)
+#keyword_to_idx = { ch:i for i,ch in enumerate(key_word_voc) }
+    
+def data_iterator(trainingdata, batch_size, num_steps):
+    epoch_size = len(trainingdata) // batch_size
+    for i in range(epoch_size):
+        batch_data = trainingdata[i*batch_size:(i+1)*batch_size]
+        raw_data = []
+        key_words = []
+        for it in batch_data:
+            raw_data.append(it[0])
+            tmp = []
+            for wd in it[1]:
+                tmp.append(word_to_idx[wd])
+            key_words.append(tmp)
+            
+        data = np.zeros((len(raw_data), num_steps+1), dtype=np.int64)
+        for i in range(len(raw_data)):
+            doc = raw_data[i]
+            tmp = [1]
+            for wd in doc:
+                if wd in vocab:
+                    tmp.append(word_to_idx[wd])
+                else:
+                    tmp.append(3)
+            tmp.append(2)        
+            tmp = np.array(tmp, dtype=np.int64)
+            _size = tmp.shape[0]
+            data[i][:_size] = tmp
+        
+        key_words = np.array(key_words, dtype=np.int64)
+        
+        x = data[:, 0:num_steps]
+        y = data[:, 1:]
+        mask = np.float32(x != 0)
+        yield (x, y, mask, key_words)
+            
+            
+train_data = data
+writer = tf.python_io.TFRecordWriter("coverage_data")
+dataLS = []
+for step, (x, y, mask, key_words) in enumerate(data_iterator(train_data, config.batch_size, config.num_steps)):
+    example = tf.train.Example(
+        # Example contains a Features proto object
+        features=tf.train.Features(
+          # Features contains a map of string to Feature proto objects
+          feature={
+            # A Feature contains one of either a int64_list,
+            # float_list, or bytes_list
+            'input_data': tf.train.Feature(
+                int64_list=tf.train.Int64List(value=x.reshape(-1).astype("int64"))),
+            'target': tf.train.Feature(
+                int64_list=tf.train.Int64List(value=y.reshape(-1).astype("int64"))),
+            'mask': tf.train.Feature(
+                float_list=tf.train.FloatList(value=mask.reshape(-1).astype("float"))),
+            'key_words': tf.train.Feature(
+                int64_list=tf.train.Int64List(value=key_words.reshape(-1).astype("int64"))),    
+    }))
+    # use the proto object to serialize the example to a string
+    serialized = example.SerializeToString()
+    # write the serialized object to disk
+    writer.write(serialized)
+    
+print 'total step: ',step