Fix some changes

lulupango · May 5, 2015 · 791ca42 · 791ca42
1 parent abc598a
commit 791ca42
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 68 deletions.
diff --git a/dump_training_data.py b/dump_training_data.py
@@ -15,11 +15,15 @@ def dump_training_data(origin_file, format_file):
             print 'cnt: ' + str(cnt)
 
         data = json.loads(line.strip())
+        if (data['cor_type'] == 'cor') or (data['cor_type'] == 'spe_cor'):
+            continue
+
         if data['match_type'] == 'precise':
             outfile.write(data['word'] + '\t' + data['key'] + '\t' + str(data['cnt']) + '\n')
         elif data['match_type'] == 'predict':
             if data['cor_type'] == '':
                 outfile.write(data['word'] + '\t' + data['word'] + '\t' + str(data['cnt']) + '\n')
+            '''
             elif data['cor_type'] == 'cor':
                 error_word = ''
                 for i in xrange(len(data['word'])):
@@ -28,6 +32,7 @@ def dump_training_data(origin_file, format_file):
                     else:
                         error_word = error_word + data['word'][i]
                 outfile.write(data['word'] + '\t' + error_word + '\t' + str(data['cnt']) + '\n')
+            '''
             elif data['cor_type'] == 'spe':
                 error_word = data['spell_info']['spell_in']
                 for i in xrange(len(data['word'])):

diff --git a/edit_distance.py b/edit_distance.py
@@ -47,29 +47,5 @@ def edit(string1, string2):
 
     return record[m][n], pos_vector
 
-'''
-def edit_dl(string1, string2):
-    m = len(string1)
-    n = len(string2)
-    record = [[0] * (n + 1) for i in xrange(m + 1)]
-
-    for ii in xrange(n + 1):
-        record[0][ii] = ii
-
-    for jj in xrange(m + 1):
-        record[jj][0] = jj
-
-    for i in xrange(1, m + 1):
-        for j in xrange(1, n + 1):
-            tmp = 0
-            if string1[i - 1] != string2[j - 1]:
-                if i == 1 or j == 1:
-                    tmp = 1
-                elif string1[i - 1] != string2[j - 2] or string1[i - 2] != string2[j - 1]:
-                    tmp = 1
-            record[i][j] = min(min(record[i - 1][j] + 1, record[i][j - 1] + 1), record[i - 1][j - 1] + tmp)
-    return record[m][n]
-'''
-
 if __name__ == '__main__':
     print edit('you', 'bu')
diff --git a/error_model.py b/error_model.py
@@ -10,7 +10,6 @@ def split_error(word, key, context_num=2):
     for pos in xrange(len(pos_vector)):
         if pos_vector[pos][0] != pos_vector[pos][1]:
             re_list.append(pos_vector[pos][0] + '|' + pos_vector[pos][1])
-            # print pos_vector[pos][0] + '\t' + pos_vector[pos][1]
 
             for slide in xrange(1, context_num + 1):
                 head = max(0, pos - slide)
@@ -25,63 +24,21 @@ def split_error(word, key, context_num=2):
                         if pos_vector[i + ii][1] != '#':
                             slice_s += pos_vector[i + ii][1]
                     re_list.append(slice_w + '|' + slice_s)
-                    # print slice_w + '\t' + slice_s
     return re_list
 
 
 def dump_error_data(train_file, error_file, context_num=2):
     infile = open(train_file, 'r')
     outfile = open(error_file, 'w')
-    # error_map = {}
+    error_map = {}
 
-    print 'read train data and split...'
-    for line in infile:
-        strings = line.strip().split('\t')
-        if strings[0] == strings[1]:
-            continue
-        else:
-            # print strings[0], strings[1]
-            split_list = split_error(strings[0], strings[1], context_num)
-            for pair in split_list:
-                outfile.write(pair + '\t' + str(strings[2]) + '\n')
-                outfile.flush()
-    infile.close()
-    outfile.close()
-
-    '''
     print 'dump error data...'
     # error_list = error_map.items()
     # error_list.sort(key=lambda item: item[1])
     for key in error_map.keys():
         outfile.write(key + '\t' + str(error_map[key]) + '\n')
         outfile.flush()
-    '''
-
-
-'''
-def dump_error_data_split(train_file, error_file, split_file_num=100, context_num=2):
-    line_num = 0
-    infile = open(train_file, 'r')
-    for line in infile:
-        line_num += 1
-    infile.close()
-    print 'line num: ' + str(line_num)
-
-    line_num_each_file = int(line_num / split_file_num)
-    for i in xrange(split_file_num):
-        # print 'split file num: ' + str(i)
-        print str(i*line_num_each_file) + ' ~ ' + str((i+1)*line_num_each_file)
-        if i != split_file_num - 1:
-            dump_error_data(train_file, error_file+'_'+str(i)+'.txt', context_num,
-                            i*line_num_each_file, (i+1)*line_num_each_file)
-        else:
-            dump_error_data(train_file, error_file+'_'+str(i)+'.txt', context_num,
-                            i*line_num_each_file, line_num)
-        gc.collect()
-        time.sleep(5)
-'''
 
 if __name__ == '__main__':
-    # dump_error_data_split('training_data.txt', 'error\\error_data')
     dump_error_data('training_data.txt', 'error_data.txt')
     # print split_error('you', 'yiu')