From 791ca42b6a1e98db4629e7062c5ad552956c8775 Mon Sep 17 00:00:00 2001 From: Yu Gong Date: Tue, 5 May 2015 13:13:25 +0800 Subject: [PATCH] Fix some changes --- dump_training_data.py | 5 +++++ edit_distance.py | 24 ----------------------- error_model.py | 45 +------------------------------------------ 3 files changed, 6 insertions(+), 68 deletions(-) diff --git a/dump_training_data.py b/dump_training_data.py index acc53cc..aabaa07 100644 --- a/dump_training_data.py +++ b/dump_training_data.py @@ -15,11 +15,15 @@ def dump_training_data(origin_file, format_file): print 'cnt: ' + str(cnt) data = json.loads(line.strip()) + if (data['cor_type'] == 'cor') or (data['cor_type'] == 'spe_cor'): + continue + if data['match_type'] == 'precise': outfile.write(data['word'] + '\t' + data['key'] + '\t' + str(data['cnt']) + '\n') elif data['match_type'] == 'predict': if data['cor_type'] == '': outfile.write(data['word'] + '\t' + data['word'] + '\t' + str(data['cnt']) + '\n') + ''' elif data['cor_type'] == 'cor': error_word = '' for i in xrange(len(data['word'])): @@ -28,6 +32,7 @@ def dump_training_data(origin_file, format_file): else: error_word = error_word + data['word'][i] outfile.write(data['word'] + '\t' + error_word + '\t' + str(data['cnt']) + '\n') + ''' elif data['cor_type'] == 'spe': error_word = data['spell_info']['spell_in'] for i in xrange(len(data['word'])): diff --git a/edit_distance.py b/edit_distance.py index 42e5e0e..430bb3b 100644 --- a/edit_distance.py +++ b/edit_distance.py @@ -47,29 +47,5 @@ def edit(string1, string2): return record[m][n], pos_vector -''' -def edit_dl(string1, string2): - m = len(string1) - n = len(string2) - record = [[0] * (n + 1) for i in xrange(m + 1)] - - for ii in xrange(n + 1): - record[0][ii] = ii - - for jj in xrange(m + 1): - record[jj][0] = jj - - for i in xrange(1, m + 1): - for j in xrange(1, n + 1): - tmp = 0 - if string1[i - 1] != string2[j - 1]: - if i == 1 or j == 1: - tmp = 1 - elif string1[i - 1] != string2[j - 2] or string1[i - 2] != string2[j - 1]: - tmp = 1 - record[i][j] = min(min(record[i - 1][j] + 1, record[i][j - 1] + 1), record[i - 1][j - 1] + tmp) - return record[m][n] -''' - if __name__ == '__main__': print edit('you', 'bu') diff --git a/error_model.py b/error_model.py index a44f962..88bce4b 100644 --- a/error_model.py +++ b/error_model.py @@ -10,7 +10,6 @@ def split_error(word, key, context_num=2): for pos in xrange(len(pos_vector)): if pos_vector[pos][0] != pos_vector[pos][1]: re_list.append(pos_vector[pos][0] + '|' + pos_vector[pos][1]) - # print pos_vector[pos][0] + '\t' + pos_vector[pos][1] for slide in xrange(1, context_num + 1): head = max(0, pos - slide) @@ -25,63 +24,21 @@ def split_error(word, key, context_num=2): if pos_vector[i + ii][1] != '#': slice_s += pos_vector[i + ii][1] re_list.append(slice_w + '|' + slice_s) - # print slice_w + '\t' + slice_s return re_list def dump_error_data(train_file, error_file, context_num=2): infile = open(train_file, 'r') outfile = open(error_file, 'w') - # error_map = {} + error_map = {} - print 'read train data and split...' - for line in infile: - strings = line.strip().split('\t') - if strings[0] == strings[1]: - continue - else: - # print strings[0], strings[1] - split_list = split_error(strings[0], strings[1], context_num) - for pair in split_list: - outfile.write(pair + '\t' + str(strings[2]) + '\n') - outfile.flush() - infile.close() - outfile.close() - - ''' print 'dump error data...' # error_list = error_map.items() # error_list.sort(key=lambda item: item[1]) for key in error_map.keys(): outfile.write(key + '\t' + str(error_map[key]) + '\n') outfile.flush() - ''' - - -''' -def dump_error_data_split(train_file, error_file, split_file_num=100, context_num=2): - line_num = 0 - infile = open(train_file, 'r') - for line in infile: - line_num += 1 - infile.close() - print 'line num: ' + str(line_num) - - line_num_each_file = int(line_num / split_file_num) - for i in xrange(split_file_num): - # print 'split file num: ' + str(i) - print str(i*line_num_each_file) + ' ~ ' + str((i+1)*line_num_each_file) - if i != split_file_num - 1: - dump_error_data(train_file, error_file+'_'+str(i)+'.txt', context_num, - i*line_num_each_file, (i+1)*line_num_each_file) - else: - dump_error_data(train_file, error_file+'_'+str(i)+'.txt', context_num, - i*line_num_each_file, line_num) - gc.collect() - time.sleep(5) -''' if __name__ == '__main__': - # dump_error_data_split('training_data.txt', 'error\\error_data') dump_error_data('training_data.txt', 'error_data.txt') # print split_error('you', 'yiu') \ No newline at end of file