Skip to content

Commit

Permalink
Fix some changes
Browse files Browse the repository at this point in the history
  • Loading branch information
gy910210 committed May 5, 2015
1 parent abc598a commit 791ca42
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 68 deletions.
5 changes: 5 additions & 0 deletions dump_training_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ def dump_training_data(origin_file, format_file):
print 'cnt: ' + str(cnt)

data = json.loads(line.strip())
if (data['cor_type'] == 'cor') or (data['cor_type'] == 'spe_cor'):
continue

if data['match_type'] == 'precise':
outfile.write(data['word'] + '\t' + data['key'] + '\t' + str(data['cnt']) + '\n')
elif data['match_type'] == 'predict':
if data['cor_type'] == '':
outfile.write(data['word'] + '\t' + data['word'] + '\t' + str(data['cnt']) + '\n')
'''
elif data['cor_type'] == 'cor':
error_word = ''
for i in xrange(len(data['word'])):
Expand All @@ -28,6 +32,7 @@ def dump_training_data(origin_file, format_file):
else:
error_word = error_word + data['word'][i]
outfile.write(data['word'] + '\t' + error_word + '\t' + str(data['cnt']) + '\n')
'''
elif data['cor_type'] == 'spe':
error_word = data['spell_info']['spell_in']
for i in xrange(len(data['word'])):
Expand Down
24 changes: 0 additions & 24 deletions edit_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,29 +47,5 @@ def edit(string1, string2):

return record[m][n], pos_vector

'''
def edit_dl(string1, string2):
m = len(string1)
n = len(string2)
record = [[0] * (n + 1) for i in xrange(m + 1)]
for ii in xrange(n + 1):
record[0][ii] = ii
for jj in xrange(m + 1):
record[jj][0] = jj
for i in xrange(1, m + 1):
for j in xrange(1, n + 1):
tmp = 0
if string1[i - 1] != string2[j - 1]:
if i == 1 or j == 1:
tmp = 1
elif string1[i - 1] != string2[j - 2] or string1[i - 2] != string2[j - 1]:
tmp = 1
record[i][j] = min(min(record[i - 1][j] + 1, record[i][j - 1] + 1), record[i - 1][j - 1] + tmp)
return record[m][n]
'''

if __name__ == '__main__':
print edit('you', 'bu')
45 changes: 1 addition & 44 deletions error_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ def split_error(word, key, context_num=2):
for pos in xrange(len(pos_vector)):
if pos_vector[pos][0] != pos_vector[pos][1]:
re_list.append(pos_vector[pos][0] + '|' + pos_vector[pos][1])
# print pos_vector[pos][0] + '\t' + pos_vector[pos][1]

for slide in xrange(1, context_num + 1):
head = max(0, pos - slide)
Expand All @@ -25,63 +24,21 @@ def split_error(word, key, context_num=2):
if pos_vector[i + ii][1] != '#':
slice_s += pos_vector[i + ii][1]
re_list.append(slice_w + '|' + slice_s)
# print slice_w + '\t' + slice_s
return re_list


def dump_error_data(train_file, error_file, context_num=2):
infile = open(train_file, 'r')
outfile = open(error_file, 'w')
# error_map = {}
error_map = {}

print 'read train data and split...'
for line in infile:
strings = line.strip().split('\t')
if strings[0] == strings[1]:
continue
else:
# print strings[0], strings[1]
split_list = split_error(strings[0], strings[1], context_num)
for pair in split_list:
outfile.write(pair + '\t' + str(strings[2]) + '\n')
outfile.flush()
infile.close()
outfile.close()

'''
print 'dump error data...'
# error_list = error_map.items()
# error_list.sort(key=lambda item: item[1])
for key in error_map.keys():
outfile.write(key + '\t' + str(error_map[key]) + '\n')
outfile.flush()
'''


'''
def dump_error_data_split(train_file, error_file, split_file_num=100, context_num=2):
line_num = 0
infile = open(train_file, 'r')
for line in infile:
line_num += 1
infile.close()
print 'line num: ' + str(line_num)
line_num_each_file = int(line_num / split_file_num)
for i in xrange(split_file_num):
# print 'split file num: ' + str(i)
print str(i*line_num_each_file) + ' ~ ' + str((i+1)*line_num_each_file)
if i != split_file_num - 1:
dump_error_data(train_file, error_file+'_'+str(i)+'.txt', context_num,
i*line_num_each_file, (i+1)*line_num_each_file)
else:
dump_error_data(train_file, error_file+'_'+str(i)+'.txt', context_num,
i*line_num_each_file, line_num)
gc.collect()
time.sleep(5)
'''

if __name__ == '__main__':
# dump_error_data_split('training_data.txt', 'error\\error_data')
dump_error_data('training_data.txt', 'error_data.txt')
# print split_error('you', 'yiu')

0 comments on commit 791ca42

Please sign in to comment.