forked from zhenming33/RAN_torch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
char2seq.py
93 lines (80 loc) · 2.43 KB
/
char2seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 5 14:27:35 2018
@author: ensur
"""
import os
import numpy as np
import pickle
# generate all characters dict
lines = open('cjkvi-ids/ids.txt',encoding='UTF-8').readlines()[2:]
char_seq = {}
char_seq['⿰'] = '⿰'
char_seq['⿱'] = '⿱'
char_seq['⿵'] = '⿵'
char_seq['⿻'] = '⿻'
char_seq['⿺'] = '⿺'
char_seq['⿹'] = '⿹'
char_seq['⿶'] = '⿶'
char_seq['⿳'] = '⿳'
char_seq['⿴'] = '⿴'
char_seq['⿸'] = '⿸'
char_seq['⿷'] = '⿷'
char_seq['⿲'] = '⿲'
char_seq['A'] = 'A'
char_seq['H'] = 'H'
char_seq['U'] = 'U'
char_seq['X'] = 'X'
for i in range(len(lines)):
a = lines[i].split(' ')[0].replace('\n','').split('\t')
seq = a[2].replace(' ','').replace('[','').replace(']','')\
.replace('G','').replace('T','').replace('J','')\
.replace('K','').replace('V','')
char_seq[a[1]] = seq
for i in range(len(lines)):
a = lines[i].split(' ')[0].replace('\n','').split('\t')
seq = a[2].replace(' ','').replace('[','').replace(']','')\
.replace('G','').replace('T','').replace('J','')\
.replace('K','').replace('V','')
for k in seq:
char_seq[k]
# analysis all seq
def is_all(seq):
all_len = [len(char_seq[c]) for c in seq]
if max(all_len) > 1:
return False
else:
return True
char_seq_all = {}
for i in range(len(lines)):
print(i)
a = lines[i].split(' ')[0].replace('\n','').split('\t')
char = a[1]
seq_tmp = char_seq[a[1]]
while not is_all(seq_tmp):
for k in range(len(seq_tmp)):
if len(char_seq[seq_tmp[k]]) > 1:
seq_tmp = seq_tmp.replace(seq_tmp[k],char_seq[seq_tmp[k]])
print(seq_tmp)
char_seq_all[char] = seq_tmp
alphabet = ''
for value in char_seq_all.values():
alphabet += value
alphabet = list(set(alphabet))
alphabet = ''.join(alphabet)
print(len(alphabet))
char_seq_index = {}
for keys in char_seq_all.keys():
char_seq_index[keys] = [alphabet.index(c) for c in list(char_seq_all[keys])]
#保存序列
save_file = open('char2seq_dict_real.pkl', 'wb')
pickle.dump(char_seq_all, save_file)
save_file.close()
#保存序列
save_file = open('char2seq_dict.pkl', 'wb')
pickle.dump(char_seq_index, save_file)
save_file.close()
#保存字典
f = open('radical_alphabet.txt','w',encoding='utf-8')
f.write(alphabet)
f.close()