-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_phrase_ids.py
55 lines (46 loc) · 1.62 KB
/
generate_phrase_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# author: wangjh237@mail2.sysu.edu.cn
# generate phrase ids for sentences in [train, dev, test]
# you can get phrase ids of [train, dev, test] by the 3 generated files.
# and then find corresponding sentences in dictionary.txt
split_dict = {'1':[], '2':[], '3':[]} # 表示三个集合中包含的sentence
i = 0
with open('datasetSplit.txt', 'r', encoding='utf-8') as f:
for line in f:
if i == 0:
i += 1
continue
[sentence_id, split_id] = line.strip().split(',')
split_dict[split_id].append(sentence_id)
f.close()
sentence_dict = {} # [id, sentence] pair
i = 0
with open('datasetSentences.txt', 'r') as f:
for line in f:
if i == 0:
i += 1
continue
[sentence_id, sentence] = line.strip().split('\t')
sentence_dict[sentence_id] = sentence
f.close()
phrase_dict = {} # [phrase, id] pair
with open('dictionary.txt', 'r') as f:
for line in f:
[phrase, phrase_id] =line.strip().split('|')
phrase = phrase.replace('(', '-LRB-').replace(')', '-RRB-')
phrase_dict[phrase] = phrase_id
f.close()
for split_id in split_dict:
split_phrase_list = []
for sentence_id in split_dict[split_id]:
split_phrase_list.append(phrase_dict[sentence_dict[sentence_id]])
filename = ''
if split_id == '1':
filename = 'phrase_ids.train.txt'
if split_id == '2':
filename = 'phrase_ids.test.txt'
if split_id == '3':
filename = 'phrase_ids.dev.txt'
f = open('../' + filename, 'w', encoding='utf-8')
for phrase_id in split_phrase_list:
f.write(phrase_id + '\n')
f.close()