-
Notifications
You must be signed in to change notification settings - Fork 2
/
segment.py
114 lines (94 loc) · 4.11 KB
/
segment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import json
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
import gensim
from conf import MIMIC3_PATH, EXTRACTED_PATH, TITLE_PATH, SECTION_PATH
def find_index(words, name_words, names):
len_name_words = len(name_words)
for i in range(len(words) - len_name_words):
candidate = words[i:(i + len_name_words)]
if candidate == name_words:
if i == 0:
return i
if words[i - 1] not in [327, 109, 50, 562]:
if [words[i - 1]] + candidate in names:
continue
return i
return -1
def segmentation_words(words, sections):
section_indices = {}
for section, names in sections.items():
for name_index, name_words in enumerate(names):
index = find_index(words, name_words, names[:name_index])
if index != -1:
section_indices[section] = (index, index + len(name_words))
break
section_indices = sorted(section_indices.items(), key=lambda e: e[1][0])
section_indices.append(('', (len(words), 0)))
note_sections = {}
for i, (section, (start, end)) in enumerate(section_indices[:-1]):
next_start = section_indices[i + 1][1][0]
if next_start < end:
note_sections[section] = []
else:
note_sections[section] = words[end:next_start]
if len(note_sections) == 0:
note_sections['others'] = words
else:
note_sections['others'] = []
all_note_sections = {}
for section in sections:
all_note_sections[section] = note_sections[section] if section in note_sections else []
return all_note_sections
def segmentation_dataset(dataset, section_titles, tokenizer, word2id, id2word):
section_ids = {}
for section, names in section_titles.items():
name_ids = []
for name in names:
words = tokenizer.tokenize(name)
word_ids = [word2id[word] for word in words]
name_ids.append(word_ids)
section_ids[section] = name_ids
result = []
for sample in tqdm(dataset):
text = sample['text']
words = [word for word in tokenizer.tokenize(text.lower()) if not word.isnumeric()]
word_ids = [word2id.get(word, word2id['**UNK**']) for word in words]
note_sections = segmentation_words(word_ids, section_ids)
for section, section_word_ids in note_sections.items():
note_sections[section] = ' '.join([id2word[word_id] for word_id in section_word_ids])
note = {
'hadm_id': sample['hadm_id'],
'labels': sample['labels'],
'sections': note_sections,
'text': text
}
result.append(note)
return result
def load_vocab(path):
model = gensim.models.Word2Vec.load(path)
words = list(model.wv.key_to_index)
del model
with open(os.path.join(MIMIC3_PATH, 'word_count_dict.json'), 'r') as f:
word_count_dict = json.load(f)
words = [w for w in words if w in word_count_dict]
for w in ['**UNK**', '**PAD**', '**MASK**']:
if not w in words:
words = words + [w]
word2id = {word: idx for idx, word in enumerate(words)}
id2word = {idx: word for idx, word in enumerate(words)}
return word2id, id2word
if __name__ == '__main__':
if not os.path.exists(SECTION_PATH) or not os.path.isdir(SECTION_PATH):
os.makedirs(SECTION_PATH)
section_titles = json.load(open(os.path.join(TITLE_PATH, 'title_synonyms.json')))
print(section_titles)
tokenizer = RegexpTokenizer(r'\w+')
word2id, id2word = load_vocab(os.path.join(MIMIC3_PATH, 'word2vec_sg0_100.model'))
for task_name in ['mimic3', 'mimic3-50', 'mimic3-50l']:
for version in ['train', 'dev', 'test']:
print(f'{task_name}_{version}')
dataset = json.load(open(os.path.join(EXTRACTED_PATH, f'{task_name}_{version}.json'), encoding='utf-8'))
new_dataset = segmentation_dataset(dataset, section_titles, tokenizer, word2id, id2word)
json.dump(new_dataset, open(os.path.join(SECTION_PATH, f'{task_name}_{version}.json'), 'w', encoding='utf-8'), indent=4)