-
Notifications
You must be signed in to change notification settings - Fork 1
/
mimic.py
64 lines (46 loc) · 2.11 KB
/
mimic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
def get_mimic_notes(races = ['eth_white', 'eth_black'], all_races=False):
if not os.path.exists('./data/patient_notes.csv'):
print('ERROR: Cannot find patient_notes.csv')
if not all_races:
race_names = [i.replace('eth_', '') for i in races]
race_label = race_names[0] + '-OR-' + race_names[1]
# options: eth_white, eth_black, eth_hispanic, eth_asian, eth_other
df = pd.read_csv('../data/patients_notes.csv')
ignore_cols = set([
u'mort_hosp', 'eth_asian', 'eth_black',
'eth_hispanic', 'eth_other', 'eth_white'])
feature_cols = 'chartext'
# feature_cols = [i for i in df.columns if i not in ignore_cols]
target_col = 'mort_hosp'
race_df = df[(df[races[0]] == 1) | (df[races[1]] == 1)]
sub_df = race_df[['chartext',target_col]]
sub_df[race_label] = df[races[1]] == 1
sub_df[race_label] = sub_df[race_label].apply(int)
# feature_cols += 'is_black'
# pdb.set_trace()
return sub_df, 'chartext', 'mort_hosp', race_label
elif all_races:
races_lst = ['eth_asian', 'eth_black',
'eth_hispanic', 'eth_other', 'eth_white']
feature_col = 'chartext'
target_col = 'mort_hosp'
df = pd.read_csv('../data/patients_notes_insur.csv')
def insur_pubpriv(x):
if x == 'Medicare' or x == 'Medicaid' or x == 'Government':
return 'public'
elif x == 'Private':
return 'private'
elif x == 'Self Pay':
return 'other'
df['insur_group'] = df['insurance'].apply(insur_pubpriv)
dummies = pd.get_dummies(df[['insur_group']])
df[dummies.columns] = dummies
df = df.drop('insur_group_other', axis=1)
df['male'] = df['gender']
df['female'] = 1 - df['gender']
# sub_df = df[[feature_col, target_col] + races_lst]
return df, 'chartext', 'mort_hosp', races_lst