-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
77 lines (64 loc) · 2.47 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
# Load data from file_dir
def load_data(file_dir):
# output:
# N: the number of user;
# M: the number of item
# data: the list of rating information
drug_ids_dict, target_ids_dict = {},{}
N,M,d_idx,t_idx = 0,0,0,0 # N: the number of drug; M: the number of target
data = []
f = open(file_dir)
for line in f.readlines():
d, t = line.split()
d = d.replace(':','')
if d not in drug_ids_dict:
drug_ids_dict[d]=d_idx
d_idx+=1
if t not in target_ids_dict:
target_ids_dict[t]=t_idx
t_idx+=1
data.append([drug_ids_dict[d],target_ids_dict[t],1])
f.close()
N = d_idx
M = t_idx
return N, M, data, drug_ids_dict, target_ids_dict
# Convert the list data to array
def sequence2mat(sequence, N, M):
# input:
# sequence: the list of rating information
# N: row number, i.e. the number of users
# M: column number, i.e. the number of items
# output:
# mat: user-item rating matrix
records_array = np.array(sequence)
mat = np.zeros([N,M])
row = records_array[:,0].astype(int)
col = records_array[:,1].astype(int)
values = records_array[:,2].astype(np.float32)
mat[row,col]=values
return mat
# Sample for imbalanced data
def generate_data(train_mat, sample_size=4, mode=0):
drugs_num,targets_num = train_mat.shape
data = []
if mode==0:
for d in range(drugs_num):
positive_targets = np.where(train_mat[d,:]>0)[0] # the observed interactions with drug d
for target0 in positive_targets:
data.append([d,target0,1])
for _ in range(sample_size):
target1 = np.random.randint(targets_num)
while (target1 in positive_targets) or (train_mat[d,target1]!=0):
target1 = np.random.randint(targets_num)
data.append([d,target1,0])
else:
for d in range(drugs_num):
positive_targets = np.where(train_mat[d,:]>0)[0] # the observed interactions with drug d
for target0 in positive_targets:
for _ in range(sample_size):
target1 = np.random.randint(targets_num)
while (target1 in positive_targets) or (train_mat[d,target1]!=0):
target1 = np.random.randint(targets_num)
data.append([d,target0,target1])
return data