-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataloaders.py
302 lines (236 loc) · 11.8 KB
/
dataloaders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import pickle
def load_dataset(dataset_name, batch_size, shuffle, seed, pin_memory, scale, scale_type):
if dataset_name == 'bank':
sep = ';'
else:
sep = ','
data = pd.read_csv(f'datasets/{dataset_name}.csv', sep=sep)
if dataset_name == 'avocado':
data = data.drop(columns=['Unnamed: 0'])
data = data.drop(columns=['Date'])
columns = list(data.columns)
# getting information about each variable and restructuring categorical to numbers
data, var_info, var_dtype = dataset_info_restructure(dataset_name, data)
# splitting data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=seed, shuffle=shuffle)# , stratify=None)
# again, split to get val data (on train data)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=seed, shuffle=shuffle)# , stratify=None)
# normalize training data (similar to Ma et al.), however only the numerical columns
save = 0
for idx in var_info.keys():
if var_info[idx]['dtype'] == 'numerical':
mean = train_data[var_info[idx]['name']].mean()
std = train_data[var_info[idx]['name']].std()
save += train_data[var_info[idx]['name']].var()
var_info[idx]['standardize'] = (mean, std)
min = train_data[var_info[idx]['name']].min()
max = train_data[var_info[idx]['name']].max()
var_info[idx]['normalize'] = (min, max)
# standardize outside model
if scale_type == 'outside_model':
if scale == 'standardize':
train_data[var_info[idx]['name']] = (train_data[var_info[idx]['name']] - mean) / std
val_data[var_info[idx]['name']] =(val_data[var_info[idx]['name']] - mean) / std
test_data[var_info[idx]['name']] =(test_data[var_info[idx]['name']] - mean) / std
elif scale == 'normalize':
train_data[var_info[idx]['name']] = (train_data[var_info[idx]['name']] - min) / (max-min)
val_data[var_info[idx]['name']] =(val_data[var_info[idx]['name']] - min) / (max-min)
test_data[var_info[idx]['name']] =(test_data[var_info[idx]['name']] - min) / (max-min)
# dump pickle
#dump_pickle(train_mean, train_std, dataset_name)
# create a data class with __getitem__, i.e. iterable
train_data = iterate_data(train_data)
val_data = iterate_data(val_data)
test_data = iterate_data(test_data)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False, drop_last=True, pin_memory=pin_memory)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, drop_last=True, pin_memory=pin_memory)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, drop_last=True, pin_memory=pin_memory)
return ((var_info, var_dtype), (train_loader, val_loader, test_loader))
def dump_pickle(train_mean, train_std, dataset_name):
# make pickle file for train mean and std
with open('{}_mean.pickle'.format(dataset_name), 'wb') as handle:
pickle.dump(train_mean, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('{}_std.pickle'.format(dataset_name), 'wb') as handle:
pickle.dump(train_std, handle, protocol=pickle.HIGHEST_PROTOCOL)
def dataset_info_restructure(dataset_name, data):
boston_dtype = {'numeric': [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
'categorical': [3]}
# TODO: index start with 1?
#avocado_dtype = {'numeric': [1, 2, 3, 4, 5, 6, 7, 8, 9],
# 'categorical': [10, 11]}
avocado_dtype = {'numeric': [0, 1, 2, 3, 4, 5, 6, 7, 8],
'categorical': [9, 10, 11]}
energy_dtypes = {'numeric': [0, 1, 2, 3, 4, 8, 9],
'categorical': [5, 6, 7]}
bank_dtypes = {'numeric': [0, 5, 9, 11, 12, 13, 14],
'categorical': [1, 2, 3, 4, 6, 7, 8, 10, 15, 16]}
# what dataset and then the variables
dataset_info_container = {
'boston': boston_dtype,
'avocado': avocado_dtype,
'energy': energy_dtypes,
'bank': bank_dtypes}
# getting dtype variable information
var_dtype = dataset_info_container[dataset_name]
#inv_var_dtype = {v: k for k, v in var_dtype.items()}
inv_var_dtype = {}
for k, v in var_dtype.items():
for x in v:
inv_var_dtype[x] = k
# finding number of values per variable
var_info = {}
offset = 0
for idx, variable_name in enumerate(list(data.columns)):
if inv_var_dtype[idx] == 'categorical':
new_columns = pd.get_dummies(data[variable_name])
new_columns_names = list(variable_name + '_' + new_columns.columns.astype('str'))
for i, name in enumerate(new_columns_names):
data.insert(loc=idx+i+1+offset, column=name, value=new_columns.iloc[:,i])
# data[new_columns_names] = new_columns
num_unique = len(new_columns_names) # num unique values
# dropping original dataframe
offset += num_unique
offset -= 1
data.drop(columns=variable_name, inplace=True)
var_info[idx] = {'name': variable_name, 'dtype': 'categorical', 'num_vals': num_unique}
else:
# normal distribution:
var_info[idx] = {'name': variable_name, 'dtype': 'numerical', 'num_vals': 2}
return data, var_info, var_dtype
class iterate_data(Dataset):
def __init__(self, data):
# converting to tensor
self.N = len(data)
self.torch_data = torch.tensor(data.values)
def __len__(self):
return self.N
def __getitem__(self, idx):
batch = self.torch_data[idx, :]
return batch
### PREVIOUS ###
def standardise(x:pd.DataFrame, dtypes:dict):
'''zero mean and unit variance'''
# split into numerical and categorical features
numerical = x.iloc[:, dtypes['numeric']].reset_index(drop=True) #(drop=True, inplace=True)
categorical = x.iloc[:, dtypes['categorical']].reset_index(drop=True) #(drop=True, inplace=True)
binary = x.iloc[:, dtypes['binary']].reset_index(drop=True) #(drop=True, inplace=True)
# standardizing training data
# TODO: should not do it for test data!
cols = x.columns[dtypes['numeric']]
sc = StandardScaler(copy=False)
x_sc = sc.fit_transform(numerical)
numerical_OH = pd.DataFrame(x_sc, columns=cols)
# might want to pass this to the dataloader as well to use the inverse_transform method
on = OneHotEncoder()
categoricalf = on.fit_transform(categorical).toarray()
columns_f = on.get_feature_names_out()
categorical_OH = pd.DataFrame(categoricalf, columns=columns_f)
categorical_OH = pd.concat([categorical_OH, binary], axis=1)
# categorical class-to-idx dict
class_idxs = {}
for col in columns_f:
# get class and corresponding attribute
attr, cl = col.split('_')
# if new attribute
if attr not in class_idxs:
class_idxs[attr] = {}
# insert class index
class_idxs[attr][cl] = len(class_idxs[attr])
return numerical_OH, categorical_OH, {'numerical': numerical, 'categorical': categorical, 'class_idxs': class_idxs}
class Boston(Dataset):
"""Boston dataset"""
def __init__(self, mode='train', transforms=None):
boston_dtype = {'numeric': [0,1,2,4,5,6,7,8,9,10,11,12,13],
'categorical': [3],
'binary': [3]}
data = pd.read_csv('datasets/boston.csv')
N = len(data)
# TODO: make more random and seed dependent?
if mode == 'train':
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[:int(0.6*N)], boston_dtype)
elif mode == 'val':
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[int(0.6*N):int(0.8*N)], boston_dtype)
else:
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[int(0.8*N):], boston_dtype)
# self.transforms = transforms
def __len__(self):
return len(self.data_num)
def __getitem__(self, idx):
sample_num = torch.tensor(self.data_num.iloc[idx].values)
sample_cat = torch.tensor(self.data_cat.iloc[idx].values).float()
if sample_cat[0] == torch.tensor(np.nan):
stop = 0
# if self.transforms:
# sample = self.transforms(sample)
return sample_num, sample_cat
class Avocado(Boston, Dataset):
"""Avocado dataset
inherits __len__ and __getitem__ from Boston dataset
"""
def __init__(self, mode='train', transforms=None):
avocado_dtype = {'numeric': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'categorical': [10, 11],
'binary': [10]}
data = pd.read_csv('datasets/avocado.csv',index_col=0)
data['Date'] = pd.to_datetime(data['Date'])
N = len(data)
if mode == 'train':
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[:int(0.6*N)], avocado_dtype)
elif mode == 'val':
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[int(0.6*N):int(0.8*N)], avocado_dtype)
else:
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[int(0.8*N):], avocado_dtype)
#inverse_transform -> onehot_ncoder
class Energy(Boston, Dataset):
"""Energy dataset
inherits __len__ and __getitem__ from Boston dataset
"""
def __init__(self, mode='train', transforms=None):
dtypes = {'numeric': [0, 1, 2, 3, 4, 8, 9],
'categorical': [5, 6, 7],
'binary': []}
data = pd.read_csv('datasets/energy.csv')
data.columns = ['relative_compactness', 'surface_area', 'wall_area', 'roof_area', 'overall_height','orientation', 'glazing_area', 'glazing_area_distribution', 'heating_load', 'cooling_load']
N = len(data)
if mode == 'train':
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[:int(0.6*N)], dtypes)
elif mode == 'val':
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[int(0.6*N):int(0.8*N)], dtypes)
else:
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[int(0.8*N):], dtypes)
class Bank(Boston, Dataset):
"""Bank dataset
inherits __len__ and __getitem__ from Boston dataset
"""
def __init__(self, mode='train', transforms=None):
dtypes = {'numeric': [0, 5, 9, 11, 12, 13, 14],
'categorical': [1, 2, 3,4,6,7, 8, 10, 15,16],
'binary': [4, 6, 7, 16]}
data = pd.read_csv('datasets/bank-full.csv', sep=';')
N = len(data)
if mode == 'train':
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[:int(0.6*N)], dtypes)
elif mode == 'val':
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[int(0.6*N):int(0.8*N)], dtypes)
else:
self.data_num, self.data_cat, self.data_dict = standardise(data.iloc[int(0.8*N):], dtypes)
'''
if __name__ == '__main__':
train_data = Boston(mode='train')
val_data = Boston(mode='val')
test_data = Boston(mode='test')
training_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)
# create dummy dataloader to get a batch of 1
tester_loader = DataLoader(test_data, batch_size=1, shuffle=False)
print(next(iter(tester_loader)))
'''