forked from fxrshed/ScaledSPS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasets.py
113 lines (72 loc) · 3.71 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import numpy as np
from torch.utils.data import DataLoader
import torch.utils.data as data_utils
from scipy import sparse
import urllib.request
from sklearn.preprocessing import normalize
from sklearn.datasets import load_svmlight_file
from loss_fns import LogisticRegression, NLLSQ
from dotenv import load_dotenv
load_dotenv()
def get_dataset(name, batch_size, percentage=1.0, scale_range=None, loss_target_range=None):
datasets_path = './datasets' #os.getenv("DATASETS_DIR")
print(datasets_path)
if name == "MNIST":
assert scale == False, "Scaling not applicable."
train_dataset, test_dataset = get_MNIST()
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
return train_loader, test_loader
elif name == "mushrooms":
trainX, trainY = load_svmlight_file(f"{datasets_path}/{name}")
sample = np.random.choice(trainX.shape[0], round(trainX.shape[0] * percentage), replace=False)
assert sample.shape == np.unique(sample).shape
trainX = trainX[sample]
trainY = trainY[sample]
train_data = torch.tensor(trainX.toarray(), dtype=torch.float)
train_target = torch.tensor(trainY, dtype=torch.float)
elif name == "colon-cancer":
trainX, trainY = load_svmlight_file(f"{datasets_path}/{name}")
sample = np.random.choice(trainX.shape[0], round(trainX.shape[0] * percentage), replace=False)
assert sample.shape == np.unique(sample).shape
trainX = trainX[sample]
trainY = trainY[sample]
train_data = torch.tensor(trainX.toarray(), dtype=torch.float)
train_target = torch.tensor(trainY, dtype=torch.float)
elif name == "covtype.libsvm.binary.scale" or name == "covtype.libsvm.binary":
trainX, trainY = load_svmlight_file(f"{datasets_path}/{name}")
sample = np.random.choice(trainX.shape[0], round(trainX.shape[0] * percentage), replace=False)
assert sample.shape == np.unique(sample).shape
trainX = trainX[sample]
trainY = trainY[sample]
train_data = torch.tensor(trainX.toarray(), dtype=torch.float)
train_target = torch.tensor(trainY, dtype=torch.float)
if scale_range != None:
r1 = scale_range[0]
r2 = scale_range[1]
scaling_vec = (r1 - r2) * torch.rand(train_data.shape[1]) + r2
scaling_vec = torch.pow(torch.e, scaling_vec)
train_data = scaling_vec * train_data
if loss_target_range is not None:
train_target[train_target == train_target.unique()[0]] = loss_target_range[0]
train_target[train_target == train_target.unique()[1]] = loss_target_range[1]
return train_data, train_target
def get_MNIST():
train_dataset = torchvision.datasets.MNIST(root='./datasets',
train=True,
transform=transforms.ToTensor(),
download=True)
test_dataset = torchvision.datasets.MNIST(root='./datasets',
train=False,
transform=transforms.ToTensor())
return train_dataset, test_dataset