-
Notifications
You must be signed in to change notification settings - Fork 94
/
uea.py
195 lines (166 loc) · 7.62 KB
/
uea.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
import json
import math
import torch
import numpy
import argparse
import weka.core.jvm
import weka.core.converters
import scikit_wrappers
def load_UEA_dataset(path, dataset):
"""
Loads the UEA dataset given in input in numpy arrays.
@param path Path where the UCR dataset is located.
@param dataset Name of the UCR dataset.
@return Quadruplet containing the training set, the corresponding training
labels, the testing set and the corresponding testing labels.
"""
# Initialization needed to load a file with Weka wrappers
weka.core.jvm.start()
loader = weka.core.converters.Loader(
classname="weka.core.converters.ArffLoader"
)
train_file = os.path.join(path, dataset, dataset + "_TRAIN.arff")
test_file = os.path.join(path, dataset, dataset + "_TEST.arff")
train_weka = loader.load_file(train_file)
test_weka = loader.load_file(test_file)
train_size = train_weka.num_instances
test_size = test_weka.num_instances
nb_dims = train_weka.get_instance(0).get_relational_value(0).num_instances
length = train_weka.get_instance(0).get_relational_value(0).num_attributes
train = numpy.empty((train_size, nb_dims, length))
test = numpy.empty((test_size, nb_dims, length))
train_labels = numpy.empty(train_size, dtype=numpy.int)
test_labels = numpy.empty(test_size, dtype=numpy.int)
for i in range(train_size):
train_labels[i] = int(train_weka.get_instance(i).get_value(1))
time_series = train_weka.get_instance(i).get_relational_value(0)
for j in range(nb_dims):
train[i, j] = time_series.get_instance(j).values
for i in range(test_size):
test_labels[i] = int(test_weka.get_instance(i).get_value(1))
time_series = test_weka.get_instance(i).get_relational_value(0)
for j in range(nb_dims):
test[i, j] = time_series.get_instance(j).values
# Normalizing dimensions independently
for j in range(nb_dims):
# Post-publication note:
# Using the testing set to normalize might bias the learned network,
# but with a limited impact on the reported results on few datasets.
# See the related discussion here: https://github.com/White-Link/UnsupervisedScalableRepresentationLearningTimeSeries/pull/13.
mean = numpy.mean(numpy.concatenate([train[:, j], test[:, j]]))
var = numpy.var(numpy.concatenate([train[:, j], test[:, j]]))
train[:, j] = (train[:, j] - mean) / math.sqrt(var)
test[:, j] = (test[:, j] - mean) / math.sqrt(var)
# Move the labels to {0, ..., L-1}
labels = numpy.unique(train_labels)
transform = {}
for i, l in enumerate(labels):
transform[l] = i
train_labels = numpy.vectorize(transform.get)(train_labels)
test_labels = numpy.vectorize(transform.get)(test_labels)
weka.core.jvm.stop()
return train, train_labels, test, test_labels
def fit_hyperparameters(file, train, train_labels, cuda, gpu,
save_memory=False):
"""
Creates a classifier from the given set of hyperparameters in the input
file, fits it and return it.
@param file Path of a file containing a set of hyperparemeters.
@param train Training set.
@param train_labels Labels for the training set.
@param cuda If True, enables computations on the GPU.
@param gpu GPU to use if CUDA is enabled.
@param save_memory If True, save GPU memory by propagating gradients after
each loss term, instead of doing it after computing the whole loss.
"""
classifier = scikit_wrappers.CausalCNNEncoderClassifier()
# Loads a given set of hyperparameters and fits a model with those
hf = open(os.path.join(file), 'r')
params = json.load(hf)
hf.close()
# Check the number of input channels
params['in_channels'] = numpy.shape(train)[1]
params['cuda'] = cuda
params['gpu'] = gpu
classifier.set_params(**params)
return classifier.fit(
train, train_labels, save_memory=save_memory, verbose=True
)
def parse_arguments():
parser = argparse.ArgumentParser(
description='Classification tests for UEA repository datasets'
)
parser.add_argument('--dataset', type=str, metavar='D', required=True,
help='dataset name')
parser.add_argument('--path', type=str, metavar='PATH', required=True,
help='path where the dataset is located')
parser.add_argument('--save_path', type=str, metavar='PATH', required=True,
help='path where the estimator is/should be saved')
parser.add_argument('--cuda', action='store_true',
help='activate to use CUDA')
parser.add_argument('--gpu', type=int, default=0, metavar='GPU',
help='index of GPU used for computations (default: 0)')
parser.add_argument('--hyper', type=str, metavar='FILE', required=True,
help='path of the file of hyperparameters to use ' +
'for training; must be a JSON file')
parser.add_argument('--load', action='store_true', default=False,
help='activate to load the estimator instead of ' +
'training it')
parser.add_argument('--fit_classifier', action='store_true', default=False,
help='if not supervised, activate to load the ' +
'model and retrain the classifier')
return parser.parse_args()
if __name__ == '__main__':
args = parse_arguments()
if args.cuda and not torch.cuda.is_available():
print("CUDA is not available, proceeding without it...")
args.cuda = False
train, train_labels, test, test_labels = load_UEA_dataset(
args.path, args.dataset
)
if not args.load and not args.fit_classifier:
classifier = fit_hyperparameters(
args.hyper, train, train_labels, args.cuda, args.gpu,
save_memory=True
)
else:
classifier = scikit_wrappers.CausalCNNEncoderClassifier()
hf = open(
os.path.join(
args.save_path, args.dataset + '_hyperparameters.json'
), 'r'
)
hp_dict = json.load(hf)
hf.close()
hp_dict['cuda'] = args.cuda
hp_dict['gpu'] = args.gpu
classifier.set_params(**hp_dict)
classifier.load(os.path.join(args.save_path, args.dataset))
if not args.load:
if args.fit_classifier:
classifier.fit_classifier(classifier.encode(train), train_labels)
classifier.save(
os.path.join(args.save_path, args.dataset)
)
with open(
os.path.join(
args.save_path, args.dataset + '_hyperparameters.json'
), 'w'
) as fp:
json.dump(classifier.get_params(), fp)
print("Test accuracy: " + str(classifier.score(test, test_labels)))