-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
122 lines (90 loc) · 4.15 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from timbl import TimblClassifier
import scipy as sp
import numpy as np
class skTiMBL(BaseEstimator, ClassifierMixin):
def __init__(self, prefix='timbl', algorithm=4, dist_metric=None,
k=1, normalize=False, debug=0, flushdir=None):
self.prefix = prefix
self.algorithm = algorithm
self.dist_metric = dist_metric
self.k = k
self.normalize = normalize
self.debug = debug
self.flushdir = flushdir
def _make_timbl_options(self, *options):
"""
-a algorithm
-m metric
-w weighting
-k amount of neighbours
-d class voting weights
-L frequency threshold
-T which feature index is label
-N max number of features
-H turn hashing on/off
This function still has to be made, for now the appropriate arguments
can be passed in fit()
"""
pass
def fit(self, X, y):
X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr')
n_rows = X.shape[0]
self.classes_ = np.unique(y)
if sp.sparse.issparse(X):
if self.debug: print('Features are sparse, choosing faster learning')
self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]),
format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir,
flushthreshold=20000, normalize=self.normalize)
for i in range(n_rows):
sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)]
self.classifier.append(sparse,str(y[i]))
else:
self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]),
debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000,
normalize=self.normalize)
if y.dtype != 'O':
y = y.astype(str)
for i in range(n_rows):
self.classifier.append(list(X[i].toarray()[0]), y[i])
self.classifier.train()
return self
def _timbl_predictions(self, X, part_index, y=None):
choices = {0 : lambda x : x.append(np.int64(label)),
1 : lambda x : x.append([np.float(distance)]),
}
X = check_array(X, dtype=np.float64, accept_sparse='csr')
n_samples = X.shape[0]
pred = []
func = choices[part_index]
if sp.sparse.issparse(X):
if self.debug: print('Features are sparse, choosing faster predictions')
for i in range(n_samples):
sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)]
label,proba, distance = self.classifier.classify(sparse)
func(pred)
else:
for i in range(n_samples):
label,proba, distance = self.classifier.classify(list(X[i].toarray()[0]))
func(pred)
return np.array(pred)
def predict(self, X, y=None):
return self._timbl_predictions(X, part_index=0)
def predict_proba(self, X, y=None):
"""
TIMBL is a discrete classifier. It cannot give probability estimations.
To ensure that scikit-learn functions with TIMBL (and especially metrics
such as ROC_AUC), this method is implemented.
For ROC_AUC, the classifier corresponds to a single point in ROC space,
instead of a probabilistic continuum such as classifiers that can give
a probability estimation (e.g. Linear classifiers). For an explanation,
see Fawcett (2005).
"""
return predict(X)
def decision_function(self, X, y=None):
"""
The decision function is interpreted here as being the distance between
the instance that is being classified and the nearest point in k space.
"""
return self._timbl_predictions(X, part_index=1)