13
13
from sklearn .svm import LinearSVC
14
14
from sklearn .preprocessing import StandardScaler
15
15
from sklearn .cluster import KMeans
16
-
17
- # get the data: https://www.kaggle.com/c/digit-recognizer
18
- Xtrain , Ytrain , Xtest , Ytest = getKaggleMNIST ()
16
+ from sklearn .mixture import GaussianMixture
17
+ from sklearn .model_selection import cross_val_score
18
+ from sklearn .utils import shuffle
19
+ from scipy import stats
19
20
20
21
21
22
class SigmoidFeaturizer :
22
23
def __init__ (self , gamma = 1.0 , n_components = 100 , method = 'random' ):
23
24
self .M = n_components
24
25
self .gamma = gamma
25
- assert (method in ('random' , 'kmeans' ))
26
+ assert (method in ('random' , 'kmeans' , 'gmm' ))
26
27
self .method = method
27
28
29
+ def _subsample_data (self , X , Y , n = 10000 ):
30
+ if Y is not None :
31
+ X , Y = shuffle (X , Y )
32
+ return X [:n ], Y [:n ]
33
+ else :
34
+ X = shuffle (X )
35
+ return X [:n ]
36
+
28
37
def fit (self , X , Y = None ):
29
38
if self .method == 'random' :
30
39
N = len (X )
31
40
idx = np .random .randint (N , size = self .M )
32
41
self .samples = X [idx ]
33
42
elif self .method == 'kmeans' :
43
+ X , Y = self ._subsample_data (X , Y )
44
+
34
45
print ("Fitting kmeans..." )
35
46
t0 = datetime .now ()
36
- kmeans = KMeans (n_clusters = self . M )
47
+ kmeans = KMeans (n_clusters = len ( set ( Y )) )
37
48
kmeans .fit (X )
38
49
print ("Finished fitting kmeans, duration:" , datetime .now () - t0 )
39
- self .samples = kmeans .cluster_centers_
50
+
51
+ # calculate the most ambiguous points
52
+ # we will do this by finding the distance between each point
53
+ # and all cluster centers
54
+ # and return which points have the smallest variance
55
+ dists = kmeans .transform (X ) # returns an N x K matrix
56
+ variances = dists .var (axis = 1 )
57
+ idx = np .argsort (variances ) # smallest to largest
58
+ idx = idx [:self .M ]
59
+ self .samples = X [idx ]
60
+ elif self .method == 'gmm' :
61
+ X , Y = self ._subsample_data (X , Y )
62
+
63
+ print ("Fitting GMM" )
64
+ t0 = datetime .now ()
65
+ gmm = GaussianMixture (n_components = len (set (Y )))
66
+ gmm .fit (X )
67
+ print ("Finished fitting GMM, duration:" , datetime .now () - t0 )
68
+
69
+ # calculate the most ambiguous points
70
+ probs = gmm .predict_proba (X )
71
+ ent = stats .entropy (probs .T ) # N-length vector of entropies
72
+ idx = np .argsort (- ent ) # negate since we want biggest first
73
+ idx = idx [:self .M ]
74
+ self .samples = X [idx ]
40
75
return self
41
76
42
77
def transform (self , X ):
43
- Z = self .gamma * X .dot (self .samples .T ) # (Ntest x D) x (D x Nsamples) -> (Ntest x Nsamples)
44
- return np .tanh (Z )
78
+ Z = X .dot (self .samples .T ) # (Ntest x D) x (D x Nsamples) -> (Ntest x Nsamples)
79
+ return np .tanh (self .gamma * Z )
80
+ # return self.gamma * Z * (Z > 0)
45
81
46
82
def fit_transform (self , X , Y = None ):
47
83
return self .fit (X , Y ).transform (X )
48
84
49
85
86
+ # get the data: https://www.kaggle.com/c/digit-recognizer
87
+ Xtrain , Ytrain , Xtest , Ytest = getKaggleMNIST ()
88
+
50
89
# with SGD
51
90
pipeline = Pipeline ([
52
91
('scaler' , StandardScaler ()),
53
- ('sigmoid' , SigmoidFeaturizer (gamma = 0.05 , n_components = 2000 , method = 'random ' )),
92
+ ('sigmoid' , SigmoidFeaturizer (gamma = 0.05 , n_components = 2000 , method = 'gmm ' )),
54
93
('linear' , SGDClassifier (max_iter = 1e6 , tol = 1e-5 ))
55
94
])
56
95
@@ -63,10 +102,16 @@ def fit_transform(self, X, Y=None):
63
102
# ])
64
103
65
104
66
- t0 = datetime .now ()
67
- pipeline .fit (Xtrain , Ytrain )
68
- print ("train duration:" , datetime .now () - t0 )
69
- t0 = datetime .now ()
70
- print ("train score:" , pipeline .score (Xtrain , Ytrain ), "duration:" , datetime .now () - t0 )
71
- t0 = datetime .now ()
72
- print ("test score:" , pipeline .score (Xtest , Ytest ), "duration:" , datetime .now () - t0 )
105
+ X = np .vstack ((Xtrain , Xtest ))
106
+ Y = np .concatenate ((Ytrain , Ytest ))
107
+ scores = cross_val_score (pipeline , X , Y , cv = 5 )
108
+ print (scores )
109
+ print ("avg:" , np .mean (scores ))
110
+
111
+ # t0 = datetime.now()
112
+ # pipeline.fit(Xtrain, Ytrain)
113
+ # print("train duration:", datetime.now() - t0)
114
+ # t0 = datetime.now()
115
+ # print("train score:", pipeline.score(Xtrain, Ytrain), "duration:", datetime.now() - t0)
116
+ # t0 = datetime.now()
117
+ # print("test score:", pipeline.score(Xtest, Ytest), "duration:", datetime.now() - t0)
0 commit comments