Skip to content

Commit 877acd6

Browse files
committed
update
1 parent 25eb869 commit 877acd6

File tree

3 files changed

+100
-17
lines changed

3 files changed

+100
-17
lines changed

svm_class/crossval.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from __future__ import print_function, division
2+
from builtins import range
3+
# Note: you may need to update your version of future
4+
# sudo pip install -U future
5+
6+
import matplotlib.pyplot as plt
7+
import numpy as np
8+
import pandas as pd
9+
10+
from datetime import datetime
11+
from sklearn.model_selection import cross_val_score
12+
from sklearn.pipeline import Pipeline
13+
from sklearn.preprocessing import StandardScaler
14+
from sklearn.datasets import load_breast_cancer
15+
from sklearn.svm import SVC
16+
17+
# load the data
18+
data = load_breast_cancer()
19+
20+
for C in (0.5, 1.0, 5.0, 10.0):
21+
pipeline = Pipeline([('scaler', StandardScaler()), ('svm', SVC(C=C))])
22+
scores = cross_val_score(pipeline, data.data, data.target, cv=5)
23+
print("C:", C, "mean:", scores.mean(), "std:", scores.std())

svm_class/extra_reading.txt

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,19 @@ A Tutorial on Support Vector Regression
4444
https://alex.smola.org/papers/2003/SmoSch03b.pdf
4545

4646
LIBSVM -- A Library for Support Vector Machines
47-
https://www.csie.ntu.edu.tw/~cjlin/libsvm/
47+
https://www.csie.ntu.edu.tw/~cjlin/libsvm/
48+
49+
Random Features for Large-Scale Kernel Machines
50+
http://www.robots.ox.ac.uk/~vgg/rg/papers/randomfeatures.pdf
51+
52+
Reflections on Random Kitchen Sinks
53+
http://www.argmin.net/2017/12/05/kitchen-sinks/
54+
55+
Weighted Sums of Random Kitchen Sinks: Replacing minimization with randomization in learning
56+
https://papers.nips.cc/paper/3495-weighted-sums-of-random-kitchen-sinks-replacing-minimization-with-randomization-in-learning
57+
58+
Using the Nyström Method to Speed Up Kernel Machines
59+
https://papers.nips.cc/paper/1866-using-the-nystrom-method-to-speed-up-kernel-machines
60+
61+
Nyström Method vs Random Fourier Features: A Theoretical and Empirical Comparison
62+
https://papers.nips.cc/paper/4588-nystrom-method-vs-random-fourier-features-a-theoretical-and-empirical-comparison

svm_class/fake_neural_net.py

Lines changed: 61 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,44 +13,83 @@
1313
from sklearn.svm import LinearSVC
1414
from sklearn.preprocessing import StandardScaler
1515
from sklearn.cluster import KMeans
16-
17-
# get the data: https://www.kaggle.com/c/digit-recognizer
18-
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()
16+
from sklearn.mixture import GaussianMixture
17+
from sklearn.model_selection import cross_val_score
18+
from sklearn.utils import shuffle
19+
from scipy import stats
1920

2021

2122
class SigmoidFeaturizer:
2223
def __init__(self, gamma=1.0, n_components=100, method='random'):
2324
self.M = n_components
2425
self.gamma = gamma
25-
assert(method in ('random', 'kmeans'))
26+
assert(method in ('random', 'kmeans', 'gmm'))
2627
self.method = method
2728

29+
def _subsample_data(self, X, Y, n=10000):
30+
if Y is not None:
31+
X, Y = shuffle(X, Y)
32+
return X[:n], Y[:n]
33+
else:
34+
X = shuffle(X)
35+
return X[:n]
36+
2837
def fit(self, X, Y=None):
2938
if self.method == 'random':
3039
N = len(X)
3140
idx = np.random.randint(N, size=self.M)
3241
self.samples = X[idx]
3342
elif self.method == 'kmeans':
43+
X, Y = self._subsample_data(X, Y)
44+
3445
print("Fitting kmeans...")
3546
t0 = datetime.now()
36-
kmeans = KMeans(n_clusters=self.M)
47+
kmeans = KMeans(n_clusters=len(set(Y)))
3748
kmeans.fit(X)
3849
print("Finished fitting kmeans, duration:", datetime.now() - t0)
39-
self.samples = kmeans.cluster_centers_
50+
51+
# calculate the most ambiguous points
52+
# we will do this by finding the distance between each point
53+
# and all cluster centers
54+
# and return which points have the smallest variance
55+
dists = kmeans.transform(X) # returns an N x K matrix
56+
variances = dists.var(axis=1)
57+
idx = np.argsort(variances) # smallest to largest
58+
idx = idx[:self.M]
59+
self.samples = X[idx]
60+
elif self.method == 'gmm':
61+
X, Y = self._subsample_data(X, Y)
62+
63+
print("Fitting GMM")
64+
t0 = datetime.now()
65+
gmm = GaussianMixture(n_components=len(set(Y)))
66+
gmm.fit(X)
67+
print("Finished fitting GMM, duration:", datetime.now() - t0)
68+
69+
# calculate the most ambiguous points
70+
probs = gmm.predict_proba(X)
71+
ent = stats.entropy(probs.T) # N-length vector of entropies
72+
idx = np.argsort(-ent) # negate since we want biggest first
73+
idx = idx[:self.M]
74+
self.samples = X[idx]
4075
return self
4176

4277
def transform(self, X):
43-
Z = self.gamma * X.dot(self.samples.T) # (Ntest x D) x (D x Nsamples) -> (Ntest x Nsamples)
44-
return np.tanh(Z)
78+
Z = X.dot(self.samples.T) # (Ntest x D) x (D x Nsamples) -> (Ntest x Nsamples)
79+
return np.tanh(self.gamma * Z)
80+
# return self.gamma * Z * (Z > 0)
4581

4682
def fit_transform(self, X, Y=None):
4783
return self.fit(X, Y).transform(X)
4884

4985

86+
# get the data: https://www.kaggle.com/c/digit-recognizer
87+
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()
88+
5089
# with SGD
5190
pipeline = Pipeline([
5291
('scaler', StandardScaler()),
53-
('sigmoid', SigmoidFeaturizer(gamma=0.05, n_components=2000, method='random')),
92+
('sigmoid', SigmoidFeaturizer(gamma=0.05, n_components=2000, method='gmm')),
5493
('linear', SGDClassifier(max_iter=1e6, tol=1e-5))
5594
])
5695

@@ -63,10 +102,16 @@ def fit_transform(self, X, Y=None):
63102
# ])
64103

65104

66-
t0 = datetime.now()
67-
pipeline.fit(Xtrain, Ytrain)
68-
print("train duration:", datetime.now() - t0)
69-
t0 = datetime.now()
70-
print("train score:", pipeline.score(Xtrain, Ytrain), "duration:", datetime.now() - t0)
71-
t0 = datetime.now()
72-
print("test score:", pipeline.score(Xtest, Ytest), "duration:", datetime.now() - t0)
105+
X = np.vstack((Xtrain, Xtest))
106+
Y = np.concatenate((Ytrain, Ytest))
107+
scores = cross_val_score(pipeline, X, Y, cv=5)
108+
print(scores)
109+
print("avg:", np.mean(scores))
110+
111+
# t0 = datetime.now()
112+
# pipeline.fit(Xtrain, Ytrain)
113+
# print("train duration:", datetime.now() - t0)
114+
# t0 = datetime.now()
115+
# print("train score:", pipeline.score(Xtrain, Ytrain), "duration:", datetime.now() - t0)
116+
# t0 = datetime.now()
117+
# print("test score:", pipeline.score(Xtest, Ytest), "duration:", datetime.now() - t0)

0 commit comments

Comments
 (0)