Skip to content

Commit 029a370

Browse files
committed
add nonparametric module
1 parent 693cac0 commit 029a370

File tree

7 files changed

+432
-0
lines changed

7 files changed

+432
-0
lines changed

nonparametric/README.md

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Nonparametric Models
2+
The nonparametric module implements several popular nonparameteric regression
3+
and classification models.
4+
5+
- `kernel_regression.py` implements Nadaraya-Watson kernel regression
6+
([Nadaraya, 1964](https://epubs.siam.org/doi/abs/10.1137/1109020); [Watson,
7+
1964](https://www.jstor.org/stable/pdf/25049340.pdf))
8+
- `knn.py` implements k-nearest neighbors regression and classification
9+
models using a ball-tree
10+
11+
## Plots
12+
<p align="center">
13+
<strong>k-Nearest Neighbors</strong>
14+
<img src="img/knn_plots.png" align='center' height="550" />
15+
16+
<strong>Nadaraya-Watson Kernel Regression</strong>
17+
<img src="img/kr_plots.png" align='center' height="550" />
18+
</p>

nonparametric/img/knn_plots.png

744 KB
Loading

nonparametric/img/kr_plots.png

588 KB
Loading

nonparametric/kernel_regression.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import sys
2+
3+
sys.path.append("..")
4+
from utils.kernels import KernelInitializer
5+
6+
7+
class KernelRegression:
8+
def __init__(self, kernel=None):
9+
"""
10+
A Nadaraya-Watson kernel regression model.
11+
12+
f(x) = sum_i w_i(x) * y_i
13+
14+
where the sample weighting functions, w_i, are simply
15+
16+
w_i(x) = k(x, x_i) / sum_j k(x, x_j)
17+
18+
with k being the kernel function.
19+
20+
Observe that k-nearest neighbors (KNN) regression is a special case of
21+
kernel regression where the k closest observations have a weight 1/k,
22+
and all others have weight 0.
23+
24+
Parameters
25+
----------
26+
kernel : str, `KernelBase` instance, or dict (default: None)
27+
The kernel to use. If `None`, default to `LinearKernel`
28+
"""
29+
self.parameters = {"X": None, "y": None}
30+
self.hyperparameters = {"kernel": str(kernel)}
31+
self.kernel = KernelInitializer(kernel)()
32+
33+
def fit(self, X, y):
34+
"""
35+
Fit the regression model to the data and targets in `X` and `y`
36+
37+
Parameters
38+
----------
39+
X : numpy array of shape (N, M)
40+
An array of N examples to generate predictions on
41+
y : numpy array of shape (N, ...)
42+
Predicted targets for the N' rows in `X`
43+
"""
44+
self.parameters = {"X": X, "y": y}
45+
46+
def predict(self, X):
47+
"""
48+
Generate predictions for the targets associated with the rows in `X`.
49+
50+
Parameters
51+
----------
52+
X : numpy array of shape (N', M')
53+
An array of N' examples to generate predictions on
54+
55+
Returns
56+
-------
57+
y : numpy array of shape (N', ...)
58+
Predicted targets for the N' rows in `X`
59+
"""
60+
K = self.kernel
61+
P = self.parameters
62+
sim = K(P["X"], X)
63+
return (sim * P["y"][:, None]).sum(axis=0) / sim.sum(axis=0)

nonparametric/knn.py

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import sys
2+
from collections import Counter
3+
4+
import numpy as np
5+
6+
sys.path.append("..")
7+
from utils.data_structures import BallTree
8+
9+
10+
class KNN:
11+
def __init__(
12+
self, k=5, leaf_size=40, classifier=True, metric=None, weights="uniform"
13+
):
14+
"""
15+
A k-nearest neighbors (kNN) model relying on a ball tree for efficient
16+
computation.
17+
18+
Parameters
19+
----------
20+
k : int (default: 5)
21+
The number of neighbors to use during prediction
22+
leaf_size : int (default: 40)
23+
The maximum number of datapoints at each leaf in the ball tree
24+
classifier : bool (default: True)
25+
Whether to treat the values in Y as class labels (classifier =
26+
True) or real-valued targets (classifier = False)
27+
metric : function (default: None)
28+
The distance metric to use for computing nearest neighbors
29+
weights : 'uniform' or 'distance' (default: 'uniform')
30+
How to weight the predictions from each neighbors. 'uniform'
31+
assigns uniform weights to each neighbor, while 'distance' assigns
32+
weights proportional to the inverse of the distance from the query
33+
point
34+
"""
35+
self._ball_tree = BallTree(leaf_size=leaf_size, metric=metric)
36+
self.hyperparameters = {
37+
"id": "KNN",
38+
"k": k,
39+
"leaf_size": leaf_size,
40+
"classifier": classifier,
41+
"metric": str(metric),
42+
"weights": weights,
43+
}
44+
45+
def fit(self, X, y):
46+
"""
47+
Fit the model to the data and targets in `X` and `y`
48+
49+
Parameters
50+
----------
51+
X : numpy array of shape (N, M)
52+
An array of N examples to generate predictions on
53+
y : numpy array of shape (N, ...)
54+
Predicted targets for the N' rows in `X`
55+
"""
56+
if X.ndim != 2:
57+
raise Exception("X must be two-dimensional")
58+
self._ball_tree.fit(X, y)
59+
60+
def predict(self, X):
61+
"""
62+
Generate predictions for the targets associated with the rows in `X`.
63+
64+
Parameters
65+
----------
66+
X : numpy array of shape (N', M')
67+
An array of N' examples to generate predictions on
68+
69+
Returns
70+
-------
71+
y : numpy array of shape (N', ...)
72+
Predicted targets for the N' rows in `X`
73+
"""
74+
predictions = []
75+
H = self.hyperparameters
76+
for x in X:
77+
pred = None
78+
nearest = self._ball_tree.nearest_neighbors(H["k"], x)
79+
targets = [n.val for n in nearest]
80+
81+
if H["classifier"]:
82+
if H["weights"] == "uniform":
83+
pred = Counter(targets).most_common(1)[0][0]
84+
elif H["weights"] == "distance":
85+
best_score = -np.inf
86+
for label in set(targets):
87+
scores = [1 / n.distance for n in nearest if n.val == label]
88+
pred = label if np.sum(scores) > best_score else pred
89+
else:
90+
if H["weights"] == "uniform":
91+
pred = np.mean(targets)
92+
elif H["weights"] == "distance":
93+
weights = [1 / n.distance for n in nearest]
94+
pred = np.average(targets, weights=weights)
95+
predictions.append(pred)
96+
return np.array(predictions)

nonparametric/plots.py

+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
import sys
2+
3+
sys.path.append("..")
4+
5+
import numpy as np
6+
7+
import matplotlib.pyplot as plt
8+
import seaborn as sns
9+
10+
# https://seaborn.pydata.org/generated/seaborn.set_context.html
11+
# https://seaborn.pydata.org/generated/seaborn.set_style.html
12+
sns.set_style("white")
13+
sns.set_context("paper", font_scale=0.5)
14+
15+
from linear_models.lm import LinearRegression
16+
from kernel_regression import KernelRegression
17+
from knn import KNN
18+
19+
from sklearn.model_selection import train_test_split
20+
from sklearn.datasets import make_regression
21+
22+
23+
def random_regression_problem(n_ex, n_in, n_out, d=3, intercept=0, std=1, seed=0):
24+
coef = np.random.uniform(0, 50, size=d)
25+
coef[-1] = intercept
26+
27+
y = []
28+
X = np.random.uniform(-100, 100, size=(n_ex, n_in))
29+
for x in X:
30+
val = np.polyval(coef, x) + np.random.normal(0, std)
31+
y.append(val)
32+
y = np.array(y)
33+
34+
# X, y, coef = make_regression(
35+
# n_samples=n_ex,
36+
# n_features=n_in,
37+
# n_targets=n_out,
38+
# bias=intercept,
39+
# noise=std,
40+
# coef=True,
41+
# random_state=seed,
42+
# )
43+
X_train, X_test, y_train, y_test = train_test_split(
44+
X, y, test_size=0.3, random_state=seed
45+
)
46+
return X_train, y_train, X_test, y_test, coef
47+
48+
49+
def plot_regression():
50+
np.random.seed(12345)
51+
fig, axes = plt.subplots(4, 4)
52+
for i, ax in enumerate(axes.flatten()):
53+
n_in = 1
54+
n_out = 1
55+
d = np.random.randint(1, 5)
56+
n_ex = np.random.randint(5, 500)
57+
std = np.random.randint(0, 1000)
58+
intercept = np.random.rand() * np.random.randint(-300, 300)
59+
X_train, y_train, X_test, y_test, coefs = random_regression_problem(
60+
n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i
61+
)
62+
63+
LR = LinearRegression(fit_intercept=True)
64+
LR.fit(X_train, y_train)
65+
y_pred = LR.predict(X_test)
66+
loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)
67+
68+
d = 3
69+
best_loss = np.inf
70+
for gamma in np.linspace(1e-10, 1, 100):
71+
for c0 in np.linspace(-1, 1000, 100):
72+
kernel = "PolynomialKernel(d={}, gamma={}, c0={})".format(d, gamma, c0)
73+
KR_poly = KernelRegression(kernel=kernel)
74+
KR_poly.fit(X_train, y_train)
75+
y_pred_poly = KR_poly.predict(X_test)
76+
loss_poly = np.mean((y_test.flatten() - y_pred_poly.flatten()) ** 2)
77+
if loss_poly <= best_loss:
78+
KR_poly_best = kernel
79+
best_loss = loss_poly
80+
81+
print("Best kernel: {} || loss: {:.4f}".format(KR_poly_best, best_loss))
82+
KR_poly = KernelRegression(kernel=KR_poly_best)
83+
KR_poly.fit(X_train, y_train)
84+
85+
KR_rbf = KernelRegression(kernel="RBFKernel(gamma=0.01)")
86+
KR_rbf.fit(X_train, y_train)
87+
y_pred_rbf = KR_rbf.predict(X_test)
88+
loss_rbf = np.mean((y_test.flatten() - y_pred_rbf.flatten()) ** 2)
89+
90+
xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
91+
xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
92+
X_plot = np.linspace(xmin, xmax, 100)
93+
y_plot = LR.predict(X_plot)
94+
y_plot_poly = KR_poly.predict(X_plot)
95+
y_plot_rbf = KR_rbf.predict(X_plot)
96+
97+
ax.scatter(X_test, y_test, alpha=0.5)
98+
ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
99+
ax.plot(
100+
X_plot, y_plot_poly, label="KR (poly kernel, d={})".format(d), alpha=0.5
101+
)
102+
ax.plot(X_plot, y_plot_rbf, label="KR (rbf kernel)", alpha=0.5)
103+
ax.legend()
104+
# ax.set_title(
105+
# "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
106+
# loss, loss_poly, loss_rbf
107+
# )
108+
# )
109+
110+
ax.xaxis.set_ticklabels([])
111+
ax.yaxis.set_ticklabels([])
112+
113+
plt.tight_layout()
114+
plt.savefig("img/kr_plots.png", dpi=300)
115+
plt.close("all")
116+
117+
118+
def plot_knn():
119+
np.random.seed(12345)
120+
fig, axes = plt.subplots(4, 4)
121+
for i, ax in enumerate(axes.flatten()):
122+
n_in = 1
123+
n_out = 1
124+
d = np.random.randint(1, 5)
125+
n_ex = np.random.randint(5, 500)
126+
std = np.random.randint(0, 1000)
127+
intercept = np.random.rand() * np.random.randint(-300, 300)
128+
X_train, y_train, X_test, y_test, coefs = random_regression_problem(
129+
n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i
130+
)
131+
132+
LR = LinearRegression(fit_intercept=True)
133+
LR.fit(X_train, y_train)
134+
y_pred = LR.predict(X_test)
135+
loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)
136+
137+
knn_1 = KNN(k=1, classifier=False, leaf_size=10, weights="uniform")
138+
knn_1.fit(X_train, y_train)
139+
y_pred_1 = knn_1.predict(X_test)
140+
loss_1 = np.mean((y_test.flatten() - y_pred_1.flatten()) ** 2)
141+
142+
knn_5 = KNN(k=5, classifier=False, leaf_size=10, weights="uniform")
143+
knn_5.fit(X_train, y_train)
144+
y_pred_5 = knn_5.predict(X_test)
145+
loss_5 = np.mean((y_test.flatten() - y_pred_5.flatten()) ** 2)
146+
147+
knn_10 = KNN(k=10, classifier=False, leaf_size=10, weights="uniform")
148+
knn_10.fit(X_train, y_train)
149+
y_pred_10 = knn_10.predict(X_test)
150+
loss_10 = np.mean((y_test.flatten() - y_pred_10.flatten()) ** 2)
151+
152+
xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
153+
xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
154+
X_plot = np.linspace(xmin, xmax, 100)
155+
y_plot = LR.predict(X_plot)
156+
y_plot_1 = knn_1.predict(X_plot)
157+
y_plot_5 = knn_5.predict(X_plot)
158+
y_plot_10 = knn_10.predict(X_plot)
159+
160+
ax.scatter(X_test, y_test, alpha=0.5)
161+
ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
162+
ax.plot(X_plot, y_plot_1, label="KNN (k=1)", alpha=0.5)
163+
ax.plot(X_plot, y_plot_5, label="KNN (k=5)", alpha=0.5)
164+
ax.plot(X_plot, y_plot_10, label="KNN (k=10)", alpha=0.5)
165+
ax.legend()
166+
# ax.set_title(
167+
# "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
168+
# loss, loss_poly, loss_rbf
169+
# )
170+
# )
171+
172+
ax.xaxis.set_ticklabels([])
173+
ax.yaxis.set_ticklabels([])
174+
175+
plt.tight_layout()
176+
plt.savefig("img/knn_plots.png", dpi=300)
177+
plt.close("all")

0 commit comments

Comments
 (0)