forked from vsmolyakov/ml_algo_in_depth
-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot_tomek_links.py
40 lines (32 loc) · 1.42 KB
/
plot_tomek_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from imblearn.under_sampling import TomekLinks
rng = np.random.RandomState(42)
def main():
#generate data
n_samples_1 = 500
n_samples_2 = 50
X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2), 0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
X_syn, y_syn = shuffle(X_syn, y_syn)
X_syn_train, X_syn_test, y_syn_train, y_syn_test = train_test_split(X_syn, y_syn)
# remove Tomek links
tl = TomekLinks(sampling_strategy='auto')
X_resampled, y_resampled = tl.fit_resample(X_syn, y_syn)
idx_resampled = tl.sample_indices_
idx_samples_removed = np.setdiff1d(np.arange(X_syn.shape[0]),idx_resampled)
#generate plots
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
idx_class_0 = y_resampled == 0
plt.scatter(X_resampled[idx_class_0, 0], X_resampled[idx_class_0, 1], alpha=.8, marker = "o", label='Class #0')
plt.scatter(X_resampled[~idx_class_0, 0], X_resampled[~idx_class_0, 1], alpha=.8, marker = "s", label='Class #1')
plt.scatter(X_syn[idx_samples_removed, 0], X_syn[idx_samples_removed, 1], alpha=.8, marker = "v", label='Removed samples')
plt.title('Undersampling: Tomek links')
plt.legend()
plt.show()
if __name__ == "__main__":
main()