This repository has been archived by the owner on Feb 15, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
MLSMOTE.py
90 lines (75 loc) · 3.06 KB
/
MLSMOTE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
import logging
from time import time
# MLSMOTE - https://www.kaggle.com/tolgadincer/upsampling-multilabel-data-with-mlsmote
def get_tail_label(df: pd.DataFrame, ql=[0.05, 1.]) -> list:
"""
Find the underrepresented targets.
Underrepresented targets are those which are observed less than the median occurance.
Targets beyond a quantile limit are filtered.
"""
irlbl = df.sum(axis=0)
irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))] # Filtering
irlbl = irlbl.max() / irlbl
threshold_irlbl = irlbl.median()
tail_label = irlbl[irlbl > threshold_irlbl].index.tolist()
return tail_label
def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.05, 1.]):
"""
return
X_sub: pandas.DataFrame, the feature vector minority dataframe
y_sub: pandas.DataFrame, the target vector minority dataframe
"""
tail_labels = get_tail_label(y, ql=ql)
index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
X_sub = X[X.index.isin(index)].reset_index(drop=True)
y_sub = y[y.index.isin(index)].reset_index(drop=True)
return X_sub, y_sub
def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
"""
Give index of 10 nearest neighbor of all the instance
args
X: np.array, array whose nearest neighbor has to find
return
indices: list of list, index of 5 NN of each element in X
"""
logging.debug("Start to nearest_neighbour")
nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree', n_jobs=-1).fit(X)
euclidean, indices = nbs.kneighbors(X)
return indices
def MLSMOTE(X, y, n_sample, neigh=5):
"""
Give the augmented data using MLSMOTE algorithm
args
X: pandas.DataFrame, input vector DataFrame
y: pandas.DataFrame, feature vector dataframe
n_sample: int, number of newly generated sample
return
new_X: pandas.DataFrame, augmented feature vector data
target: pandas.DataFrame, augmented target vector data
"""
start_time = time()
logging.debug("Start to MLSMOTE")
indices2 = nearest_neighbour(X, neigh=5)
n = len(indices2)
new_X = np.zeros((n_sample, X.shape[1]))
target = np.zeros((n_sample, y.shape[1]))
for i in range(n_sample):
reference = random.randint(0, n - 1)
neighbor = random.choice(indices2[reference, 1:])
all_point = indices2[reference]
nn_df = y[y.index.isin(all_point)]
ser = nn_df.sum(axis=0, skipna=True)
target[i] = np.array([1 if val > 0 else 0 for val in ser])
ratio = random.random()
gap = X.loc[reference, :] - X.loc[neighbor, :]
new_X[i] = np.array(X.loc[reference, :] + ratio * gap)
new_X = pd.DataFrame(new_X, columns=X.columns)
target = pd.DataFrame(target, columns=y.columns)
logging.debug("Finished MLSMOTE.")
logging.debug("Used {:.2f}s.".format(time() - start_time))
return new_X, target