Skip to content

Commit abf2d8b

Browse files
author
Jaime Céspedes Sisniega
authored
Merge pull request #122 from IFCA/feature-hellinger-distance
Add Hellinger distance data drift method
2 parents cfa7cbb + 8e9ceaa commit abf2d8b

File tree

8 files changed

+214
-81
lines changed

8 files changed

+214
-81
lines changed

frouros/data_drift/batch/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from .distance_based import (
44
EMD,
5+
Hellinger,
56
HistogramIntersection,
67
JS,
78
KL,
@@ -19,6 +20,7 @@
1920
"ChiSquareTest",
2021
"CVMTest",
2122
"EMD",
23+
"Hellinger",
2224
"HistogramIntersection",
2325
"JS",
2426
"KL",

frouros/data_drift/batch/distance_based/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Data drift batch distance based detection methods' init."""
22

33
from .emd import EMD
4+
from .hellinger import Hellinger
45
from .histogram_intersection import HistogramIntersection
56
from .js import JS
67
from .kl import KL
@@ -9,6 +10,7 @@
910

1011
__all__ = [
1112
"EMD",
13+
"Hellinger",
1214
"HistogramIntersection",
1315
"JS",
1416
"KL",

frouros/data_drift/batch/distance_based/base.py

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616

1717
class DistanceBasedBase(DataDriftBatchBase):
18-
"""Abstract class representing a distance based."""
18+
"""Abstract class representing a distance based detector."""
1919

2020
def _apply_method(
2121
self, X_ref_: np.ndarray, X: np.ndarray, **kwargs # noqa: N803
@@ -47,8 +47,71 @@ def _distance_measure(
4747
pass
4848

4949

50+
class DistanceBinsBasedBase(DistanceBasedBase):
51+
"""Abstract class representing a distance bins based detector."""
52+
53+
def __init__(self, num_bins: int = 10) -> None:
54+
"""Init method.
55+
56+
:param num_bins: number of bins in which to divide probabilities
57+
:type num_bins: int
58+
"""
59+
super().__init__(data_type=NumericalData(), statistical_type=UnivariateData())
60+
self.num_bins = num_bins
61+
62+
@property
63+
def num_bins(self) -> int:
64+
"""Number of bins property.
65+
66+
:return: number of bins in which to divide probabilities
67+
:rtype: int
68+
"""
69+
return self._num_bins
70+
71+
@num_bins.setter
72+
def num_bins(self, value: int) -> None:
73+
"""Number of bins setter.
74+
75+
:param value: value to be set
76+
:type value: int
77+
:raises ValueError: Value error exception
78+
"""
79+
if value < 1:
80+
raise ValueError("value must be greater than 0.")
81+
self._num_bins = value
82+
83+
def _distance_measure(
84+
self, X_ref_: np.ndarray, X: np.ndarray, **kwargs # noqa: N803
85+
) -> DistanceResult:
86+
X_ref_percents, X_percents = self._calculate_bins_values( # noqa: N806
87+
X_ref_=self.X_ref_, X=X, num_bins=self.num_bins
88+
)
89+
distance_bins = self._distance_measure_bins(X_ref_=X_ref_percents, X=X_percents)
90+
distance = DistanceResult(distance=distance_bins)
91+
return distance
92+
93+
@staticmethod
94+
def _calculate_bins_values(
95+
X_ref_: np.ndarray, X: np.ndarray, num_bins: int = 10 # noqa: N803
96+
) -> np.ndarray:
97+
bins = np.histogram(np.hstack((X_ref_, X)), bins=num_bins)[ # get the bin edges
98+
1
99+
]
100+
X_ref_percents = ( # noqa: N806
101+
np.histogram(a=X_ref_, bins=bins)[0] / X_ref_.shape[0]
102+
) # noqa: N806
103+
X_percents = np.histogram(a=X, bins=bins)[0] / X.shape[0] # noqa: N806
104+
return X_ref_percents, X_percents
105+
106+
@abc.abstractmethod
107+
def _distance_measure_bins(
108+
self, X_ref_: np.ndarray, X: np.ndarray # noqa: N803
109+
) -> float:
110+
pass
111+
112+
50113
class DistanceProbabilityBasedBase(DistanceBasedBase):
51-
"""Abstract class representing a distance probability based."""
114+
"""Abstract class representing a distance probability based detector."""
52115

53116
def __init__(self, num_bins: int = 100) -> None:
54117
"""Init method.
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""Hellinger distance module."""
2+
3+
import numpy as np # type: ignore
4+
5+
from frouros.data_drift.batch.distance_based.base import (
6+
DistanceBinsBasedBase,
7+
)
8+
9+
10+
class Hellinger(DistanceBinsBasedBase):
11+
"""Hellinger algorithm class."""
12+
13+
def __init__(self, num_bins: int = 10) -> None:
14+
"""Init method.
15+
16+
:param num_bins: number of bins in which to divide probabilities
17+
:type num_bins: int
18+
"""
19+
super().__init__(num_bins=num_bins)
20+
self._sqrt_div = np.sqrt(2)
21+
22+
def _distance_measure_bins(
23+
self,
24+
X_ref_: np.ndarray, # noqa: N803
25+
X: np.ndarray, # noqa: N803
26+
) -> float:
27+
distance = self._hellinger(
28+
X_ref_=X_ref_,
29+
X=X,
30+
sqrt_div=self._sqrt_div,
31+
)
32+
return distance
33+
34+
@staticmethod
35+
def _hellinger(
36+
X_ref_: np.ndarray, X: np.ndarray, sqrt_div: float # noqa: N803
37+
) -> float:
38+
distance = np.sqrt(np.sum((np.sqrt(X_ref_) - np.sqrt(X)) ** 2)) / sqrt_div
39+
return distance
Lines changed: 13 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,105 +1,43 @@
11
"""PSI (Population Stability Index) module."""
22

33
import sys
4-
from typing import Optional
54

65
import numpy as np # type: ignore
76

8-
from frouros.data_drift.base import NumericalData, UnivariateData
97
from frouros.data_drift.batch.distance_based.base import (
10-
DistanceBasedBase,
8+
DistanceBinsBasedBase,
119
DistanceResult,
1210
)
1311

1412

15-
class PSI(DistanceBasedBase):
13+
class PSI(DistanceBinsBasedBase):
1614
"""PSI (Population Stability Index) algorithm class."""
1715

18-
def __init__(self, num_buckets: int = 10) -> None:
19-
"""Init method.
20-
21-
:param num_buckets: number of buckets
22-
:type num_buckets: int
23-
"""
24-
super().__init__(data_type=NumericalData(), statistical_type=UnivariateData())
25-
self.num_buckets = num_buckets
26-
self.X_ref_num: Optional[int] = None # pylint: disable=invalid-name
27-
28-
@property
29-
def num_buckets(self) -> int:
30-
"""Number of buckets.
31-
32-
:return: number of buckets
33-
:rtype: int
34-
"""
35-
return self._num_buckets
36-
37-
@num_buckets.setter
38-
def num_buckets(self, value: int) -> None:
39-
"""Number of buckets setter.
40-
41-
:param value: value to be set
42-
:type value: Optional[int]
43-
:raises ValueError: Value error exception
44-
"""
45-
if value < 1:
46-
raise ValueError("num buckets must be greater than 0.")
47-
self._num_buckets = value
48-
49-
def fit(
50-
self,
51-
X: np.ndarray, # noqa: N803
52-
) -> None:
53-
"""Fit estimator.
54-
55-
:param X: feature data
56-
:type X: numpy.ndarray
57-
:return: fitted estimator
58-
:rtype: self
59-
"""
60-
super().fit(X=X)
61-
self.X_ref_num = self.X_ref_.shape[0] # type: ignore
62-
6316
def _apply_method(
6417
self, X_ref_: np.ndarray, X: np.ndarray, **kwargs # noqa: N803
6518
) -> DistanceResult:
6619
distance = self._distance_measure(X_ref_=X_ref_, X=X, **kwargs)
6720
return distance
6821

69-
def _distance_measure(
70-
self, X_ref_: np.ndarray, X: np.ndarray, **kwargs # noqa: N803
71-
) -> DistanceResult:
72-
psi = self._psi(
22+
def _distance_measure_bins(
23+
self,
24+
X_ref_: np.ndarray, # noqa: N803
25+
X: np.ndarray, # noqa: N803
26+
) -> float:
27+
# Replace 0.0 values with the smallest number possible
28+
# in order to avoid division by zero
29+
X_ref_[X_ref_ == 0.0] = sys.float_info.min
30+
X[X == 0.0] = sys.float_info.min
31+
distance = self._psi(
7332
X_ref_=X_ref_,
7433
X=X,
75-
X_ref_num=self.X_ref_num, # type: ignore
76-
num_buckets=self.num_buckets,
7734
)
78-
distance = DistanceResult(distance=psi)
7935
return distance
8036

8137
@staticmethod
8238
def _psi(
8339
X_ref_: np.ndarray, # noqa: N803
8440
X: np.ndarray, # noqa: N803
85-
X_ref_num: int, # noqa: N803 # pylint: disable=invalid-name
86-
num_buckets: int,
8741
) -> float:
88-
X_ref_percents = ( # noqa: N806 # pylint: disable=invalid-name
89-
np.histogram(a=X_ref_, bins=num_buckets)[0] / X_ref_num
90-
)
91-
X_percents = np.histogram( # noqa: N806 # pylint: disable=invalid-name
92-
a=X, bins=num_buckets
93-
)[0] / len(
94-
X # noqa: N806
95-
)
96-
97-
# Replace 0.0 values with the smallest number possible
98-
# in order to avoid division by zero
99-
X_ref_percents[X_ref_percents == 0.0] = sys.float_info.min
100-
X_percents[X_percents == 0.0] = sys.float_info.min
101-
102-
psi = np.sum(
103-
(X_percents - X_ref_percents) * np.log(X_percents / X_ref_percents)
104-
)
42+
psi = np.sum((X - X_ref_) * np.log(X / X_ref_))
10543
return psi

frouros/tests/conftest.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,30 @@ def multivariate_distribution_q() -> Tuple[np.ndarray, np.ndarray]:
171171
return mean, cov
172172

173173

174+
@pytest.fixture(scope="module")
175+
def univariate_distribution_p() -> Tuple[float, float]:
176+
"""Univariate distribution p.
177+
178+
:return: mean and standard deviation of distribution p
179+
:rtype: Tuple[float, float]
180+
"""
181+
mean, std = 1, 1
182+
183+
return mean, std
184+
185+
186+
@pytest.fixture(scope="module")
187+
def univariate_distribution_q() -> Tuple[float, float]:
188+
"""Univariate distribution q.
189+
190+
:return: mean and standard deviation of distribution q
191+
:rtype: Tuple[float, float]
192+
"""
193+
mean, std = 5, 2
194+
195+
return mean, std
196+
197+
174198
@pytest.fixture(scope="module")
175199
def prequential_error():
176200
"""Prequential error.

frouros/tests/test_data_drift.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from frouros.data_drift.batch.base import DataDriftBatchBase
99
from frouros.data_drift.batch.distance_based import (
1010
EMD,
11+
Hellinger,
1112
HistogramIntersection,
1213
PSI,
1314
JS,
@@ -56,7 +57,6 @@ def test_batch_distance_based_categorical(
5657
"detector, expected_distance",
5758
[
5859
(EMD(), 0.54726161),
59-
(PSI(), 496.21968934),
6060
(JS(), 0.81451218),
6161
(KL(), np.inf),
6262
(HistogramIntersection(), 0.97669491),
@@ -84,6 +84,71 @@ def test_batch_distance_based_univariate(
8484
assert np.isclose(distance, expected_distance)
8585

8686

87+
@pytest.mark.parametrize(
88+
"detector, expected_distance",
89+
[(PSI(), 468.79410784), (Hellinger(), 0.77137797)],
90+
)
91+
def test_batch_distance_bins_based_univariate_different_distribution(
92+
univariate_distribution_p: Tuple[float, float],
93+
univariate_distribution_q: Tuple[float, float],
94+
detector: DataDriftBatchBase,
95+
expected_distance: float,
96+
num_samples: int = 500,
97+
) -> None:
98+
"""Test distance based univariate different distribution method.
99+
100+
:param univariate_distribution_p: mean and standard deviation of distribution p
101+
:type univariate_distribution_p: Tuple[float, float]
102+
:param univariate_distribution_q: mean and standard deviation of distribution q
103+
:type univariate_distribution_q: Tuple[float, float]
104+
:param detector: detector distance
105+
:type detector: DataDriftBatchBase
106+
:param expected_distance: expected p-value value
107+
:type expected_distance: float
108+
"""
109+
np.random.seed(seed=31)
110+
X_ref = np.random.normal(*univariate_distribution_p, size=num_samples) # noqa: N806
111+
X_test = np.random.normal( # noqa: N806
112+
*univariate_distribution_q, size=num_samples
113+
)
114+
115+
detector.fit(X=X_ref)
116+
distance = detector.compare(X=X_test)
117+
118+
assert np.isclose(distance, expected_distance)
119+
120+
121+
@pytest.mark.parametrize(
122+
"detector, expected_distance",
123+
[(PSI(), 0.01840072), (Hellinger(), 0.04792538)],
124+
)
125+
def test_batch_distance_bins_based_univariate_same_distribution(
126+
univariate_distribution_p: Tuple[float, float],
127+
detector: DataDriftBatchBase,
128+
expected_distance: float,
129+
num_samples: int = 500,
130+
) -> None:
131+
"""Test distance based univariate same distribution method.
132+
133+
:param univariate_distribution_p: mean and standard deviation of distribution p
134+
:type univariate_distribution_p: Tuple[float, float]
135+
:param detector: detector distance
136+
:type detector: DataDriftBatchBase
137+
:param expected_distance: expected p-value value
138+
:type expected_distance: float
139+
"""
140+
np.random.seed(seed=31)
141+
X_ref = np.random.normal(*univariate_distribution_p, size=num_samples) # noqa: N806
142+
X_test = np.random.normal( # noqa: N806
143+
*univariate_distribution_p, size=num_samples
144+
)
145+
146+
detector.fit(X=X_ref)
147+
distance = detector.compare(X=X_test)
148+
149+
assert np.isclose(distance, expected_distance)
150+
151+
87152
@pytest.mark.parametrize(
88153
"detector, expected_statistic, expected_p_value",
89154
[

0 commit comments

Comments
 (0)