Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] K-Means anomaly detector #1607

Merged
merged 13 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .all-contributorsrc
Original file line number Diff line number Diff line change
Expand Up @@ -2348,7 +2348,10 @@
"bug",
"code",
"doc",
"research"
"research",
"tests",
"review",
"data"
]
},
{
Expand Down
2 changes: 2 additions & 0 deletions aeon/anomaly_detection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

__all__ = [
"DWT_MLEAD",
"KMeansAD",
"MERLIN",
"STRAY",
]

from aeon.anomaly_detection._dwt_mlead import DWT_MLEAD
from aeon.anomaly_detection._kmeans import KMeansAD
from aeon.anomaly_detection._merlin import MERLIN
from aeon.anomaly_detection._stray import STRAY
182 changes: 182 additions & 0 deletions aeon/anomaly_detection/_kmeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""k-Means anomaly detector."""

__maintainer__ = ["CodeLionX"]
__all__ = ["KMeansAD"]

from typing import Optional

import numpy as np
from sklearn.cluster import KMeans

from aeon.anomaly_detection.base import BaseAnomalyDetector
from aeon.utils.windowing import reverse_windowing, sliding_windows


class KMeansAD(BaseAnomalyDetector):
"""KMeans anomaly detector.

The k-Means anomaly detector uses k-Means clustering to detect anomalies in time
series. The time series is split into windows of a fixed size, and the k-Means
algorithm is used to cluster these windows. The anomaly score for each time point is
the average Euclidean distance between the time point's windows and the windows'
corresponding cluster centers.

``k-MeansAD`` supports univariate and multivariate time series. It can also be
fitted on a clean reference time series and used to detect anomalies in a different
target time series with the same number of dimensions.

.. list-table:: Capabilities
:stub-columns: 1

* - Input data format
- univariate and multivariate
* - Output data format
- anomaly scores
* - Learning Type
- unsupervised or semi-superivsed

Parameters
----------
n_clusters : int, default=20
The number of clusters to use in the k-Means algorithm. The bigger the number
of clusters, the less noisy the anomaly scores get. However, the number of
clusters should not be too high, as this can lead to overfitting.

window_size : int, default=20
The size of the sliding window used to split the time series into windows. The
bigger the window size, the bigger the anomaly context is. If it is too big,
however, the detector marks points anomalous that are not. If it is too small,
the detector might not detect larger anomalies or contextual anomalies at all.
If ``window_size`` is smaller than the anomaly, the detector might detect only
the transitions between normal data and the anomalous subsequence.

stride : int, default=1
The stride of the sliding window. The stride determines how many time points
the windows are spaced appart. A stride of 1 means that the window is moved one
time point forward compared to the previous window. The larger the stride, the
fewer windows are created, which leads to noisier anomaly scores.

random_state : int, default=None
The random state to use in the k-Means algorithm.

Notes
-----
This implementation is inspired by [1]_. However, the original paper proposes a
different kind of preprocessing and also uses advanced techniques to post-process
the clustering.

References
----------
.. [1] Yairi, Takehisa, Yoshikiyo Kato, and Koichi Hori. "Fault Detection by Mining
Association Rules from House-Keeping Data." In Proceedings of the
International Symposium on Artificial Intelligence, Robotics and Automation
in Space (-SAIRAS), Vol. 6., 2001.

Examples
--------
>>> import numpy as np
>>> from aeon.anomaly_detection import KMeansAD
>>> X = np.array([1, 2, 3, 4, 1, 2, 3, 3, 2, 8, 9, 8, 1, 2, 3, 4], dtype=np.float_)
>>> detector = KMeansAD(n_clusters=3, window_size=4, stride=1, random_state=0)
>>> detector.fit_predict(X)
array([1.97827709, 2.45374147, 2.51929879, 2.36979677, 2.34826601,
2.05075554, 2.57611912, 2.87642119, 3.18400743, 3.65060425,
3.36402514, 3.94053744, 3.65448197, 3.6707922 , 3.70341266,
1.97827709])

"""

_tags = {
"capability:univariate": True,
"capability:multivariate": True,
"capability:missing_values": False,
"fit_is_empty": False,
}

def __init__(
self,
n_clusters: int = 20,
window_size: int = 20,
stride: int = 1,
random_state: Optional[int] = None,
):
self.n_clusters = n_clusters
self.window_size = window_size
self.stride = stride
self.random_state = random_state

super().__init__(axis=0)

self.estimator_: Optional[KMeans] = None

def _fit(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> "KMeansAD":
_X, _ = sliding_windows(
X, window_size=self.window_size, stride=self.stride, axis=0
)
self.estimator_ = KMeans(
n_clusters=self.n_clusters,
random_state=self.random_state,
init="k-means++",
n_init=10,
max_iter=300,
tol=1e-4,
verbose=0,
algorithm="lloyd",
)
self.estimator_.fit(_X)
return self

def _predict(self, X) -> np.ndarray:
_X, padding = sliding_windows(
X, window_size=self.window_size, stride=self.stride, axis=0
)
clusters = self.estimator_.predict(_X)
window_scores = np.linalg.norm(
_X - self.estimator_.cluster_centers_[clusters], axis=1
)
point_anomaly_scores = reverse_windowing(
window_scores, self.window_size, np.nanmean, self.stride, padding
)
return point_anomaly_scores

def _fit_predict(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
_X, padding = sliding_windows(
X, window_size=self.window_size, stride=self.stride, axis=0
)
self.estimator_ = KMeans(
n_clusters=self.n_clusters, random_state=self.random_state
)
self.estimator_.fit(_X)
clusters = self.estimator_.predict(_X)
window_scores = np.linalg.norm(
_X - self.estimator_.cluster_centers_[clusters], axis=1
)
point_anomaly_scores = reverse_windowing(
window_scores, self.window_size, np.nanmean, self.stride, padding
)
return point_anomaly_scores

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.

Returns
-------
dict or list of dict, default={}
Parameters to create testing instances of the class.
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`.
"""
return {
"n_clusters": 5,
"window_size": 10,
"stride": 1,
"random_state": 0,
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
return_names=False,
)

labels = np.zeros(15)
labels = np.zeros(15, dtype=np.int_)
labels[np.random.choice(15, 5)] = 1
uv_series = make_series(n_timepoints=15, return_numpy=True, random_state=0)
uv_series[labels == 1] += 1
Expand Down
37 changes: 37 additions & 0 deletions aeon/anomaly_detection/tests/test_kmeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Tests for the KMeansAD class."""

__maintainer__ = ["CodeLionX"]

import numpy as np

from aeon.anomaly_detection import KMeansAD
from aeon.testing.utils.data_gen import make_series


def test_kmeansad_univariate():
"""Test KMeansAD univariate output."""
series = make_series(n_timepoints=100, return_numpy=True, random_state=42)
series[50:58] -= 5

ad = KMeansAD(n_clusters=2, window_size=10)
pred = ad.fit_predict(series, axis=0)

assert pred.shape == (100,)
assert pred.dtype == np.float_
assert 50 <= np.argmax(pred) <= 58


def test_kmeansad_multivariate():
"""Test KMeansAD multivariate output."""
series = make_series(
n_timepoints=100, n_columns=3, return_numpy=True, random_state=42
)
series[50:58, 0] -= 5
series[87:90, 1] += 0.1

ad = KMeansAD(n_clusters=2, window_size=10)
pred = ad.fit_predict(series, axis=0)

assert pred.shape == (100,)
assert pred.dtype == np.float_
assert 50 <= np.argmax(pred) <= 58
5 changes: 3 additions & 2 deletions docs/api_reference/anomaly_detection.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ for time series anomaly detection.
:toctree: auto_generated/
:template: class.rst

STRAY
MERLIN
DWT_MLEAD
KMeansAD
MERLIN
STRAY