Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 76 additions & 35 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,23 @@ Frouros is a Python library for drift detection in machine learning systems that

### Concept drift

As a quick example, we can use the wine dataset to which concept drift it is induced in order to show the use of a concept drift detector like DDM (Drift Detection Method).
As a quick example, we can use the breast cancer dataset to which concept drift it is induced and show the use of a concept drift detector like DDM (Drift Detection Method). We can see how concept drift affects the performance in terms of accuracy.

```python
import numpy as np
from sklearn.datasets import load_wine
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from frouros.detectors.concept_drift import DDM, DDMConfig
from frouros.metrics import PrequentialError

np.random.seed(seed=31)

# Load wine dataset
X, y = load_wine(return_X_y=True)
# Load breast cancer dataset
X, y = load_breast_cancer(return_X_y=True)

# Split train (70%) and test (30%)
(
Expand All @@ -90,14 +91,6 @@ X, y = load_wine(return_X_y=True)
y_test,
) = train_test_split(X, y, train_size=0.7, random_state=31)

# IMPORTANT: Induce/simulate concept drift in the last part (20%)
# of y_test by modifying some labels (50% approx). Therefore, changing P(y|X))
drift_size = int(y_test.shape[0] * 0.2)
y_test_drift = y_test[-drift_size:]
modify_idx = np.random.rand(*y_test_drift.shape) <= 0.5
y_test_drift[modify_idx] = (y_test_drift[modify_idx] + 1) % len(np.unique(y_test))
y_test[-drift_size:] = y_test_drift

# Define and fit model
pipeline = Pipeline(
[
Expand All @@ -108,29 +101,74 @@ pipeline = Pipeline(
pipeline.fit(X=X_train, y=y_train)

# Detector configuration and instantiation
config = DDMConfig(warning_level=2.0,
drift_level=3.0,
min_num_instances=30,)
config = DDMConfig(
warning_level=2.0,
drift_level=3.0,
min_num_instances=25, # minimum number of instances before checking for concept drift
)
detector = DDM(config=config)

# Simulate data stream (assuming test label available after prediction)
for i, (X, y) in enumerate(zip(X_test, y_test)):
y_pred = pipeline.predict(X.reshape(1, -1))
error = 1 - int(y_pred == y)
detector.update(value=error)
status = detector.status
if status["drift"]:
print(f"Drift detected at index {i}")
break

>> Drift detected at index 44
# Metric to compute accuracy
metric = PrequentialError(alpha=1.0) # alpha=1.0 is equivalent to normal accuracy

def stream_test(X_test, y_test, y, metric, detector):
"""Simulate data stream over X_test and y_test. y is the true label."""
drift_flag = False
for i, (X, y) in enumerate(zip(X_test, y_test)):
y_pred = pipeline.predict(X.reshape(1, -1))
error = 1 - (y_pred.item() == y.item())
metric_error = metric(error_value=error)
_ = detector.update(value=error)
status = detector.status
if status["drift"] and not drift_flag:
drift_flag = True
print(f"Concept drift detected at step {i}. Accuracy: {1 - metric_error:.4f}")
if not drift_flag:
print("No concept drift detected")
print(f"Final accuracy: {1 - metric_error:.4f}\n")

# Simulate data stream (assuming test label available after each prediction)
# No concept drift is expected to occur
stream_test(
X_test=X_test,
y_test=y_test,
y=y,
metric=metric,
detector=detector,
)
# >> No concept drift detected
# >> Final accuracy: 0.9766

# IMPORTANT: Induce/simulate concept drift in the last part (20%)
# of y_test by modifying some labels (50% approx). Therefore, changing P(y|X))
drift_size = int(y_test.shape[0] * 0.2)
y_test_drift = y_test[-drift_size:]
modify_idx = np.random.rand(*y_test_drift.shape) <= 0.5
y_test_drift[modify_idx] = (y_test_drift[modify_idx] + 1) % len(np.unique(y_test))
y_test[-drift_size:] = y_test_drift

# Reset detector and metric
detector.reset()
metric.reset()

# Simulate data stream (assuming test label available after each prediction)
# Concept drift is expected to occur because of the label modification
stream_test(
X_test=X_test,
y_test=y_test,
y=y,
metric=metric,
detector=detector,
)
# >> Concept drift detected at step 142. Accuracy: 0.9510
# >> Final accuracy: 0.8480
```

More concept drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples.html#data-drift).
More concept drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples/concept_drift.html).

### Data drift

As a quick example, we can use the iris dataset to which data drift in order to show the use of a data drift detector like Kolmogorov-Smirnov test.
As a quick example, we can use the iris dataset to which data drift is induced and show the use of a data drift detector like Kolmogorov-Smirnov test.

```python
import numpy as np
Expand All @@ -154,11 +192,11 @@ X, y = load_iris(return_X_y=True)
) = train_test_split(X, y, train_size=0.7, random_state=31)

# Set the feature index to which detector is applied
dim_idx = 0
feature_idx = 0

# IMPORTANT: Induce/simulate data drift in the selected feature of y_test by
# applying some gaussian noise. Therefore, changing P(X))
X_test[:, dim_idx] += np.random.normal(
X_test[:, feature_idx] += np.random.normal(
loc=0.0,
scale=3.0,
size=X_test.shape[0],
Expand All @@ -172,18 +210,21 @@ model.fit(X=X_train, y=y_train)
alpha = 0.001
# Define and fit detector
detector = KSTest()
detector.fit(X=X_train[:, dim_idx])
_ = detector.fit(X=X_train[:, feature_idx])

# Apply detector to the selected feature of X_test
result = detector.compare(X=X_test[:, dim_idx])
result, _ = detector.compare(X=X_test[:, feature_idx])

# Check if drift is taking place
result[0].p_value < alpha
>> True # Data drift detected.
if result.p_value <= alpha:
print(f"Data drift detected at feature {feature_idx}")
else:
print(f"No data drift detected at feature {feature_idx}")
# >> Data drift detected at feature 0
# Therefore, we can reject H0 (both samples come from the same distribution).
```

More data drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples.html#data-drift).
More data drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples/data_drift.html).

## 🛠 Installation

Expand Down