diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5cadd5d..1b4d989 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -46,7 +46,7 @@ jobs:
name: python/default
steps:
- coveralls/upload:
- carryforward: 3.11, 3.12
+ carryforward: 3.11, 3.12, 3.13
parallel_finished: true
workflows:
@@ -56,7 +56,7 @@ workflows:
- tests:
matrix:
parameters:
- version: ["3.11", "3.12"]
+ version: ["3.11", "3.12", "3.13"]
- coverage:
requires:
- tests
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..555f555
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = "sequentia/model_selection/_validation.py"
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..60404dc
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.ipynb linguist-documentation
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 5025358..b953c5e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -94,3 +94,6 @@ venv.bak/
# Changelog entry
ENTRY.md
+
+# Jupyter Notebook checkpoints
+*.ipynb_checkpoints/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8230514..c25eafb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,13 +11,13 @@ repos:
pass_filenames: false
# ruff check (w/autofix)
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.1.3 # should match version in pyproject.toml
+ rev: v0.8.4 # should match version in pyproject.toml
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
# ruff format
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.1.3 # should match version in pyproject.toml
+ rev: v0.8.4 # should match version in pyproject.toml
hooks:
- id: ruff-format
# # pydoclint - docstring formatting
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c141487..c63cf24 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -388,6 +388,21 @@ Nothing, initial release!
+## [v2.5.0](https://github.com/eonu/sequentia/releases/tag/v2.5.0) - 2024-12-27
+
+### Documentation
+
+- update copyright notice ([#255](https://github.com/eonu/sequentia/issues/255))
+
+### Features
+
+- add `mise.toml` and support `numpy>=2` ([#254](https://github.com/eonu/sequentia/issues/254))
+- add python v3.13 support ([#253](https://github.com/eonu/sequentia/issues/253))
+- add library benchmarks ([#256](https://github.com/eonu/sequentia/issues/256))
+- add `model_selection` sub-package for hyper-parameters ([#257](https://github.com/eonu/sequentia/issues/257))
+- add model spec support to `HMMClassifier.__init__` ([#258](https://github.com/eonu/sequentia/issues/258))
+- add `HMMClassifier.fit` multiprocessing ([#259](https://github.com/eonu/sequentia/issues/259))
+
## [v2.0.2](https://github.com/eonu/sequentia/releases/tag/v2.0.2) - 2024-04-13
### Bug Fixes
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index fe583f6..3988459 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -50,6 +50,6 @@ We are thankful for their work and all the communities who have paved the way wi
---
- Sequentia © 2019-2025, Edwin Onuonga - Released under the MIT license.
+ Sequentia © 2019, Edwin Onuonga - Released under the MIT license.
Authored and maintained by Edwin Onuonga.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1430004..f934143 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -105,6 +105,6 @@ By contributing, you agree that your contributions will be licensed under the re
---
- Sequentia © 2019-2025, Edwin Onuonga - Released under the MIT license.
+ Sequentia © 2019, Edwin Onuonga - Released under the MIT license.
Authored and maintained by Edwin Onuonga.
diff --git a/LICENSE b/LICENSE
index c5d8701..c02e87e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2019-2025 Edwin Onuonga (eonu)
+Copyright (c) 2019 Edwin Onuonga (eonu)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 9721eec..b52d532 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@
About ·
Build Status ·
Features ·
+ Installation ·
Documentation ·
Examples ·
Acknowledgments ·
@@ -57,6 +58,8 @@ Some examples of how Sequentia can be used on sequence data include:
- **Simplicity and interpretability**: Sequentia offers a limited set of machine learning algorithms, chosen specifically to be more interpretable and easier to configure than more complex alternatives such as recurrent neural networks and transformers, while maintaining a high level of effectiveness.
- **Familiar and user-friendly**: To fit more seamlessly into the workflow of data science practitioners, Sequentia follows the ubiquitous Scikit-Learn API, providing a familiar model development process for many, as well as enabling wider access to the rapidly growing Scikit-Learn ecosystem.
+- **Speed**: Some algorithms offered by Sequentia naturally have restrictive runtime scaling, such as k-nearest neighbors. However, our implementation is
+optimized to the point of being multiple orders of magnitude faster than similar packages — see the [Benchmarks](#benchmarks) section for more information.
## Build Status
@@ -68,33 +71,99 @@ Some examples of how Sequentia can be used on sequence data include:
### Models
-The following models provided by Sequentia all support variable length sequences.
-
#### [Dynamic Time Warping + k-Nearest Neighbors](https://sequentia.readthedocs.io/en/latest/sections/models/knn/index.html) (via [`dtaidistance`](https://github.com/wannesm/dtaidistance))
+Dynamic Time Warping (DTW) is a distance measure that can be applied to two sequences of different length.
+When used as a distance measure for the k-Nearest Neighbors (kNN) algorithm this results in a simple yet
+effective inference algorithm.
+
- [x] Classification
- [x] Regression
+- [x] Variable length sequences
- [x] Multivariate real-valued observations
- [x] Sakoe–Chiba band global warping constraint
- [x] Dependent and independent feature warping (DTWD/DTWI)
- [x] Custom distance-weighted predictions
-- [x] Multi-processed predictions
+- [x] Multi-processed prediction
#### [Hidden Markov Models](https://sequentia.readthedocs.io/en/latest/sections/models/hmm/index.html) (via [`hmmlearn`](https://github.com/hmmlearn/hmmlearn))
-Parameter estimation with the Baum-Welch algorithm and prediction with the forward algorithm [[1]](#references)
+A Hidden Markov Model (HMM) is a state-based statistical model which represents a sequence as
+a series of observations that are emitted from a collection of latent hidden states which form
+an underlying Markov chain. Each hidden state has an emission distribution that models its observations.
+
+Expectation-maximization via the Baum-Welch algorithm (or forward-backward algorithm) [[1]](#references) is used to
+derive a maximum likelihood estimate of the Markov chain probabilities and emission distribution parameters
+based on the provided training sequence data.
- [x] Classification
-- [x] Multivariate real-valued observations (Gaussian mixture model emissions)
-- [x] Univariate categorical observations (discrete emissions)
+- [x] Variable length sequences
+- [x] Multivariate real-valued observations (modeled with Gaussian mixture emissions)
+- [x] Univariate categorical observations (modeled with discrete emissions)
- [x] Linear, left-right and ergodic topologies
-- [x] Multi-processed predictions
+- [x] Multi-processed training and prediction
### Scikit-Learn compatibility
-**Sequentia (≥2.0) is fully compatible with the Scikit-Learn API (≥1.4), enabling for rapid development and prototyping of sequential models.**
+**Sequentia (≥2.0) is compatible with the Scikit-Learn API (≥1.4), enabling for rapid development and prototyping of sequential models.**
+
+The integration relies on the use of [metadata routing](https://scikit-learn.org/stable/metadata_routing.html),
+which means that in most cases, the only necessary change is to add a `lengths` key-word argument to provide
+sequence length information, e.g. `fit(X, y, lengths=lengths)` instead of `fit(X, y)`.
+
+### Similar libraries
+
+As DTW k-nearest neighbors is the core algorithm offered by Sequentia, below is a comparison of the DTW k-nearest neighbors algorithm features supported by Sequentia and similar libraries.
+
+||**`sequentia`**|[`aeon`](https://github.com/aeon-toolkit/aeon)|[`tslearn`](https://github.com/tslearn-team/tslearn)|[`sktime`](https://github.com/sktime/sktime)|[`pyts`](https://github.com/johannfaouzi/pyts)|
+|-|:-:|:-:|:-:|:-:|:-:|
+|Scikit-Learn compatible|✅|✅|✅|✅|✅|
+|Multivariate sequences|✅|✅|✅|✅|❌|
+|Variable length sequences|✅|✅|➖1 |❌2 |❌3 |
+|No padding required|✅|❌|➖1 |❌2 |❌3 |
+|Classification|✅|✅|✅|✅|✅|
+|Regression|✅|✅|✅|✅|❌|
+|Preprocessing|✅|✅|✅|✅|✅|
+|Multiprocessing|✅|✅|✅|✅|✅|
+|Custom weighting|✅|✅|✅|✅|✅|
+|Sakoe-Chiba band constraint|✅|✅|✅|✅|✅|
+|Itakura paralellogram constraint|❌|✅|✅|✅|✅|
+|Dependent DTW (DTWD)|✅|✅|✅|✅|❌|
+|Independent DTW (DTWI)|✅|❌|❌|❌|✅|
+|Custom DTW measures|❌4 |✅|❌|✅|✅|
+
+- 1 `tslearn` supports variable length sequences with padding, but doesn't seem to mask the padding.
+- 2 `sktime` does not support variable length sequences, so they are padded (and padding is not masked).
+- 3 `pyts` does not support variable length sequences, so they are padded (and padding is not masked).
+- 4 `sequentia` only supports [`dtaidistance`](https://github.com/wannesm/dtaidistance), which is one of the fastest DTW libraries as it is written in C.
+
+### Benchmarks
+
+To compare the above libraries in runtime performance on dynamic time warping k-nearest neighbors classification tasks, a simple benchmark was performed on a univariate sequence dataset.
+
+The [Free Spoken Digit Dataset](https://sequentia.readthedocs.io/en/latest/sections/datasets/digits.html) was used for benchmarking and consists of:
+
+- 3000 recordings of 10 spoken digits (0-9)
+ - 50 recordings of each digit for each of 6 speakers
+ - 1500 used for training, 1500 used for testing (split via label stratification)
+- 13 features ([MFCCs](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum))
+ - Only the first feature was used as not all of the above libraries support multivariate sequences
+- Sequence length statistics: (min 6, median 17, max 92)
+
+Each result measures the total time taken to complete training and prediction repeated 10 times.
+
+All of the above libraries support multiprocessing, and prediction was performed using 16 workers.
-In most cases, the only necessary change is to add a `lengths` key-word argument to provide sequence length information, e.g. `fit(X, y, lengths=lengths)` instead of `fit(X, y)`.
+* : `sktime`, `tslearn` and `pyts` seem to not mask padding, which may result in incorrect predictions.
+
+
+
+> **Device information**:
+> - Product: Lenovo ThinkPad T14s (Gen 6)
+> - Processor: AMD Ryzen™ AI 7 PRO 360 (8 cores, 16 threads, 2-5GHz)
+> - Memory: 64 GB LPDDR5X-7500MHz
+> - Solid State Drive: 1 TB SSD M.2 2280 PCIe Gen4 Performance TLC Opal
+> - Operating system: Fedora Linux 41 (Workstation Edition)
## Installation
@@ -104,19 +173,21 @@ The latest stable version of Sequentia can be installed with the following comma
pip install sequentia
```
-### C library compilation
+### C libraries
-For optimal performance when using any of the k-NN based models, it is important that `dtaidistance` C libraries are compiled correctly.
+For optimal performance when using any of the k-NN based models, it is important that the correct `dtaidistance` C libraries are accessible.
-Please see the [`dtaidistance` installation guide](https://dtaidistance.readthedocs.io/en/latest/usage/installation.html) for troubleshooting if you run into C compilation issues, or if setting `use_c=True` on k-NN based models results in a warning.
+Please see the [`dtaidistance` installation guide](https://dtaidistance.readthedocs.io/en/latest/usage/installation.html) for troubleshooting if you run into C compilation issues, or if using k-NN based models with `use_c=True` results in a warning.
-You can use the following to check if the appropriate C libraries have been installed.
+You can use the following to check if the appropriate C libraries are available.
```python
from dtaidistance import dtw
dtw.try_import_c()
```
+If these libraries are unavailable, Sequentia will fall back to using a Python alternative.
+
### Development
Please see the [contribution guidelines](/CONTRIBUTING.md) to see installation instructions for contributing to Sequentia.
@@ -127,26 +198,25 @@ Documentation for the package is available on [Read The Docs](https://sequentia.
## Examples
-Demonstration of classifying multivariate sequences with two features into two classes using the `KNNClassifier`.
+Demonstration of classifying multivariate sequences into two classes using the `KNNClassifier`.
-This example also shows a typical preprocessing workflow, as well as compatibility with Scikit-Learn.
+This example also shows a typical preprocessing workflow, as well as compatibility with
+Scikit-Learn for pipelining and hyper-parameter optimization.
-```python
-import numpy as np
+---
-from sklearn.preprocessing import scale
-from sklearn.decomposition import PCA
-from sklearn.pipeline import Pipeline
+First, we create some sample multivariate input data consisting of three sequences with two features.
-from sequentia.models import KNNClassifier
-from sequentia.preprocessing import IndependentFunctionTransformer, median_filter
+- Sequentia expects sequences to be concatenated and represented as a single NumPy array.
+- Sequence lengths are provided separately and used to decode the sequences when needed.
+
+This avoids the need for complex structures such as lists of nested arrays with different lengths,
+or a 3D array with wasteful and annoying padding.
-# Create input data
-# - Sequentia expects sequences to be concatenated into a single array
-# - Sequence lengths are provided separately and used to decode the sequences when needed
-# - This avoids the need for complex structures such as lists of arrays with different lengths
+```python
+import numpy as np
-# Sequences
+# Sequence data
X = np.array([
# Sequence 1 - Length 3
[1.2 , 7.91],
@@ -168,12 +238,47 @@ lengths = np.array([3, 5, 2])
# Sequence classes
y = np.array([0, 1, 1])
+```
+
+With this data, we can train a `KNNClassifier` and use it for prediction and scoring.
+
+**Note**: Each of the `fit()`, `predict()` and `score()` methods require the sequence lengths
+to be provided in addition to the sequence data `X` and labels `y`.
+
+```python
+from sequentia.models import KNNClassifier
+
+# Initialize and fit the classifier
+clf = KNNClassifier(k=1)
+clf.fit(X, y, lengths=lengths)
+
+# Make predictions based on the provided sequences
+y_pred = clf.predict(X, lengths=lengths)
+
+# Make predicitons based on the provided sequences and calculate accuracy
+acc = clf.score(X, y, lengths=lengths)
+```
+
+Alternatively, we can use [`sklearn.preprocessing.Pipeline`](https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html) to build a more complex preprocessing pipeline:
+
+1. Individually denoise each sequence by applying a [median filter](https://sequentia.readthedocs.io/en/latest/sections/preprocessing/transforms/filters.html#sequentia.preprocessing.transforms.median_filter) to each sequence.
+2. Individually [standardize](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.scale.html) each sequence by subtracting the mean and dividing the s.d. for each feature.
+3. Reduce the dimensionality of the data to a single feature by using [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html).
+4. Pass the resulting transformed data into a `KNNClassifier`.
+
+**Note**: Steps 1 and 2 use [`IndependentFunctionTransformer`](https://sequentia.readthedocs.io/en/latest/sections/preprocessing/transforms/function_transformer.html#sequentia.preprocessing.transforms.IndependentFunctionTransformer) provided by Sequentia to
+apply the specified transformation to each sequence in `X` individually, rather than using
+[`FunctionTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html#sklearn.preprocessing.FunctionTransformer) from Scikit-Learn which would transform the entire `X`
+array once, treating it as a single sequence.
-# Create a transformation pipeline that feeds into a KNNClassifier
-# 1. Individually denoise each sequence by applying a median filter for each feature
-# 2. Individually standardize each sequence by subtracting the mean and dividing the s.d. for each feature
-# 3. Reduce the dimensionality of the data to a single feature by using PCA
-# 4. Pass the resulting transformed data into a KNNClassifier
+```python
+from sklearn.preprocessing import scale
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+
+from sequentia.preprocessing import IndependentFunctionTransformer, median_filter
+
+# Create a preprocessing pipeline that feeds into a KNNClassifier
pipeline = Pipeline([
('denoise', IndependentFunctionTransformer(median_filter)),
('scale', IndependentFunctionTransformer(scale)),
@@ -181,14 +286,51 @@ pipeline = Pipeline([
('knn', KNNClassifier(k=1))
])
-# Fit the pipeline to the data - lengths must be provided
+# Fit the pipeline to the data
pipeline.fit(X, y, lengths=lengths)
-# Predict classes for the sequences and calculate accuracy - lengths must be provided
+# Predict classes for the sequences and calculate accuracy
y_pred = pipeline.predict(X, lengths=lengths)
+
+# Make predicitons based on the provided sequences and calculate accuracy
acc = pipeline.score(X, y, lengths=lengths)
```
+For hyper-parameter optimization, Sequentia provides a `sequentia.model_selection` sub-package
+that includes most of the hyper-parameter search and cross-validation methods provided by
+[`sklearn.model_selection`](https://scikit-learn.org/stable/api/sklearn.model_selection.html),
+but adapted to work with sequences.
+
+For instance, we can perform a grid search with k-fold cross-validation stratifying over labels
+in order to find an optimal value for the number of neighbors in `KNNClassifier` for the
+above pipeline.
+
+```python
+from sequentia.model_selection import StratifiedKFold, GridSearchCV
+
+# Define hyper-parameter search and specify cross-validation method
+search = GridSearchCV(
+ # Re-use the above pipeline
+ estimator=Pipeline([
+ ('denoise', IndependentFunctionTransformer(median_filter)),
+ ('scale', IndependentFunctionTransformer(scale)),
+ ('pca', PCA(n_components=1)),
+ ('knn', KNNClassifier(k=1))
+ ]),
+ # Try a range of values of k
+ param_grid={"knn__k": [1, 2, 3, 4, 5]},
+ # Specify k-fold cross-validation with label stratification using 4 splits
+ cv=StratifiedKFold(n_splits=4),
+)
+
+# Perform cross-validation over accuracy and retrieve the best model
+search.fit(X, y, lengths=lengths)
+clf = search.best_estimator_
+
+# Make predicitons using the best model and calculate accuracy
+acc = clf.score(X, y, lengths=lengths)
+```
+
## Acknowledgments
In earlier versions of the package, an approximate DTW implementation [`fastdtw`](https://github.com/slaypni/fastdtw) was used in hopes of speeding up k-NN predictions, as the authors of the original FastDTW paper [[2]](#references) claim that approximated DTW alignments can be computed in linear memory and time, compared to the O(N2 ) runtime complexity of the usual exact DTW implementation.
@@ -262,12 +404,12 @@ All contributions to this repository are greatly appreciated. Contribution guide
Sequentia is released under the [MIT](https://opensource.org/licenses/MIT) license.
-Certain parts of the source code are heavily adapted from [Scikit-Learn](scikit-learn.org/).
+Certain parts of source code are heavily adapted from [Scikit-Learn](scikit-learn.org/).
Such files contain a copy of [their license](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING).
---
- Sequentia © 2019-2025, Edwin Onuonga - Released under the MIT license.
+ Sequentia © 2019, Edwin Onuonga - Released under the MIT license.
Authored and maintained by Edwin Onuonga.
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 0000000..f8f49c1
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Collection of runtime benchmarks for Python packages
+providing dynamic time warping k-nearest neighbors algorithms.
+"""
diff --git a/benchmarks/benchmark.svg b/benchmarks/benchmark.svg
new file mode 100644
index 0000000..3f9a775
--- /dev/null
+++ b/benchmarks/benchmark.svg
@@ -0,0 +1,1621 @@
+
+
+
+
+
+
+
+ 2024-12-24T17:13:37.655962
+ image/svg+xml
+
+
+ Matplotlib v3.10.0, https://matplotlib.org/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/benchmarks/plot.ipynb b/benchmarks/plot.ipynb
new file mode 100644
index 0000000..0642d70
--- /dev/null
+++ b/benchmarks/plot.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "ed902379-677e-4c90-aa1c-95ef9dbb1d11",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "plt.style.use(\"ggplot\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6649bf2d-7430-401d-8113-f3c1e1cf4779",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, ax = plt.subplots(figsize=(8, 4))\n",
+ "\n",
+ "runtimes = [31.871, 828.855, 887.367, 1210.012, 2778.706]\n",
+ "labels = [\"sequentia\", \"sktime*\", \"aeon\", \"tslearn*\", \"pyts*\"]\n",
+ "\n",
+ "bars = ax.bar(labels, runtimes, width=0.5, color=\"C1\")\n",
+ "ax.set(xlabel=\"Package\", ylabel=\"Runtime (s)\")\n",
+ "ax.set_title(\n",
+ " (\n",
+ " \"Univariate DTW-kNN performance \"\n",
+ " \"(1,500 FSDD train/test sequences, 16 workers)\"\n",
+ " ),\n",
+ " fontsize=11,\n",
+ ")\n",
+ "\n",
+ "\n",
+ "def fmt(s: float) -> str:\n",
+ " \"\"\"Formats the runtime.\"\"\"\n",
+ " if s < 60:\n",
+ " return f\"{round(s)}s\"\n",
+ " m, s = divmod(s, 60)\n",
+ " return f\"{round(m)}m {round(s)}s\"\n",
+ "\n",
+ "\n",
+ "for bar in bars:\n",
+ " plt.text(\n",
+ " bar.get_x() + bar.get_width() / 2,\n",
+ " bar.get_height(),\n",
+ " fmt(bar.get_height()),\n",
+ " ha=\"center\",\n",
+ " va=\"bottom\",\n",
+ " fontsize=9,\n",
+ " )\n",
+ "\n",
+ "for lab in ax.get_xticklabels():\n",
+ " if lab.get_text() == \"sequentia\":\n",
+ " lab.set_fontweight(\"bold\")\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.savefig(\"benchmark.svg\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "07aeb22f-d8be-4759-9012-1a3e9479343a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
new file mode 100644
index 0000000..1353452
--- /dev/null
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,6 @@
+# python==3.12.8
+sequentia==2.1.0
+aeon==1.0.0
+tslearn==0.6.3
+sktime==0.35.0
+pyts==0.13.0
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
new file mode 100755
index 0000000..ed732a8
--- /dev/null
+++ b/benchmarks/run.sh
@@ -0,0 +1,19 @@
+echo "sequentia"
+python test_sequentia.py --n-jobs 16 --number 10
+echo
+
+echo "aeon"
+python test_aeon.py --n-jobs 16 --number 10
+echo
+
+echo "tslearn"
+python test_tslearn.py --n-jobs 16 --number 10
+echo
+
+echo "sktime"
+python test_sktime.py --n-jobs 16 --number 10
+echo
+
+echo "pyts"
+python test_pyts.py --n-jobs 16 --number 10
+echo
diff --git a/benchmarks/test_aeon.py b/benchmarks/test_aeon.py
new file mode 100644
index 0000000..a03f13e
--- /dev/null
+++ b/benchmarks/test_aeon.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Runtime benchmarks for aeon's dynamic time warping
+k-nearest neighbors algorithm.
+"""
+
+from __future__ import annotations
+
+import timeit
+import typing as t
+
+import numpy as np
+from aeon.classification.distance_based import KNeighborsTimeSeriesClassifier
+from aeon.transformations.collection import Padder
+from dtaidistance import dtw_ndim
+from utils import load_dataset
+
+from sequentia.datasets.base import SequentialDataset
+
+np.random.seed(0)
+random_state: np.random.RandomState = np.random.RandomState(0)
+
+DataSplit: t.TypeAlias = tuple[np.ndarray, np.ndarray]
+
+
+def distance(s1: np.ndarray, s2: np.ndarray) -> float:
+ """DTAIDistance DTW measure - not used."""
+ # need to transpose sequences again
+ return dtw_ndim.distance(s1.T, s2.T, use_c=True)
+
+
+def prepare(data: SequentialDataset) -> DataSplit:
+ """Prepare the dataset - padding."""
+ # transpose sequences and pad
+ X = [x.T for x, _ in data]
+ padder = Padder()
+ X_pad = padder.fit_transform(X)
+ # X_pad = X_pad.astype("float64")
+ return X_pad, data.y
+
+
+def run(*, train_data: DataSplit, test_data: DataSplit, n_jobs: int) -> None:
+ """Fit and predict the classifier."""
+ # initialize model
+ clf = KNeighborsTimeSeriesClassifier(
+ n_neighbors=1,
+ n_jobs=n_jobs,
+ distance="dtw",
+ # distance=distance,
+ )
+
+ # fit model
+ X_train, y_train = train_data
+ clf.fit(X_train, y_train)
+
+ # predict model
+ X_test, _ = test_data
+ clf.predict(X_test)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser: argparse.ArgumentParser = argparse.ArgumentParser()
+ parser.add_argument("--n-jobs", type=int, default=1)
+ parser.add_argument("--number", type=int, default=10)
+ args: argparse.Namespace = parser.parse_args()
+
+ train_data, test_data = load_dataset(multivariate=False)
+ train_data, test_data = prepare(train_data), prepare(test_data)
+
+ benchmark = timeit.timeit(
+ "run(train_data=train_data, test_data=test_data, n_jobs=args.n_jobs)",
+ globals=locals(),
+ number=args.number,
+ )
+
+ print(args) # noqa: T201
+ print(f"{benchmark:.3f}s") # noqa: T201
diff --git a/benchmarks/test_pyts.py b/benchmarks/test_pyts.py
new file mode 100644
index 0000000..09a3d96
--- /dev/null
+++ b/benchmarks/test_pyts.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Runtime benchmarks for pyts's dynamic time warping
+k-nearest neighbors algorithm.
+"""
+
+from __future__ import annotations
+
+import timeit
+import typing as t
+
+import numpy as np
+from aeon.transformations.collection import Padder
+from pyts.classification import KNeighborsClassifier
+from utils import load_dataset
+
+from sequentia.datasets.base import SequentialDataset
+
+np.random.seed(0)
+random_state: np.random.RandomState = np.random.RandomState(0)
+
+DataSplit: t.TypeAlias = tuple[np.ndarray, np.ndarray]
+
+
+def prepare(data: SequentialDataset, length: int) -> DataSplit:
+ """Prepare the dataset - pad and flatten."""
+ # transpose sequences and pad
+ X = [x.T for x, _ in data]
+ padder = Padder(pad_length=length)
+ X_pad = padder.fit_transform(X)
+ return X_pad[:, 0], data.y
+
+
+def run(*, train_data: DataSplit, test_data: DataSplit, n_jobs: int) -> None:
+ """Fit and predict the classifier."""
+ # initialize model
+ clf = KNeighborsClassifier(
+ n_neighbors=1,
+ n_jobs=n_jobs,
+ metric="dtw",
+ )
+
+ # fit model
+ X_train, y_train = train_data
+ clf.fit(X_train, y_train)
+
+ # predict model
+ X_test, _ = test_data
+ clf.predict(X_test)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser: argparse.ArgumentParser = argparse.ArgumentParser()
+ parser.add_argument("--n-jobs", type=int, default=1)
+ parser.add_argument("--number", type=int, default=10)
+ args: argparse.Namespace = parser.parse_args()
+
+ train_data, test_data = load_dataset(multivariate=False)
+ length = max(train_data.lengths.max(), test_data.lengths.max())
+ train_data, test_data = (
+ prepare(train_data, length=length),
+ prepare(test_data, length=length),
+ )
+
+ benchmark = timeit.timeit(
+ "run(train_data=train_data, test_data=test_data, n_jobs=args.n_jobs)",
+ globals=locals(),
+ number=args.number,
+ )
+
+ print(args) # noqa: T201
+ print(f"{benchmark:.3f}s") # noqa: T201
diff --git a/benchmarks/test_sequentia.py b/benchmarks/test_sequentia.py
new file mode 100644
index 0000000..521d222
--- /dev/null
+++ b/benchmarks/test_sequentia.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Runtime benchmarks for sequentia's dynamic time warping
+k-nearest neighbors algorithm.
+"""
+
+from __future__ import annotations
+
+import timeit
+
+import numpy as np
+from utils import load_dataset
+
+import sequentia
+from sequentia.datasets.base import SequentialDataset
+
+np.random.seed(0)
+random_state: np.random.RandomState = np.random.RandomState(0)
+
+
+def run(
+ *, train_data: SequentialDataset, test_data: SequentialDataset, n_jobs: int
+) -> None:
+ """Fit and predict the classifier."""
+ # initialize model
+ clf = sequentia.models.KNNClassifier(
+ k=1,
+ use_c=True,
+ n_jobs=n_jobs,
+ random_state=random_state,
+ classes=train_data.classes,
+ )
+
+ # fit model
+ clf.fit(X=train_data.X, y=train_data.y, lengths=train_data.lengths)
+
+ # predict model
+ clf.predict(X=test_data.X, lengths=test_data.lengths)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser: argparse.ArgumentParser = argparse.ArgumentParser()
+ parser.add_argument("--n-jobs", type=int, default=1)
+ parser.add_argument("--number", type=int, default=10)
+ args: argparse.Namespace = parser.parse_args()
+
+ train_data, test_data = load_dataset(multivariate=False)
+
+ benchmark = timeit.timeit(
+ "run(train_data=train_data, test_data=test_data, n_jobs=args.n_jobs)",
+ globals=locals(),
+ number=args.number,
+ )
+
+ print(args) # noqa: T201
+ print(f"{benchmark:.3f}s") # noqa: T201
diff --git a/benchmarks/test_sktime.py b/benchmarks/test_sktime.py
new file mode 100644
index 0000000..7fc5297
--- /dev/null
+++ b/benchmarks/test_sktime.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Runtime benchmarks for sktime's dynamic time warping
+k-nearest neighbors algorithm.
+"""
+
+from __future__ import annotations
+
+import timeit
+import typing as t
+
+import numpy as np
+import pandas as pd
+from dtaidistance import dtw_ndim
+from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
+from utils import load_dataset
+
+from sequentia.datasets.base import SequentialDataset
+
+np.random.seed(0)
+random_state: np.random.RandomState = np.random.RandomState(0)
+
+DataSplit: t.TypeAlias = tuple[pd.Series, np.ndarray]
+
+
+def distance(s1: pd.Series, s2: pd.Series) -> np.ndarray:
+ """DTAIDistance DTW measure - not used."""
+ s1, s2 = s1.droplevel(1), s2.droplevel(1)
+ m = s1.index.max() + 1
+ n = s2.index.max() + 1
+ matrix = np.zeros((m, n))
+ for i in range(m):
+ a = np.trim_zeros(s1.loc[i].to_numpy(dtype=np.float64))
+ for j in range(n):
+ b = np.trim_zeros(s2.loc[j].to_numpy(dtype=np.float64))
+ matrix[i][j] = dtw_ndim.distance(a, b, use_c=True)
+ return matrix
+
+
+def pad(x: np.ndarray, length: int) -> np.ndarray:
+ """Pad a sequence with zeros."""
+ return np.concat((x, np.zeros((length - len(x), x.shape[-1]))))
+
+
+def prepare(data: SequentialDataset) -> DataSplit:
+ """Prepare the dataset - pad and convert to multi-indexed
+ Pandas DataFrame.
+ """
+ # convert to padded pandas multi-index
+ length = data.lengths.max()
+ X = [pd.DataFrame(pad(x, length=length)) for x, _ in data]
+ X_pd = pd.concat(X, keys=range(len(X)), axis=0)
+ return X_pd, data.y
+
+
+def run(*, train_data: DataSplit, test_data: DataSplit, n_jobs: int) -> None:
+ """Fit and predict the classifier."""
+ # initialize model
+ clf = KNeighborsTimeSeriesClassifier(
+ n_neighbors=1,
+ n_jobs=n_jobs,
+ distance="dtw",
+ # distance=distance,
+ )
+
+ # fit model
+ X_train, y_train = train_data
+ clf.fit(X_train, y_train)
+
+ # predict model
+ X_test, _ = test_data
+ clf.predict(X_test)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser: argparse.ArgumentParser = argparse.ArgumentParser()
+ parser.add_argument("--n-jobs", type=int, default=1)
+ parser.add_argument("--number", type=int, default=10)
+ args: argparse.Namespace = parser.parse_args()
+
+ train_data, test_data = load_dataset(multivariate=False)
+ train_data, test_data = prepare(train_data), prepare(test_data)
+
+ benchmark = timeit.timeit(
+ "run(train_data=train_data, test_data=test_data, n_jobs=args.n_jobs)",
+ globals=locals(),
+ number=args.number,
+ )
+
+ print(args) # noqa: T201
+ print(f"{benchmark:.3f}s") # noqa: T201
diff --git a/benchmarks/test_tslearn.py b/benchmarks/test_tslearn.py
new file mode 100644
index 0000000..b8e0306
--- /dev/null
+++ b/benchmarks/test_tslearn.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Runtime benchmarks for tslearn's dynamic time warping
+k-nearest neighbors algorithm.
+"""
+
+from __future__ import annotations
+
+import timeit
+import typing as t
+
+import numpy as np
+from aeon.transformations.collection import Padder
+from dtaidistance import dtw_ndim
+from tslearn.neighbors import KNeighborsTimeSeriesClassifier
+from utils import load_dataset
+
+from sequentia.datasets.base import SequentialDataset
+
+np.random.seed(0)
+random_state: np.random.RandomState = np.random.RandomState(0)
+
+DataSplit: t.TypeAlias = tuple[np.ndarray, np.ndarray]
+
+
+def distance(s1: np.ndarray, s2: np.ndarray) -> float:
+ """DTAIDistance DTW measure - not used."""
+ return dtw_ndim.distance(s1, s2, use_c=True)
+
+
+def prepare(data: SequentialDataset, length: int) -> DataSplit:
+ """Prepare the dataset - padding."""
+ # pad sequences - zeros/nans are not ignored (!!!)
+ X = [x.T for x, _ in data]
+ padder = Padder(pad_length=length)
+ X_pad = padder.fit_transform(X)
+ # X_pad[(X_pad == 0).all(axis=1, keepdims=True)] = np.nan
+ return X_pad, data.y
+
+
+def run(*, train_data: DataSplit, test_data: DataSplit, n_jobs: int) -> None:
+ """Fit and predict the classifier."""
+ # initialize model
+ clf = KNeighborsTimeSeriesClassifier(
+ n_neighbors=1,
+ n_jobs=n_jobs,
+ )
+
+ # fit model
+ X_train, y_train = train_data
+ clf.fit(X_train, y_train)
+
+ # predict model
+ X_test, _ = test_data
+ clf.predict(X_test)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser: argparse.ArgumentParser = argparse.ArgumentParser()
+ parser.add_argument("--n-jobs", type=int, default=1)
+ parser.add_argument("--number", type=int, default=10)
+ args: argparse.Namespace = parser.parse_args()
+
+ train_data, test_data = load_dataset(multivariate=False)
+ length = max(train_data.lengths.max(), test_data.lengths.max())
+ train_data, test_data = (
+ prepare(train_data, length=length),
+ prepare(test_data, length=length),
+ )
+
+ benchmark = timeit.timeit(
+ "run(train_data=train_data, test_data=test_data, n_jobs=args.n_jobs)",
+ globals=locals(),
+ number=args.number,
+ )
+
+ print(args) # noqa: T201
+ print(f"{benchmark:.3f}s") # noqa: T201
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
new file mode 100644
index 0000000..7a52713
--- /dev/null
+++ b/benchmarks/utils.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Utilities for benchmarking."""
+
+from __future__ import annotations
+
+import numpy as np
+
+from sequentia.datasets.base import SequentialDataset
+from sequentia.datasets.digits import load_digits
+
+__all__ = ["load_dataset"]
+
+np.random.seed(0)
+random_state: np.random.RandomState = np.random.RandomState(0)
+
+
+def load_dataset(
+ *, multivariate: bool
+) -> tuple[SequentialDataset, SequentialDataset]:
+ """Loads the Free Spoken Digit Dataset."""
+ # load data
+ data: SequentialDataset = load_digits()
+
+ # split dataset
+ train_data, test_data = data.split(
+ test_size=0.5,
+ random_state=random_state,
+ shuffle=True,
+ stratify=True,
+ )
+
+ if multivariate:
+ # return untransformed data
+ return train_data, test_data
+
+ # retrieve features
+ X_train, X_test = train_data.X, test_data.X
+
+ # reduce to one dimension
+ X_train = X_train.mean(axis=-1, keepdims=True)
+ X_test = X_test.mean(axis=-1, keepdims=True)
+
+ # return splits
+ train_split: SequentialDataset = SequentialDataset(
+ X=X_train,
+ y=train_data.y,
+ lengths=train_data.lengths,
+ classes=train_data.classes,
+ )
+ test_split: SequentialDataset = SequentialDataset(
+ X=X_test,
+ y=test_data.y,
+ lengths=test_data.lengths,
+ classes=test_data.classes,
+ )
+ return train_split, test_split
diff --git a/docs/source/__init__.py b/docs/source/__init__.py
index 1c08549..b49759a 100644
--- a/docs/source/__init__.py
+++ b/docs/source/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/docs/source/_static/css/toc.css b/docs/source/_static/css/toc.css
index 3a8238c..d08fe3f 100644
--- a/docs/source/_static/css/toc.css
+++ b/docs/source/_static/css/toc.css
@@ -1,9 +1,7 @@
-/* Adds overflow to the Table of Contents on the side bar */
-div[aria-label="main navigation"] div.sphinxsidebarwrapper div:first-child {
+div.sphinxsidebarwrapper {
overflow-x: auto;
}
-/* Hides any API reference lists in the Table of Contents */
-div[aria-label="main navigation"] div.sphinxsidebarwrapper div:first-child a[href="#api-reference"] + ul {
+div.sphinxsidebarwrapper a[href="#definitions"] + ul > li > ul {
display: none;
-}
\ No newline at end of file
+}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index deb2e21..1e0f753 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -21,9 +21,9 @@
# -- Project information -----------------------------------------------------
project = "sequentia"
-copyright = "2019-2025, Sequentia Developers" # noqa: A001
+copyright = "2019, Sequentia Developers" # noqa: A001
author = "Edwin Onuonga (eonu)"
-release = "2.0.2"
+release = "2.5.0"
# -- General configuration ---------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 961fcaf..d8fc7b2 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,6 +42,7 @@ Features
sections/models/index
sections/preprocessing/index
+ sections/model_selection/index
sections/datasets/index
sections/configuration
diff --git a/docs/source/sections/configuration.rst b/docs/source/sections/configuration.rst
index 62d1e9c..755269e 100644
--- a/docs/source/sections/configuration.rst
+++ b/docs/source/sections/configuration.rst
@@ -13,7 +13,10 @@ API Reference
~sequentia.enums.TopologyMode
~sequentia.enums.TransitionMode
-|
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
.. automodule:: sequentia.enums
:members:
diff --git a/docs/source/sections/datasets/digits.rst b/docs/source/sections/datasets/digits.rst
index 9206723..dc56611 100644
--- a/docs/source/sections/datasets/digits.rst
+++ b/docs/source/sections/datasets/digits.rst
@@ -4,4 +4,9 @@ Digits
API reference
-------------
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
+
.. autofunction:: sequentia.datasets.load_digits
diff --git a/docs/source/sections/datasets/gene_families.rst b/docs/source/sections/datasets/gene_families.rst
index 77add39..87c4979 100644
--- a/docs/source/sections/datasets/gene_families.rst
+++ b/docs/source/sections/datasets/gene_families.rst
@@ -4,4 +4,9 @@ Gene Families
API reference
-------------
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
+
.. autofunction:: sequentia.datasets.load_gene_families
diff --git a/docs/source/sections/datasets/index.rst b/docs/source/sections/datasets/index.rst
index 29cf5cd..90c17ad 100644
--- a/docs/source/sections/datasets/index.rst
+++ b/docs/source/sections/datasets/index.rst
@@ -49,7 +49,10 @@ Properties
~sequentia.datasets.base.SequentialDataset.lengths
~sequentia.datasets.base.SequentialDataset.y
-|
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
.. autoclass:: sequentia.datasets.base.SequentialDataset
:members:
diff --git a/docs/source/sections/model_selection/index.rst b/docs/source/sections/model_selection/index.rst
new file mode 100644
index 0000000..e61aeb5
--- /dev/null
+++ b/docs/source/sections/model_selection/index.rst
@@ -0,0 +1,20 @@
+Model Selection
+===============
+
+.. toctree::
+ :titlesonly:
+
+ searching.rst
+ splitting.rst
+
+----
+
+For validating models and performing hyper-parameter selection, it is common
+to use cross-validation methods such as those in :mod:`sklearn.model_selection`.
+
+Although :mod:`sklearn.model_selection` is partially compatible with Sequentia,
+we define our own wrapped versions of certain classes and functions to allow
+support for sequences.
+
+- :ref:`searching` defines methods for searching hyper-parameter spaces in different ways, such as :class:`sequentia.model_selection.GridSearchCV`.
+- :ref:`splitting` defines methods for partitioning data into training/validation splits for cross-validation, such as :class:`sequentia.model_selection.KFold`.
diff --git a/docs/source/sections/model_selection/searching.rst b/docs/source/sections/model_selection/searching.rst
new file mode 100644
index 0000000..e5b6635
--- /dev/null
+++ b/docs/source/sections/model_selection/searching.rst
@@ -0,0 +1,101 @@
+.. _searching:
+
+Hyper-parameter search methods
+==============================
+
+In order to optimize the hyper-parameters for a specific model,
+hyper-parameter search methods are used (often in conjunction with
+:ref:`cross-validation methods `) to evaluate the performance of a model
+with different configurations and find the optimal settings.
+
+:mod:`sklearn.model_selection` provides such hyper-parameter search methods,
+but does not support sequence data. Sequentia provides modified
+versions of these methods to support sequence data.
+
+API reference
+-------------
+
+Classes/Methods
+^^^^^^^^^^^^^^^
+
+.. autosummary::
+
+ ~sequentia.model_selection.param_grid
+ ~sequentia.model_selection.GridSearchCV
+ ~sequentia.model_selection.RandomizedSearchCV
+ ~sequentia.model_selection.HalvingGridSearchCV
+ ~sequentia.model_selection.HalvingRandomSearchCV
+
+Example
+^^^^^^^
+
+Using :class:`.GridSearchCV` with :class:`.StratifiedKFold` to
+cross-validate a :class:`.KNNClassifier` training pipeline. ::
+
+ import numpy as np
+
+ from sklearn.pipeline import Pipeline
+ from sklearn.preprocessing import minmax_scale
+
+ from sequentia.datasets import load_digits
+ from sequentia.models import KNNClassifier
+ from sequentia.preprocessing import IndependentFunctionTransformer
+ from sequentia.model_selection import StratifiedKFold, GridSearchCV
+
+ EPS: np.float32 = np.finfo(np.float32).eps
+
+ # Define model and hyper-parameter search space
+ search = GridSearchCV(
+ # Create a basic pipeline with a KNNClassifier to be optimized
+ estimator=Pipeline(
+ [
+ ("scale", IndependentFunctionTransformer(minmax_scale)),
+ ("clf", KNNClassifier(use_c=True, n_jobs=-1))
+ ]
+ ),
+ # Optimize over k, weighting function and window size
+ param_grid={
+ "clf__k": [1, 2, 3, 4, 5],
+ "clf__weighting": [
+ None, lambda x: 1 / (x + EPS), lambda x: np.exp(-x)
+ ],
+ "clf__window": [1.0, 0.75, 0.5, 0.25, 0.1],
+ },
+ # Use StratifiedKFold cross-validation
+ cv=StratifiedKFold(),
+ n_jobs=-1,
+ )
+
+ # Load the spoken digit dataset with a train/test set split
+ data = load_digits()
+ train_data, test_data = data.split(test_size=0.2, stratify=True)
+
+ # Perform cross-validation over accuracy and retrieve the best model
+ search.fit(train_data.X, train_data.y, lengths=train_data.lengths)
+ clf = search.best_estimator_
+
+ # Calculate accuracy on the test set split
+ acc = clf.score(test_data.X, test_data.y, lengths=test_data.lengths)
+
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
+
+.. autofunction:: sequentia.model_selection.param_grid
+
+.. autoclass:: sequentia.model_selection.GridSearchCV
+ :members: __init__
+ :exclude-members: __new__
+
+.. autoclass:: sequentia.model_selection.RandomizedSearchCV
+ :members: __init__
+ :exclude-members: __new__
+
+.. autoclass:: sequentia.model_selection.HalvingGridSearchCV
+ :members: __init__
+ :exclude-members: __new__
+
+.. autoclass:: sequentia.model_selection.HalvingRandomSearchCV
+ :members: __init__
+ :exclude-members: __new__
\ No newline at end of file
diff --git a/docs/source/sections/model_selection/splitting.rst b/docs/source/sections/model_selection/splitting.rst
new file mode 100644
index 0000000..f2a8d9d
--- /dev/null
+++ b/docs/source/sections/model_selection/splitting.rst
@@ -0,0 +1,114 @@
+.. _splitting:
+
+Cross-validation splitting methods
+==================================
+
+During cross-validation, a dataset is divided into splits for training and validation.
+
+This can be either be done using a single basic split, or alternatively via successive
+*folds* which re-use parts of the dataset for different splits.
+
+:mod:`sklearn.model_selection` provides such cross-validation splitting methods,
+but does not support sequence data. Sequentia provides modified
+versions of these methods to support sequence data.
+
+API reference
+-------------
+
+Classes
+^^^^^^^
+
+.. autosummary::
+
+ ~sequentia.model_selection.KFold
+ ~sequentia.model_selection.StratifiedKFold
+ ~sequentia.model_selection.ShuffleSplit
+ ~sequentia.model_selection.StratifiedShuffleSplit
+ ~sequentia.model_selection.RepeatedKFold
+ ~sequentia.model_selection.RepeatedStratifiedKFold
+
+Example
+^^^^^^^
+
+Using :class:`.GridSearchCV` with :class:`.StratifiedKFold` to
+cross-validate a :class:`.KNNClassifier` training pipeline. ::
+
+ import numpy as np
+
+ from sklearn.pipeline import Pipeline
+ from sklearn.preprocessing import minmax_scale
+
+ from sequentia.datasets import load_digits
+ from sequentia.models import KNNClassifier
+ from sequentia.preprocessing import IndependentFunctionTransformer
+ from sequentia.model_selection import StratifiedKFold, GridSearchCV
+
+ EPS: np.float32 = np.finfo(np.float32).eps
+
+ # Define model and hyper-parameter search space
+ search = GridSearchCV(
+ # Create a basic pipeline with a KNNClassifier to be optimized
+ estimator=Pipeline(
+ [
+ ("scale", IndependentFunctionTransformer(minmax_scale)),
+ ("clf", KNNClassifier(use_c=True, n_jobs=-1))
+ ]
+ ),
+ # Optimize over k, weighting function and window size
+ param_grid={
+ "clf__k": [1, 2, 3, 4, 5],
+ "clf__weighting": [
+ None, lambda x: 1 / (x + EPS), lambda x: np.exp(-x)
+ ],
+ "clf__window": [1.0, 0.75, 0.5, 0.25, 0.1],
+ },
+ # Use StratifiedKFold cross-validation
+ cv=StratifiedKFold(),
+ n_jobs=-1,
+ )
+
+ # Load the spoken digit dataset with a train/test set split
+ data = load_digits()
+ train_data, test_data = data.split(test_size=0.2, stratify=True)
+
+ # Perform cross-validation over accuracy and retrieve the best model
+ search.fit(train_data.X, train_data.y, lengths=train_data.lengths)
+ clf = search.best_estimator_
+
+ # Calculate accuracy on the test set split
+ acc = clf.score(test_data.X, test_data.y, lengths=test_data.lengths)
+
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
+
+.. autoclass:: sequentia.model_selection.KFold
+ :members:
+ :inherited-members:
+ :exclude-members: get_metadata_routing, get_n_splits, split
+
+.. autoclass:: sequentia.model_selection.StratifiedKFold
+ :members:
+ :inherited-members:
+ :exclude-members: get_metadata_routing, get_n_splits, split
+
+.. autoclass:: sequentia.model_selection.ShuffleSplit
+ :members:
+ :inherited-members:
+ :exclude-members: get_metadata_routing, get_n_splits, split
+
+.. autoclass:: sequentia.model_selection.StratifiedShuffleSplit
+ :members:
+ :inherited-members:
+ :exclude-members: get_metadata_routing, get_n_splits, split
+
+.. autoclass:: sequentia.model_selection.RepeatedKFold
+ :members:
+ :inherited-members:
+ :exclude-members: get_metadata_routing, get_n_splits, split
+
+.. autoclass:: sequentia.model_selection.RepeatedStratifiedKFold
+ :members:
+ :inherited-members:
+ :exclude-members: get_metadata_routing, get_n_splits, split
diff --git a/docs/source/sections/models/hmm/classifier.rst b/docs/source/sections/models/hmm/classifier.rst
index a94a087..bc3d2ee 100644
--- a/docs/source/sections/models/hmm/classifier.rst
+++ b/docs/source/sections/models/hmm/classifier.rst
@@ -62,7 +62,10 @@ Methods
~sequentia.models.hmm.classifier.HMMClassifier.save
~sequentia.models.hmm.classifier.HMMClassifier.score
-|
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
.. autoclass:: sequentia.models.hmm.classifier.HMMClassifier
:members:
diff --git a/docs/source/sections/models/hmm/variants/categorical.rst b/docs/source/sections/models/hmm/variants/categorical.rst
index e746af8..a028ce1 100644
--- a/docs/source/sections/models/hmm/variants/categorical.rst
+++ b/docs/source/sections/models/hmm/variants/categorical.rst
@@ -62,7 +62,10 @@ Methods
~sequentia.models.hmm.variants.CategoricalHMM.unfreeze
~sequentia.models.hmm.variants.CategoricalHMM.n_params
-|
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
.. autoclass:: sequentia.models.hmm.variants.CategoricalHMM
:members:
diff --git a/docs/source/sections/models/hmm/variants/gaussian_mixture.rst b/docs/source/sections/models/hmm/variants/gaussian_mixture.rst
index bc322e6..36b9be5 100644
--- a/docs/source/sections/models/hmm/variants/gaussian_mixture.rst
+++ b/docs/source/sections/models/hmm/variants/gaussian_mixture.rst
@@ -73,7 +73,10 @@ Methods
~sequentia.models.hmm.variants.GaussianMixtureHMM.unfreeze
~sequentia.models.hmm.variants.GaussianMixtureHMM.n_params
-|
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
.. autoclass:: sequentia.models.hmm.variants.GaussianMixtureHMM
:members:
diff --git a/docs/source/sections/models/index.rst b/docs/source/sections/models/index.rst
index 2b9708e..ba03888 100644
--- a/docs/source/sections/models/index.rst
+++ b/docs/source/sections/models/index.rst
@@ -16,9 +16,9 @@ The following models provided by Sequentia all support variable length sequences
| | | | +----------+------------+
| | | | | Training | Prediction |
+=========================+==============================+================+===============+==============+==========+============+
-| :class:`.HMMClassifier` | :class:`.GaussianMixtureHMM` | Classification | Real | ✔ | ✗ | ✔ |
+| :class:`.HMMClassifier` | :class:`.GaussianMixtureHMM` | Classification | Real | ✔ | ✔ | ✔ |
| +------------------------------+----------------+---------------+--------------+----------+------------+
-| | :class:`.CategoricalHMM` | Classification | Categorical | ✗ | ✗ | ✔ |
+| | :class:`.CategoricalHMM` | Classification | Categorical | ✗ | ✔ | ✔ |
+-------------------------+------------------------------+----------------+---------------+--------------+----------+------------+
| :class:`.KNNRegressor` | Regression | Real | ✔ | N/A | ✔ |
+--------------------------------------------------------+----------------+---------------+--------------+----------+------------+
diff --git a/docs/source/sections/models/knn/classifier.rst b/docs/source/sections/models/knn/classifier.rst
index 906fa3b..42fdeff 100644
--- a/docs/source/sections/models/knn/classifier.rst
+++ b/docs/source/sections/models/knn/classifier.rst
@@ -47,7 +47,10 @@ Methods
~sequentia.models.knn.classifier.KNNClassifier.save
~sequentia.models.knn.classifier.KNNClassifier.score
-|
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
.. autoclass:: sequentia.models.knn.classifier.KNNClassifier
:members:
diff --git a/docs/source/sections/models/knn/regressor.rst b/docs/source/sections/models/knn/regressor.rst
index 2e1926f..f5aa9d5 100644
--- a/docs/source/sections/models/knn/regressor.rst
+++ b/docs/source/sections/models/knn/regressor.rst
@@ -48,7 +48,10 @@ Methods
~sequentia.models.knn.regressor.KNNRegressor.save
~sequentia.models.knn.regressor.KNNRegressor.score
-|
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
.. autoclass:: sequentia.models.knn.regressor.KNNRegressor
:members:
diff --git a/docs/source/sections/preprocessing/transforms/filters.rst b/docs/source/sections/preprocessing/transforms/filters.rst
index ccb6a27..75459f7 100644
--- a/docs/source/sections/preprocessing/transforms/filters.rst
+++ b/docs/source/sections/preprocessing/transforms/filters.rst
@@ -21,7 +21,10 @@ Methods
~sequentia.preprocessing.transforms.mean_filter
~sequentia.preprocessing.transforms.median_filter
-|
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
.. autofunction:: sequentia.preprocessing.transforms.mean_filter
.. autofunction:: sequentia.preprocessing.transforms.median_filter
diff --git a/docs/source/sections/preprocessing/transforms/function_transformer.rst b/docs/source/sections/preprocessing/transforms/function_transformer.rst
index 0fe8954..1b23691 100644
--- a/docs/source/sections/preprocessing/transforms/function_transformer.rst
+++ b/docs/source/sections/preprocessing/transforms/function_transformer.rst
@@ -29,7 +29,10 @@ Methods
~sequentia.preprocessing.transforms.IndependentFunctionTransformer.inverse_transform
~sequentia.preprocessing.transforms.IndependentFunctionTransformer.transform
-|
+.. _definitions:
+
+Definitions
+^^^^^^^^^^^
.. autoclass:: sequentia.preprocessing.transforms.IndependentFunctionTransformer
:members:
diff --git a/make/__init__.py b/make/__init__.py
index f25c976..e818b59 100644
--- a/make/__init__.py
+++ b/make/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/make/cov.py b/make/cov.py
index bd10475..4169231 100644
--- a/make/cov.py
+++ b/make/cov.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/make/docs.py b/make/docs.py
index 592e69c..9fdbb7a 100644
--- a/make/docs.py
+++ b/make/docs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/make/lint.py b/make/lint.py
index 0bd9dd7..17151ef 100644
--- a/make/lint.py
+++ b/make/lint.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -33,7 +33,7 @@ def check(c: Config) -> None:
def format_(c: Config) -> None:
"""Format Python files."""
commands: list[str] = [
- "poetry run ruff --fix .",
+ "poetry run ruff check --fix .",
"poetry run ruff format .",
]
for command in commands:
diff --git a/make/release.py b/make/release.py
index 9175905..8f2cf19 100644
--- a/make/release.py
+++ b/make/release.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/make/tests.py b/make/tests.py
index 84111ee..12fb507 100644
--- a/make/tests.py
+++ b/make/tests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -23,6 +23,8 @@ def unit(c: Config, *, cov: bool = False) -> None:
command: str = "poetry run pytest tests/"
if cov:
- command = f"{command} --cov sequentia --cov-report xml"
-
+ command = (
+ f"{command} --cov-config .coveragerc "
+ "--cov sequentia --cov-report xml"
+ )
c.run(command)
diff --git a/mise.toml b/mise.toml
new file mode 100644
index 0000000..0fe0947
--- /dev/null
+++ b/mise.toml
@@ -0,0 +1,6 @@
+[tools]
+poetry = { version = 'latest', pyproject = 'pyproject.toml' }
+python = '3.13'
+
+[env]
+_.python.venv = ".venv"
diff --git a/notice.py b/notice.py
index 564a3a9..7151b83 100644
--- a/notice.py
+++ b/notice.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -12,7 +12,7 @@
from pathlib import Path
notice = """
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/pyproject.toml b/pyproject.toml
index e20d3db..55f9311 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "sequentia"
-version = "2.0.2"
+version = "2.5.0"
license = "MIT"
authors = ["Edwin Onuonga "]
maintainers = ["Edwin Onuonga "]
@@ -23,6 +23,7 @@ classifiers = [
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
"Programming Language :: Python :: Implementation :: CPython",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
@@ -65,8 +66,11 @@ build-backend = 'poetry.core.masonry.api'
[tool.poetry.dependencies]
python = "^3.11"
-numba = ">=0.56,<1"
-numpy = "^1.19.5"
+numba = [
+ { version = ">=0.56,<1", python = "^3.11,<3.13" },
+ { version = ">=0.61.0rc2", python = ">=3.13" }
+]
+numpy = ">=1.19.5,<3"
hmmlearn = ">=0.2.8,<1"
dtaidistance = "^2.3.10"
scikit-learn = "^1.4"
@@ -82,7 +86,7 @@ tox = "4.11.3"
pre-commit = ">=3"
[tool.poetry.group.lint.dependencies]
-ruff = "0.1.3"
+ruff = "0.8.4"
pydoclint = "0.3.8"
[tool.poetry.group.docs.dependencies]
@@ -96,8 +100,8 @@ pytest = { version = "^7.4.0" }
pytest-cov = { version = "^4.1.0" }
[tool.ruff]
-required-version = "0.1.3"
-select = [
+required-version = "0.8.4"
+lint.select = [
"F", # pyflakes: https://pypi.org/project/pyflakes/
"E", # pycodestyle (error): https://pypi.org/project/pycodestyle/
"W", # pycodestyle (warning): https://pypi.org/project/pycodestyle/
@@ -140,7 +144,7 @@ select = [
"PERF", # perflint: https://pypi.org/project/perflint/
"RUF", # ruff
]
-ignore = [
+lint.ignore = [
"ANN401", # https://beta.ruff.rs/docs/rules/any-type/
"B905", # https://beta.ruff.rs/docs/rules/zip-without-explicit-strict/
"TD003", # https://beta.ruff.rs/docs/rules/missing-todo-link/
@@ -158,16 +162,15 @@ ignore = [
"C408", # Unnecessary `dict` call (rewrite as a literal)
"D401", # First line of docstring should be in imperative mood
]
-ignore-init-module-imports = true # allow unused imports in __init__.py
line-length = 79
-[tool.ruff.pydocstyle]
+[tool.ruff.lint.pydocstyle]
convention = "numpy"
-[tool.ruff.flake8-annotations]
+[tool.ruff.lint.flake8-annotations]
allow-star-arg-any = true
-[tool.ruff.extend-per-file-ignores]
+[tool.ruff.lint.extend-per-file-ignores]
"__init__.py" = ["PLC0414", "F403", "F401", "F405"]
"sequentia/datasets/*.py" = ["B006"]
"sequentia/enums.py" = ["E501"]
@@ -181,6 +184,21 @@ allow-star-arg-any = true
"SLF",
"ARG",
]
+"sequentia/model_selection/*.py" = [
+ "D",
+ "E",
+ "ANN",
+ "PLR",
+ "TRY",
+ "EM",
+ "T",
+ "BLE",
+ "RET",
+ "SLF",
+ "UP",
+ "ARG",
+ "FA"
+]
"tests/**/*.py" = ["D", "E", "S101"]
# "tests/**/test_*.py" = ["ARG001", "S101", "D", "FA100", "FA102", "PLR0915"]
"tests/**/test_*.py" = [
diff --git a/sequentia/__init__.py b/sequentia/__init__.py
index ee898a0..f15f1aa 100644
--- a/sequentia/__init__.py
+++ b/sequentia/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -9,8 +9,22 @@
import sklearn
-from sequentia import datasets, enums, models, preprocessing, version
+from sequentia import (
+ datasets,
+ enums,
+ model_selection,
+ models,
+ preprocessing,
+ version,
+)
-__all__ = ["datasets", "models", "preprocessing", "enums", "version"]
+__all__ = [
+ "datasets",
+ "enums",
+ "model_selection",
+ "models",
+ "preprocessing",
+ "version",
+]
sklearn.set_config(enable_metadata_routing=True)
diff --git a/sequentia/_internal/__init__.py b/sequentia/_internal/__init__.py
index cd11e40..b4cba4c 100644
--- a/sequentia/_internal/__init__.py
+++ b/sequentia/_internal/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/_internal/_data.py b/sequentia/_internal/_data.py
index 9d57786..96ccc72 100644
--- a/sequentia/_internal/_data.py
+++ b/sequentia/_internal/_data.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/_internal/_hmm/__init__.py b/sequentia/_internal/_hmm/__init__.py
index 017d051..9391a8b 100644
--- a/sequentia/_internal/_hmm/__init__.py
+++ b/sequentia/_internal/_hmm/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/_internal/_hmm/topologies.py b/sequentia/_internal/_hmm/topologies.py
index 6b605bb..c74278f 100644
--- a/sequentia/_internal/_hmm/topologies.py
+++ b/sequentia/_internal/_hmm/topologies.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -13,10 +13,10 @@
from sequentia.enums import TopologyMode
__all__ = [
+ "TOPOLOGY_MAP",
"ErgodicTopology",
"LeftRightTopology",
"LinearTopology",
- "TOPOLOGY_MAP",
]
@@ -36,15 +36,15 @@ class BaseTopology:
mode: TopologyMode
def __init__(
- self: BaseTopology,
+ self,
*,
n_states: int,
random_state: np.random.RandomState,
- ) -> BaseTopology:
+ ) -> None:
self.n_states = n_states
self.random_state = random_state
- def uniform_start_probs(self: BaseTopology) -> FloatArray:
+ def uniform_start_probs(self) -> FloatArray:
"""Set the initial state distribution as a discrete uniform
distribution.
@@ -55,7 +55,7 @@ def uniform_start_probs(self: BaseTopology) -> FloatArray:
"""
return np.ones(self.n_states) / self.n_states
- def random_start_probs(self: BaseTopology) -> FloatArray:
+ def random_start_probs(self) -> FloatArray:
"""Set the initial state distribution by randomly sampling
probabilities generated by a Dirichlet distribution.
@@ -69,7 +69,7 @@ def random_start_probs(self: BaseTopology) -> FloatArray:
size=1,
).flatten()
- def uniform_transition_probs(self: BaseTopology) -> FloatArray:
+ def uniform_transition_probs(self) -> FloatArray:
"""Set the transition matrix as uniform (equal probability of
transitioning to all other possible states from each state)
corresponding to the topology.
@@ -81,7 +81,7 @@ def uniform_transition_probs(self: BaseTopology) -> FloatArray:
"""
raise NotImplementedError
- def random_transition_probs(self: BaseTopology) -> FloatArray:
+ def random_transition_probs(self) -> FloatArray:
"""Set the transition matrix as random (random probability of
transitioning to all other possible states from each state) by
sampling probabilitiesfrom a Dirichlet distribution - according
@@ -94,7 +94,7 @@ def random_transition_probs(self: BaseTopology) -> FloatArray:
"""
raise NotImplementedError
- def check_start_probs(self: BaseTopology, initial: FloatArray, /) -> None:
+ def check_start_probs(self, initial: FloatArray, /) -> None:
"""Validate an initial state distribution according to the
topology's restrictions.
@@ -114,9 +114,7 @@ def check_start_probs(self: BaseTopology, initial: FloatArray, /) -> None:
raise ValueError(msg)
return initial
- def check_transition_probs(
- self: BaseTopology, transitions: FloatArray, /
- ) -> FloatArray:
+ def check_transition_probs(self, transitions: FloatArray, /) -> FloatArray:
"""Validate a transition matrix according to the topology's
restrictions.
@@ -152,7 +150,7 @@ class ErgodicTopology(BaseTopology):
mode: TopologyMode = TopologyMode.ERGODIC
- def uniform_transition_probs(self: ErgodicTopology) -> FloatArray:
+ def uniform_transition_probs(self) -> FloatArray:
"""Set the transition matrix as uniform (equal probability of
transitioning to all other possible states from each state)
corresponding to the topology.
@@ -164,7 +162,7 @@ def uniform_transition_probs(self: ErgodicTopology) -> FloatArray:
"""
return np.ones((self.n_states, self.n_states)) / self.n_states
- def random_transition_probs(self: ErgodicTopology) -> FloatArray:
+ def random_transition_probs(self) -> FloatArray:
"""Set the transition matrix as random (random probability of
transitioning to all other possible states from each state) by
sampling probabilities from a Dirichlet distribution - according
@@ -180,9 +178,7 @@ def random_transition_probs(self: ErgodicTopology) -> FloatArray:
size=self.n_states,
)
- def check_transition_probs(
- self: ErgodicTopology, transitions: FloatArray, /
- ) -> FloatArray:
+ def check_transition_probs(self, transitions: FloatArray, /) -> FloatArray:
"""Validate a transition matrix according to the topology's
restrictions.
@@ -216,7 +212,7 @@ class LeftRightTopology(BaseTopology):
mode: TopologyMode = TopologyMode.LEFT_RIGHT
- def uniform_transition_probs(self: LeftRightTopology) -> FloatArray:
+ def uniform_transition_probs(self) -> FloatArray:
"""Set the transition matrix as uniform (equal probability of
transitioning to all other possible states from each state)
corresponding to the topology.
@@ -233,7 +229,7 @@ def uniform_transition_probs(self: LeftRightTopology) -> FloatArray:
lower_ones = np.tril(np.ones(self.n_states), k=-1)
return upper_ones / (upper_divisors + lower_ones)
- def random_transition_probs(self: LeftRightTopology) -> FloatArray:
+ def random_transition_probs(self) -> FloatArray:
"""Set the transition matrix as random (random probability of
transitioning to all other possible states from each state) by
sampling probabilities from a Dirichlet distribution, according
@@ -249,9 +245,7 @@ def random_transition_probs(self: LeftRightTopology) -> FloatArray:
row[i:] = self.random_state.dirichlet(np.ones(self.n_states - i))
return transitions
- def check_transition_probs(
- self: LeftRightTopology, transitions: FloatArray, /
- ) -> FloatArray:
+ def check_transition_probs(self, transitions: FloatArray, /) -> FloatArray:
"""Validate a transition matrix according to the topology's
restrictions.
@@ -281,7 +275,7 @@ class LinearTopology(LeftRightTopology):
mode: TopologyMode = TopologyMode.LINEAR
- def uniform_transition_probs(self: LinearTopology) -> FloatArray:
+ def uniform_transition_probs(self) -> FloatArray:
"""Set the transition matrix as uniform (equal probability of
transitioning to all other possible states from each state)
corresponding to the topology.
@@ -297,7 +291,7 @@ def uniform_transition_probs(self: LinearTopology) -> FloatArray:
row[i : (i + size)] = np.ones(size) / size
return transitions
- def random_transition_probs(self: LinearTopology) -> FloatArray:
+ def random_transition_probs(self) -> FloatArray:
"""Set the transition matrix as random (random probability of
transitioning to all other possible states from each state) by
sampling probabilities from a Dirichlet distribution, according to the
@@ -314,9 +308,7 @@ def random_transition_probs(self: LinearTopology) -> FloatArray:
row[i : (i + size)] = self.random_state.dirichlet(np.ones(size))
return transitions
- def check_transition_probs(
- self: LinearTopology, transitions: FloatArray, /
- ) -> FloatArray:
+ def check_transition_probs(self, transitions: FloatArray, /) -> FloatArray:
"""Validate a transition matrix according to the topology's
restrictions.
diff --git a/sequentia/_internal/_multiprocessing.py b/sequentia/_internal/_multiprocessing.py
index b0260d0..2134c64 100644
--- a/sequentia/_internal/_multiprocessing.py
+++ b/sequentia/_internal/_multiprocessing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/_internal/_sklearn.py b/sequentia/_internal/_sklearn.py
new file mode 100644
index 0000000..d364f57
--- /dev/null
+++ b/sequentia/_internal/_sklearn.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+import sklearn
+
+__all__ = ["routing_enabled"]
+
+
+def routing_enabled() -> bool:
+ return sklearn.get_config()["enable_metadata_routing"]
diff --git a/sequentia/_internal/_typing.py b/sequentia/_internal/_typing.py
index d9db94b..2e37a98 100644
--- a/sequentia/_internal/_typing.py
+++ b/sequentia/_internal/_typing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -6,8 +6,8 @@
import numpy as np
import numpy.typing as npt
-__all__ = ["FloatArray", "IntArray", "Array"]
+__all__ = ["Array", "FloatArray", "IntArray"]
-FloatArray = npt.NDArray[np.float_]
-IntArray = npt.NDArray[np.int_]
+FloatArray = npt.NDArray[np.float64]
+IntArray = npt.NDArray[np.int64]
Array = FloatArray | IntArray
diff --git a/sequentia/_internal/_validation.py b/sequentia/_internal/_validation.py
index 04c8122..89f26cb 100644
--- a/sequentia/_internal/_validation.py
+++ b/sequentia/_internal/_validation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -20,15 +20,15 @@
from sequentia._internal._typing import Array, FloatArray, IntArray
__all__ = [
- "check_random_state",
- "check_is_fitted",
- "requires_fit",
- "check_classes",
"check_X",
"check_X_lengths",
- "check_y",
- "check_weighting",
+ "check_classes",
+ "check_is_fitted",
+ "check_random_state",
"check_use_c",
+ "check_weighting",
+ "check_y",
+ "requires_fit",
]
@@ -60,7 +60,7 @@ def check_is_fitted(
def requires_fit(function: t.Callable) -> t.Callable:
@functools.wraps(function)
- def wrapper(self: t.Self, *args: t.Any, **kwargs: t.Any) -> t.Any:
+ def wrapper(self, *args: t.Any, **kwargs: t.Any) -> t.Any: # noqa: ANN001
check_is_fitted(self)
return function(self, *args, **kwargs)
@@ -100,20 +100,20 @@ def check_X(
X: t.Iterable[int] | t.Iterable[float],
/,
*,
- dtype: np.float_ | np.int_,
+ dtype: np.float64 | np.int64,
univariate: bool = False,
) -> Array:
if not isinstance(X, np.ndarray):
try:
X = np.array(X).astype(dtype)
- except Exception as e: # noqa: BLE001
+ except Exception as e:
type_ = type(X).__name__
msg = f"Expected value to be a numpy.ndarray, got {type_!r}"
raise TypeError(msg) from e
if (dtype_ := X.dtype) != dtype:
try:
X = X.astype(dtype)
- except Exception as e: # noqa: BLE001
+ except Exception as e:
msg = f"Expected array to have dtype {dtype}, got {dtype_}"
raise TypeError(msg) from e
if (ndim_ := X.ndim) != 2:
@@ -133,7 +133,7 @@ def check_X_lengths(
/,
*,
lengths: t.Iterable[int] | None,
- dtype: np.float_ | np.int_,
+ dtype: np.float64 | np.int64,
univariate: bool = False,
) -> tuple[Array, IntArray]:
# validate observations
@@ -172,7 +172,7 @@ def check_y(
/,
*,
lengths: IntArray,
- dtype: np.float_ | np.int_ | None = None,
+ dtype: np.float64 | np.int64 | None = None,
) -> Array:
if y is None:
msg = "No output values `y` provided"
@@ -214,7 +214,7 @@ def check_weighting(
if x.shape != weights.shape:
msg = "Weights should have the same shape as inputs"
raise ValueError(msg) # noqa: TRY301
- except Exception as e: # noqa: BLE001
+ except Exception as e:
msg = "Invalid weighting function"
raise ValueError(msg) from e
diff --git a/sequentia/datasets/__init__.py b/sequentia/datasets/__init__.py
index 66efacc..817b0a2 100644
--- a/sequentia/datasets/__init__.py
+++ b/sequentia/datasets/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -10,4 +10,4 @@
from sequentia.datasets.digits import load_digits
from sequentia.datasets.gene_families import load_gene_families
-__all__ = ["data", "load_digits", "load_gene_families", "SequentialDataset"]
+__all__ = ["SequentialDataset", "data", "load_digits", "load_gene_families"]
diff --git a/sequentia/datasets/base.py b/sequentia/datasets/base.py
index 18984f2..4bc096b 100644
--- a/sequentia/datasets/base.py
+++ b/sequentia/datasets/base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -26,19 +26,17 @@ class SequentialDataset:
"""Utility wrapper for a generic sequential dataset."""
def __init__(
- self: SequentialDataset,
+ self,
X: Array,
y: Array | None = None,
*,
lengths: IntArray | None = None,
classes: list[int] | None = None,
- ) -> SequentialDataset:
+ ) -> None:
"""Initialize a :class:`.SequentialDataset`.
Parameters
----------
- self: SequentialDataset
-
X:
Sequence(s).
@@ -80,17 +78,21 @@ def __init__(
self._idxs = _data.get_idxs(self.lengths)
def split(
- self: SequentialDataset,
+ self,
*,
test_size: (
- pyd.NonNegativeInt | pyd.confloat(ge=0, le=1) | None
- ) = None, # placeholder
+ pyd.NonNegativeInt
+ | t.Annotated[float, pyd.Field(ge=0, le=1)]
+ | None
+ ) = None,
train_size: (
- pyd.NonNegativeInt | pyd.confloat(ge=0, le=1) | None
- ) = None, # placeholder
+ pyd.NonNegativeInt
+ | t.Annotated[float, pyd.Field(ge=0, le=1)]
+ | None
+ ) = None,
random_state: (
pyd.NonNegativeInt | np.random.RandomState | None
- ) = None, # placeholder
+ ) = None,
shuffle: bool = True,
stratify: bool = False,
) -> tuple[SequentialDataset, SequentialDataset]:
@@ -100,8 +102,6 @@ def split(
Parameters
----------
- self: SequentialDataset
-
test_size:
Size of the test partition.
@@ -171,9 +171,7 @@ def split(
return data_train, data_test
- def iter_by_class(
- self: SequentialDataset,
- ) -> t.Generator[tuple[Array, Array, int]]:
+ def iter_by_class(self) -> t.Generator[tuple[Array, Array, int]]:
"""Subset the observation sequences by class.
Returns
@@ -207,24 +205,18 @@ def iter_by_class(
lengths = self._lengths[ind]
yield np.vstack(X), lengths, c
- def __len__(self: SequentialDataset) -> int:
+ def __len__(self) -> int:
"""Return the number of sequences in the dataset."""
return len(self._lengths)
- def __getitem__(
- self: SequentialDataset,
- /,
- i: int,
- ) -> Array | tuple[Array, Array]:
+ def __getitem__(self, /, i: int) -> Array | tuple[Array, Array]:
"""Slice observation sequences and corresponding outputs."""
idxs = np.atleast_2d(self._idxs[i])
X = list(_data.iter_X(self._X, idxs=idxs))
X = X[0] if isinstance(i, int) and len(X) == 1 else X
return X if self._y is None else (X, self._y[i])
- def __iter__(
- self: SequentialDataset,
- ) -> t.Generator[Array | tuple[Array, Array]]:
+ def __iter__(self) -> t.Generator[Array | tuple[Array, Array]]:
"""Create a generator over sequences and their corresponding
outputs.
"""
@@ -232,7 +224,7 @@ def __iter__(
yield self[i]
@property
- def X(self: SequentialDataset) -> Array:
+ def X(self) -> Array:
"""Observation sequences.
Returns
@@ -243,7 +235,7 @@ def X(self: SequentialDataset) -> Array:
return self._X
@property
- def y(self: SequentialDataset) -> Array:
+ def y(self) -> Array:
"""Outputs corresponding to ``X``.
Returns
@@ -262,7 +254,7 @@ def y(self: SequentialDataset) -> Array:
return self._y
@property
- def lengths(self: SequentialDataset) -> IntArray:
+ def lengths(self) -> IntArray:
"""Lengths corresponding to ``X``.
Returns
@@ -273,7 +265,7 @@ def lengths(self: SequentialDataset) -> IntArray:
return self._lengths
@property
- def classes(self: SequentialDataset) -> IntArray | None:
+ def classes(self) -> IntArray | None:
"""Set of unique classes in ``y``.
Returns
@@ -284,7 +276,7 @@ def classes(self: SequentialDataset) -> IntArray | None:
return self._classes
@property
- def idxs(self: SequentialDataset) -> IntArray:
+ def idxs(self) -> IntArray:
"""Observation sequence start and end indices.
Returns
@@ -295,7 +287,7 @@ def idxs(self: SequentialDataset) -> IntArray:
return self._idxs
@property
- def X_y(self: SequentialDataset) -> dict[str, Array]:
+ def X_y(self) -> dict[str, Array]:
"""Observation sequences and corresponding outputs.
Returns
@@ -317,7 +309,7 @@ def X_y(self: SequentialDataset) -> dict[str, Array]:
return {"X": self._X, "y": self._y}
@property
- def X_lengths(self: SequentialDataset) -> dict[str, Array]:
+ def X_lengths(self) -> dict[str, Array]:
"""Observation sequences and corresponding lengths.
Returns
@@ -331,7 +323,7 @@ def X_lengths(self: SequentialDataset) -> dict[str, Array]:
return {"X": self._X, "lengths": self._lengths}
@property
- def X_y_lengths(self: SequentialDataset) -> dict[str, Array]:
+ def X_y_lengths(self) -> dict[str, Array]:
"""Observation sequences and corresponding outputs and lengths.
Returns
@@ -354,7 +346,7 @@ def X_y_lengths(self: SequentialDataset) -> dict[str, Array]:
return {"X": self._X, "y": self._y, "lengths": self._lengths}
def save(
- self: SequentialDataset,
+ self,
path: str | pathlib.Path | t.IO,
/,
*,
@@ -389,9 +381,7 @@ def save(
save_fun(path, **arrs)
@classmethod
- def load(
- cls: type[SequentialDataset], path: str | pathlib.Path | t.IO, /
- ) -> SequentialDataset:
+ def load(cls, path: str | pathlib.Path | t.IO, /) -> SequentialDataset:
"""Load a stored dataset in ``.npz`` format.
See :func:`numpy:numpy.load`.
@@ -413,7 +403,7 @@ def load(
"""
return cls(**np.load(path))
- def copy(self: SequentialDataset) -> SequentialDataset:
+ def copy(self) -> SequentialDataset:
"""Create a copy of the dataset.
Returns
diff --git a/sequentia/datasets/data/__init__.py b/sequentia/datasets/data/__init__.py
index e206863..3f2a533 100644
--- a/sequentia/datasets/data/__init__.py
+++ b/sequentia/datasets/data/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/datasets/digits.py b/sequentia/datasets/digits.py
index defe491..7fbca04 100644
--- a/sequentia/datasets/digits.py
+++ b/sequentia/datasets/digits.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -9,6 +9,7 @@
import importlib.resources
import operator
+import typing as t
import numpy as np
import pydantic as pyd
@@ -22,7 +23,19 @@
@pyd.validate_call
def load_digits(
- *, digits: set[pyd.conint(ge=0, le=9)] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+ *,
+ digits: set[t.Annotated[int, pyd.Field(ge=0, le=9)]] = {
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ },
) -> SequentialDataset:
"""Load a dataset of MFCC features of spoken digit audio samples from the
Free Spoken Digit Dataset.
diff --git a/sequentia/datasets/gene_families.py b/sequentia/datasets/gene_families.py
index 2151568..54559ff 100644
--- a/sequentia/datasets/gene_families.py
+++ b/sequentia/datasets/gene_families.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -9,6 +9,7 @@
import importlib.resources
import operator
+import typing as t
import numpy as np
import pydantic as pyd
@@ -23,7 +24,16 @@
@pyd.validate_call
def load_gene_families(
- *, families: set[pyd.conint(ge=0, le=6)] = {0, 1, 2, 3, 4, 5, 6}
+ *,
+ families: set[t.Annotated[int, pyd.Field(ge=0, le=6)]] = {
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ },
) -> tuple[SequentialDataset, LabelEncoder]:
"""Load a dataset of human DNA sequences grouped by gene family.
diff --git a/sequentia/enums.py b/sequentia/enums.py
index 4d3128f..3b70919 100644
--- a/sequentia/enums.py
+++ b/sequentia/enums.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -7,7 +7,7 @@
import enum
-__all__ = ["TopologyMode", "CovarianceMode", "TransitionMode", "PriorMode"]
+__all__ = ["CovarianceMode", "PriorMode", "TopologyMode", "TransitionMode"]
class TopologyMode(enum.StrEnum):
diff --git a/sequentia/model_selection/__init__.py b/sequentia/model_selection/__init__.py
new file mode 100644
index 0000000..07e3c40
--- /dev/null
+++ b/sequentia/model_selection/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""Hyper-parameter search and dataset splitting utilities."""
+
+from sequentia.model_selection._search import (
+ GridSearchCV,
+ RandomizedSearchCV,
+ param_grid,
+)
+from sequentia.model_selection._search_successive_halving import (
+ HalvingGridSearchCV,
+ HalvingRandomSearchCV,
+)
+from sequentia.model_selection._split import (
+ KFold,
+ RepeatedKFold,
+ RepeatedStratifiedKFold,
+ ShuffleSplit,
+ StratifiedKFold,
+ StratifiedShuffleSplit,
+)
+
+__all__ = [
+ "GridSearchCV",
+ "HalvingGridSearchCV",
+ "HalvingRandomSearchCV",
+ "KFold",
+ "RandomizedSearchCV",
+ "RepeatedKFold",
+ "RepeatedStratifiedKFold",
+ "ShuffleSplit",
+ "StratifiedKFold",
+ "StratifiedShuffleSplit",
+ "param_grid",
+]
diff --git a/sequentia/model_selection/_search.py b/sequentia/model_selection/_search.py
new file mode 100644
index 0000000..d0b5ef5
--- /dev/null
+++ b/sequentia/model_selection/_search.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""This file is an adapted version of the same file from the
+sklearn.model_selection sub-package.
+
+Below is the original license from Scikit-Learn, copied on 27th December 2024
+from https://github.com/scikit-learn/scikit-learn/blob/main/COPYING.
+
+---
+
+BSD 3-Clause License
+
+Copyright (c) 2007-2024 The scikit-learn developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+# Author: Alexandre Gramfort ,
+# Gael Varoquaux
+# Andreas Mueller
+# Olivier Grisel
+# Raghav RV
+# License: BSD 3 clause
+
+import time
+import typing as t
+from collections import defaultdict
+from itertools import product
+
+from sklearn.base import _fit_context, clone, is_classifier
+from sklearn.metrics._scorer import _MultimetricScorer
+from sklearn.model_selection import _search
+from sklearn.model_selection._split import check_cv
+from sklearn.model_selection._validation import (
+ _insert_error_scores,
+ _warn_or_raise_about_fit_failures,
+)
+from sklearn.utils.parallel import Parallel, delayed
+from sklearn.utils.validation import _check_method_params
+
+from sequentia.model_selection._validation import _fit_and_score
+
+__all__ = ["BaseSearchCV", "GridSearchCV", "RandomizedSearchCV", "param_grid"]
+
+
+def param_grid(**kwargs: list[t.Any]) -> list[dict[str, t.Any]]:
+ """Generates a hyper-parameter grid for a nested object.
+
+ Examples
+ --------
+ Using :func:`.param_grid` in a grid search to cross-validate over
+ settings for :class:`.GaussianMixtureHMM`, which is a nested model
+ specified in the constructor of a :class:`.HMMClassifier`. ::
+
+ from sklearn.pipeline import Pipeline
+ from sklearn.preprocessing import minmax_scale
+
+ from sequentia.enums import PriorMode, CovarianceMode, TopologyMode
+ from sequentia.models import HMMClassifier, GaussianMixtureHMM
+ from sequentia.preprocessing import IndependentFunctionTransformer
+ from sequentia.model_selection import GridSearchCV, StratifiedKFold
+
+ GridSearchCV(
+ estimator=Pipeline(
+ [
+ ("scale", IndependentFunctionTransformer(minmax_scale)),
+ ("clf", HMMClassifier(variant=GaussianMixtureHMM)),
+ ]
+ ),
+ param_grid={
+ "clf__prior": [PriorMode.UNIFORM, PriorMode.FREQUENCY],
+ "clf__model_kwargs": param_grid(
+ n_states=[3, 5, 7],
+ n_components=[2, 3, 4],
+ covariance=[
+ CovarianceMode.DIAGONAL, CovarianceMode.SPHERICAL
+ ],
+ topology=[
+ TopologyMode.LEFT_RIGHT, TopologyMode.LINEAR
+ ],
+ )
+ },
+ cv=StratifiedKFold(),
+ )
+
+ Parameters
+ ----------
+ **kwargs:
+ Hyper-parameter name and corresponding values.
+
+ Returns
+ -------
+ Hyper-parameter grid for a nested object.
+ """
+ return [
+ dict(zip(kwargs.keys(), values))
+ for values in product(*kwargs.values())
+ ]
+
+
+class BaseSearchCV(_search.BaseSearchCV):
+ @_fit_context(
+ # *SearchCV.estimator is not validated yet
+ prefer_skip_nested_validation=False
+ )
+ def fit(self, X, y=None, **params):
+ """Run fit with all sets of parameters.
+
+ Parameters
+ ----------
+ X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)
+ Training vectors, where `n_samples` is the number of samples and
+ `n_features` is the number of features. For precomputed kernel or
+ distance matrix, the expected shape of X is (n_samples, n_samples).
+
+ y : array-like of shape (n_samples, n_output) \
+ or (n_samples,), default=None
+ Target relative to X for classification or regression;
+ None for unsupervised learning.
+
+ **params : dict of str -> object
+ Parameters passed to the ``fit`` method of the estimator, the scorer,
+ and the CV splitter.
+
+ If a fit parameter is an array-like whose length is equal to
+ `num_samples` then it will be split across CV groups along with `X`
+ and `y`. For example, the :term:`sample_weight` parameter is split
+ because `len(sample_weights) = len(X)`.
+
+ Returns
+ -------
+ self : object
+ Instance of fitted estimator.
+ """
+ estimator = self.estimator
+ scorers, refit_metric = self._get_scorers()
+
+ # X, y = indexable(X, y) # NOTE @eonu: removed
+ params = _check_method_params(X, params=params)
+
+ routed_params = self._get_routed_params_for_fit(params)
+
+ cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
+ n_splits = cv_orig.get_n_splits(X, y, **routed_params.splitter.split)
+
+ base_estimator = clone(self.estimator)
+
+ parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)
+
+ fit_and_score_kwargs = dict(
+ scorer=scorers,
+ fit_params=routed_params.estimator.fit,
+ score_params=routed_params.scorer.score,
+ return_train_score=self.return_train_score,
+ return_n_test_samples=True,
+ return_times=True,
+ return_parameters=False,
+ error_score=self.error_score,
+ verbose=self.verbose,
+ )
+ results = {}
+ with parallel:
+ all_candidate_params = []
+ all_out = []
+ all_more_results = defaultdict(list)
+
+ def evaluate_candidates(
+ candidate_params, cv=None, more_results=None
+ ):
+ cv = cv or cv_orig
+ candidate_params = list(candidate_params)
+ n_candidates = len(candidate_params)
+
+ if self.verbose > 0:
+ print(
+ "Fitting {0} folds for each of {1} candidates,"
+ " totalling {2} fits".format(
+ n_splits, n_candidates, n_candidates * n_splits
+ )
+ )
+
+ out = parallel(
+ delayed(_fit_and_score)(
+ clone(base_estimator),
+ X,
+ y,
+ train=train,
+ test=test,
+ parameters=parameters,
+ split_progress=(split_idx, n_splits),
+ candidate_progress=(cand_idx, n_candidates),
+ **fit_and_score_kwargs,
+ )
+ for (cand_idx, parameters), (
+ split_idx,
+ (train, test),
+ ) in product(
+ enumerate(candidate_params),
+ enumerate(
+ cv.split(X, y, **routed_params.splitter.split)
+ ),
+ )
+ )
+
+ if len(out) < 1:
+ raise ValueError(
+ "No fits were performed. "
+ "Was the CV iterator empty? "
+ "Were there no candidates?"
+ )
+ elif len(out) != n_candidates * n_splits:
+ raise ValueError(
+ "cv.split and cv.get_n_splits returned "
+ f"inconsistent results. Expected {n_splits} "
+ f"splits, got {len(out) // n_candidates}"
+ )
+
+ _warn_or_raise_about_fit_failures(out, self.error_score)
+
+ # For callable self.scoring, the return type is only know after
+ # calling. If the return type is a dictionary, the error scores
+ # can now be inserted with the correct key. The type checking
+ # of out will be done in `_insert_error_scores`.
+ if callable(self.scoring):
+ _insert_error_scores(out, self.error_score)
+
+ all_candidate_params.extend(candidate_params)
+ all_out.extend(out)
+
+ if more_results is not None:
+ for key, value in more_results.items():
+ all_more_results[key].extend(value)
+
+ nonlocal results
+ results = self._format_results(
+ all_candidate_params, n_splits, all_out, all_more_results
+ )
+
+ return results
+
+ self._run_search(evaluate_candidates)
+
+ # multimetric is determined here because in the case of a callable
+ # self.scoring the return type is only known after calling
+ first_test_score = all_out[0]["test_scores"]
+ self.multimetric_ = isinstance(first_test_score, dict)
+
+ # check refit_metric now for a callabe scorer that is multimetric
+ if callable(self.scoring) and self.multimetric_:
+ self._check_refit_for_multimetric(first_test_score)
+ refit_metric = self.refit
+
+ # For multi-metric evaluation, store the best_index_, best_params_ and
+ # best_score_ iff refit is one of the scorer names
+ # In single metric evaluation, refit_metric is "score"
+ if self.refit or not self.multimetric_:
+ self.best_index_ = self._select_best_index(
+ self.refit, refit_metric, results
+ )
+ if not callable(self.refit):
+ # With a non-custom callable, we can select the best score
+ # based on the best index
+ self.best_score_ = results[f"mean_test_{refit_metric}"][
+ self.best_index_
+ ]
+ self.best_params_ = results["params"][self.best_index_]
+
+ if self.refit:
+ # here we clone the estimator as well as the parameters, since
+ # sometimes the parameters themselves might be estimators, e.g.
+ # when we search over different estimators in a pipeline.
+ # ref: https://github.com/scikit-learn/scikit-learn/pull/26786
+ self.best_estimator_ = clone(base_estimator).set_params(
+ **clone(self.best_params_, safe=False)
+ )
+
+ refit_start_time = time.time()
+ if y is not None:
+ self.best_estimator_.fit(X, y, **routed_params.estimator.fit)
+ else:
+ self.best_estimator_.fit(X, **routed_params.estimator.fit)
+ refit_end_time = time.time()
+ self.refit_time_ = refit_end_time - refit_start_time
+
+ if hasattr(self.best_estimator_, "feature_names_in_"):
+ self.feature_names_in_ = self.best_estimator_.feature_names_in_
+
+ # Store the only scorer not as a dict for single metric evaluation
+ if isinstance(scorers, _MultimetricScorer):
+ self.scorer_ = scorers._scorers
+ else:
+ self.scorer_ = scorers
+
+ self.cv_results_ = results
+ self.n_splits_ = n_splits
+
+ return self
+
+
+class GridSearchCV(_search.GridSearchCV, BaseSearchCV):
+ """Exhaustive search over specified parameter values for an estimator.
+
+ ``cv`` must be a valid splitting method from
+ :mod:`sequentia.model_selection`.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.GridSearchCV`
+ :class:`.GridSearchCV` is a modified version
+ of this class that supports sequences.
+ """
+
+
+class RandomizedSearchCV(_search.RandomizedSearchCV, BaseSearchCV):
+ """Randomized search on hyper parameters.
+
+ ``cv`` must be a valid splitting method from
+ :mod:`sequentia.model_selection`.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.RandomizedSearchCV`
+ :class:`.RandomizedSearchCV` is a modified version
+ of this class that supports sequences.
+ """
diff --git a/sequentia/model_selection/_search_successive_halving.py b/sequentia/model_selection/_search_successive_halving.py
new file mode 100644
index 0000000..499e5b1
--- /dev/null
+++ b/sequentia/model_selection/_search_successive_halving.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""This file is an adapted version of the same file from the
+sklearn.model_selection sub-package.
+
+Below is the original license from Scikit-Learn, copied on 27th December 2024
+from https://github.com/scikit-learn/scikit-learn/blob/main/COPYING.
+
+---
+
+BSD 3-Clause License
+
+Copyright (c) 2007-2024 The scikit-learn developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+from sklearn.model_selection import _search_successive_halving as _search
+
+from sequentia.model_selection._search import BaseSearchCV
+
+__all__ = ["HalvingGridSearchCV", "HalvingRandomSearchCV"]
+
+
+class HalvingGridSearchCV(_search.HalvingGridSearchCV, BaseSearchCV):
+ """Search over specified parameter values with successive halving.
+
+ ``cv`` must be a valid splitting method from
+ :mod:`sequentia.model_selection`.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.HalvingGridSearchCV`
+ :class:`.HalvingGridSearchCV` is a modified version
+ of this class that supports sequences.
+ """
+
+
+class HalvingRandomSearchCV(_search.HalvingRandomSearchCV, BaseSearchCV):
+ """Randomized search on hyper parameters with successive halving.
+
+ ``cv`` must be a valid splitting method from
+ :mod:`sequentia.model_selection`.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.HalvingRandomSearchCV`
+ :class:`.HalvingRandomSearchCV` is a modified version
+ of this class that supports sequences.
+ """
diff --git a/sequentia/model_selection/_split.py b/sequentia/model_selection/_split.py
new file mode 100644
index 0000000..d710f2f
--- /dev/null
+++ b/sequentia/model_selection/_split.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""This file is an adapted version of the same file from the
+sklearn.model_selection sub-package.
+
+Below is the original license from Scikit-Learn, copied on 27th December 2024
+from https://github.com/scikit-learn/scikit-learn/blob/main/COPYING.
+
+---
+
+BSD 3-Clause License
+
+Copyright (c) 2007-2024 The scikit-learn developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import typing as t
+
+import numpy as np
+from sklearn.model_selection import _split
+
+__all__ = [
+ "KFold",
+ "RepeatedKFold",
+ "RepeatedStratifiedKFold",
+ "ShuffleSplit",
+ "StratifiedKFold",
+ "StratifiedShuffleSplit",
+]
+
+
+class KFold(_split.KFold):
+ """K-Fold cross-validator.
+
+ Provides train/test indices to split data in train/test sets.
+ Split dataset into k consecutive folds (without shuffling by default).
+
+ Each fold is then used once as a validation while the
+ k - 1 remaining folds form the training set.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.KFold`
+ :class:`.KFold` is a modified version
+ of this class that supports sequences.
+ """
+
+ def split(
+ self, X: np.ndarray, y: np.ndarray, groups: t.Any = None
+ ) -> None:
+ return super().split(y, y, groups)
+
+
+class StratifiedKFold(_split.StratifiedKFold):
+ """Stratified K-Fold cross-validator.
+
+ Provides train/test indices to split data in train/test sets.
+
+ This cross-validation object is a variation of
+ KFold that returns stratified folds.
+
+ The folds are made by preserving the percentage of samples for each class.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.StratifiedKFold`
+ :class:`.StratifiedKFold` is a modified version
+ of this class that supports sequences.
+ """
+
+ def split(
+ self, X: np.ndarray, y: np.ndarray, groups: t.Any = None
+ ) -> None:
+ return super().split(y, y, groups)
+
+
+class ShuffleSplit(_split.ShuffleSplit):
+ """Random permutation cross-validator.
+
+ Yields indices to split data into training and test sets.
+
+ Note: contrary to other cross-validation strategies, random splits do not
+ guarantee that test sets across all folds will be mutually exclusive,
+ and might include overlapping samples. However, this is still very likely
+ for sizeable datasets.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.ShuffleSplit`
+ :class:`.ShuffleSplit` is a modified version
+ of this class that supports sequences.
+ """
+
+ def split(
+ self,
+ X: np.ndarray,
+ y: np.ndarray | None = None,
+ groups: t.Any = None,
+ ) -> None:
+ return super().split(y, y, groups)
+
+
+class StratifiedShuffleSplit(_split.StratifiedShuffleSplit):
+ """Stratified :class:`.ShuffleSplit` cross-validator.
+
+ Provides train/test indices to split data in train/test sets.
+
+ This cross-validation object is a merge of :class:`.StratifiedKFold`
+ and :class:`.ShuffleSplit`, which returns stratified randomized folds.
+ The folds are made by preserving the percentage of samples for each class.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.StratifiedShuffleSplit`
+ :class:`.StratifiedShuffleSplit` is a modified version
+ of this class that supports sequences.
+ """
+
+ def split(
+ self,
+ X: np.ndarray,
+ y: np.ndarray | None = None,
+ groups: t.Any = None,
+ ) -> None:
+ return super().split(y, y, groups)
+
+
+class RepeatedKFold(_split.RepeatedKFold):
+ """Repeated :class:`.KFold` cross validator.
+
+ Repeats :class:`.KFold` n times with different randomization in each repetition.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.RepeatedKFold`
+ :class:`.RepeatedKFold` is a modified version
+ of this class that supports sequences.
+ """
+
+ def split(
+ self,
+ X: np.ndarray,
+ y: np.ndarray | None = None,
+ groups: t.Any = None,
+ ) -> None:
+ return super().split(y, y, groups)
+
+
+class RepeatedStratifiedKFold(_split.RepeatedStratifiedKFold):
+ """Repeated :class:`.StratifiedKFold` cross validator.
+
+ Repeats :class:`.StratifiedKFold` n times with different randomization
+ in each repetition.
+
+ See Also
+ --------
+ :class:`sklearn.model_selection.RepeatedStratifiedKFold`
+ :class:`.RepeatedStratifiedKFold` is a modified version
+ of this class that supports sequences.
+ """
+
+ def split(
+ self,
+ X: np.ndarray,
+ y: np.ndarray | None = None,
+ groups: t.Any = None,
+ ) -> None:
+ return super().split(y, y, groups)
diff --git a/sequentia/model_selection/_validation.py b/sequentia/model_selection/_validation.py
new file mode 100644
index 0000000..6cd0674
--- /dev/null
+++ b/sequentia/model_selection/_validation.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+"""This file is an adapted version of the same file from the
+sklearn.model_selection sub-package.
+
+Below is the original license from Scikit-Learn, copied on 27th December 2024
+from https://github.com/scikit-learn/scikit-learn/blob/main/COPYING.
+
+---
+
+BSD 3-Clause License
+
+Copyright (c) 2007-2024 The scikit-learn developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numbers
+import time
+from traceback import format_exc
+
+import numpy as np
+from joblib import logger
+from sklearn.base import clone
+from sklearn.metrics._scorer import _MultimetricScorer
+from sklearn.model_selection._validation import _score
+from sklearn.utils._array_api import device, get_namespace
+from sklearn.utils.validation import _check_method_params, _num_samples
+
+from sequentia._internal import _data
+
+__all__ = ["_fit_and_score"]
+
+
+def _fit_and_score(
+ estimator,
+ X,
+ y,
+ *,
+ scorer,
+ train,
+ test,
+ verbose,
+ parameters,
+ fit_params,
+ score_params,
+ return_train_score=False,
+ return_parameters=False,
+ return_n_test_samples=False,
+ return_times=False,
+ return_estimator=False,
+ split_progress=None,
+ candidate_progress=None,
+ error_score=np.nan,
+):
+ xp, _ = get_namespace(X)
+ X_device = device(X)
+
+ # Make sure that we can fancy index X even if train and test are provided
+ # as NumPy arrays by NumPy only cross-validation splitters.
+ train, test = (
+ xp.asarray(train, device=X_device),
+ xp.asarray(test, device=X_device),
+ )
+
+ if not isinstance(error_score, numbers.Number) and error_score != "raise":
+ raise ValueError(
+ "error_score must be the string 'raise' or a numeric value. "
+ "(Hint: if using 'raise', please make sure that it has been "
+ "spelled correctly.)"
+ )
+
+ progress_msg = ""
+ if verbose > 2:
+ if split_progress is not None:
+ progress_msg = f" {split_progress[0]+1}/{split_progress[1]}"
+ if candidate_progress and verbose > 9:
+ progress_msg += (
+ f"; {candidate_progress[0]+1}/{candidate_progress[1]}"
+ )
+
+ if verbose > 1:
+ if parameters is None:
+ params_msg = ""
+ else:
+ sorted_keys = sorted(parameters) # Ensure deterministic o/p
+ params_msg = ", ".join(f"{k}={parameters[k]}" for k in sorted_keys)
+ if verbose > 9:
+ start_msg = f"[CV{progress_msg}] START {params_msg}"
+ print(f"{start_msg}{(80 - len(start_msg)) * '.'}")
+
+ # Adjust length of sample weights
+ lengths = fit_params["lengths"] # NOTE @eonu: added this
+ fit_params = fit_params if fit_params is not None else {}
+ fit_params = _check_method_params(X, params=fit_params, indices=train)
+ score_params = score_params if score_params is not None else {}
+ score_params_train = _check_method_params(
+ X, params=score_params, indices=train
+ )
+ score_params_test = _check_method_params(
+ X, params=score_params, indices=test
+ )
+
+ if parameters is not None:
+ # here we clone the parameters, since sometimes the parameters
+ # themselves might be estimators, e.g. when we search over different
+ # estimators in a pipeline.
+ # ref: https://github.com/scikit-learn/scikit-learn/pull/26786
+ estimator = estimator.set_params(**clone(parameters, safe=False))
+
+ start_time = time.time()
+
+ # NOTE @eonu: modified this block
+ idxs = _data.get_idxs(lengths)
+ idxs_train, idxs_test = idxs[train], idxs[test]
+ y_train, y_test = y[train], y[test]
+ lengths_train, lengths_test = lengths[train], lengths[test]
+ X_train = np.concatenate(list(_data.iter_X(X, idxs=idxs_train)))
+ X_test = np.concatenate(list(_data.iter_X(X, idxs=idxs_test)))
+ fit_params["lengths"] = lengths_train
+ score_params_train["lengths"] = lengths_train
+ score_params_test["lengths"] = lengths_test
+
+ result = {}
+ try:
+ if y_train is None:
+ estimator.fit(X_train, **fit_params)
+ else:
+ estimator.fit(X_train, y_train, **fit_params)
+
+ except Exception:
+ # Note fit time as time until error
+ fit_time = time.time() - start_time
+ score_time = 0.0
+ if error_score == "raise":
+ raise
+ elif isinstance(error_score, numbers.Number):
+ if isinstance(scorer, _MultimetricScorer):
+ test_scores = {name: error_score for name in scorer._scorers}
+ if return_train_score:
+ train_scores = test_scores.copy()
+ else:
+ test_scores = error_score
+ if return_train_score:
+ train_scores = error_score
+ result["fit_error"] = format_exc()
+ else:
+ result["fit_error"] = None
+
+ fit_time = time.time() - start_time
+ test_scores = _score(
+ estimator, X_test, y_test, scorer, score_params_test, error_score
+ )
+ score_time = time.time() - start_time - fit_time
+ if return_train_score:
+ train_scores = _score(
+ estimator,
+ X_train,
+ y_train,
+ scorer,
+ score_params_train,
+ error_score,
+ )
+
+ if verbose > 1:
+ total_time = score_time + fit_time
+ end_msg = f"[CV{progress_msg}] END "
+ result_msg = params_msg + (";" if params_msg else "")
+ if verbose > 2:
+ if isinstance(test_scores, dict):
+ for scorer_name in sorted(test_scores):
+ result_msg += f" {scorer_name}: ("
+ if return_train_score:
+ scorer_scores = train_scores[scorer_name]
+ result_msg += f"train={scorer_scores:.3f}, "
+ result_msg += f"test={test_scores[scorer_name]:.3f})"
+ else:
+ result_msg += ", score="
+ if return_train_score:
+ result_msg += (
+ f"(train={train_scores:.3f}, test={test_scores:.3f})"
+ )
+ else:
+ result_msg += f"{test_scores:.3f}"
+ result_msg += f" total time={logger.short_format_time(total_time)}"
+
+ # Right align the result_msg
+ end_msg += "." * (80 - len(end_msg) - len(result_msg))
+ end_msg += result_msg
+ print(end_msg)
+
+ result["test_scores"] = test_scores
+ if return_train_score:
+ result["train_scores"] = train_scores
+ if return_n_test_samples:
+ result["n_test_samples"] = _num_samples(X_test)
+ if return_times:
+ result["fit_time"] = fit_time
+ result["score_time"] = score_time
+ if return_parameters:
+ result["parameters"] = parameters
+ if return_estimator:
+ result["estimator"] = estimator
+ return result
diff --git a/sequentia/models/__init__.py b/sequentia/models/__init__.py
index bbff6eb..e9b3bbc 100644
--- a/sequentia/models/__init__.py
+++ b/sequentia/models/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/models/base.py b/sequentia/models/base.py
index 7a412d5..55874be 100644
--- a/sequentia/models/base.py
+++ b/sequentia/models/base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -8,6 +8,7 @@
from __future__ import annotations
import abc
+import typing as t
import numpy as np
import sklearn.base
@@ -28,18 +29,18 @@ class ClassifierMixin(
@abc.abstractmethod
def fit(
- self: ClassifierMixin,
+ self,
X: Array,
y: IntArray,
*,
lengths: IntArray | None = None,
- ) -> ClassifierMixin:
+ ) -> t.Self:
"""Fit the classifier with the provided sequences and outputs."""
raise NotImplementedError
@abc.abstractmethod
def predict(
- self: ClassifierMixin,
+ self,
X: Array,
*,
lengths: IntArray | None = None,
@@ -48,7 +49,7 @@ def predict(
raise NotImplementedError
def fit_predict(
- self: ClassifierMixin,
+ self,
X: Array,
y: IntArray,
*,
@@ -59,8 +60,6 @@ def fit_predict(
Parameters
----------
- self: ClassifierMixin
-
X:
Sequence(s).
@@ -82,7 +81,7 @@ def fit_predict(
@abc.abstractmethod
def predict_proba(
- self: ClassifierMixin,
+ self,
X: Array,
*,
lengths: IntArray | None = None,
@@ -92,7 +91,7 @@ def predict_proba(
@abc.abstractmethod
def predict_scores(
- self: ClassifierMixin,
+ self,
X: Array,
*,
lengths: IntArray | None = None,
@@ -102,7 +101,7 @@ def predict_scores(
@_validation.requires_fit
def score(
- self: ClassifierMixin,
+ self,
X: Array,
y: IntArray,
*,
@@ -114,8 +113,6 @@ def score(
Parameters
----------
- self: ClassifierMixin
-
X:
Sequence(s).
@@ -155,24 +152,24 @@ class RegressorMixin(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
@abc.abstractmethod
def fit(
- self: RegressorMixin,
+ self,
X: FloatArray,
y: FloatArray,
*,
lengths: IntArray | None = None,
- ) -> RegressorMixin:
+ ) -> t.Self:
"""Fit the regressor with the provided sequences and outputs."""
raise NotImplementedError
@abc.abstractmethod
def predict(
- self: RegressorMixin, X: FloatArray, lengths: IntArray | None = None
+ self, X: FloatArray, lengths: IntArray | None = None
) -> FloatArray:
"""Predict outputs for the provided sequences."""
raise NotImplementedError
def fit_predict(
- self: RegressorMixin,
+ self,
X: FloatArray,
y: FloatArray,
*,
@@ -183,8 +180,6 @@ def fit_predict(
Parameters
----------
- self: RegressorMixin
-
X:
Sequence(s).
@@ -206,7 +201,7 @@ def fit_predict(
@_validation.requires_fit
def score(
- self: RegressorMixin,
+ self,
X: FloatArray,
y: FloatArray,
*,
@@ -218,8 +213,6 @@ def score(
Parameters
----------
- self: RegressorMixin
-
X:
Sequence(s).
diff --git a/sequentia/models/hmm/__init__.py b/sequentia/models/hmm/__init__.py
index 480da35..ea7f699 100644
--- a/sequentia/models/hmm/__init__.py
+++ b/sequentia/models/hmm/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/models/hmm/classifier.py b/sequentia/models/hmm/classifier.py
index be0f31e..2da8a38 100644
--- a/sequentia/models/hmm/classifier.py
+++ b/sequentia/models/hmm/classifier.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -17,12 +17,12 @@
import pydantic as pyd
from sklearn.utils.validation import NotFittedError
-from sequentia._internal import _data, _multiprocessing, _validation
+from sequentia._internal import _data, _multiprocessing, _sklearn, _validation
from sequentia._internal._typing import Array, FloatArray, IntArray
from sequentia.datasets.base import SequentialDataset
from sequentia.enums import PriorMode
from sequentia.models.base import ClassifierMixin
-from sequentia.models.hmm.variants.base import BaseHMM
+from sequentia.models.hmm import variants
class HMMClassifier(ClassifierMixin):
@@ -35,8 +35,9 @@ class HMMClassifier(ClassifierMixin):
Examples
--------
- Using a :class:`.HMMClassifier` (with :class:`.GaussianMixtureHMM`
- models) to classify spoken digits. ::
+ Using a :class:`.HMMClassifier` with :class:`.GaussianMixtureHMM`
+ models for each class (all with identical settings),
+ to classify spoken digits. ::
import numpy as np
from sequentia.datasets import load_digits
@@ -47,7 +48,29 @@ class HMMClassifier(ClassifierMixin):
# Fetch MFCCs of spoken digits
data = load_digits()
- train_data, test_data = data.split(test_size=0.2, random_state=random_state)
+ train_data, test_data = data.split(
+ test_size=0.2, random_state=random_state
+ )
+
+ # Create a HMMClassifier using:
+ # - a separate GaussianMixtureHMM for each class (with 3 states)
+ # - a class frequency prior
+ clf = HMMClassifier(
+ variant=GaussianMixtureHMM,
+ model_kwargs=dict(n_states=3, random_state=random_state)
+ prior='frequency',
+ )
+
+ # Fit the HMMs by providing observation sequences for all classes
+ clf.fit(train_data.X, train_data.y, lengths=train_data.lengths)
+
+ # Predict classes for the test observation sequences
+ y_pred = clf.predict(test_data.X, lengths=test_data.lengths)
+
+ For more complex problems, it might be necessary to specify different
+ hyper-parameters for each individual class HMM. This can be done by
+ using :func:`add_model` or :func:`add_models` to add HMM objects
+ after the :class:`HMMClassifier` has been initialized. ::
# Create a HMMClassifier using a class frequency prior
clf = HMMClassifier(prior='frequency')
@@ -57,24 +80,18 @@ class HMMClassifier(ClassifierMixin):
model = GaussianMixtureHMM(random_state=random_state)
clf.add_model(model, label=label)
- # Fit the HMMs by providing training observation sequences for all classes
+ # Fit the HMMs by providing observation sequences for all classes
clf.fit(train_data.X, train_data.y, lengths=train_data.lengths)
- # Predict classes for the test observation sequences
- y_pred = clf.predict(test_data.X, lengths=test_data.lengths)
-
- As done in the above example, we can provide unfitted HMMs using
- :func:`add_model` or :func:`add_models`, then provide training
- observation sequences for all classes to :func:`fit`, which will
- automatically train each HMM on the appropriate subset of data.
-
- Alternatively, we may provide pre-fitted HMMs and call :func:`fit` with
- no arguments. ::
+ Alternatively, we might want to pre-fit the HMMs individually,
+ then add these fitted HMMs to the :class:`.HMMClassifier`. In this case,
+ :func:`fit` on the :class:`.HMMClassifier` is called without providing any
+ data as arguments, since the HMMs are already fitted. ::
# Create a HMMClassifier using a class frequency prior
clf = HMMClassifier(prior='frequency')
- # Manually fit each HMM on its own subset of data
+ # Manually fit each HMM on its own subset of data
for X_train, lengths_train, label for train_data.iter_by_class():
model = GaussianMixtureHMM(random_state=random_state)
model.fit(X_train, lengths=lengths_train)
@@ -82,28 +99,41 @@ class HMMClassifier(ClassifierMixin):
# Fit the classifier
clf.fit()
- """ # noqa: E501
+ """
@pyd.validate_call(config=dict(arbitrary_types_allowed=True))
def __init__(
- self: pyd.SkipValidation,
+ self,
*,
+ variant: type[variants.CategoricalHMM]
+ | type[variants.GaussianMixtureHMM]
+ | None = None,
+ model_kwargs: dict[str, t.Any] | None = None,
prior: (
- PriorMode | dict[int, pyd.confloat(ge=0, le=1)]
- ) = PriorMode.UNIFORM, # placeholder
+ PriorMode | dict[int, t.Annotated[float, pyd.Field(ge=0, le=1)]]
+ ) = PriorMode.UNIFORM,
classes: list[int] | None = None,
n_jobs: pyd.PositiveInt | pyd.NegativeInt = 1,
- ) -> pyd.SkipValidation:
+ ) -> None:
"""Initialize a :class:`.HMMClassifier`.
Parameters
----------
- self: HMMClassifier
+ variant:
+ Variant of HMM to use for modelling each class. If not specified,
+ models must instead be added using the :func:`add_model` or
+ :func:`add_models` methods after the :class:`.HMMClassifier` has
+ been initialized.
+
+ model_kwargs:
+ If ``variant`` is specified, these parameters are used to
+ initialize the created HMM object(s). Note that all HMMs
+ will be created with identical settings.
prior:
Type of prior probability to assign to each HMM.
- - If ``None``, a uniform prior will be used, making each HMM
+ - If ``"uniform"``, a uniform prior will be used, making each HMM
equally likely.
- If ``"frequency"``, the prior probability of each HMM is equal
to the fraction of total observation sequences that the HMM was
@@ -134,39 +164,43 @@ class labels provided here.
-------
HMMClassifier
"""
+ #: Type of HMM to use for each class.
+ self.variant: (
+ type[variants.CategoricalHMM]
+ | type[variants.GaussianMixtureHMM]
+ | None
+ ) = variant
+ #: Model parameters for initializing HMMs.
+ self.model_kwargs: dict[str, t.Any] | None = model_kwargs
#: Type of prior probability to assign to each HMM.
- self.prior: PriorMode | dict[int, pyd.confloat(ge=0, le=1)] = prior
+ self.prior: (
+ PriorMode | dict[int, t.Annotated[float, pyd.Field(ge=0, le=1)]]
+ ) = prior
#: Set of possible class labels.
self.classes: list[int] | None = classes
#: Maximum number of concurrently running workers.
self.n_jobs: pyd.PositiveInt | pyd.NegativeInt = n_jobs
#: HMMs constituting the :class:`.HMMClassifier`.
- self.models: dict[int, BaseHMM] = {}
+ self.models: dict[int, variants.BaseHMM] = {}
+
# Allow metadata routing for lengths
- self.set_fit_request(lengths=True)
- self.set_predict_request(lengths=True)
- self.set_predict_proba_request(lengths=True)
- self.set_predict_log_proba_request(lengths=True)
- self.set_score_request(
- lengths=True,
- normalize=True,
- sample_weight=True,
- )
+ if _sklearn.routing_enabled():
+ self.set_fit_request(lengths=True)
+ self.set_predict_request(lengths=True)
+ self.set_predict_proba_request(lengths=True)
+ self.set_predict_log_proba_request(lengths=True)
+ self.set_score_request(
+ lengths=True,
+ normalize=True,
+ sample_weight=True,
+ )
@pyd.validate_call(config=dict(arbitrary_types_allowed=True))
- def add_model(
- self: pyd.SkipValidation,
- model: BaseHMM,
- /,
- *,
- label: int,
- ) -> pyd.SkipValidation:
+ def add_model(self, model: variants.BaseHMM, /, *, label: int) -> t.Self:
"""Add a single HMM to the classifier.
Parameters
----------
- self: HMMClassifier
-
model:
HMM to add to the classifier.
@@ -196,17 +230,11 @@ def add_model(
return self
@pyd.validate_call(config=dict(arbitrary_types_allowed=True))
- def add_models(
- self: pyd.SkipValidation,
- models: dict[int, BaseHMM],
- /,
- ) -> pyd.SkipValidation:
+ def add_models(self, models: dict[int, variants.BaseHMM], /) -> t.Self:
"""Add HMMs to the classifier.
Parameters
----------
- self: HMMClassifier
-
models:
HMMs to add to the classifier. The key for each HMM should be the
label of the class represented by the HMM.
@@ -226,24 +254,23 @@ def add_models(
return self
def fit(
- self: HMMClassifier,
+ self,
X: Array | None = None,
y: IntArray | None = None,
*,
lengths: IntArray | None = None,
- ) -> HMMClassifier:
+ ) -> t.Self:
"""Fit the HMMs to the sequence(s) in ``X``.
- If fitted models were provided with :func:`add_model` or
:func:`add_models`, no arguments should be passed to :func:`fit`.
- If unfitted models were provided with :func:`add_model` or
- :func:`add_models`, training data ``X``, ``y`` and ``lengths``
- must be provided to :func:`fit`.
+ :func:`add_models`, or a ``variant`` was specified in
+ :func:`HMMClassifier.__init__`, training data ``X``, ``y`` and
+ ``lengths`` must be provided to :func:`fit`.
Parameters
----------
- self: HMMClassifier
-
X:
Sequence(s).
@@ -289,6 +316,13 @@ def fit(
y = _validation.check_y(y, lengths=lengths, dtype=np.int8)
self.classes_ = _validation.check_classes(y, classes=self.classes)
+ # Initialize models based on instructor spec if provided
+ if self.variant:
+ model_kwargs = self.model_kwargs or {}
+ self.models = {
+ label: self.variant(**model_kwargs) for label in self.classes_
+ }
+
# Check that each label has a HMM (and vice versa)
if set(self.models.keys()) != set(self.classes_):
msg = (
@@ -306,11 +340,27 @@ def fit(
lengths=lengths,
classes=self.classes_,
)
- for X_c, lengths_c, c in dataset.iter_by_class():
- self.models[c].fit(X_c, lengths=lengths_c)
+
+ # get number of jobs
+ n_jobs = _multiprocessing.effective_n_jobs(
+ self.n_jobs, x=self.classes_
+ )
+
+ # fit models in parallel
+ self.models = dict(
+ zip(
+ self.classes_,
+ joblib.Parallel(n_jobs=n_jobs, mmap_mode="r+")(
+ joblib.delayed(self.models[c].fit)(
+ X_c, lengths=lengths_c
+ )
+ for X_c, lengths_c, c in dataset.iter_by_class()
+ ),
+ )
+ )
# Set class priors
- models: t.Iterator[int, BaseHMM] = self.models.items()
+ models: t.Iterable[int, variants.BaseHMM] = self.models.items()
if self.prior == PriorMode.UNIFORM:
self.prior_ = {c: 1 / len(self.classes_) for c, _ in models}
elif self.prior == PriorMode.FREQUENCY:
@@ -330,7 +380,7 @@ def fit(
@_validation.requires_fit
def predict(
- self: HMMClassifier,
+ self,
X: Array,
*,
lengths: IntArray | None = None,
@@ -339,8 +389,6 @@ def predict(
Parameters
----------
- self: HMMClassifier
-
X:
Sequence(s).
@@ -365,15 +413,13 @@ def predict(
@_validation.requires_fit
def predict_log_proba(
- self: HMMClassifier, X: Array, *, lengths: IntArray | None = None
+ self, X: Array, *, lengths: IntArray | None = None
) -> FloatArray:
"""Predict log un-normalized posterior probabilities for the
sequences in ``X``.
Parameters
----------
- self: HMMClassifier
-
X:
Sequence(s).
@@ -396,7 +442,7 @@ def predict_log_proba(
@_validation.requires_fit
def predict_proba(
- self: HMMClassifier, X: Array, *, lengths: IntArray | None = None
+ self, X: Array, *, lengths: IntArray | None = None
) -> FloatArray:
"""Predict class probabilities for the sequence(s) in ``X``.
@@ -405,8 +451,6 @@ def predict_proba(
Parameters
----------
- self: HMMClassifier
-
X:
Sequence(s).
@@ -433,7 +477,7 @@ def predict_proba(
@_validation.requires_fit
def predict_scores(
- self: HMMClassifier, X: Array, *, lengths: IntArray | None = None
+ self, X: Array, *, lengths: IntArray | None = None
) -> FloatArray:
"""Predict class scores for the sequence(s) in ``X``.
@@ -442,8 +486,6 @@ def predict_scores(
Parameters
----------
- self: HMMClassifier
-
X:
Sequence(s).
@@ -462,7 +504,7 @@ def predict_scores(
-----
This method requires a trained classifier — see :func:`fit`.
"""
- model: BaseHMM = next(iter(self.models.values()))
+ model: variants.BaseHMM = next(iter(self.models.values()))
X, lengths = _validation.check_X_lengths(
X,
lengths=lengths,
@@ -471,20 +513,18 @@ def predict_scores(
n_jobs = _multiprocessing.effective_n_jobs(self.n_jobs, x=lengths)
chunk_idxs = np.array_split(_data.get_idxs(lengths), n_jobs)
return np.concatenate(
- joblib.Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ joblib.Parallel(n_jobs=n_jobs, mmap_mode="r+")(
joblib.delayed(self._compute_scores_chunk)(X, idxs=idxs)
for idxs in chunk_idxs
)
)
@_validation.requires_fit
- def save(self: HMMClassifier, path: str | pathlib.Path | t.IO, /) -> None:
+ def save(self, path: str | pathlib.Path | t.IO, /) -> None:
"""Serialize and save a fitted HMM classifier.
Parameters
----------
- self: HMMClassifier
-
path:
Location to save the serialized classifier.
@@ -509,17 +549,11 @@ def save(self: HMMClassifier, path: str | pathlib.Path | t.IO, /) -> None:
joblib.dump(state, path)
@classmethod
- def load(
- cls: type[HMMClassifier],
- path: str | pathlib.Path | t.IO,
- /,
- ) -> HMMClassifier:
+ def load(cls, path: str | pathlib.Path | t.IO, /) -> HMMClassifier:
"""Load and deserialize a fitted HMM classifier.
Parameters
----------
- cls: type[HMMClassifier]
-
path:
Location to load the serialized classifier from.
@@ -547,7 +581,7 @@ def load(
return model
def _compute_scores_chunk(
- self: HMMClassifier, X: Array, /, *, idxs: IntArray
+ self, X: Array, /, *, idxs: IntArray
) -> FloatArray:
"""Compute log posterior probabilities for a chunk of sequences."""
scores = np.zeros((len(idxs), len(self.classes_)))
@@ -556,7 +590,7 @@ def _compute_scores_chunk(
return scores
def _compute_log_posterior(
- self: HMMClassifier,
+ self,
x: Array,
/,
) -> FloatArray:
diff --git a/sequentia/models/hmm/variants/__init__.py b/sequentia/models/hmm/variants/__init__.py
index b40b57f..ea339bb 100644
--- a/sequentia/models/hmm/variants/__init__.py
+++ b/sequentia/models/hmm/variants/__init__.py
@@ -1,11 +1,12 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
"""Supported hidden Markov Model variants."""
+from sequentia.models.hmm.variants.base import BaseHMM
from sequentia.models.hmm.variants.categorical import CategoricalHMM
from sequentia.models.hmm.variants.gaussian_mixture import GaussianMixtureHMM
-__all__ = ["CategoricalHMM", "GaussianMixtureHMM"]
+__all__ = ["BaseHMM", "CategoricalHMM", "GaussianMixtureHMM"]
diff --git a/sequentia/models/hmm/variants/base.py b/sequentia/models/hmm/variants/base.py
index d80d4e7..3e0b864 100644
--- a/sequentia/models/hmm/variants/base.py
+++ b/sequentia/models/hmm/variants/base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -34,13 +34,13 @@ class BaseHMM(BaseEstimator, metaclass=abc.ABCMeta):
@abc.abstractmethod
def __init__(
- self: BaseHMM,
+ self,
*,
n_states: pyd.PositiveInt,
topology: enums.TopologyMode | None,
random_state: pyd.NonNegativeInt | np.random.RandomState | None,
hmmlearn_kwargs: dict[str, t.Any] | None,
- ) -> BaseHMM:
+ ) -> None:
self.n_states: int = n_states
"""Number of states in the Markov chain."""
@@ -66,19 +66,12 @@ def __init__(
self._skip_init_params = set()
self._skip_params = set()
- def fit(
- self: BaseHMM,
- X: Array,
- *,
- lengths: IntArray | None = None,
- ) -> BaseHMM:
+ def fit(self, X: Array, *, lengths: IntArray | None = None) -> t.Self:
"""Fit the HMM to the sequences in ``X``, using the Baum—Welch
algorithm.
Parameters
----------
- self: BaseHMM
-
X:
Sequence(s).
@@ -123,14 +116,12 @@ def fit(
return self
@_validation.requires_fit
- def score(self: BaseHMM, x: Array, /) -> float:
+ def score(self, x: Array, /) -> float:
"""Calculate the log-likelihood of the HMM generating a single
observation sequence.
Parameters
----------
- self: BaseHMM
-
x:
Sequence.
@@ -152,7 +143,7 @@ def score(self: BaseHMM, x: Array, /) -> float:
@abc.abstractproperty
@_validation.requires_fit
- def n_params(self: BaseHMM) -> int:
+ def n_params(self) -> int:
"""Number of trainable parameters — requires :func:`fit`."""
n_params = 0
if "s" not in self._skip_params:
@@ -163,7 +154,7 @@ def n_params(self: BaseHMM) -> int:
@_validation.requires_fit
def bic(
- self: BaseHMM,
+ self,
X: Array,
*,
lengths: IntArray | None = None,
@@ -173,8 +164,6 @@ def bic(
Parameters
----------
- self: BaseHMM
-
X:
Sequence(s).
@@ -200,7 +189,7 @@ def bic(
@_validation.requires_fit
def aic(
- self: BaseHMM,
+ self,
X: Array,
*,
lengths: IntArray | None = None,
@@ -210,8 +199,6 @@ def aic(
Parameters
----------
- self: BaseHMM
-
X:
Sequence(s).
@@ -236,10 +223,10 @@ def aic(
@pyd.validate_call(config=dict(arbitrary_types_allowed=True))
def set_state_start_probs(
- self: pyd.SkipValidation,
+ self,
probs: (
FloatArray | enums.TransitionMode
- ) = enums.TransitionMode.RANDOM, # placeholder
+ ) = enums.TransitionMode.RANDOM,
/,
) -> None:
"""Set the initial state probabilities.
@@ -258,8 +245,6 @@ def set_state_start_probs(
Parameters
----------
- self: BaseHMM
-
probs:
Probabilities or probability type to assign as initial state
probabilities.
@@ -285,10 +270,10 @@ def set_state_start_probs(
@pyd.validate_call(config=dict(arbitrary_types_allowed=True))
def set_state_transition_probs(
- self: pyd.SkipValidation,
+ self,
probs: (
FloatArray | enums.TransitionMode
- ) = enums.TransitionMode.RANDOM, # placeholder
+ ) = enums.TransitionMode.RANDOM,
/,
) -> None:
"""Set the transition probability matrix.
@@ -307,8 +292,6 @@ def set_state_transition_probs(
Parameters
----------
- self: BaseHMM
-
probs:
Probabilities or probability type to assign as state transition
probabilities.
@@ -335,7 +318,7 @@ def set_state_transition_probs(
self._skip_init_params |= set("t")
@abc.abstractmethod
- def freeze(self: BaseHMM, params: str | None, /) -> None:
+ def freeze(self, params: str | None, /) -> None:
"""Freeze the trainable parameters of the HMM,
preventing them from be updated during the Baum—Welch algorithm.
"""
@@ -343,19 +326,19 @@ def freeze(self: BaseHMM, params: str | None, /) -> None:
self._skip_params |= set(self._modify_params(params or defaults))
@abc.abstractmethod
- def unfreeze(self: BaseHMM, params: str | None, /) -> None:
+ def unfreeze(self, params: str | None, /) -> None:
"""Unfreeze the trainable parameters of the HMM,
allowing them to be updated during the Baum—Welch algorithm.
"""
defaults = self._hmmlearn_kwargs_defaults()["params"]
self._skip_params -= set(self._modify_params(params or defaults))
- def _modify_params(self: BaseHMM, params: str) -> str:
+ def _modify_params(self, params: str) -> str:
"""Validate parameters to be frozen/unfrozen."""
defaults = self._hmmlearn_kwargs_defaults()["params"]
msg = (
"Expected a string consisting of any combination of "
- f"{defaults!r}" #
+ f"{defaults!r}"
)
if isinstance(params, str):
if bool(re.compile(rf"[^{defaults}]").search(params)):
@@ -364,7 +347,7 @@ def _modify_params(self: BaseHMM, params: str) -> str:
raise TypeError(msg)
return params
- def _check_init_params(self: BaseHMM) -> None:
+ def _check_init_params(self) -> None:
"""Validate hmmlearn init_params argument."""
topology = self.topology_ or _hmm.topologies.ErgodicTopology(
n_states=self.n_states,
@@ -401,7 +384,7 @@ def _check_init_params(self: BaseHMM) -> None:
@classmethod
def _check_hmmlearn_kwargs(
- cls: type[BaseHMM], kwargs: dict[str, t.Any] | None
+ cls, kwargs: dict[str, t.Any] | None
) -> dict[str, t.Any]:
"""Check hmmlearn forwarded key-word arguments."""
defaults: dict[str, t.Any] = cls._hmmlearn_kwargs_defaults()
@@ -456,7 +439,7 @@ def _check_hmmlearn_kwargs(
return kwargs
@abc.abstractmethod
- def _init_hmm(self: BaseHMM, **kwargs: t.Any) -> hmmlearn.base.BaseHMM:
+ def _init_hmm(self, **kwargs: t.Any) -> hmmlearn.base.BaseHMM:
"""Initialize the hmmlearn model."""
raise NotImplementedError
diff --git a/sequentia/models/hmm/variants/categorical.py b/sequentia/models/hmm/variants/categorical.py
index fdc9404..3c1ddc6 100644
--- a/sequentia/models/hmm/variants/categorical.py
+++ b/sequentia/models/hmm/variants/categorical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -59,19 +59,17 @@ class CategoricalHMM(BaseHMM):
@pyd.validate_call(config=dict(arbitrary_types_allowed=True))
def __init__(
- self: pyd.SkipValidation,
+ self,
*,
n_states: pyd.PositiveInt = 5,
topology: enums.TopologyMode | None = enums.TopologyMode.LEFT_RIGHT,
random_state: pyd.NonNegativeInt | np.random.RandomState | None = None,
hmmlearn_kwargs: dict[str, t.Any] | None = None,
- ) -> pyd.SkipValidation:
+ ) -> None:
"""Initializes the :class:`.CategoricalHMM`.
Parameters
----------
- self: CategoricalHMM
-
n_states:
Number of states in the Markov chain.
@@ -104,18 +102,14 @@ def __init__(
@property
@_validation.requires_fit
- def n_params(self: CategoricalHMM) -> int:
+ def n_params(self) -> int:
"""Number of trainable parameters — requires :func:`fit`."""
n_params = super().n_params
if "e" not in self._skip_params:
n_params += self.model.emissionprob_.size
return n_params
- def set_state_emission_probs(
- self: CategoricalHMM,
- probs: FloatArray,
- /,
- ) -> None:
+ def set_state_emission_probs(self, probs: FloatArray, /) -> None:
"""Set the state emission distribution of the HMM's emission model.
If this method is **not** called, emission probabilities will be
@@ -124,8 +118,6 @@ def set_state_emission_probs(
Parameters
----------
- self: CategoricalHMM
-
probs:
Array of emission probabilities.
@@ -136,14 +128,12 @@ def set_state_emission_probs(
self._emissionprob = np.array(probs, dtype=np.float64)
self._skip_init_params |= set("e")
- def freeze(self: CategoricalHMM, params: str | None = None, /) -> None:
+ def freeze(self, params: str | None = None, /) -> None:
"""Freeze the trainable parameters of the HMM,
preventing them from being updated during the Baum—Welch algorithm.
Parameters
----------
- self: CategoricalHMM
-
params:
A string specifying which parameters to freeze.
Can contain a combination of:
@@ -164,14 +154,12 @@ def freeze(self: CategoricalHMM, params: str | None = None, /) -> None:
"""
super().freeze(params)
- def unfreeze(self: CategoricalHMM, params: str | None = None, /) -> None:
+ def unfreeze(self, params: str | None = None, /) -> None:
"""Unfreeze the trainable parameters of the HMM,
allowing them to be updated during the Baum—Welch algorithm.
Parameters
----------
- self: CategoricalHMM
-
params:
A string specifying which parameters to unfreeze.
Can contain a combination of:
@@ -189,10 +177,7 @@ def unfreeze(self: CategoricalHMM, params: str | None = None, /) -> None:
"""
super().unfreeze(params)
- def _init_hmm(
- self: CategoricalHMM,
- **kwargs: t.Any,
- ) -> hmmlearn.hmm.CategoricalHMM:
+ def _init_hmm(self, **kwargs: t.Any) -> hmmlearn.hmm.CategoricalHMM:
"""Initialize the hmmlearn model."""
return hmmlearn.hmm.CategoricalHMM(
n_components=self.n_states,
diff --git a/sequentia/models/hmm/variants/gaussian_mixture.py b/sequentia/models/hmm/variants/gaussian_mixture.py
index 87bfcf8..58042a8 100644
--- a/sequentia/models/hmm/variants/gaussian_mixture.py
+++ b/sequentia/models/hmm/variants/gaussian_mixture.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -60,7 +60,7 @@ class GaussianMixtureHMM(BaseHMM):
@pyd.validate_call(config=dict(arbitrary_types_allowed=True))
def __init__(
- self: pyd.SkipValidation,
+ self,
*,
n_states: pyd.PositiveInt = 5,
n_components: pyd.PositiveInt = 3,
@@ -68,13 +68,11 @@ def __init__(
topology: enums.TopologyMode | None = enums.TopologyMode.LEFT_RIGHT,
random_state: pyd.NonNegativeInt | np.random.RandomState | None = None,
hmmlearn_kwargs: dict[str, t.Any] | None = None,
- ) -> pyd.SkipValidation:
+ ) -> None:
"""Initializes the :class:`.GaussianMixtureHMM`.
Parameters
----------
- self: GaussianMixtureHMM
-
n_states:
Number of states in the Markov chain.
@@ -122,7 +120,7 @@ def __init__(
@property
@_validation.requires_fit
- def n_params(self: GaussianMixtureHMM) -> int:
+ def n_params(self) -> int:
"""Number of trainable parameters — requires :func:`fit`."""
n_params = super().n_params()
if "m" not in self._skip_params:
@@ -133,11 +131,7 @@ def n_params(self: GaussianMixtureHMM) -> int:
n_params += self.model.weights_.size
return n_params
- def set_state_means(
- self: GaussianMixtureHMM,
- means: FloatArray,
- /,
- ) -> None:
+ def set_state_means(self, means: FloatArray, /) -> None:
"""Set the mean vectors of the state emission distributions.
If this method is **not** called, mean vectors will be
@@ -146,8 +140,6 @@ def set_state_means(
Parameters
----------
- self: GaussianMixtureHMM
-
means:
Array of mean values.
@@ -158,11 +150,7 @@ def set_state_means(
self._means = np.array(means, dtype=np.float64)
self._skip_init_params |= set("m")
- def set_state_covars(
- self: GaussianMixtureHMM,
- covars: FloatArray,
- /,
- ) -> None:
+ def set_state_covars(self, covars: FloatArray, /) -> None:
"""Set the covariance matrices of the state emission distributions.
If this method is **not** called, covariance matrices will be
@@ -171,8 +159,6 @@ def set_state_covars(
Parameters
----------
- self: GaussianMixtureHMM
-
covars:
Array of covariance values.
@@ -183,11 +169,7 @@ def set_state_covars(
self._covars = np.array(covars, dtype=np.float64)
self._skip_init_params |= set("c")
- def set_state_weights(
- self: GaussianMixtureHMM,
- weights: FloatArray,
- /,
- ) -> None:
+ def set_state_weights(self, weights: FloatArray, /) -> None:
"""Set the component mixture weights of the state emission
distributions.
@@ -197,8 +179,6 @@ def set_state_weights(
Parameters
----------
- self: GaussianMixtureHMM
-
weights:
Array of component mixture weights.
@@ -209,18 +189,12 @@ def set_state_weights(
self._weights = np.array(weights, dtype=np.float64)
self._skip_init_params |= set("w")
- def freeze(
- self: GaussianMixtureHMM,
- params: str | None = None,
- /,
- ) -> None:
+ def freeze(self, params: str | None = None, /) -> None:
"""Freeze the trainable parameters of the HMM,
preventing them from be updated during the Baum—Welch algorithm.
Parameters
----------
- self: GaussianMixtureHMM
-
params:
A string specifying which parameters to freeze. Can contain a
combination of:
@@ -239,18 +213,12 @@ def freeze(
"""
super().freeze(params)
- def unfreeze(
- self: GaussianMixtureHMM,
- params: str | None = None,
- /,
- ) -> None:
+ def unfreeze(self, params: str | None = None, /) -> None:
"""Unfreeze the trainable parameters of the HMM,
allowing them to be updated during the Baum—Welch algorithm.
Parameters
----------
- self: GaussianMixtureHMM
-
params:
A string specifying which parameters to unfreeze. Can contain
a combination of:
@@ -270,7 +238,7 @@ def unfreeze(
super().unfreeze(params)
def _init_hmm(
- self: GaussianMixtureHMM,
+ self,
**kwargs: t.Any,
) -> hmmlearn.hmm.GMMHMM:
"""Initialize the hmmlearn model."""
diff --git a/sequentia/models/knn/__init__.py b/sequentia/models/knn/__init__.py
index 8e88274..abb405c 100644
--- a/sequentia/models/knn/__init__.py
+++ b/sequentia/models/knn/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/models/knn/base.py b/sequentia/models/knn/base.py
index d2d91e9..0d25325 100644
--- a/sequentia/models/knn/base.py
+++ b/sequentia/models/knn/base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -30,7 +30,7 @@ class KNNMixin:
@_validation.requires_fit
def query_neighbors(
- self: KNNMixin,
+ self,
X: FloatArray,
*,
lengths: IntArray | None = None,
@@ -41,8 +41,6 @@ def query_neighbors(
Parameters
----------
- self: KNNMixin
-
X:
Sequence(s).
@@ -94,7 +92,7 @@ def query_neighbors(
@_validation.requires_fit
def compute_distance_matrix(
- self: KNNMixin,
+ self,
X: FloatArray,
*,
lengths: IntArray | None = None,
@@ -104,8 +102,6 @@ def compute_distance_matrix(
Parameters
----------
- self: KNNMixin
-
X:
Sequence(s).
@@ -143,7 +139,7 @@ def compute_distance_matrix(
# multiprocessed DTW calculation
return np.vstack(
- joblib.Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ joblib.Parallel(n_jobs=n_jobs, mmap_mode="r+")(
joblib.delayed(self._distance_matrix_row_chunk)(
row_idxs, col_chunk_idxs, X, n_jobs, dtw
)
@@ -152,13 +148,11 @@ def compute_distance_matrix(
)
@_validation.requires_fit
- def dtw(self: KNNMixin, A: FloatArray, B: FloatArray) -> float:
+ def dtw(self, A: FloatArray, B: FloatArray) -> float:
"""Calculate the DTW distance between two observation sequences.
Parameters
----------
- self: KNNMixin
-
A:
The first sequence.
@@ -179,7 +173,7 @@ def dtw(self: KNNMixin, A: FloatArray, B: FloatArray) -> float:
return self._dtw()(A, B)
def _dtw1d(
- self: KNNMixin,
+ self,
a: FloatArray,
b: FloatArray,
*,
@@ -193,11 +187,11 @@ def _dtw1d(
window=window,
)
- def _window(self: KNNMixin, A: FloatArray, B: FloatArray) -> int:
+ def _window(self, A: FloatArray, B: FloatArray) -> int:
"""Calculate the absolute DTW window size."""
return int(self.window * min(len(A), len(B)))
- def _dtwi(self: KNNMixin, A: FloatArray, B: FloatArray) -> float:
+ def _dtwi(self, A: FloatArray, B: FloatArray) -> float:
"""Compute the multivariate DTW distance as the sum of the pairwise
per-feature DTW distances, allowing each feature to be warped
independently.
@@ -210,7 +204,7 @@ def dtw(a: FloatArray, b: FloatArray) -> float:
return np.sum([dtw(A[:, i], B[:, i]) for i in range(A.shape[1])])
- def _dtwd(self: KNNMixin, A: FloatArray, B: FloatArray) -> float:
+ def _dtwd(self, A: FloatArray, B: FloatArray) -> float:
"""Compute the multivariate DTW distance so that the warping of the
features depends on each other, by modifying the local distance
measure.
@@ -223,18 +217,18 @@ def _dtwd(self: KNNMixin, A: FloatArray, B: FloatArray) -> float:
window=window,
)
- def _dtw(self: KNNMixin) -> t.Callable[[FloatArray], float]:
+ def _dtw(self) -> t.Callable[[FloatArray], float]:
"""Conditional DTW callable."""
return self._dtwi if self.independent else self._dtwd
- def _weighting(self: KNNMixin) -> t.Callable[[FloatArray], FloatArray]:
+ def _weighting(self) -> t.Callable[[FloatArray], FloatArray]:
"""Weighting function - use equal weighting if not provided."""
if callable(self.weighting):
return self.weighting
return np.ones_like
def _distance_matrix_row_chunk(
- self: KNNMixin,
+ self,
row_idxs: IntArray,
col_chunk_idxs: list[IntArray],
X: FloatArray,
@@ -245,7 +239,7 @@ def _distance_matrix_row_chunk(
columns.
"""
return np.hstack(
- joblib.Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ joblib.Parallel(n_jobs=n_jobs, mmap_mode="r+")(
joblib.delayed(self._distance_matrix_row_col_chunk)(
col_idxs, row_idxs, X, dtw
)
@@ -254,7 +248,7 @@ def _distance_matrix_row_chunk(
)
def _distance_matrix_row_col_chunk(
- self: KNNMixin,
+ self,
col_idxs: IntArray,
row_idxs: IntArray,
X: FloatArray,
@@ -270,17 +264,11 @@ def _distance_matrix_row_col_chunk(
return distances
@_validation.requires_fit
- def save(
- self: KNNMixin,
- path: str | pathlib.Path | t.IO,
- /,
- ) -> None:
+ def save(self, path: str | pathlib.Path | t.IO, /) -> None:
"""Serialize and save a fitted KNN estimator.
Parameters
----------
- self: KNNMixin
-
path:
Location to save the serialized estimator.
@@ -312,17 +300,11 @@ def save(
joblib.dump(state, path)
@classmethod
- def load(
- cls: type[KNNMixin],
- path: str | pathlib.Path | t.IO,
- /,
- ) -> KNNMixin:
+ def load(cls, path: str | pathlib.Path | t.IO, /) -> KNNMixin:
"""Load and deserialize a fitted KNN estimator.
Parameters
----------
- cls: type[KNNMixin]
-
path:
Location to load the serialized estimator from.
diff --git a/sequentia/models/knn/classifier.py b/sequentia/models/knn/classifier.py
index 51af161..76b5240 100644
--- a/sequentia/models/knn/classifier.py
+++ b/sequentia/models/knn/classifier.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -16,7 +16,7 @@
import numpy as np
import pydantic as pyd
-from sequentia._internal import _data, _multiprocessing, _validation
+from sequentia._internal import _data, _multiprocessing, _sklearn, _validation
from sequentia._internal._typing import Array, FloatArray, IntArray
from sequentia.models.base import ClassifierMixin
from sequentia.models.knn.base import KNNMixin
@@ -59,23 +59,21 @@ class KNNClassifier(KNNMixin, ClassifierMixin):
@pyd.validate_call(config=dict(arbitrary_types_allowed=True))
def __init__(
- self: pyd.SkipValidation,
+ self,
*,
k: pyd.PositiveInt = 1,
weighting: t.Callable[[FloatArray], FloatArray] | None = None,
- window: pyd.confloat(ge=0.0, le=1.0) = 1.0,
+ window: t.Annotated[float, pyd.Field(ge=0, le=1)] = 1.0,
independent: bool = False,
- use_c: bool = False,
+ use_c: bool = True,
n_jobs: pyd.PositiveInt | pyd.NegativeInt = 1,
random_state: pyd.NonNegativeInt | np.random.RandomState | None = None,
classes: list[int] | None = None,
- ) -> pyd.SkipValidation:
+ ) -> None:
"""Initializes the :class:`.KNNClassifier`.
Parameters
----------
- self: KNNClassifier
-
k:
Number of neighbors.
@@ -142,9 +140,7 @@ def __init__(
self.k: int = k
"""Number of neighbors."""
- self.weighting: t.Callable[[np.ndarray], np.ndarray] | None = (
- weighting # placeholder
- )
+ self.weighting: t.Callable[[np.ndarray], np.ndarray] | None = weighting
"""A callable that specifies how distance weighting should be
performed."""
@@ -172,29 +168,28 @@ def __init__(
"""Set of possible class labels."""
# Allow metadata routing for lengths
- self.set_fit_request(lengths=True)
- self.set_predict_request(lengths=True)
- self.set_predict_log_proba_request(lengths=True)
- self.set_predict_proba_request(lengths=True)
- self.set_score_request(
- lengths=True,
- normalize=True,
- sample_weight=True,
- )
+ if _sklearn.routing_enabled():
+ self.set_fit_request(lengths=True)
+ self.set_predict_request(lengths=True)
+ self.set_predict_log_proba_request(lengths=True)
+ self.set_predict_proba_request(lengths=True)
+ self.set_score_request(
+ lengths=True,
+ normalize=True,
+ sample_weight=True,
+ )
def fit(
- self: KNNClassifier,
+ self,
X: FloatArray,
y: IntArray,
*,
lengths: IntArray | None = None,
- ) -> KNNClassifier:
+ ) -> t.Self:
"""Fit the classifier to the sequence(s) in ``X``.
Parameters
----------
- self: KNNClassifier
-
X:
Sequence(s).
@@ -232,7 +227,7 @@ def fit(
@_validation.requires_fit
def predict(
- self: KNNClassifier,
+ self,
X: FloatArray,
*,
lengths: IntArray | None = None,
@@ -241,8 +236,6 @@ def predict(
Parameters
----------
- self: KNNClassifier
-
X:
Sequence(s).
@@ -266,7 +259,7 @@ def predict(
@_validation.requires_fit
def predict_log_proba(
- self: KNNClassifier,
+ self,
X: FloatArray,
*,
lengths: IntArray | None = None,
@@ -277,8 +270,6 @@ def predict_log_proba(
Parameters
----------
- self: KNNClassifier
-
X:
Sequence(s).
@@ -301,7 +292,7 @@ def predict_log_proba(
@_validation.requires_fit
def predict_proba(
- self: KNNClassifier,
+ self,
X: FloatArray,
*,
lengths: IntArray | None = None,
@@ -312,8 +303,6 @@ def predict_proba(
Parameters
----------
- self: KNNClassifier
-
X:
Sequence(s).
@@ -337,7 +326,7 @@ def predict_proba(
@_validation.requires_fit
def predict_scores(
- self: KNNClassifier,
+ self,
X: FloatArray,
*,
lengths: IntArray | None = None,
@@ -349,8 +338,6 @@ def predict_scores(
Parameters
----------
- self: KNNClassifier
-
X:
Sequence(s).
@@ -378,7 +365,7 @@ def predict_scores(
return self._compute_scores(k_labels, k_weightings)
def _compute_scores(
- self: KNNClassifier, labels: IntArray, weightings: FloatArray
+ self, labels: IntArray, weightings: FloatArray
) -> FloatArray:
"""Calculate the sum of the weightings for each label group."""
scores = np.zeros((len(labels), len(self.classes_)))
@@ -387,7 +374,7 @@ def _compute_scores(
return scores
def _find_max_labels(
- self: KNNClassifier,
+ self,
scores: FloatArray,
/,
) -> IntArray:
@@ -397,15 +384,13 @@ def _find_max_labels(
n_jobs = _multiprocessing.effective_n_jobs(self.n_jobs, x=scores)
score_chunks = np.array_split(scores, n_jobs)
return np.concatenate(
- joblib.Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ joblib.Parallel(n_jobs=n_jobs, mmap_mode="r+")(
joblib.delayed(self._find_max_labels_chunk)(score_chunk)
for score_chunk in score_chunks
)
)
- def _find_max_labels_chunk(
- self: KNNClassifier, score_chunk: FloatArray, /
- ) -> IntArray:
+ def _find_max_labels_chunk(self, score_chunk: FloatArray, /) -> IntArray:
"""Return the label with the highest score for each item in the
chunk.
"""
diff --git a/sequentia/models/knn/regressor.py b/sequentia/models/knn/regressor.py
index 88ed9ba..8e9e83c 100644
--- a/sequentia/models/knn/regressor.py
+++ b/sequentia/models/knn/regressor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -14,7 +14,7 @@
import numpy as np
import pydantic as pyd
-from sequentia._internal import _data, _validation
+from sequentia._internal import _data, _sklearn, _validation
from sequentia._internal._typing import FloatArray, IntArray
from sequentia.models.base import RegressorMixin
from sequentia.models.knn.base import KNNMixin
@@ -33,22 +33,20 @@ class KNNRegressor(KNNMixin, RegressorMixin):
@pyd.validate_call(config=dict(arbitrary_types_allowed=True))
def __init__(
- self: pyd.SkipValidation,
+ self,
*,
k: pyd.PositiveInt = 1,
weighting: t.Callable[[FloatArray], FloatArray] | None = None,
- window: pyd.confloat(ge=0.0, le=1.0) = 1.0,
+ window: t.Annotated[float, pyd.Field(ge=0, le=1)] = 1.0,
independent: bool = False,
- use_c: bool = False,
+ use_c: bool = True,
n_jobs: pyd.PositiveInt | pyd.NegativeInt = 1,
random_state: pyd.NonNegativeInt | np.random.RandomState | None = None,
- ) -> pyd.SkipValidation:
+ ) -> None:
"""Initializes the :class:`.KNNRegressor`.
Parameters
----------
- self: KNNRegressor
-
k:
Number of neighbors.
@@ -64,7 +62,8 @@ def __init__(
If ``None``, then a uniform weighting of 1 will be applied to all
distances.
- window: The size of the Sakoe—Chiba band global constrant as a
+ window:
+ The size of the Sakoe—Chiba band global constrant as a
fraction of the length of the shortest of the two sequences being
compared.
@@ -106,9 +105,7 @@ def __init__(
self.k: int = k
"""Number of neighbors."""
- self.weighting: t.Callable[[np.ndarray], np.ndarray] | None = (
- weighting # placeholder
- )
+ self.weighting: t.Callable[[np.ndarray], np.ndarray] | None = weighting
"""A callable that specifies how distance weighting should be
performed."""
@@ -131,23 +128,22 @@ def __init__(
reproducible pseudo-randomness."""
# Allow metadata routing for lengths
- self.set_fit_request(lengths=True)
- self.set_predict_request(lengths=True)
- self.set_score_request(lengths=True, sample_weight=True)
+ if _sklearn.routing_enabled():
+ self.set_fit_request(lengths=True)
+ self.set_predict_request(lengths=True)
+ self.set_score_request(lengths=True, sample_weight=True)
def fit(
- self: KNNRegressor,
+ self,
X: FloatArray,
y: FloatArray,
*,
lengths: IntArray | None = None,
- ) -> KNNRegressor:
+ ) -> t.Self:
"""Fits the regressor to the sequence(s) in ``X``.
Parameters
----------
- self: KNNRegressor
-
X:
Sequence(s).
@@ -181,7 +177,7 @@ def fit(
@_validation.requires_fit
def predict(
- self: KNNRegressor,
+ self,
X: FloatArray,
*,
lengths: IntArray | None = None,
@@ -190,8 +186,6 @@ def predict(
Parameters
----------
- self: KNNRegressor
-
X:
Sequence(s).
diff --git a/sequentia/preprocessing/__init__.py b/sequentia/preprocessing/__init__.py
index 236a880..ad77bd4 100644
--- a/sequentia/preprocessing/__init__.py
+++ b/sequentia/preprocessing/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/sequentia/preprocessing/transforms.py b/sequentia/preprocessing/transforms.py
index d609d60..52c7509 100644
--- a/sequentia/preprocessing/transforms.py
+++ b/sequentia/preprocessing/transforms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -45,15 +45,17 @@
from __future__ import annotations
+import typing as t
import warnings
import numpy as np
import scipy.signal
+import sklearn
import sklearn.base
from sklearn.preprocessing import FunctionTransformer
from sklearn.utils.validation import _allclose_dense_sparse, check_array
-from sequentia._internal import _data, _validation
+from sequentia._internal import _data, _sklearn, _validation
from sequentia._internal._typing import Array, FloatArray, IntArray
__all__ = ["IndependentFunctionTransformer", "mean_filter", "median_filter"]
@@ -122,10 +124,12 @@ def __init__(
self.feature_names_out = feature_names_out
self.kw_args = kw_args
self.inv_kw_args = inv_kw_args
+
# Allow metadata routing for lengths
- self.set_fit_request(lengths=True)
- self.set_transform_request(lengths=True)
- self.set_inverse_transform_request(lengths=True)
+ if _sklearn.routing_enabled():
+ self.set_fit_request(lengths=True)
+ self.set_transform_request(lengths=True)
+ self.set_inverse_transform_request(lengths=True)
def _check_input(self, X, *, lengths, reset):
if self.validate:
@@ -173,18 +177,16 @@ def _check_inverse_transform(self, X, *, lengths):
@sklearn.base._fit_context(prefer_skip_nested_validation=True)
def fit(
- self: IndependentFunctionTransformer,
+ self,
X: Array,
y: Array | None = None,
*,
lengths: IntArray | None = None,
- ) -> IndependentFunctionTransformer:
+ ) -> t.Self:
"""Fits the transformer to ``X``.
Parameters
----------
- self: IndependentFunctionTransformer
-
X:
Sequence(s).
@@ -210,7 +212,7 @@ def fit(
return self
def transform(
- self: IndependentFunctionTransformer,
+ self,
X: Array,
*,
lengths: IntArray | None = None,
@@ -220,8 +222,6 @@ def transform(
Parameters
----------
- self: IndependentFunctionTransformer
-
X:
Sequence(s).
@@ -242,7 +242,7 @@ def transform(
)
def inverse_transform(
- self: IndependentFunctionTransformer,
+ self,
X: Array,
*,
lengths: IntArray | None = None,
@@ -251,8 +251,6 @@ def inverse_transform(
Parameters
----------
- self: IndependentFunctionTransformer
-
X:
Sequence(s).
@@ -280,7 +278,7 @@ def inverse_transform(
)
def fit_transform(
- self: IndependentFunctionTransformer,
+ self,
X: Array,
y: Array | None = None,
*,
@@ -291,8 +289,6 @@ def fit_transform(
Parameters
----------
- self: IndependentFunctionTransformer
-
X:
Sequence(s).
diff --git a/sequentia/version.py b/sequentia/version.py
index bcec5ca..91e71f8 100644
--- a/sequentia/version.py
+++ b/sequentia/version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -33,7 +33,7 @@
__all__ = ["VERSION", "version_info"]
-VERSION = "2.0.2"
+VERSION = "2.5.0"
def version_info() -> str:
@@ -79,6 +79,6 @@ def version_info() -> str:
"related packages": ", ".join(related_packages),
}
return "\n".join(
- "{:>30} {}".format(k + ":", str(v).replace("\n", " ")) #
+ "{:>30} {}".format(k + ":", str(v).replace("\n", " "))
for k, v in info.items()
)
diff --git a/tasks.py b/tasks.py
index b075595..65de3f7 100644
--- a/tasks.py
+++ b/tasks.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/__init__.py b/tests/__init__.py
index 8d3537b..244ba10 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/conftest.py b/tests/conftest.py
index db78dcc..a007cf6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -25,7 +25,7 @@ def combinations(string: str, /) -> t.Iterable[str]:
return map( # noqa: C417
lambda params: "".join(params),
itertools.chain.from_iterable(
- itertools.combinations(string, i) # placeholder
+ itertools.combinations(string, i)
for i in range(1, len(string))
),
)
@@ -39,17 +39,12 @@ def assert_not_equal(a: Array, b: Array, /) -> None:
assert not np.allclose(a, b, rtol=1e-3)
@classmethod
- def assert_all_equal(cls: type[Helpers], A: Array, B: Array, /) -> None:
+ def assert_all_equal(cls, A: Array, B: Array, /) -> None:
for a, b in zip(A, B):
cls.assert_equal(a, b)
@classmethod
- def assert_all_not_equal(
- cls: type[Helpers],
- A: Array,
- B: Array,
- /,
- ) -> None:
+ def assert_all_not_equal(cls, A: Array, B: Array, /) -> None:
for a, b in zip(A, B):
cls.assert_not_equal(a, b)
@@ -61,6 +56,6 @@ def assert_distribution(x: Array, /) -> None:
assert_almost_equal(x.sum(axis=1), np.ones(len(x)))
-@pytest.fixture()
+@pytest.fixture
def helpers() -> type[Helpers]:
return Helpers
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
index cd11e40..b4cba4c 100644
--- a/tests/unit/__init__.py
+++ b/tests/unit/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_datasets/__init__.py b/tests/unit/test_datasets/__init__.py
index cd11e40..b4cba4c 100644
--- a/tests/unit/test_datasets/__init__.py
+++ b/tests/unit/test_datasets/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_datasets/test_base.py b/tests/unit/test_datasets/test_base.py
index 17bbe1b..0059b88 100644
--- a/tests/unit/test_datasets/test_base.py
+++ b/tests/unit/test_datasets/test_base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_datasets/test_digits.py b/tests/unit/test_datasets/test_digits.py
index 6aab914..de8e9a8 100644
--- a/tests/unit/test_datasets/test_digits.py
+++ b/tests/unit/test_datasets/test_digits.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_datasets/test_gene_families.py b/tests/unit/test_datasets/test_gene_families.py
index 2baae10..05b5d0b 100644
--- a/tests/unit/test_datasets/test_gene_families.py
+++ b/tests/unit/test_datasets/test_gene_families.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_internal/__init__.py b/tests/unit/test_internal/__init__.py
index cd11e40..b4cba4c 100644
--- a/tests/unit/test_internal/__init__.py
+++ b/tests/unit/test_internal/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_internal/test_data.py b/tests/unit/test_internal/test_data.py
index b0421f9..323e9d9 100644
--- a/tests/unit/test_internal/test_data.py
+++ b/tests/unit/test_internal/test_data.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_internal/test_hmm/__init__.py b/tests/unit/test_internal/test_hmm/__init__.py
index cd11e40..b4cba4c 100644
--- a/tests/unit/test_internal/test_hmm/__init__.py
+++ b/tests/unit/test_internal/test_hmm/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_internal/test_hmm/test_topologies.py b/tests/unit/test_internal/test_hmm/test_topologies.py
index 9b7e7ce..cea4f22 100644
--- a/tests/unit/test_internal/test_hmm/test_topologies.py
+++ b/tests/unit/test_internal/test_hmm/test_topologies.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -328,7 +328,7 @@ def test_ergodic_random_transitions_many(
def test_ergodic_check_transitions_invalid(
- random_state: np.random.RandomState
+ random_state: np.random.RandomState,
) -> None:
"""Validate an invalid ergodic transition matrix"""
topology = topologies.ErgodicTopology(
@@ -342,7 +342,7 @@ def test_ergodic_check_transitions_invalid(
def test_ergodic_check_transitions_valid(
- random_state: np.random.RandomState
+ random_state: np.random.RandomState,
) -> None:
"""Validate a valid ergodic transition matrix"""
topology = topologies.ErgodicTopology(
@@ -451,7 +451,7 @@ def test_linear_random_transitions_many(
def test_linear_check_transitions_invalid(
- random_state: np.random.RandomState
+ random_state: np.random.RandomState,
) -> None:
"""Validate an invalid linear transition matrix"""
topology = topologies.LinearTopology(n_states=5, random_state=random_state)
@@ -466,7 +466,7 @@ def test_linear_check_transitions_invalid(
def test_linear_check_transitions_valid(
- random_state: np.random.RandomState
+ random_state: np.random.RandomState,
) -> None:
"""Validate a valid linear transition matrix"""
topology = topologies.LinearTopology(n_states=5, random_state=random_state)
diff --git a/tests/unit/test_model_selection.py b/tests/unit/test_model_selection.py
new file mode 100644
index 0000000..1b88d1f
--- /dev/null
+++ b/tests/unit/test_model_selection.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2019 Sequentia Developers.
+# Distributed under the terms of the MIT License (see the LICENSE file).
+# SPDX-License-Identifier: MIT
+# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
+
+from __future__ import annotations
+
+import numpy as np
+import numpy.testing as npt
+import pytest
+from sklearn.model_selection._split import (
+ BaseCrossValidator,
+ BaseShuffleSplit,
+)
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import minmax_scale
+
+from sequentia.datasets import SequentialDataset, load_digits
+from sequentia.enums import CovarianceMode, PriorMode, TopologyMode
+from sequentia.model_selection import (
+ GridSearchCV,
+ HalvingGridSearchCV,
+ KFold,
+ RandomizedSearchCV,
+ RepeatedKFold,
+ RepeatedStratifiedKFold,
+ ShuffleSplit,
+ StratifiedKFold,
+ StratifiedShuffleSplit,
+ param_grid,
+)
+from sequentia.model_selection._search import BaseSearchCV
+from sequentia.models import (
+ GaussianMixtureHMM,
+ HMMClassifier,
+ KNNClassifier,
+ KNNRegressor,
+)
+from sequentia.preprocessing import IndependentFunctionTransformer
+
+EPS: np.float32 = np.finfo(np.float32).eps
+random_state: np.random.RandomState = np.random.RandomState(0)
+
+
+def exp_weight(x: np.ndarray) -> np.ndarray:
+ return np.exp(-x)
+
+
+def inv_weight(x: np.ndarray) -> np.ndarray:
+ return 1 / (x + EPS)
+
+
+@pytest.fixture(scope="module")
+def data() -> SequentialDataset:
+ """Small subset of the spoken digits dataset."""
+ digits = load_digits(digits={0, 1})
+ _, digits = digits.split(
+ test_size=0.1,
+ random_state=random_state,
+ shuffle=True,
+ stratify=True,
+ )
+ return digits
+
+
+@pytest.mark.parametrize(
+ "cv",
+ [
+ KFold,
+ StratifiedKFold,
+ ShuffleSplit,
+ StratifiedShuffleSplit,
+ RepeatedKFold,
+ RepeatedStratifiedKFold,
+ ],
+)
+@pytest.mark.parametrize(
+ "search", [GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV]
+)
+def test_knn_classifier(
+ data: SequentialDataset,
+ search: type[BaseSearchCV],
+ cv: type[BaseCrossValidator] | type[BaseShuffleSplit],
+) -> None:
+ # Specify cross-validator parameters
+ cv_kwargs = {"random_state": 0, "n_splits": 2}
+ if cv in (KFold, StratifiedKFold):
+ cv_kwargs["shuffle"] = True
+
+ # Initialize search, splitter and parameter
+ optimizer = search(
+ Pipeline(
+ [
+ ("scale", IndependentFunctionTransformer(minmax_scale)),
+ ("knn", KNNClassifier(use_c=True, n_jobs=-1)),
+ ]
+ ),
+ {
+ "knn__k": [1, 5],
+ "knn__weighting": [exp_weight, inv_weight],
+ },
+ cv=cv(**cv_kwargs),
+ n_jobs=-1,
+ )
+
+ # Perform the hyper-parameter search and retrieve the best model
+ optimizer.fit(data.X, data.y, lengths=data.lengths)
+ assert optimizer.best_score_ > 0.8
+ clf = optimizer.best_estimator_
+
+ # Predict labels
+ y_pred = clf.predict(data.X, lengths=data.lengths)
+ assert np.isin(y_pred, (0, 1)).all()
+
+ # Predict probabilities
+ y_probs = clf.predict_proba(data.X, lengths=data.lengths)
+ assert ((y_probs >= 0) & (y_probs <= 1)).all()
+ npt.assert_almost_equal(y_probs.sum(axis=1), 1.0)
+
+ # Predict log probabilities
+ y_log_probs = clf.predict_log_proba(data.X, lengths=data.lengths)
+ assert (y_log_probs <= 0).all()
+ npt.assert_almost_equal(y_log_probs, np.log(y_probs))
+
+ # Calculate accuracy
+ acc = clf.score(data.X, data.y, lengths=data.lengths)
+ assert acc > 0.8
+
+
+@pytest.mark.parametrize(
+ "cv",
+ [
+ KFold,
+ StratifiedKFold,
+ ShuffleSplit,
+ StratifiedShuffleSplit,
+ RepeatedKFold,
+ RepeatedStratifiedKFold,
+ ],
+)
+@pytest.mark.parametrize(
+ "search", [GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV]
+)
+def test_knn_regressor(
+ data: SequentialDataset,
+ search: type[BaseSearchCV],
+ cv: type[BaseCrossValidator] | type[BaseShuffleSplit],
+) -> None:
+ # Specify cross-validator parameters
+ cv_kwargs = {"random_state": 0, "n_splits": 2}
+ if cv in (KFold, StratifiedKFold):
+ cv_kwargs["shuffle"] = True
+
+ # Initialize search, splitter and parameter
+ optimizer = search(
+ Pipeline(
+ [
+ ("scale", IndependentFunctionTransformer(minmax_scale)),
+ ("knn", KNNRegressor(use_c=True, n_jobs=-1)),
+ ]
+ ),
+ {
+ "knn__k": [3, 5],
+ "knn__weighting": [exp_weight, inv_weight],
+ },
+ cv=cv(**cv_kwargs),
+ n_jobs=-1,
+ )
+
+ # Convert labels to float
+ y = data.y.astype(np.float64)
+
+ # Perform the hyper-parameter search and retrieve the best model
+ optimizer.fit(data.X, y, lengths=data.lengths)
+ assert optimizer.best_score_ > 0.8
+ model = optimizer.best_estimator_
+
+ # Predict labels
+ y_pred = model.predict(data.X, lengths=data.lengths)
+ assert ((y_pred >= 0) & (y_pred <= 1)).all()
+
+ # Calculate R^2
+ r2 = model.score(data.X, y, lengths=data.lengths)
+ assert r2 > 0.8
+
+
+def test_hmm_classifier(data: SequentialDataset) -> None:
+ # Initialize search, splitter and parameter
+ optimizer = GridSearchCV(
+ estimator=Pipeline(
+ [
+ ("scale", IndependentFunctionTransformer(minmax_scale)),
+ ("clf", HMMClassifier(variant=GaussianMixtureHMM, n_jobs=-1)),
+ ]
+ ),
+ param_grid={
+ "clf__prior": [PriorMode.UNIFORM, PriorMode.FREQUENCY],
+ "clf__model_kwargs": param_grid(
+ n_states=[3, 4, 5],
+ n_components=[2, 3, 4],
+ covariance=[CovarianceMode.DIAGONAL, CovarianceMode.SPHERICAL],
+ topology=[TopologyMode.LEFT_RIGHT, TopologyMode.LINEAR],
+ ),
+ },
+ cv=StratifiedKFold(),
+ n_jobs=-1,
+ )
+
+ # Perform the hyper-parameter search and retrieve the best model
+ optimizer.fit(data.X, data.y, lengths=data.lengths)
+ assert optimizer.best_score_ > 0.8
+ clf = optimizer.best_estimator_
+
+ # Predict labels
+ y_pred = clf.predict(data.X, lengths=data.lengths)
+ assert np.isin(y_pred, (0, 1)).all()
+
+ # Predict probabilities
+ y_probs = clf.predict_proba(data.X, lengths=data.lengths)
+ assert ((y_probs >= 0) & (y_probs <= 1)).all()
+ npt.assert_almost_equal(y_probs.sum(axis=1), 1.0)
+
+ # Predict log probabilities
+ clf.predict_log_proba(data.X, lengths=data.lengths)
+
+ # Calculate accuracy
+ acc = clf.score(data.X, data.y, lengths=data.lengths)
+ assert acc > 0.8
diff --git a/tests/unit/test_models/__init__.py b/tests/unit/test_models/__init__.py
index cd11e40..b4cba4c 100644
--- a/tests/unit/test_models/__init__.py
+++ b/tests/unit/test_models/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/hmm/__init__.py b/tests/unit/test_models/hmm/__init__.py
index cd11e40..b4cba4c 100644
--- a/tests/unit/test_models/hmm/__init__.py
+++ b/tests/unit/test_models/hmm/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/hmm/test_classifier.py b/tests/unit/test_models/hmm/test_classifier.py
index 0ca4dbd..e7d180d 100644
--- a/tests/unit/test_models/hmm/test_classifier.py
+++ b/tests/unit/test_models/hmm/test_classifier.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -6,6 +6,7 @@
from __future__ import annotations
import copy
+import enum
import os
import tempfile
import typing as t
@@ -37,6 +38,12 @@
n_classes = 7
+class FitMode(enum.StrEnum):
+ PREFIT = "prefit"
+ POSTFIT_IDENTICAL = "postfit_identical"
+ POSTFIT_FLEXIBLE = "postfit_flexible"
+
+
@pytest.fixture(scope="module")
def random_state(request: SubRequest) -> np.random.RandomState:
return np.random.RandomState(1)
@@ -113,18 +120,19 @@ def assert_fit(clf: BaseHMM):
},
],
)
-@pytest.mark.parametrize("prefit", [True, False])
+@pytest.mark.parametrize("fit_mode", list(FitMode))
+@pytest.mark.parametrize("n_jobs", [1, -1])
def test_classifier_e2e(
request: SubRequest,
helpers: t.Any,
model: BaseHMM,
dataset: SequentialDataset,
prior: enums.PriorMode | dict[int, float],
+ fit_mode: FitMode,
+ n_jobs: int,
random_state: np.random.RandomState,
- *,
- prefit: bool,
) -> None:
- clf = HMMClassifier(prior=prior)
+ clf = HMMClassifier(prior=prior, n_jobs=n_jobs)
clf.add_models({i: copy.deepcopy(model) for i in range(n_classes)})
assert clf.prior == prior
@@ -139,12 +147,20 @@ def test_classifier_e2e(
test_size=0.2, random_state=random_state, stratify=True
)
- if prefit:
+ if fit_mode == FitMode.PREFIT:
for X, lengths, c in train.iter_by_class():
clf.models[c].fit(X, lengths=lengths)
assert_fit(clf.fit())
- else:
+ elif fit_mode == FitMode.POSTFIT_FLEXIBLE:
assert_fit(clf.fit(**train.X_y_lengths))
+ elif fit_mode == FitMode.POSTFIT_IDENTICAL:
+ clf = HMMClassifier(
+ variant=type(model),
+ model_kwargs=model.get_params(),
+ prior=prior,
+ n_jobs=n_jobs,
+ )
+ clf.fit(**train.X_y_lengths)
scores_pred = clf.predict_scores(**test.X_lengths)
assert scores_pred.shape == (len(test), n_classes)
diff --git a/tests/unit/test_models/hmm/variants/__init__.py b/tests/unit/test_models/hmm/variants/__init__.py
index cd11e40..b4cba4c 100644
--- a/tests/unit/test_models/hmm/variants/__init__.py
+++ b/tests/unit/test_models/hmm/variants/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/hmm/variants/test_categorical.py b/tests/unit/test_models/hmm/variants/test_categorical.py
index ba35326..92f53d5 100644
--- a/tests/unit/test_models/hmm/variants/test_categorical.py
+++ b/tests/unit/test_models/hmm/variants/test_categorical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/hmm/variants/test_gaussian_mixture.py b/tests/unit/test_models/hmm/variants/test_gaussian_mixture.py
index d091099..f677fc0 100644
--- a/tests/unit/test_models/hmm/variants/test_gaussian_mixture.py
+++ b/tests/unit/test_models/hmm/variants/test_gaussian_mixture.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/knn/__init__.py b/tests/unit/test_models/knn/__init__.py
index cd11e40..b4cba4c 100644
--- a/tests/unit/test_models/knn/__init__.py
+++ b/tests/unit/test_models/knn/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/knn/test_classifier.py b/tests/unit/test_models/knn/test_classifier.py
index 15f4544..de05694 100644
--- a/tests/unit/test_models/knn/test_classifier.py
+++ b/tests/unit/test_models/knn/test_classifier.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_models/knn/test_regressor.py b/tests/unit/test_models/knn/test_regressor.py
index 715cd16..fffc3a2 100644
--- a/tests/unit/test_models/knn/test_regressor.py
+++ b/tests/unit/test_models/knn/test_regressor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_pipeline.py b/tests/unit/test_pipeline.py
index dfdbff0..38909cb 100644
--- a/tests/unit/test_pipeline.py
+++ b/tests/unit/test_pipeline.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
@@ -51,7 +51,7 @@ def test_pipeline_with_transforms(
)
# check that transforming without fitting doesn't work
- with pytest.raises(NotFittedError):
+ with pytest.raises((NotFittedError, AttributeError)):
pipeline.transform(**data.X_lengths)
# check that fitting without y works
diff --git a/tests/unit/test_preprocessing/__init__.py b/tests/unit/test_preprocessing/__init__.py
index cd11e40..b4cba4c 100644
--- a/tests/unit/test_preprocessing/__init__.py
+++ b/tests/unit/test_preprocessing/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).
diff --git a/tests/unit/test_preprocessing/test_transforms.py b/tests/unit/test_preprocessing/test_transforms.py
index 229ad05..d202f47 100644
--- a/tests/unit/test_preprocessing/test_transforms.py
+++ b/tests/unit/test_preprocessing/test_transforms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2025 Sequentia Developers.
+# Copyright (c) 2019 Sequentia Developers.
# Distributed under the terms of the MIT License (see the LICENSE file).
# SPDX-License-Identifier: MIT
# This source code is part of the Sequentia project (https://github.com/eonu/sequentia).