From db24460c91e76e357d7e4892493b873e504d8ee4 Mon Sep 17 00:00:00 2001 From: Edwin Onuonga Date: Sun, 29 Dec 2019 19:16:15 +0400 Subject: [PATCH] [add:docs] Add proper documentation (#40) * Add proper documentation * Remove and ignore docs/_build directory --- .gitignore | 3 + CHANGELOG.md | 8 +- README.md | 39 +- docs/Makefile | 20 + docs/_includes/examples/classifiers/dtwknn.py | 13 + docs/_includes/examples/classifiers/hmm.py | 11 + .../examples/classifiers/hmm_classifier.py | 21 + .../examples/preprocessing/downsample.py | 8 + docs/_includes/examples/preprocessing/fft.py | 8 + .../examples/preprocessing/normalize.py | 8 + .../examples/preprocessing/preprocess.py | 14 + docs/changelog.rst | 6 + docs/conf.py | 69 ++ docs/index.rst | 58 ++ docs/sections/classifiers/dtwknn.rst | 45 ++ docs/sections/classifiers/hmm.rst | 123 ++++ docs/sections/preprocessing/downsample.rst | 25 + docs/sections/preprocessing/fft.rst | 26 + docs/sections/preprocessing/normalize.rst | 38 + docs/sections/preprocessing/preprocessing.rst | 22 + examples/1 - Input Format (Tutorial).ipynb | 32 - examples/2 - Preprocessing (Tutorial).ipynb | 32 - examples/Pen-Tip Trajectories (Example).ipynb | 654 ------------------ lib/sequentia/classifiers/dtwknn/dtwknn.py | 112 +-- lib/sequentia/classifiers/hmm/hmm.py | 174 ++--- .../classifiers/hmm/hmm_classifier.py | 104 +-- lib/sequentia/preprocessing/methods.py | 60 +- lib/sequentia/preprocessing/preprocess.py | 55 +- 28 files changed, 820 insertions(+), 968 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/_includes/examples/classifiers/dtwknn.py create mode 100644 docs/_includes/examples/classifiers/hmm.py create mode 100644 docs/_includes/examples/classifiers/hmm_classifier.py create mode 100644 docs/_includes/examples/preprocessing/downsample.py create mode 100644 docs/_includes/examples/preprocessing/fft.py create mode 100644 docs/_includes/examples/preprocessing/normalize.py create mode 100644 docs/_includes/examples/preprocessing/preprocess.py create mode 100644 docs/changelog.rst create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/sections/classifiers/dtwknn.rst create mode 100644 docs/sections/classifiers/hmm.rst create mode 100644 docs/sections/preprocessing/downsample.rst create mode 100644 docs/sections/preprocessing/fft.rst create mode 100644 docs/sections/preprocessing/normalize.rst create mode 100644 docs/sections/preprocessing/preprocessing.rst delete mode 100644 examples/1 - Input Format (Tutorial).ipynb delete mode 100644 examples/2 - Preprocessing (Tutorial).ipynb delete mode 100644 examples/Pen-Tip Trajectories (Example).ipynb diff --git a/.gitignore b/.gitignore index 2f5fd2b5..abce4b0d 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,9 @@ coverage.xml .hypothesis/ .pytest_cache/ +# Documentation +docs/_build + # Jupyter Notebook .ipynb_checkpoints diff --git a/CHANGELOG.md b/CHANGELOG.md index 6876fee8..2577a73f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,9 @@ -# [0.2.0](https://github.com/eonu/sequentia/releases/tag/v0.2.0) +## [0.2.0](https://github.com/eonu/sequentia/releases/tag/v0.2.0) #### Major changes - Add multi-processing support for `DTWKNN` predictions. ([#29](https://github.com/eonu/sequentia/pull/29)) - Rename the `fit_transform()` function in `Preprocess` to `transform()` since there is nothing being fitted. ([#35](https://github.com/eonu/sequentia/pull/35)) -- Modify package classifiers in `setup.py`: ([#31](https://github.com/eonu/sequentia/pull/31)) +- Modify package classifiers in `setup.py` ([#31](https://github.com/eonu/sequentia/pull/31)): - Set development status classifier to `Pre-Alpha`. - Add Python version classifiers for v3.5+. - Specify UNIX and macOS operating system classifiers. @@ -11,11 +11,11 @@ #### Minor changes - Finish tutorial and example notebooks. ([#35](https://github.com/eonu/sequentia/pull/35)) - Rename `examples` directory to `notebooks`. ([#32](https://github.com/eonu/sequentia/pull/32)) -- Host notebooks statically on [`nbviewer`](https://github.com/jupyter/nbviewer). ([#32](https://github.com/eonu/sequentia/pull/32)) +- Host notebooks statically on [nbviewer](https://github.com/jupyter/nbviewer). ([#32](https://github.com/eonu/sequentia/pull/32)) - Add reference to Pomegranate [paper](http://jmlr.org/papers/volume18/17-636/17-636.pdf) and [repository](https://github.com/jmschrei/pomegranate). ([#30](https://github.com/eonu/sequentia/pull/30)) - Add badges to `README.md`. ([#28](https://github.com/eonu/sequentia/pull/28)) -# [0.1.0](https://github.com/eonu/sequentia/releases/tag/v0.1.0) +## [0.1.0](https://github.com/eonu/sequentia/releases/tag/v0.1.0) #### Major changes diff --git a/README.md b/README.md index bd058bba..8c4346df 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,26 @@ +

+ Sequentia +

+ +

+ A machine learning interface for isolated temporal sequence classification algorithms in Python. +

+
- + PyPI - + PyPI - Python Version PyPI - License + + Read The Docs - Documentation +
-# Sequentia - -_A machine learning interface for isolated temporal sequence classification algorithms in Python._ - ## Introduction @@ -44,7 +51,7 @@ Sequentia offers the use of **multivariate observation sequences with differing ### Preprocessing methods -- [x] Normalization (centering observation sequences) +- [x] Normalization - [x] Downsampling (by decimation and averaging) - [x] Discrete (Fast) Fourier Transform @@ -60,6 +67,10 @@ Sequentia offers the use of **multivariate observation sequences with differing pip install sequentia ``` +## Documentation + +Documentation for the package is available on [Read The Docs](https://sequentia.readthedocs.io/en/latest). + ## Tutorials and examples For tutorials and examples on the usage of Sequentia, [look at the notebooks here](https://nbviewer.jupyter.org/github/eonu/sequentia/tree/master/notebooks/)! @@ -91,13 +102,13 @@ All contributions to this repository are greatly appreciated. Contribution guide - - Edwin Onuonga -
Edwin Onuonga -
-
- βœ‰οΈ - 🌍 + + Edwin Onuonga +
Edwin Onuonga +
+
+ βœ‰οΈ + 🌍 diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_includes/examples/classifiers/dtwknn.py b/docs/_includes/examples/classifiers/dtwknn.py new file mode 100644 index 00000000..2754ec24 --- /dev/null +++ b/docs/_includes/examples/classifiers/dtwknn.py @@ -0,0 +1,13 @@ +import numpy as np +from sequentia.classifiers import DTWKNN + +# Create some sample data +X = [np.random.random((10 * i, 3)) for i in range(1, 4)] +y = ['class0', 'class1', 'class1'] + +# Create and fit the classifier +clf = DTWKNN(k=1, radius=5) +clf.fit(X, y) + +# Predict labels for the training data (just as an example) +clf.predict(X) \ No newline at end of file diff --git a/docs/_includes/examples/classifiers/hmm.py b/docs/_includes/examples/classifiers/hmm.py new file mode 100644 index 00000000..510b9582 --- /dev/null +++ b/docs/_includes/examples/classifiers/hmm.py @@ -0,0 +1,11 @@ +import numpy as np +from sequentia.classifiers import HMM + +# Create some sample data +X = [np.random.random((10 * i, 3)) for i in range(1, 4)] + +# Create and fit a left-right HMM with random transitions and initial state distribution +hmm = HMM(label='class1', n_states=5, topology='left-right') +hmm.set_random_initial() +hmm.set_random_transitions() +hmm.fit(X) \ No newline at end of file diff --git a/docs/_includes/examples/classifiers/hmm_classifier.py b/docs/_includes/examples/classifiers/hmm_classifier.py new file mode 100644 index 00000000..9a85a7d0 --- /dev/null +++ b/docs/_includes/examples/classifiers/hmm_classifier.py @@ -0,0 +1,21 @@ +import numpy as np +from sequentia.classifiers import HMM, HMMClassifier + +# Create and fit some sample HMMs +hmms = [] +for i in range(5): + hmm = HMM(label=f'class{i}', n_states=(i + 3), topology='left-right') + hmm.set_random_initial() + hmm.set_random_transitions() + hmm.fit([np.arange((i + j * 20) * 30).reshape(-1, 3) for j in range(1, 4)]) + hmms.append(hmm) + +# Create some sample test data and labels +X = [np.random.random((10 * i, 3)) for i in range(1, 4)] +y = ['class0', 'class1', 'class1'] + +# Create a classifier and calculate predictions and evaluations +clf = HMMClassifier() +clf.fit(hmms) +predictions = clf.predict(X) +accuracy, confusion = clf.evaluate(X, y) \ No newline at end of file diff --git a/docs/_includes/examples/preprocessing/downsample.py b/docs/_includes/examples/preprocessing/downsample.py new file mode 100644 index 00000000..87ee97cf --- /dev/null +++ b/docs/_includes/examples/preprocessing/downsample.py @@ -0,0 +1,8 @@ +import numpy as np +from sequentia.preprocessing import downsample + +# Create some sample data +X = [np.random.random((10 * i, 3)) for i in range(1, 4)] + +# Downsample the data with downsample factor 5 and decimation +X = downsample(X, n=5, method='decimate') \ No newline at end of file diff --git a/docs/_includes/examples/preprocessing/fft.py b/docs/_includes/examples/preprocessing/fft.py new file mode 100644 index 00000000..b31b9180 --- /dev/null +++ b/docs/_includes/examples/preprocessing/fft.py @@ -0,0 +1,8 @@ +import numpy as np +from sequentia.preprocessing import fft + +# Create some sample data +X = [np.random.random((10 * i, 3)) for i in range(1, 4)] + +# Transform the data +X = fft(X) \ No newline at end of file diff --git a/docs/_includes/examples/preprocessing/normalize.py b/docs/_includes/examples/preprocessing/normalize.py new file mode 100644 index 00000000..2dd811ba --- /dev/null +++ b/docs/_includes/examples/preprocessing/normalize.py @@ -0,0 +1,8 @@ +import numpy as np +from sequentia.preprocessing import normalize + +# Create some sample data +X = [np.random.random((10 * i, 3)) for i in range(1, 4)] + +# Normalize the data +X = normalize(X) \ No newline at end of file diff --git a/docs/_includes/examples/preprocessing/preprocess.py b/docs/_includes/examples/preprocessing/preprocess.py new file mode 100644 index 00000000..aa4ee39c --- /dev/null +++ b/docs/_includes/examples/preprocessing/preprocess.py @@ -0,0 +1,14 @@ +import numpy as np +from sequentia.preprocessing import Preprocess + +# Create some sample data +X = [np.random.random((10 * i, 3)) for i in range(1, 4)] + +# Create the Preprocess object +pre = Preprocess() +pre.normalize() +pre.downsample(10, method='average') +pre.fft() + +# Transform the data applying transformations in order +X = pre.transform(X) \ No newline at end of file diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 00000000..28d0cbf1 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1,6 @@ +.. _changelog: + +Changelog +========= + +.. mdinclude:: ../CHANGELOG.md \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..356eaef9 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,69 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +import sys +import os +import subprocess + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.abspath('..')) + +subprocess.call('pip install numpydoc sphinx_rtd_theme m2r', shell=True) + +# -- Project information ----------------------------------------------------- + +project = 'sequentia' +copyright = '2019-2020, Edwin Onuonga' +author = 'Edwin Onuonga' + +# The full version, including alpha/beta/rc tags +release = '0.2.0' + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'numpydoc', + 'm2r' +] + +autodoc_member_order = 'bysource' +autosummary_generate = True +numpydoc_show_class_members = False + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +source_suffix = ['.rst', '.md'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..3bbd1b13 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,58 @@ +.. Sequentia documentation master file, created by + sphinx-quickstart on Sat Dec 28 19:22:34 2019. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. image:: https://i.ibb.co/42GkhfR/sequentia.png + :alt: Sequentia + :width: 275 + :target: https://github.com/eonu/sequentia + +About +===== + +Sequentia is a collection of machine learning algorithms for performing the classification of isolated temporal sequences. + +Each isolated sequence is generally modeled as a section of a longer multivariate time series +that represents the entire sequence. Naturally, this fits the description of many types of problems such as: + +- isolated word utterance frequencies in speech audio signals, +- isolated hand-written character pen-tip trajectories, +- isolated hand or head gestures positions in a video or motion-capture recording. + +Most modern machine learning algorithms won't work directly out of the box when applied to such +sequential data – mostly due to the fact that the dependencies between observations at different +time frames must be considered, and also because each isolated sequence generally has a different duration. + +Sequentia offers some appropriate classification algorithms for these kinds of tasks. + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Sequentia + + self + changelog.rst + +.. toctree:: + :maxdepth: 1 + :caption: Classifiers and Models + + sections/classifiers/hmm.rst + sections/classifiers/dtwknn.rst + +.. toctree:: + :maxdepth: 1 + :caption: Preprocessing Methods + + sections/preprocessing/normalize.rst + sections/preprocessing/downsample.rst + sections/preprocessing/fft.rst + sections/preprocessing/preprocessing.rst + +Documentation Search and Index +============================== + +* :ref:`search` +* :ref:`genindex` +* :ref:`modindex` \ No newline at end of file diff --git a/docs/sections/classifiers/dtwknn.rst b/docs/sections/classifiers/dtwknn.rst new file mode 100644 index 00000000..19a67117 --- /dev/null +++ b/docs/sections/classifiers/dtwknn.rst @@ -0,0 +1,45 @@ +.. _dtwknn: + +Dynamic Time Warping `k`-Nearest Neighbors Classifier (``DTWKNN``) +================================================================== + +| Recall that the isolated sequences we are dealing with are represented as + multivariate time series of different durations. +| Suppose that our sequences are all :math:`D`-dimensional. The main requirement of + `k-Nearest Neighbor `_ + (:math:`k`-NN) classifiers is that each example must have the same number of + dimensions – and hence, be in the same feature space. This is indeed the case with + our :math:`D`-dimensional sequences. However, we can't use :math:`k`-NN with simple + distance metrics such as Euclidean distance because we are comparing sequences + (which represent an ordered collection of points in :math:`D`-dimensional space) + rather than individual points in :math:`D`-dimensional space. + +One distance metric that allows us to compare multivariate sequences of different length +is `Dynamic Time Warping `_. Coupling this metric +with :math:`k`-NN creates a powerful classifier that assigns the class of a new +observation sequence by looking at the classes of observation sequences with similar patterns. + +However, :math:`k`-NN classifiers suffer from the fact that they are non-parametric, +which means that when predicting the class for a new observation sequence, +we must look back at every observation sequence that was used to fit the model. +To speed up prediction times, we have chosen to use a constrained DTW algorithm that +sacrifices accuracy by calculating an approximate distance, but saves **a lot** of time. +This is the `FastDTW `_ +implementation, which has a `radius` parameter for controlling the imposed constraint on the distance calculation. + +This approximate DTW :math:`k`-NN classifier is implemented by the :class:`~DTWKNN` class. + +Example +------- + +.. literalinclude:: ../../_includes/examples/classifiers/dtwknn.py + :language: python + :linenos: + +For more elaborate examples, please have a look at the `example notebooks `_. + +API reference +------------- + +.. autoclass:: sequentia.classifiers.dtwknn.DTWKNN + :members: \ No newline at end of file diff --git a/docs/sections/classifiers/hmm.rst b/docs/sections/classifiers/hmm.rst new file mode 100644 index 00000000..e27173e7 --- /dev/null +++ b/docs/sections/classifiers/hmm.rst @@ -0,0 +1,123 @@ +.. _hmm: + +Hidden Markov Model (``HMM``) +============================= + +The `Hidden Markov Model `_ (HMM) +is a state-based statistical model that can be used to represent an individual +observation sequence class :math:`c`. As seen in the diagram below, the rough idea is that +each state should correspond to one 'section' of the sequence. + +.. image:: https://i.ibb.co/GFtV46t/HMM.jpg + :alt: HMM + :width: 350 + +A single HMM is modeled by the :class:`~HMM` class. + +Parameters and Training +----------------------- + +The 'sections' in the image above are determined by the parameters of the HMM, explained below. + +- | **Initial state distribution** :math:`\boldsymbol{\pi}`: + | A discrete probability distribution that dictates the probability of the HMM starting in each state. + +- | **Transition probability matrix** :math:`A`: + | A matrix whose rows represent a discrete probability distribution that dictates how likely the HMM is + to transition to each state, given some current state. + +- | **Emission probability distributions** :math:`B`: + | A collection of :math:`N` continuous multivariate probability distributions (one for each state) + that each dictate the probability of the HMM generating an observation :math:`\mathbf{o}`, given some current state. + Recall that we are generally considering multivariate observation sequences – that is, + at time :math:`t`, we have an observation :math:`\mathbf{o}^{(t)}=\left(o_1^{(t)}, o_2^{(t)}, \ldots, o_D^{(t)}\right)`. + The fact that the observations are multivariate necessitates a multivariate emission distribution. + Sequentia uses the `multivariate Gaussian distribution `_. + +In order to learn these parameters, we must train the HMM on examples that are labeled +with the class :math:`c` that the HMM models. Denote the HMM that models class :math:`c` as +:math:`\lambda_c=(\boldsymbol{\pi}_c, A_c, B_c)`. We can use the `Baum-Welch algorithm `_ +(an application of the `Expectation-Maximization algorithm `_) +to fit :math:`\lambda_c` and learn its parameters. This fitting is implemented by the :func:`~HMM.fit` function. + +Model Topologies +^^^^^^^^^^^^^^^^ + +As we usually wish to preserve the natural ordering of time, we normally want to prevent our HMM +from transitioning to previous states (this is shown in the figure above). This restriction leads +to what known as a **left-right** HMM, and is the most commonly used type of HMM for sequential +modeling. Mathematically, a left-right HMM is defined by an upper-triangular transition matrix. + +If we allow transitions to any state at any time, this HMM topology is known as **ergodic**. + +**Note**: Ergodicity is mathematically defined as having a transition matrix with no non-zero entries. +Using the ergodic topology in Sequentia will still permit zero entries in the transition matrix, +but will issue a warning stating that those probabilities will not be learned. + +Sequentia offers both topologies, specified by a string parameter ``topology`` in the +:class:`~HMM` constructor that takes values `'left-right'` or `'ergodic'`. + +Making Predictions +------------------ + +A score for how likely a HMM is to generate an observation sequence is given by the +`Forward algorithm `_. It calculates the likelihood +:math:`\mathbb{P}(O|\lambda_c)` of the HMM :math:`\lambda_c` generating the observation sequence :math:`O`. + +**Note**: The likelihood does not account for the fact that a particular observation class +may occur more or less frequently than other observation classes. Once an ensemble of :class:`~HMM` objects +(represented by a :class:`~HMMClassifier`) is created and configured, this can be accounted for by +calculating the joint probability (or un-normalized posterior) +:math:`\mathbb{P}(O, \lambda_c)=\mathbb{P}(O|\lambda_c)\mathbb{P}(\lambda_c)` +and using this score to classify instead. The addition of the prior term :math:`\mathbb{P}(\lambda_c)` +accounts for some classes occuring more frequently than others. + +Example +------- + +.. literalinclude:: ../../_includes/examples/classifiers/hmm.py + :language: python + :linenos: + +For more elaborate examples, please have a look at the +`example notebooks `_. + +API reference +------------- + +.. autoclass:: sequentia.classifiers.hmm.HMM + :members: + +Ensemble Hidden Markov Model Classifier (``HMMClassifier``) +=========================================================== + +Multiple HMMs can be combined to form an ensemble multi-class classifier. +To classify a new observation sequence :math:`O'`, this works by: + +1. | Creating and training the HMMs :math:`\lambda_1, \lambda_2, \ldots, \lambda_N`. + +2. | Calculating the likelihoods :math:`\mathbb{P}(O'|\lambda_1), \mathbb{P}(O'|\lambda_2), \ldots, \mathbb{P}(O'|\lambda_N)` of each model generating :math:`O'`. + | **Note**: You can also used the un-normalized posterior :math:`\mathbb{P}(O'|\lambda_c)\mathbb{P}(\lambda_c)` instead of the likelihood. + +3. | Choose the class represented by the HMM with the highest likelihood – that is, :math:`c^*=\mathop{\arg\max}_{c\in\{1,\ldots,N\}}{\mathbb{P}(O'|\lambda_c)}`. + +These steps are summarized in the diagram below. + +.. image:: https://i.ibb.co/gPymgs4/classifier.png + :alt: Ensemble HMM Classifier System + :width: 400 + +Example +------- + +.. literalinclude:: ../../_includes/examples/classifiers/hmm_classifier.py + :language: python + :linenos: + +For more elaborate examples, please have a look at the `example notebooks `_. + +API reference +------------- + +.. autoclass:: sequentia.classifiers.hmm.HMMClassifier + :members: \ No newline at end of file diff --git a/docs/sections/preprocessing/downsample.rst b/docs/sections/preprocessing/downsample.rst new file mode 100644 index 00000000..03d79e01 --- /dev/null +++ b/docs/sections/preprocessing/downsample.rst @@ -0,0 +1,25 @@ +.. _downsample: + +Downsampling (``downsample``) +============================= + +Downsampling reduces the number of frames in an observation sequence according +to a specified downsample factor and one of two methods: **averaging** and **decimation**. + +This is an especially helpful preprocessing method for speeding up classification times. + +For further information, please see the `preprocessing tutorial notebook `_. + +Example +------- + +.. literalinclude:: ../../_includes/examples/preprocessing/downsample.py + :language: python + :linenos: + +API reference +------------- + +.. automodule:: sequentia.preprocessing + :noindex: +.. autofunction:: downsample \ No newline at end of file diff --git a/docs/sections/preprocessing/fft.rst b/docs/sections/preprocessing/fft.rst new file mode 100644 index 00000000..65656348 --- /dev/null +++ b/docs/sections/preprocessing/fft.rst @@ -0,0 +1,26 @@ +.. _fft: + +Discrete Fourier Transform (``fft``) +==================================== + +The Discrete Fourier Transform (DFT) converts the observation sequence into a real-valued, +same-length sequence of equally-spaced samples of the +`discrete-time Fourier transform `_. + +The popular `Fast Fourier Transform `_ (FFT) implementation is used to efficiently compute the DFT. + +For further information, please see the `preprocessing tutorial notebook `_. + +Example +------- + +.. literalinclude:: ../../_includes/examples/preprocessing/fft.py + :language: python + :linenos: + +API reference +------------- + +.. automodule:: sequentia.preprocessing + :noindex: +.. autofunction:: fft \ No newline at end of file diff --git a/docs/sections/preprocessing/normalize.rst b/docs/sections/preprocessing/normalize.rst new file mode 100644 index 00000000..923f6781 --- /dev/null +++ b/docs/sections/preprocessing/normalize.rst @@ -0,0 +1,38 @@ +.. _normalize: + +Normalization (``normalize``) +============================= + +Normalizing centers an observation sequence about the mean of its observations – that is, given: + +.. math:: + + O=\begin{pmatrix} + o_1^{(1)} & o_2^{(1)} & \cdots & o_D^{(1)} \\ + o_1^{(2)} & o_2^{(2)} & \cdots & o_D^{(2)} \\ + \vdots & \vdots & \ddots & \vdots \\ + o_1^{(T)} & o_2^{(T)} & \cdots & o_D^{(T)} + \end{pmatrix} + \qquad + \boldsymbol{\mu}=\begin{pmatrix} + \overline{o_1} & \overline{o_2} & \cdots & \overline{o_D} + \end{pmatrix} + +Where :math:`\overline{o_d}` represents the mean of the :math:`d^\text{th}` feature of :math:`O`. + +We subtract :math:`\boldsymbol{\mu}` from each observation, or row in :math:`O`. This centers the observations. + +For further information, please see the `preprocessing tutorial notebook `_. + +Example +------- + +.. literalinclude:: ../../_includes/examples/preprocessing/normalize.py + :language: python + :linenos: + +API reference +------------- + +.. automodule:: sequentia.preprocessing +.. autofunction:: normalize \ No newline at end of file diff --git a/docs/sections/preprocessing/preprocessing.rst b/docs/sections/preprocessing/preprocessing.rst new file mode 100644 index 00000000..893f898e --- /dev/null +++ b/docs/sections/preprocessing/preprocessing.rst @@ -0,0 +1,22 @@ +.. _combined: + +Combined Preprocessing (``Preprocess``) +======================================= + +The :class:`~Preprocess` class provides a way of efficiently applying multiple +preprocessing transformations to provided input observation sequences. + +For further information, please see the `preprocessing tutorial notebook `_. + +Example +------- + +.. literalinclude:: ../../_includes/examples/preprocessing/preprocess.py + :language: python + :linenos: + +API reference +------------- + +.. autoclass:: sequentia.preprocessing.Preprocess + :members: \ No newline at end of file diff --git a/examples/1 - Input Format (Tutorial).ipynb b/examples/1 - Input Format (Tutorial).ipynb deleted file mode 100644 index 8d151e13..00000000 --- a/examples/1 - Input Format (Tutorial).ipynb +++ /dev/null @@ -1,32 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/2 - Preprocessing (Tutorial).ipynb b/examples/2 - Preprocessing (Tutorial).ipynb deleted file mode 100644 index 8d151e13..00000000 --- a/examples/2 - Preprocessing (Tutorial).ipynb +++ /dev/null @@ -1,32 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/Pen-Tip Trajectories (Example).ipynb b/examples/Pen-Tip Trajectories (Example).ipynb deleted file mode 100644 index b660941d..00000000 --- a/examples/Pen-Tip Trajectories (Example).ipynb +++ /dev/null @@ -1,654 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import requests\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "from tqdm.auto import tqdm\n", - "from scipy.io import loadmat\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "# Set seed for reproducible randomness\n", - "seed = 101\n", - "np.random.seed(seed)\n", - "rng = np.random.RandomState(seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Pen-Tip Trajectories (Example)\n", - "\n", - "This notebook aims to showcase some of the preprocessing methods and classification algorithms offered by Sequentia on the [Character Trajectories Data Set](https://archive.ics.uci.edu/ml/datasets/Character+Trajectories). This dataset consists of pen-tip trajectories generated by writing English letters on a [WACOM tablet](https://www.wacom.com/en-us). \n", - "\n", - "The **$x$-velocity**, **$y$-velocity** and **pen-tip force** were recorded.\n", - "\n", - "Some more specific details about the dataset:\n", - "\n", - "- The data consists of 2858 character samples\n", - "- The data has been numerically differentiated and Gaussian smoothed\n", - "- Only characters with a single 'PEN-DOWN' segment were considered (these characters are shown later)\n", - "- Characters have been shifted so that their velocity profiles best match the mean of the set\n", - "\n", - "Each character sample is a 3-dimensional pen tip velocity (and force) trajectory. This is contained in matrix format, with 3 rows and $T$ columns where $T$ is the length of the character sample, which represents our observation sequence.\n", - "\n", - "---\n", - "\n", - "First, we will download the dataset and extract the samples and labels, then convert them so that they are in a format compatible with Sequentia (see the [_Input Format_](https://nbviewer.jupyter.org/github/eonu/sequentia/blob/master/examples/1%20-%20Input%20Format%20(Tutorial).ipynb) notebook for more information): " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "data = None\n", - "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/character-trajectories/mixoutALL_shifted.mat'\n", - "\n", - "try:\n", - " path = os.path.join(os.getcwd(), 'temp.mat')\n", - " response = requests.get(url)\n", - "except:\n", - " raise\n", - "else:\n", - " with open(path, 'wb') as file:\n", - " file.write(response.content)\n", - " data = loadmat(path)\n", - "finally:\n", - " os.remove(path)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of trajectories: 2858\n" - ] - } - ], - "source": [ - "# Load the trajectories\n", - "X = [x.T for x in data['mixout'][0]] # Transpose from 3xT to Tx3\n", - "print('Number of trajectories: {}'.format(len(X)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Only lowercase characters with a single pen-down segment were considered in this dataset. In total, there were 20 of these characters as shown below:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Labels: ['a', 'b', 'c', 'd', 'e', 'g', 'h', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 'u', 'v', 'w', 'y', 'z']\n", - "Number of labels: 20\n" - ] - } - ], - "source": [ - "# Retrieve the set of unique labels and report the number of labels\n", - "labels = [label[0] for label in data['consts'][0][0][3][0]]\n", - "n_labels = len(labels)\n", - "print('Labels: {}'.format(str(labels)))\n", - "print('Number of labels: {}'.format(n_labels))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# View distribution of observation sequence lengths\n", - "plt.title('Histogram of observation sequence lengths')\n", - "plt.xlabel('Number of time frames')\n", - "plt.ylabel('Count')\n", - "plt.hist([len(x) for x in X], bins=n_labels)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The sample rate of each trajectory recording was 200hz–meaning that in every second, 200 pen-tip trajectories were recorded!\n", - "\n", - "As seen in the histogram above, most characters can be drawn in less than 200 frames, or in less than one second.\n", - "\n", - "Although keeping all of these frames/data-points might result in a more accurate classifier, it also significantly increases the time required for training or prediction. This is especially the case for $k$-NN, since it is a non-parametric classifier that requires going through each training example during prediction time.\n", - "\n", - "---\n", - "\n", - "There are three features offered by Sequentia that can help to reduce the time taken for predictions:\n", - "\n", - "- Downsampling (representing each trajectory in a fewer number of frames) through two different methods:\n", - " - **Decimation**: Only keeping the observation at every every $n$th time frame.\n", - " - **Averaging**: Averaging every group of $n$ observations to form a single observation.\n", - "- Using a faster, restricted distance measure that can handle sequences of different length (see [FastDTW](https://pdfs.semanticscholar.org/05a2/0cde15e172fc82f32774dd0cf4fe5827cad2.pdf))\n", - "- Parallelization (only supported in the `DTWKNN` class)\n", - "\n", - "The `DTWKNN` class always uses the FastDTW algorithm to calculate distances. Downsampling is offered as one of the preprocessing methods in Sequentia, and is used as follows (see the [_Preprocessing_](https://nbviewer.jupyter.org/github/eonu/sequentia/blob/master/examples/2%20-%20Preprocessing%20(Tutorial).ipynb) notebook for more information):" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "from sequentia.preprocessing import downsample\n", - "\n", - "# Pick an example trajectory for visualization\n", - "x = X[0]\n", - "# Downsample the example trajectory, using a downsample factor of n=10\n", - "x_down = downsample(x, n=10, method='average')\n", - "\n", - "# Create the plot to visualize the downsampling\n", - "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True)\n", - "ax1.plot(x)\n", - "ax1.set_title('Original velocity and force pen-tip trajectory sample')\n", - "ax1.legend(labels=['$x$ velocity', '$y$ velocity', 'pen-tip force'])\n", - "ax2.plot(x_down)\n", - "ax2.set_title('Downsampled ($n=10$) velocity and force pen-tip trajectory sample')\n", - "ax2.legend(labels=['$x$ velocity', '$y$ velocity', 'pen-tip force'])\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# Downsample the entire dataset\n", - "X = downsample(X, n=10, method='average')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# Extract the labels\n", - "y = [labels[idx - 1] for idx in data['consts'][0][0][4][0]]\n", - "\n", - "# Plot a histogram of the labels for each class\n", - "plt.title('Histogram of the dataset label counts')\n", - "plt.xlabel('Label (character)')\n", - "plt.ylabel('Count')\n", - "plt.hist(y, bins=n_labels)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Training set size: 2286\n", - "Test set size: 572\n" - ] - } - ], - "source": [ - "# Shuffle and split the dataset into a training and test set\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rng, shuffle=True)\n", - "print('Training set size: {}'.format(len(X_train)))\n", - "print('Test set size: {}'.format(len(X_test)))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a function for displaying results (accuracy and confusion matrix)\n", - "def show_results(acc, cm, dataset):\n", - " df = pd.DataFrame(cm, index=labels, columns=labels)\n", - " plt.figure(figsize=(10, 7))\n", - " sns.heatmap(df, annot=True)\n", - " plt.title('Confusion matrix for {} set predictions'.format(dataset), fontsize=14)\n", - " plt.xlabel('Predicted')\n", - " plt.ylabel('Actual')\n", - " # Fix for matplotlib bug that cuts off top/bottom of seaborn visualizations\n", - " b, t = plt.ylim()\n", - " plt.ylim(b + 0.5, t - 0.5)\n", - " plt.show()\n", - " print('Accuracy: {:.2f}%'.format(acc * 100))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dynamic Time Warping $k$-NN\n", - "\n", - "The $k$-Nearest Neighbor ($k$-NN) classifier is a conceptually simple machine learning algorithm that is also easy to implement. As a result, it is often used as a baseline, despite often being able to perform much better than more complex algorithms.\n", - "\n", - "However, applying $k$-NN to isolated temporal observation sequences is not so straightforward since different observation sequences may have different durations, making it difficult to come up with a distance measure that can be used to compare the two sequences. \n", - "\n", - "One such appropriate distance measure is [Dynamic Time Warping](https://en.wikipedia.org/wiki/Dynamic_time_warping). However, due to the non-parametric nature of $k$-NN, it may take very long to predict new observation sequences. In an effort to reduce this wait, Sequentia uses the [FastDTW](https://github.com/slaypni/fastdtw) implementation of the Dynamic Time Warping algorithm, which allows for faster, configurable approximatations to the DTW distance calculations which can save memory and time. \n", - "\n", - "---\n", - "\n", - "Importing, creating and fitting the classifier:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from sequentia.classifiers import DTWKNN\n", - "\n", - "# Create and fit a DTWKNN classifier using the single nearest neighbor and a radius of 1\n", - "# NOTE: The radius parameter is a parameter that constrains the FastDTW algorithm.\n", - "clf = DTWKNN(k=1, radius=1)\n", - "clf.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To predict single or multiple examples, we can use the `predict` function:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c7cd63f3f6db45f8bd9558575733161a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Calculating distances', max=2286, style=ProgressStyle(descrip…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/plain": [ - "'r'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Predict the first test example\n", - "clf.predict(X_test[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "331e1528ac3b40e0aea36cd70cec7dbc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Classifying examples', max=5, style=ProgressStyle(description…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "CPU times: user 56.1 s, sys: 759 ms, total: 56.8 s\n", - "Wall time: 58.7 s\n" - ] - }, - { - "data": { - "text/plain": [ - "['r', 'w', 'l', 'w', 'y']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%time\n", - "# Predict the first 5 test examples\n", - "clf.predict(X_test[:5])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This can be sped up a bit by using multiple jobs, as specified by `n_jobs`. By default this is set to 1. A setting of -1 will use all available cores:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 561 ms, sys: 81.6 ms, total: 642 ms\n", - "Wall time: 41.4 s\n" - ] - }, - { - "data": { - "text/plain": [ - "['r', 'w', 'l', 'w', 'y']" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%time\n", - "# NOTE: Progress bars for predict() and evaluate() are only displayed in the console if multiple jobs are used\n", - "clf.predict(X_test[:5], n_jobs=-1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To calculate the model's accuracy and confusion matrix on some data, we can use the `evaluate` function:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 616 ms, sys: 183 ms, total: 799 ms\n", - "Wall time: 1h 3min 34s\n" - ] - } - ], - "source": [ - "%%time\n", - "acc, cm = clf.evaluate(X_test, y_test, labels=labels, n_jobs=-1)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 98.25%\n" - ] - } - ], - "source": [ - "show_results(acc, cm, dataset='test')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, Dynamic Time Warping $k$-NN classification often works with near perfect performance, but suffers due to the fact that $k$-NN is a non-parametric machine learning algorithm. \n", - "\n", - "This means that we have to look through every training example when we make a single prediction. Even with FastDTW, downsampling and multi-processing, the example classification on the test set consisting of 572 examples **took just over an hour**!\n", - "\n", - "---\n", - "\n", - "As a result, parametric methods such as Hidden Markov Models are often more feasible to use–but also generally perform worse." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Ensemble Hidden Markov Models\n", - "\n", - "An ensemble of HMMs can be a good classifier for isolated sequences. The main idea behind using ensemble HMMs for classification is as follows:\n", - "\n", - "1. Create $N$ HMMs $\\lambda_1,\\lambda_2,\\ldots,\\lambda_N$, each representing a different class (character in this case).\n", - "2. Fit each of these HMMs only using the training examples labeled with the class that the HMM represents. _The Baum-Welch algorithm is used for training here_.\n", - "3. For a new observation sequence $O$, calculate the likelihood of each HMM generating $O$–that is, calculate $\\mathbb{P}(O|\\lambda_c) \\quad \\forall c\\in\\{1, 2, \\ldots, N\\}$. _This is done using the Forward algorithm_.\n", - "4. Then $O$ is then classified as the class corresponding to the HMM that was most likely to generate $O$, giving a classification rule of: \n", - "\n", - "$$c^*=\\mathop{\\arg\\max}_{c\\in\\{1,2,\\ldots,N\\}}\\mathbb{P}(O|\\lambda_c)$$\n", - "\n", - "**Note**: In order to account for some classes naturally occurring more frequently than others, we can instead introduce a prior by using the Maximum A Posterior (MAP) classification rule:\n", - "\n", - "$$c^*=\\mathop{\\arg\\max}_{c\\in\\{1,2,\\ldots,N\\}}\\mathbb{P}(O|\\lambda_c)\\mathbb{P}(\\lambda_c)$$\n", - "\n", - "---\n", - "\n", - "Creating the individual `HMM` objects and fitting each one on the training examples corresponding to the label (character) that it represents:\n", - "\n", - "**Note**: Here we naively set the number of states for all HMMs to 10. In reality, you will probably want to have different numbers of states for HMMs that represent more complex or more simple characters." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c03a0b387cfd4389befc6a3d0a039d03", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Training HMMs', max=20, style=ProgressStyle(description_width…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "from sequentia.classifiers import HMM, HMMClassifier\n", - "\n", - "hmms = []\n", - "for label in tqdm(labels, desc='Training HMMs'):\n", - " hmm = HMM(label=label, n_states=10, random_state=rng)\n", - " hmm.set_random_initial()\n", - " hmm.set_random_transitions()\n", - " hmm.fit([X_train[i] for i, y_i in enumerate(y_train) if y_i == label])\n", - " hmms.append(hmm)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A `HMMClassifier` object collects each of the individual `HMM` objects in order to create the ensemble classifier:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "clf = HMMClassifier()\n", - "clf.fit(hmms)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 88.46%\n" - ] - } - ], - "source": [ - "acc, cm = clf.evaluate(X_test, y_test, labels=labels)\n", - "show_results(acc, cm, dataset='test')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/lib/sequentia/classifiers/dtwknn/dtwknn.py b/lib/sequentia/classifiers/dtwknn/dtwknn.py index 07e73099..1bab5380 100644 --- a/lib/sequentia/classifiers/dtwknn/dtwknn.py +++ b/lib/sequentia/classifiers/dtwknn/dtwknn.py @@ -8,37 +8,26 @@ from collections import Counter from scipy.spatial.distance import euclidean from sklearn.metrics import confusion_matrix -from typing import Callable, Union, List, Tuple from ...internals import Validator class DTWKNN: - """A k-Nearest Neighbor classifier that compares differing length observation sequences - using the efficient FastDTW dynamic time warping algorithm. - - Example: - >>> import numpy as np - >>> from sequentia.classifiers import DTWKNN - >>> ​ - >>> # Create some sample data - >>> X = [np.random.random((10 * i, 3)) for i in range(1, 4)] - >>> y = ['class0', 'class1', 'class1'] - >>> ​ - >>> # Create and fit the classifier - >>> clf = DTWKNN(k=1, radius=5) - >>> clf.fit(X, y) - >>> ​ - >>> # Predict labels for the training data (just as an example) - >>> clf.predict(X) + """A k-Nearest Neighbor classifier that compares differing length observation sequences using the efficient FastDTW dynamic time warping algorithm. + + Parameters + ---------- + k: int + Number of neighbors. + + radius: int + Radius parameter for FastDTW. + + See: `Stan Salvador, and Philip Chan. "FastDTW: Toward accurate dynamic time warping in linear time and space." Intelligent Data Analysis 11.5 (2007), 561-580. `_ + + metric: callable + Distance metric for FastDTW. """ - def __init__(self, k: int, radius: int = 10, metric: Callable = euclidean): - """ - Parameters: - k {int} - Number of neighbors. - radius {int} - Radius parameter for FastDTW. - See: https://pdfs.semanticscholar.org/05a2/0cde15e172fc82f32774dd0cf4fe5827cad2.pdf - metric {Callable} - Distance metric for FastDTW. - """ + def __init__(self, k, radius, metric: euclidean): self._val = Validator() self._k = self._val.restricted_integer( k, lambda x: x > 0, desc='number of neighbors', expected='greater than zero') @@ -46,26 +35,38 @@ def __init__(self, k: int, radius: int = 10, metric: Callable = euclidean): radius, lambda x: x > 0, desc='radius parameter', expected='greater than zero') self._metric = metric - def fit(self, X: List[np.ndarray], y: List[str]) -> None: + def fit(self, X, y): """Fits the classifier by adding labeled training observation sequences. - Parameters: - X {list(numpy.ndarray)} - A list of multiple observation sequences. - y {list(str)} - A list of labels for the observation sequences. + Parameters + ---------- + X: List[numpy.ndarray] + A list of multiple observation sequences. + + y: List[str] + A list of labels for the observation sequences. """ self._X, self._y = self._val.observation_sequences_and_labels(X, y) - def predict(self, X: Union[np.ndarray, List[np.ndarray]], verbose=True, n_jobs=1) -> Union[str, List[str]]: + def predict(self, X, verbose=True, n_jobs=1): """Predicts the label for an observation sequence (or multiple sequences). - Parameters: - X {numpy.ndarray, list(numpy.ndarray)} - An individual observation sequence or - a list of multiple observation sequences. - verbose {bool} - Whether to display a progress bar or not. - n_jobs {int} - The number of jobs to run in parallel. + Parameters + ---------- + X: numpy.ndarray or List[numpy.ndarray] + An individual observation sequence or a list of multiple observation sequences. - Returns {numpy.ndarray, list(numpy.ndarray)}: - The predicted labels for the observation sequence(s). + verbose: bool + Whether to display a progress bar or not. + + n_jobs: int + | The number of jobs to run in parallel. + | Setting this to -1 will use all available CPU cores. + + Returns + ------- + prediction(s): str or List[str] + The predicted label(s) for the observation sequence(s). """ try: (self._X, self._y) @@ -115,19 +116,34 @@ def parallel_predict(process, X_chunk): labels = Parallel(n_jobs=n_jobs)(delayed(parallel_predict)(i+1, chunk) for i, chunk in enumerate(X_chunks)) return [label for sublist in labels for label in sublist] # Flatten the resulting array - def evaluate(self, X: List[np.ndarray], y: List[str], labels=None, verbose=True, n_jobs=1) -> Tuple[float, np.ndarray]: + def evaluate(self, X, y, labels=None, verbose=True, n_jobs=1): """Evaluates the performance of the classifier on a batch of observation sequences and their labels. - Parameters: - X {list(numpy.ndarray)} - A list of multiple observation sequences. - y {list(str)} - A list of labels for the observation sequences. - labels {list(str)} - A list of labels for ordering the axes of the confusion matrix. - verbose {bool} - Whether to display a progress bar for predictions or not. - n_jobs {int} - The number of jobs to run in parallel. + Parameters + ---------- + X: List[numpy.ndarray] + A list of multiple observation sequences. + + y: List[str] + A list of labels for the observation sequences. + + labels: List[str] + A list of labels for ordering the axes of the confusion matrix. + + verbose: bool + Whether to display a progress bar for predictions or not. + + n_jobs: int + | The number of jobs to run in parallel. + | Setting this to -1 will use all available CPU cores. + + Returns + ------- + accuracy: float + The categorical accuracy of the classifier on the observation sequences. - Return: {tuple(float, numpy.ndarray)} - - The categorical accuracy of the classifier on the observation sequences. - - A confusion matrix representing the discrepancy between predicted and actual labels. + confusion: numpy.ndarray + The confusion matrix representing the discrepancy between predicted and actual labels. """ self._val.observation_sequences_and_labels(X, y) self._val.boolean(verbose, desc='verbose') diff --git a/lib/sequentia/classifiers/hmm/hmm.py b/lib/sequentia/classifiers/hmm/hmm.py index 7f7d196b..274559c4 100644 --- a/lib/sequentia/classifiers/hmm/hmm.py +++ b/lib/sequentia/classifiers/hmm/hmm.py @@ -2,41 +2,44 @@ import pomegranate as pg from .topologies.ergodic import ErgodicTopology from .topologies.left_right import LeftRightTopology -from typing import List from ...internals import Validator class HMM: """A hidden Markov model representing an isolated temporal sequence class. - Example: - >>> import numpy as np - >>> from sequentia.classifiers import HMM - >>> ​ - >>> # Create some sample data - >>> X = [np.random.random((10 * i, 3)) for i in range(1, 4)] - >>> ​ - >>> # Create and fit a left-right HMM with random transitions and initial state distribution - >>> hmm1 = HMM(label='class1', n_states=5, topology='left-right') - >>> hmm1.set_random_initial() - >>> hmm1.set_random_transitions() - >>> hmm1.fit(X) - - Attributes: - label (getter) - The label for the model. - n_states (getter) - The number of states for the model. - n_seqs (getter) - The number of observation sequences use to train the model. - initial (setter/getter) - The initial state distribution of the model. - transitions (setter/getter) - The transition matrix of the model. + Parameters + ---------- + label: str + A label for the model, corresponding to the class being represented. + + n_states: int + The number of states for the model. + + topology: {'ergodic', 'left-right'} + The topology for the model. + + random_state: numpy.random.RandomState, int, optional + A random state object or seed for reproducible randomness. + + Attributes + ---------- + label: str + The label for the model. + + n_states: int + The number of states for the model. + + n_seqs: int + The number of observation sequences use to train the model. + + initial: numpy.ndarray + The initial state distribution of the model. + + transitions: numpy.ndarray + The transition matrix of the model. """ - def __init__(self, label: str, n_states: int, topology='left-right', random_state=None): - """ - Parameters: - label {str} - A label for the model (should ideally correspond to the class label). - n_states {int} - The number of states for the model. - topology {str} - The topology ('ergodic' or 'left-right') for the model. - random_state {numpy.random.RandomState, int} - A random state object or seed for reproducible randomness. - """ + def __init__(self, label, n_states, topology='left-right', random_state=None): self._val = Validator() self._label = self._val.string(label, 'model label') self._n_states = self._val.restricted_integer( @@ -57,65 +60,34 @@ def __init__(self, label: str, n_states: int, topology='left-right', random_stat elif topology == 'left-right': self._topology = LeftRightTopology(self._n_states, self._random_state) - @property - def label(self) -> str: - return self._label - - @property - def n_states(self) -> int: - return self._n_states - - @property - def n_seqs(self) -> int: - """Number of observation sequences used to train the model.""" - try: - return self._n_seqs - except AttributeError as e: - raise AttributeError('The model has not been fitted and has not seen any observation sequences') from e - - @property - def initial(self) -> np.ndarray: - try: - return self._initial - except AttributeError as e: - raise AttributeError('No initial state distribution has been defined') from e - - @initial.setter - def initial(self, probabilities: np.ndarray): - self._topology.validate_initial(probabilities) - self._initial = probabilities - - @property - def transitions(self) -> np.ndarray: - try: - return self._transitions - except AttributeError as e: - raise AttributeError('No transition matrix has been defined') from e - - @transitions.setter - def transitions(self, probabilities: np.ndarray): - self._topology.validate_transitions(probabilities) - self._transitions = probabilities - def set_uniform_initial(self): + """Sets a uniform initial state distribution.""" self._initial = self._topology.uniform_initial() def set_random_initial(self): + """Sets a random initial state distribution.""" self._initial = self._topology.random_initial() def set_uniform_transitions(self): + """Sets a uniform transition matrix according to the topology.""" self._transitions = self._topology.uniform_transitions() def set_random_transitions(self): + """Sets a random transition matrix according to the topology.""" self._transitions = self._topology.random_transitions() - def fit(self, X: List[np.ndarray], n_jobs=1): + def fit(self, X, n_jobs=1): """Fits the HMM to observation sequences assumed to be labeled as the class that the model represents. - Parameters: - X {list(numpy.ndarray)} - Collection of multivariate observation sequences, each of shape (T, D) - where T may vary per observation sequence. - n_jobs {int} - The number of jobs to run in parallel. + Parameters + ---------- + X: List[numpy.ndarray] + Collection of multivariate observation sequences, each of shape :math:`(T \\times D)` where + :math:`T` may vary per observation sequence. + + n_jobs: int + | The number of jobs to run in parallel. + | Setting this to -1 will use all available CPU cores. """ self._val.observation_sequences(X) self._val.restricted_integer(n_jobs, lambda x: x == -1 or x > 0, 'number of jobs', '-1 or greater than zero') @@ -147,14 +119,19 @@ def fit(self, X: List[np.ndarray], n_jobs=1): self._initial = inner_tx[self._n_states] self._transitions = inner_tx[:self._n_states] - def forward(self, sequence: np.ndarray) -> float: + def forward(self, sequence): """Runs the forward algorithm to calculate the (negative log) likelihood of the model generating an observation sequence. - Parameters: - sequence {numpy.ndarray} - An individual sequence of observations of size (T, D) where: - T is the number of time frames (or observations) and D is the number of features. + Parameters + ---------- + sequence: numpy.ndarray + An individual sequence of observations of size :math:`(T \\times D)` where + :math:`T` is the number of time frames (or observations) and + :math:`D` is the number of features. - Returns {float}: + Returns + ------- + negative log-likelihood: float The negative log-likelihood of the model generating the observation sequence. """ if not isinstance(sequence, np.ndarray): @@ -164,4 +141,43 @@ def forward(self, sequence: np.ndarray) -> float: if not sequence.shape[1] == self._n_features: raise ValueError('Number of observation features must match the dimensionality of the original data used to fit the model') - return -self._model.log_probability(sequence) \ No newline at end of file + return -self._model.log_probability(sequence) + + @property + def label(self): + return self._label + + @property + def n_states(self): + return self._n_states + + @property + def n_seqs(self): + try: + return self._n_seqs + except AttributeError as e: + raise AttributeError('The model has not been fitted and has not seen any observation sequences') from e + + @property + def initial(self): + try: + return self._initial + except AttributeError as e: + raise AttributeError('No initial state distribution has been defined') from e + + @initial.setter + def initial(self, probabilities): + self._topology.validate_initial(probabilities) + self._initial = probabilities + + @property + def transitions(self): + try: + return self._transitions + except AttributeError as e: + raise AttributeError('No transition matrix has been defined') from e + + @transitions.setter + def transitions(self, probabilities): + self._topology.validate_transitions(probabilities) + self._transitions = probabilities \ No newline at end of file diff --git a/lib/sequentia/classifiers/hmm/hmm_classifier.py b/lib/sequentia/classifiers/hmm/hmm_classifier.py index a7b47fb9..101b5701 100644 --- a/lib/sequentia/classifiers/hmm/hmm_classifier.py +++ b/lib/sequentia/classifiers/hmm/hmm_classifier.py @@ -1,43 +1,21 @@ import numpy as np from .hmm import HMM from sklearn.metrics import confusion_matrix -from typing import Dict, Union, List, Tuple, Any from ...internals import Validator class HMMClassifier: - """An ensemble classifier that combines individual HMMs which model isolated sequences from different classes. - - Example: - >>> import numpy as np - >>> from sequentia.classifiers import HMM, HMMClassifier - >>> ​ - >>> # Create and fit some sample HMMs - >>> hmms = [] - >>> for i in range(5): - >>> hmm = HMM(label=f'class{i}', n_states=(i + 3), topology='left-right') - >>> hmm.set_random_initial() - >>> hmm.set_random_transitions() - >>> hmm.fit([np.arange((i + j * 20) * 30).reshape(-1, 3) for j in range(1, 4)]) - >>> hmms.append(hmm) - >>> ​ - >>> # Create some sample test data and labels - >>> X = [np.random.random((10 * i, 3)) for i in range(1, 4)] - >>> y = ['class0', 'class1', 'class1'] - >>> ​ - >>> # Create a classifier and calculate predictions and evaluations - >>> clf = HMMClassifier() - >>> clf.fit(hmms) - >>> predictions = clf.predict(X) - >>> f1, confusion = clf.evaluate(X, y) - """ + """An ensemble classifier that combines individual :class:`~HMM` objects, which model isolated sequences from different classes.""" def __init__(self): self._val = Validator() - def fit(self, models: Union[List[HMM], Dict[Any, HMM]]): - """ - Parameters: - models {list(HMM),dict(HMM)} - A collection of HMM objects to use for classification. + def fit(self, models): + """Fits the ensemble classifier with a collection of :class:`~HMM` objects. + + Parameters + ---------- + models: List[HMM] or Dict[Any, HMM] + A collection of :class:`~HMM` objects to use for classification. """ if isinstance(models, list): if not all(isinstance(model, HMM) for model in models): @@ -55,18 +33,31 @@ def fit(self, models: Union[List[HMM], Dict[Any, HMM]]): else: raise RuntimeError('Must fit the classifier with at least one HMM') - def predict(self, X: Union[np.ndarray, List[np.ndarray]], prior=True, return_scores=False) -> Union[str, List[str]]: + def predict(self, X, prior=True, return_scores=False): """Predicts the label for an observation sequence (or multiple sequences) according to maximum likelihood or posterior scores. - Parameters: - X {numpy.ndarray, list(numpy.ndarray)} - An individual observation sequence or - a list of multiple observation sequences. - prior {bool} - Whether to calculate a prior and perform MAP estimation. If this parameter is set - to False, then the negative log likelihoods generated from the models' `forward` function are used. - return_scores {bool} - Whether to return the scores of each model on the observation sequence(s). + Parameters + ---------- + X: numpy.ndarray or List[numpy.ndarray] + An individual observation sequence or a list of multiple observation sequences. + + prior: bool + Whether to calculate a prior for each model and perform MAP estimation by scoring with + the joint probability (or un-normalized posterior) :math:`\mathbb{P}(O, \lambda_c)=\mathbb{P}(O|\lambda_c)\mathbb{P}(\lambda_c)`. + + If this parameter is set to false, then the negative log likelihoods + :math:`\mathbb{P}(O|\lambda_c)` generated from the models' :func:`~HMM.forward` function are used. + + return_scores: bool + Whether to return the scores of each model on the observation sequence(s). + + Returns + ------- + prediction(s): str or List[str] + The predicted label(s) for the observation sequence(s). - Returns {str, list(str)}: - The predicted labels for the observation sequence(s). + If ``return_scores`` is true, then for each observation sequence, a tuple `(label, scores)` is returned for each label, + consisting of the `scores` of each HMM and the `label` of the HMM with the best score. """ self._val.boolean(prior, desc='prior') self._val.boolean(return_scores, desc='return_scores') @@ -91,19 +82,34 @@ def predict(self, X: Union[np.ndarray, List[np.ndarray]], prior=True, return_sco predictions.append((best[0], scores) if return_scores else best[0]) return predictions - def evaluate(self, X: List[np.ndarray], y: List[str], prior=True, labels=None) -> Tuple[float, np.ndarray]: + def evaluate(self, X, y, prior=True, labels=None): """Evaluates the performance of the classifier on a batch of observation sequences and their labels. - Parameters: - X {list(numpy.ndarray)} - A list of multiple observation sequences. - y {list(str)} - A list of labels for the observation sequences. - prior {bool} - Whether to calculate a prior and perform MAP estimation. If this parameter is set - to False, then the negative log likelihoods generated from the models' `forward` function are used. - labels {list(str)} - A list of labels for ordering the axes of the confusion matrix. + Parameters + ---------- + X: List[numpy.ndarray] + A list of multiple observation sequences. + + y: List[str] + A list of labels for the observation sequences. + + prior: bool + Whether to calculate a prior for each model and perform MAP estimation by scoring with + the joint probability (or un-normalized posterior) :math:`\mathbb{P}(O, \lambda_c)=\mathbb{P}(O|\lambda_c)\mathbb{P}(\lambda_c)`. + + If this parameter is set to false, then the negative log likelihoods + :math:`\mathbb{P}(O|\lambda_c)` generated from the models' :func:`~HMM.forward` function are used. + + labels: List[str] + A list of labels for ordering the axes of the confusion matrix. + + Returns + ------- + accuracy: float + The categorical accuracy of the classifier on the observation sequences. - Return: {tuple(float, numpy.ndarray)} - - The categorical accuracy of the classifier on the observation sequences. - - A confusion matrix representing the discrepancy between predicted and actual labels. + confusion: numpy.ndarray + The confusion matrix representing the discrepancy between predicted and actual labels. """ self._val.observation_sequences_and_labels(X, y) self._val.boolean(prior, desc='prior') diff --git a/lib/sequentia/preprocessing/methods.py b/lib/sequentia/preprocessing/methods.py index 9c7e17ea..d5d11794 100644 --- a/lib/sequentia/preprocessing/methods.py +++ b/lib/sequentia/preprocessing/methods.py @@ -1,23 +1,25 @@ import scipy.fftpack import numpy as np -from typing import Union, List from ..internals import Validator -def normalize(X: Union[np.ndarray, List[np.ndarray]]) -> Union[np.ndarray, List[np.ndarray]]: +def normalize(X): """Normalizes an observation sequence (or multiple sequences) by centering observations around the mean. - Parameters: - X {numpy.ndarray, list(numpy.ndarray)} - An individual observation sequence or - a list of multiple observation sequences. + Parameters + ---------- + X: numpy.ndarray or List[numpy.ndarray] + An individual observation sequence or a list of multiple observation sequences. - Returns {numpy.ndarray, list(numpy.ndarray)}: + Returns + ------- + normalized: numpy.ndarray or List[numpy.ndarray] The normalized input observation sequence(s). """ val = Validator() val.observation_sequences(X, allow_single=True) return _normalize(X) -def _normalize(X: Union[np.ndarray, List[np.ndarray]]) -> Union[np.ndarray, List[np.ndarray]]: +def _normalize(X): def transform(x): return x - x.mean(axis=0) @@ -26,20 +28,25 @@ def transform(x): elif isinstance(X, np.ndarray): return transform(X) -def downsample(X: Union[np.ndarray, List[np.ndarray]], n: int, method='decimate') -> Union[np.ndarray, List[np.ndarray]]: - """Downsamples an observation sequence (or multiple sequences) by: - - Decimating the next n-1 observations - - Averaging the current observation with the next n-1 observations +def downsample(X, n, method='decimate'): + """Downsamples an observation sequence (or multiple sequences) by either: + - Decimating the next :math:`n-1` observations + - Averaging the current observation with the next :math:`n-1` observations - Parameters: - X {numpy.ndarray, list(numpy.ndarray)} - An individual observation sequence or - a list of multiple observation sequences. - n {int} - Downsample factor. - NOTE: This downsamples the current observation by either decimating the next n-1 - observations or computing an average with them. - method {str} - The downsampling method, either 'decimate' or 'average'. + Parameters + ---------- + X: numpy.ndarray or List[numpy.ndarray] + An individual observation sequence or a list of multiple observation sequences. - Returns {numpy.ndarray, list(numpy.ndarray)}: + n: int + Downsample factor. + + method: {'decimate', 'average'} + The downsampling method. + + Returns + ------- + downsampled: numpy.ndarray or List[numpy.ndarray] The downsampled input observation sequence(s). """ val = Validator() @@ -48,7 +55,7 @@ def downsample(X: Union[np.ndarray, List[np.ndarray]], n: int, method='decimate' val.one_of(method, ['decimate', 'average'], desc='downsampling method') return _downsample(X, n, method) -def _downsample(X: Union[np.ndarray, List[np.ndarray]], n: int, method: str) -> Union[np.ndarray, List[np.ndarray]]: +def _downsample(X, n, method): def transform(x): N, D = x.shape if method == 'decimate': @@ -63,17 +70,24 @@ def transform(x): elif isinstance(X, np.ndarray): return transform(X) -def fft(X: Union[np.ndarray, List[np.ndarray]]) -> Union[np.ndarray, List[np.ndarray]]: +def fft(X): """Applies a Discrete Fourier Transform to the input observation sequence(s). - Returns {numpy.ndarray, list(numpy.ndarray)}: + Parameters + ---------- + X: numpy.ndarray or List[numpy.ndarray] + An individual observation sequence or a list of multiple observation sequences. + + Returns + ------- + transformed: numpy.ndarray or List[numpy.ndarray] The transformed input observation sequence(s). """ val = Validator() val.observation_sequences(X, allow_single=True) return _fft(X) -def _fft(X: Union[np.ndarray, List[np.ndarray]]) -> Union[np.ndarray, List[np.ndarray]]: +def _fft(X): def transform(x): return scipy.fftpack.rfft(x, axis=0) diff --git a/lib/sequentia/preprocessing/preprocess.py b/lib/sequentia/preprocessing/preprocess.py index c0eb24da..819fd0df 100644 --- a/lib/sequentia/preprocessing/preprocess.py +++ b/lib/sequentia/preprocessing/preprocess.py @@ -1,61 +1,50 @@ import numpy as np -from typing import Union, List from .methods import _normalize, _downsample, _fft from ..internals import Validator class Preprocess: - """Efficiently applies multiple preprocessing transformations to provided input observation sequences. - - Example: - >>> import numpy as np - >>> from sequentia.preprocessing import Preprocess - >>> ​ - >>> # Create some sample data - >>> X = [np.random.random((10 * i, 3)) for i in range(1, 4)] - >>> ​ - >>> # Create the Preprocess object - >>> pre = Preprocess() - >>> pre.normalize() - >>> pre.downsample(10, method='average') - >>> pre.fft() - >>> ​ - >>> # Transform the data applying transformations in order - >>> X = pre.transform(X) - """ + """Efficiently applies multiple preprocessing transformations to provided input observation sequences.""" def __init__(self): self._transforms = [] self._val = Validator() - def normalize(self) -> None: + def normalize(self): """Normalizes an observation sequence (or multiple sequences) by centering observations around the mean.""" self._transforms.append((_normalize, {})) - def downsample(self, n: int, method='decimate') -> None: - """Downsamples an observation sequence (or multiple sequences) by: - - Decimating the next n-1 observations - - Averaging the current observation with the next n-1 observations + def downsample(self, n, method='decimate'): + """Downsamples an observation sequence (or multiple sequences) by either: + - Decimating the next :math:`n-1` observations + - Averaging the current observation with the next :math:`n-1` observations + + Parameters + ---------- + n: int + Downsample factor. - Parameters: - n {int} - Downsample factor. This downsamples the current observation - by either decimating the next n-1 observations or computing an average with them. - method {str} - The downsamplimg method, either 'decimate' or 'average'. + method: {'decimate', 'average'} + The downsampling method. """ self._val.restricted_integer(n, lambda x: x > 1, desc='downsample factor', expected='greater than one') self._val.one_of(method, ['decimate', 'average'], desc='downsampling method') self._transforms.append((_downsample, {'n': n, 'method': method})) - def fft(self) -> None: + def fft(self): """Applies a Discrete Fourier Transform to the input observation sequence(s).""" self._transforms.append((_fft, {})) - def transform(self, X: List[np.ndarray]) -> List[np.ndarray]: + def transform(self, X): """Applies the preprocessing transformations to the provided input observation sequence(s). - Parameters: - X {list(numpy.ndarray)} - A list of multiple observation sequences. + Parameters + ---------- + X: List[numpy.ndarray] + A list of multiple observation sequences. - Returns {list(numpy.ndarray)}: + Returns + ------- + transformed: List[numpy.ndarray] The input observation sequences with preprocessing transformations applied in order. """ self._val.observation_sequences(X)