Skip to content

Commit

Permalink
Fixed #1058 Added support for Polars
Browse files Browse the repository at this point in the history
  • Loading branch information
seanlaw committed Jan 7, 2025
1 parent 4416873 commit ce0cd8c
Show file tree
Hide file tree
Showing 10 changed files with 56 additions and 24 deletions.
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ dependencies:
- pydata-sphinx-theme>=0.5.2
- scikit-learn>=0.21.3
- numpydoc>=1.1.0
- build>=0.7.0
- python-build>=0.7.0
- pytest-check-links>=0.7.1
- isort>=5.11.0
- jupyterlab-myst>=2.0.0
- myst-nb>=1.0.0
- polars>=1.14.0
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ ci = [
"black >= 22.1.0",
"pytest >= 4.4.1",
"isort >= 5.11.0",
'tbb >= 2019.5 ; platform_system == "Linux"'
'tbb >= 2019.5 ; platform_system == "Linux"',
"polars >= 1.14.0"
]

[project.urls]
Expand Down
21 changes: 15 additions & 6 deletions stumpy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,25 +445,26 @@ def check_dtype(a, dtype=np.float64): # pragma: no cover

def transpose_dataframe(df): # pragma: no cover
"""
Check if the input is a column-wise Pandas `DataFrame`. If `True`, return a
Check if the input is a column-wise pandas/polars `DataFrame`. If `True`, return a
transpose dataframe since stumpy assumes that each row represents data from a
different dimension while each column represents data from the same dimension.
If `False`, return `a` unchanged. Pandas `Series` do not need to be transposed.
If `False`, return `a` unchanged. Pandas/polars `Series` do not need to be
transposed.
Note that this function has zero dependency on Pandas (not even a soft dependency).
Parameters
----------
df : numpy.ndarray
Pandas dataframe
df : DataFrame
pandas/polars dataframe
Returns
-------
output : df
If `df` is a Pandas `DataFrame` then return `df.T`. Otherwise, return `df`
"""
if type(df).__name__ == "DataFrame":
return df.T
return df.transpose()

return df

Expand Down Expand Up @@ -2062,8 +2063,16 @@ def _preprocess(T, copy=True):
Modified time series
"""
if copy:
T = T.copy()
try:
T = T.copy()
except AttributeError: # Polars copy
T = T.clone()

T = transpose_dataframe(T)

if "polars" in str(type(T)):
T = T.to_numpy(writable=True)

T = np.asarray(T)
check_dtype(T)

Expand Down
4 changes: 2 additions & 2 deletions stumpy/maamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,8 +879,8 @@ def maamp(T, m, include=None, discords=False, p=2.0):
----------
T : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile. Each row in `T` represents data from a different
dimension while each column in `T` represents data from the same
matrix profile. Each row in `T` represents data from the same
dimension while each column in `T` represents data from a different
dimension.
m : int
Expand Down
12 changes: 6 additions & 6 deletions stumpy/maamped.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def _dask_maamped(
T_A : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile. Each row in `T_A` represents data from a different
dimension while each column in `T_A` represents data from the same
matrix profile. Each row in `T_A` represents data from the same
dimension while each column in `T_A` represents data from a different
dimension.
T_B : numpy.ndarray
Expand Down Expand Up @@ -194,8 +194,8 @@ def _ray_maamped(
T_A : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile. Each row in `T_A` represents data from a different
dimension while each column in `T_A` represents data from the same
matrix profile. Each row in `T_A` represents data from the same
dimension while each column in `T_A` represents data from a different
dimension.
T_B : numpy.ndarray
Expand Down Expand Up @@ -335,8 +335,8 @@ def maamped(client, T, m, include=None, discords=False, p=2.0):
T : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile. Each row in `T` represents data from a different
dimension while each column in `T` represents data from the same
matrix profile. Each row in `T` represents data from the same
dimension while each column in `T` represents data from a different
dimension.
m : int
Expand Down
4 changes: 2 additions & 2 deletions stumpy/mstump.py
Original file line number Diff line number Diff line change
Expand Up @@ -1126,8 +1126,8 @@ def mstump(
----------
T : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile. Each row in ``T`` represents data from a different
dimension while each column in ``T`` represents data from the same
matrix profile. Each row in ``T`` represents data from the same
dimension while each column in ``T`` represents data from a different
dimension.
m : int
Expand Down
12 changes: 6 additions & 6 deletions stumpy/mstumped.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def _dask_mstumped(
T_A : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile. Each row in `T_A` represents data from a different
dimension while each column in `T_A` represents data from the same
matrix profile. Each row in `T_A` represents data from the same
dimension while each column in `T_A` represents data from a different
dimension.
T_B : numpy.ndarray
Expand Down Expand Up @@ -216,8 +216,8 @@ def _ray_mstumped(
T_A : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile. Each row in `T_A` represents data from a different
dimension while each column in `T_A` represents data from the same
matrix profile. Each row in `T_A` represents data from the same
dimension while each column in `T_A` represents data from a different
dimension.
T_B : numpy.ndarray
Expand Down Expand Up @@ -387,8 +387,8 @@ def mstumped(
T : numpy.ndarray
The time series or sequence for which to compute the multi-dimensional
matrix profile. Each row in ``T`` represents data from a different
dimension while each column in ``T`` represents data from the same
matrix profile. Each row in ``T`` represents data from the same
dimension while each column in ``T`` represents data from a different
dimension.
m : int
Expand Down
7 changes: 7 additions & 0 deletions tests/test_mstump.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import numpy.testing as npt
import pandas as pd
import polars as pl
import pytest

from stumpy import config, core, mdl, mstump, subspace
Expand Down Expand Up @@ -305,6 +306,12 @@ def test_mstump_wrapper(T, m):
npt.assert_almost_equal(ref_P, comp_P)
npt.assert_almost_equal(ref_I, comp_I)

df = pl.DataFrame(T.T)
comp_P, comp_I = mstump(df, m)

npt.assert_almost_equal(ref_P, comp_P)
npt.assert_almost_equal(ref_I, comp_I)


@pytest.mark.parametrize("T, m", test_data)
def test_mstump_wrapper_include(T, m):
Expand Down
9 changes: 9 additions & 0 deletions tests/test_stump.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import numpy.testing as npt
import pandas as pd
import polars as pl
import pytest

from stumpy import config, stump
Expand Down Expand Up @@ -42,6 +43,10 @@ def test_stump_self_join(T_A, T_B):
naive.replace_inf(comp_mp)
npt.assert_almost_equal(ref_mp, comp_mp)

comp_mp = stump(pl.Series(T_B), m, ignore_trivial=True)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(ref_mp, comp_mp)


@pytest.mark.parametrize("T_A, T_B", test_data)
def test_stump_A_B_join(T_A, T_B):
Expand All @@ -56,6 +61,10 @@ def test_stump_A_B_join(T_A, T_B):
naive.replace_inf(comp_mp)
npt.assert_almost_equal(ref_mp, comp_mp)

comp_mp = stump(pl.Series(T_A), m, pl.Series(T_B), ignore_trivial=False)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(ref_mp, comp_mp)


def test_stump_constant_subsequence_self_join():
T_A = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64)))
Expand Down
5 changes: 5 additions & 0 deletions tests/test_stumped.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import numpy.testing as npt
import pandas as pd
import polars as pl
import pytest
from dask.distributed import Client, LocalCluster

Expand Down Expand Up @@ -75,6 +76,10 @@ def test_stumped_self_join_df(T_A, T_B, dask_cluster):
naive.replace_inf(comp_mp)
npt.assert_almost_equal(ref_mp, comp_mp)

comp_mp = stumped(dask_client, pl.Series(T_B), m, ignore_trivial=True)
naive.replace_inf(comp_mp)
npt.assert_almost_equal(ref_mp, comp_mp)


@pytest.mark.filterwarnings("ignore:numpy.dtype size changed")
@pytest.mark.filterwarnings("ignore:numpy.ufunc size changed")
Expand Down

0 comments on commit ce0cd8c

Please sign in to comment.