Fixed #1058 Added support for Polars

TDAmeritrade · Jan 7, 2025 · ce0cd8c · ce0cd8c
1 parent 4416873
commit ce0cd8c
Show file tree

Hide file tree

Showing 10 changed files with 56 additions and 24 deletions.
diff --git a/environment.yml b/environment.yml
@@ -21,8 +21,9 @@ dependencies:
   - pydata-sphinx-theme>=0.5.2
   - scikit-learn>=0.21.3
   - numpydoc>=1.1.0
-  - build>=0.7.0
+  - python-build>=0.7.0
   - pytest-check-links>=0.7.1
   - isort>=5.11.0
   - jupyterlab-myst>=2.0.0
   - myst-nb>=1.0.0
+  - polars>=1.14.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,7 +52,8 @@ ci = [
     "black >= 22.1.0",
     "pytest >= 4.4.1",
     "isort >= 5.11.0",
-    'tbb >= 2019.5 ; platform_system == "Linux"'
+    'tbb >= 2019.5 ; platform_system == "Linux"',
+    "polars >= 1.14.0"
 ]
 
 [project.urls]

diff --git a/stumpy/core.py b/stumpy/core.py
@@ -445,25 +445,26 @@ def check_dtype(a, dtype=np.float64):  # pragma: no cover
 
 def transpose_dataframe(df):  # pragma: no cover
     """
-    Check if the input is a column-wise Pandas `DataFrame`. If `True`, return a
+    Check if the input is a column-wise pandas/polars `DataFrame`. If `True`, return a
     transpose dataframe since stumpy assumes that each row represents data from a
     different dimension while each column represents data from the same dimension.
-    If `False`, return `a` unchanged. Pandas `Series` do not need to be transposed.
+    If `False`, return `a` unchanged. Pandas/polars `Series` do not need to be
+    transposed.
 
     Note that this function has zero dependency on Pandas (not even a soft dependency).
 
     Parameters
     ----------
-    df : numpy.ndarray
-        Pandas dataframe
+    df : DataFrame
+        pandas/polars dataframe
 
     Returns
     -------
     output : df
         If `df` is a Pandas `DataFrame` then return `df.T`. Otherwise, return `df`
     """
     if type(df).__name__ == "DataFrame":
-        return df.T
+        return df.transpose()
 
     return df
 
@@ -2062,8 +2063,16 @@ def _preprocess(T, copy=True):
         Modified time series
     """
     if copy:
-        T = T.copy()
+        try:
+            T = T.copy()
+        except AttributeError:  # Polars copy
+            T = T.clone()
+
     T = transpose_dataframe(T)
+
+    if "polars" in str(type(T)):
+        T = T.to_numpy(writable=True)
+
     T = np.asarray(T)
     check_dtype(T)
 

diff --git a/stumpy/maamp.py b/stumpy/maamp.py
@@ -879,8 +879,8 @@ def maamp(T, m, include=None, discords=False, p=2.0):
     ----------
     T : numpy.ndarray
         The time series or sequence for which to compute the multi-dimensional
-        matrix profile. Each row in `T` represents data from a different
-        dimension while each column in `T` represents data from the same
+        matrix profile. Each row in `T` represents data from the same
+        dimension while each column in `T` represents data from a different
         dimension.
 
     m : int

diff --git a/stumpy/maamped.py b/stumpy/maamped.py
@@ -39,8 +39,8 @@ def _dask_maamped(
 
     T_A : numpy.ndarray
         The time series or sequence for which to compute the multi-dimensional
-        matrix profile. Each row in `T_A` represents data from a different
-        dimension while each column in `T_A` represents data from the same
+        matrix profile. Each row in `T_A` represents data from the same
+        dimension while each column in `T_A` represents data from a different
         dimension.
 
     T_B : numpy.ndarray
@@ -194,8 +194,8 @@ def _ray_maamped(
 
     T_A : numpy.ndarray
         The time series or sequence for which to compute the multi-dimensional
-        matrix profile. Each row in `T_A` represents data from a different
-        dimension while each column in `T_A` represents data from the same
+        matrix profile. Each row in `T_A` represents data from the same
+        dimension while each column in `T_A` represents data from a different
         dimension.
 
     T_B : numpy.ndarray
@@ -335,8 +335,8 @@ def maamped(client, T, m, include=None, discords=False, p=2.0):
 
     T : numpy.ndarray
         The time series or sequence for which to compute the multi-dimensional
-        matrix profile. Each row in `T` represents data from a different
-        dimension while each column in `T` represents data from the same
+        matrix profile. Each row in `T` represents data from the same
+        dimension while each column in `T` represents data from a different
         dimension.
 
     m : int

diff --git a/stumpy/mstump.py b/stumpy/mstump.py
@@ -1126,8 +1126,8 @@ def mstump(
     ----------
     T : numpy.ndarray
         The time series or sequence for which to compute the multi-dimensional
-        matrix profile. Each row in ``T`` represents data from a different
-        dimension while each column in ``T`` represents data from the same
+        matrix profile. Each row in ``T`` represents data from the same
+        dimension while each column in ``T`` represents data from a different
         dimension.
 
     m : int

diff --git a/stumpy/mstumped.py b/stumpy/mstumped.py
@@ -43,8 +43,8 @@ def _dask_mstumped(
 
     T_A : numpy.ndarray
         The time series or sequence for which to compute the multi-dimensional
-        matrix profile. Each row in `T_A` represents data from a different
-        dimension while each column in `T_A` represents data from the same
+        matrix profile. Each row in `T_A` represents data from the same
+        dimension while each column in `T_A` represents data from a different
         dimension.
 
     T_B : numpy.ndarray
@@ -216,8 +216,8 @@ def _ray_mstumped(
 
     T_A : numpy.ndarray
         The time series or sequence for which to compute the multi-dimensional
-        matrix profile. Each row in `T_A` represents data from a different
-        dimension while each column in `T_A` represents data from the same
+        matrix profile. Each row in `T_A` represents data from the same
+        dimension while each column in `T_A` represents data from a different
         dimension.
 
     T_B : numpy.ndarray
@@ -387,8 +387,8 @@ def mstumped(
 
     T : numpy.ndarray
         The time series or sequence for which to compute the multi-dimensional
-        matrix profile. Each row in ``T`` represents data from a different
-        dimension while each column in ``T`` represents data from the same
+        matrix profile. Each row in ``T`` represents data from the same
+        dimension while each column in ``T`` represents data from a different
         dimension.
 
     m : int

diff --git a/tests/test_mstump.py b/tests/test_mstump.py
@@ -4,6 +4,7 @@
 import numpy as np
 import numpy.testing as npt
 import pandas as pd
+import polars as pl
 import pytest
 
 from stumpy import config, core, mdl, mstump, subspace
@@ -305,6 +306,12 @@ def test_mstump_wrapper(T, m):
     npt.assert_almost_equal(ref_P, comp_P)
     npt.assert_almost_equal(ref_I, comp_I)
 
+    df = pl.DataFrame(T.T)
+    comp_P, comp_I = mstump(df, m)
+
+    npt.assert_almost_equal(ref_P, comp_P)
+    npt.assert_almost_equal(ref_I, comp_I)
+
 
 @pytest.mark.parametrize("T, m", test_data)
 def test_mstump_wrapper_include(T, m):

diff --git a/tests/test_stump.py b/tests/test_stump.py
@@ -4,6 +4,7 @@
 import numpy as np
 import numpy.testing as npt
 import pandas as pd
+import polars as pl
 import pytest
 
 from stumpy import config, stump
@@ -42,6 +43,10 @@ def test_stump_self_join(T_A, T_B):
     naive.replace_inf(comp_mp)
     npt.assert_almost_equal(ref_mp, comp_mp)
 
+    comp_mp = stump(pl.Series(T_B), m, ignore_trivial=True)
+    naive.replace_inf(comp_mp)
+    npt.assert_almost_equal(ref_mp, comp_mp)
+
 
 @pytest.mark.parametrize("T_A, T_B", test_data)
 def test_stump_A_B_join(T_A, T_B):
@@ -56,6 +61,10 @@ def test_stump_A_B_join(T_A, T_B):
     naive.replace_inf(comp_mp)
     npt.assert_almost_equal(ref_mp, comp_mp)
 
+    comp_mp = stump(pl.Series(T_A), m, pl.Series(T_B), ignore_trivial=False)
+    naive.replace_inf(comp_mp)
+    npt.assert_almost_equal(ref_mp, comp_mp)
+
 
 def test_stump_constant_subsequence_self_join():
     T_A = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64)))

diff --git a/tests/test_stumped.py b/tests/test_stumped.py
@@ -4,6 +4,7 @@
 import numpy as np
 import numpy.testing as npt
 import pandas as pd
+import polars as pl
 import pytest
 from dask.distributed import Client, LocalCluster
 
@@ -75,6 +76,10 @@ def test_stumped_self_join_df(T_A, T_B, dask_cluster):
         naive.replace_inf(comp_mp)
         npt.assert_almost_equal(ref_mp, comp_mp)
 
+        comp_mp = stumped(dask_client, pl.Series(T_B), m, ignore_trivial=True)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(ref_mp, comp_mp)
+
 
 @pytest.mark.filterwarnings("ignore:numpy.dtype size changed")
 @pytest.mark.filterwarnings("ignore:numpy.ufunc size changed")