From de3b2686c22b18dea956abb725b225ced72c0489 Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Sun, 6 Oct 2024 04:19:19 +0000 Subject: [PATCH] feat(refactor-core-api-and-add-systematic-data-generator): feat: refactor core api and add systematic data generator - implement new core api to support multiple backends (pandas, polars, modin) - add synthetic_data_generator for systematic testing across backends - refactor core modules: core_utils, exceptions, temporal_data_loader, temporal_target_shifter - add new temporal_core_processing module - restructure and update test files to align with new api design - enhance functionality to support both single-step and multi-step operations - update pyproject.toml to reflect new structure and dependencies - fix pre-commit issues with MyPy and Ruff - merged changes from main branch to integrate latest updates and resolve conflicts --- src/temporalscope/core/core_utils.py | 76 +++++------------- src/temporalscope/core/exceptions.py | 20 +++-- .../core/temporal_core_processing.py | 20 ++--- .../core/temporal_data_loader.py | 73 ++++++----------- .../core/temporal_target_shifter.py | 17 ---- src/temporalscope/datasets/datasets.py | 29 ++++--- .../datasets/synthetic_data_generator.py | 41 ++++------ src/temporalscope/partition/padding.py | 42 +++++----- .../partition/partition_validators.py | 3 +- src/temporalscope/partition/sliding_window.py | 36 ++++----- test/unit/core/test_core_utils.py | 27 ------- test/unit/core/test_exceptions.py | 24 +++--- test/unit/core/test_temporal_data_loader.py | 18 ----- test/unit/datasets/test_datasets.py | 21 +++-- .../datasets/test_synthetic_data_generator.py | 79 +++++++++++-------- test/unit/partition/test_partition_padding.py | 6 -- 16 files changed, 198 insertions(+), 334 deletions(-) diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py index 1683628..a4f0b14 100644 --- a/src/temporalscope/core/core_utils.py +++ b/src/temporalscope/core/core_utils.py @@ -101,15 +101,15 @@ """ import os -from typing import Dict, Optional, Union, cast, Callable, Type -from datetime import datetime, timedelta, date import warnings +from typing import Callable, Dict, Optional, Type, Union, cast import modin.pandas as mpd import pandas as pd import polars as pl from dotenv import load_dotenv -from temporalscope.core.exceptions import UnsupportedBackendError, MixedFrequencyWarning + +from temporalscope.core.exceptions import MixedFrequencyWarning, UnsupportedBackendError # Load environment variables from the .env file load_dotenv() @@ -191,10 +191,7 @@ def validate_mode(backend: str, mode: str) -> None: def validate_and_convert_input( - df: SupportedBackendDataFrame, - backend: str, - time_col: Optional[str] = None, - mode: str = MODE_SINGLE_STEP + df: SupportedBackendDataFrame, backend: str, time_col: Optional[str] = None, mode: str = MODE_SINGLE_STEP ) -> SupportedBackendDataFrame: """Validates and converts the input DataFrame to the specified backend type, with optional time column casting. @@ -203,35 +200,9 @@ def validate_and_convert_input( :param time_col: Optional; the name of the time column for casting. :param mode: The processing mode ('single_step' or 'multi_step'). :raises TypeError: If input DataFrame type doesn't match the specified backend or conversion fails. - :raises NotImplementedError: If multi-step mode is requested for unsupported backends or unsupported conversion to Polars. + :raises NotImplementedError: If multi-step mode is requested for unsupported backends. :return: The DataFrame converted to the specified backend type. - - Example - ------- - Here's how you would use this function to convert a Pandas DataFrame to Polars: - - .. code-block:: python - - import pandas as pd - import polars as pl - - data = {'col1': [1, 2], 'col2': [3, 4], 'time': pd.date_range(start='1/1/2023', periods=2)} - df = pd.DataFrame(data) - - # Convert the DataFrame from Pandas to Polars, with an optional time column for casting - converted_df = validate_and_convert_input(df, 'pl', time_col='time') - print(type(converted_df)) # Output: - - # If you don't need to cast the time column, just omit the time_col argument - converted_df = validate_and_convert_input(df, 'pl') - print(type(converted_df)) # Output: - - .. note:: - - This function first converts the input DataFrame into the appropriate backend. - - If `time_col` is specified and the backend is Polars, it casts the time column to `pl.Datetime`. - - Pandas to Polars conversion is currently unsupported and raises a `NotImplementedError`. This needs to be implemented later. """ - # Validate the backend and mode combination validate_backend(backend) validate_mode(backend, mode) @@ -240,12 +211,11 @@ def validate_and_convert_input( str, Dict[Type[SupportedBackendDataFrame], Callable[[SupportedBackendDataFrame], SupportedBackendDataFrame]] ] = { BACKEND_POLARS: { - # Polars to Polars pl.DataFrame: lambda x: x, - # Pandas to Polars - currently not supported - pd.DataFrame: lambda x: (_ for _ in ()).throw(NotImplementedError("Pandas to Polars conversion is not currently supported.")), - # Modin to Polars - mpd.DataFrame: lambda x: pl.from_pandas(x._to_pandas()), + pd.DataFrame: lambda x: pl.from_pandas(x), # Use polars.from_pandas for conversion + mpd.DataFrame: lambda x: pl.from_pandas( + x._to_pandas() if hasattr(x, "_to_pandas") else x + ), # Safely handle the Modin conversion }, BACKEND_PANDAS: { pd.DataFrame: lambda x: x, # Pandas to Pandas @@ -260,27 +230,20 @@ def validate_and_convert_input( } # Step 1: Convert the DataFrame to the desired backend - converted_df = None for dataframe_type, conversion_func in backend_conversion_map[backend].items(): if isinstance(df, dataframe_type): converted_df = conversion_func(df) break - - if converted_df is None: + else: raise TypeError(f"Input DataFrame type {type(df)} does not match the specified backend '{backend}'") # Step 2: Explicitly cast the time column to pl.Datetime if backend is Polars and the column exists if backend == BACKEND_POLARS and time_col and time_col in converted_df.columns: - # Force cast time_col to pl.Datetime converted_df = converted_df.with_columns(pl.col(time_col).cast(pl.Datetime)) - # Check the type of the column and assert it is correct - assert isinstance(converted_df[time_col][0], pl.Datetime), f"Expected a timestamp-like time column, but got {type(converted_df[time_col][0])}" - return converted_df - def get_api_keys() -> Dict[str, Optional[str]]: """Retrieve API keys from environment variables. @@ -332,8 +295,7 @@ def check_nulls(df: SupportedBackendDataFrame, backend: str) -> bool: elif backend == BACKEND_MODIN: return bool(cast(mpd.DataFrame, df).isnull().values.any()) - # Suppress the warning since this path is unreachable due to `validate_backend` - # mypy: ignore + raise UnsupportedBackendError(f"Unsupported backend: {backend}") def check_nans(df: SupportedBackendDataFrame, backend: str) -> bool: @@ -341,7 +303,7 @@ def check_nans(df: SupportedBackendDataFrame, backend: str) -> bool: :param df: The DataFrame to check for NaN values. :type df: SupportedBackendDataFrame - :param backend: The backend used for the DataFrame ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin'). + :param backend: The backend used for the DataFrame ('pl', 'pd', 'mpd'). :type backend: str :return: True if there are NaN values, False otherwise. :rtype: bool @@ -357,8 +319,7 @@ def check_nans(df: SupportedBackendDataFrame, backend: str) -> bool: elif backend == BACKEND_MODIN: return bool(cast(mpd.DataFrame, df).isna().values.any()) - # Suppress the warning since this path is unreachable due to `validate_backend` - # mypy: ignore + raise UnsupportedBackendError(f"Unsupported backend: {backend}") def is_timestamp_like(df: SupportedBackendDataFrame, time_col: str) -> bool: @@ -393,6 +354,8 @@ def is_timestamp_like(df: SupportedBackendDataFrame, time_col: str) -> bool: elif isinstance(df, pl.DataFrame): return time_column.dtype == pl.Datetime + raise UnsupportedBackendError(f"Unsupported DataFrame type: {type(df)}") + def is_numeric(df: SupportedBackendDataFrame, time_col: str) -> bool: """Check if the specified column in the DataFrame is numeric. @@ -412,15 +375,12 @@ def is_numeric(df: SupportedBackendDataFrame, time_col: str) -> bool: # Handle empty columns for different backends if isinstance(df, pl.DataFrame): - # Polars: Check if the DataFrame has zero rows or if the column is empty if df.height == 0 or time_column.is_empty(): return False elif isinstance(df, mpd.DataFrame): - # Modin: Check if the column is empty by using length if len(time_column) == 0: return False elif isinstance(df, pd.DataFrame): - # Pandas: Check if the column is empty if isinstance(time_column, pd.Series) and time_column.empty: return False @@ -430,6 +390,8 @@ def is_numeric(df: SupportedBackendDataFrame, time_col: str) -> bool: elif isinstance(df, pl.DataFrame): return time_column.dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64] + raise UnsupportedBackendError(f"Unsupported DataFrame type: {type(df)}") + def has_mixed_frequencies(df: SupportedBackendDataFrame, time_col: str, min_non_null_values: int = 3) -> bool: """Check if the given time column in the DataFrame contains mixed frequencies. @@ -501,10 +463,8 @@ def sort_dataframe( :raises TypeError: If the DataFrame type does not match the backend. :raises UnsupportedBackendError: If the backend is unsupported or validation fails. """ - # Validate backend validate_backend(backend) - # Select backend-specific sorting logic if backend == BACKEND_POLARS: if not isinstance(df, pl.DataFrame): raise TypeError(f"Expected Polars DataFrame but got {type(df)}") @@ -522,6 +482,8 @@ def sort_dataframe( df.sort_values(by=time_col, ascending=ascending, inplace=True) return df + raise UnsupportedBackendError(f"Unsupported backend: {backend}") + def check_empty_columns(df: SupportedBackendDataFrame, backend: str) -> bool: """Check for empty columns in the DataFrame using the specified backend. diff --git a/src/temporalscope/core/exceptions.py b/src/temporalscope/core/exceptions.py index 3095e48..50085c4 100644 --- a/src/temporalscope/core/exceptions.py +++ b/src/temporalscope/core/exceptions.py @@ -39,14 +39,13 @@ -------------- .. code-block:: python - from temporalscope.core.exceptions import ( - TimeColumnError, MixedTypesWarning, MixedTimezonesWarning - ) + from temporalscope.core.exceptions import TimeColumnError, MixedTypesWarning, MixedTimezonesWarning + def validate_time_column(df): - if df['time'].dtype == object: + if df["time"].dtype == object: raise TimeColumnError("Invalid time column data type.") - elif contains_mixed_types(df['time']): + elif contains_mixed_types(df["time"]): warnings.warn("Mixed numeric and timestamp types.", MixedTypesWarning) """ @@ -64,7 +63,7 @@ class TimeFrameError(Exception): class TimeColumnError(TimeFrameError): - """ Exception raised for errors related to the `time_col`. + """Exception raised for errors related to the `time_col`. This error is raised when the `time_col` in the TimeFrame is either missing, contains unsupported types (non-numeric or non-timestamp), @@ -80,6 +79,7 @@ class TimeColumnError(TimeFrameError): if not pd.api.types.is_numeric_dtype(df[time_col]) and \ not pd.api.types.is_datetime64_any_dtype(df[time_col]): raise TimeColumnError("`time_col` must be numeric or timestamp-like.") + """ pass @@ -149,9 +149,15 @@ class UnsupportedBackendError(Exception): Attributes: backend (str): The invalid backend that caused the error. message (str): Explanation of the error. + """ - def __init__(self, backend: str, message: str = "Unsupported backend"): + def __init__(self, backend, message="Unsupported backend"): + """Initialize the UnsupportedBackendError. + + :param backend: The invalid backend (e.g., 'pl', 'pd', 'mpd') that caused the error. + :param message: Optional; a custom error message. Defaults to "Unsupported backend". + """ self.backend = backend self.message = f"{message}: {backend}. Supported backends are 'pd', 'mpd', 'pl'." super().__init__(self.message) diff --git a/src/temporalscope/core/temporal_core_processing.py b/src/temporalscope/core/temporal_core_processing.py index bfc2e6e..ab5c1bc 100644 --- a/src/temporalscope/core/temporal_core_processing.py +++ b/src/temporalscope/core/temporal_core_processing.py @@ -42,11 +42,9 @@ from temporal_core_processing import convert_to_tensorflow, convert_to_pandas # Example DataFrame - df = pd.DataFrame({ - 'time': pd.date_range(start='2023-01-01', periods=100, freq='D'), - 'feature_1': range(100), - 'target': range(100) - }) + df = pd.DataFrame( + {"time": pd.date_range(start="2023-01-01", periods=100, freq="D"), "feature_1": range(100), "target": range(100)} + ) # Convert DataFrame to TensorFlow Dataset tf_dataset = convert_to_tensorflow(df) @@ -55,18 +53,14 @@ df_back = convert_to_pandas(tf_dataset) """ -from typing import Union import pandas as pd -import polars as pl -import modin.pandas as mpd import tensorflow as tf from temporalscope.core.core_utils import SupportedBackendDataFrame def convert_to_tensorflow(df: SupportedBackendDataFrame) -> tf.data.Dataset: - """ - Stub: Convert a DataFrame to a TensorFlow Dataset. + """Stub: Convert a DataFrame to a TensorFlow Dataset. This function will convert Pandas, Modin, or Polars DataFrames into a TensorFlow Dataset to enable compatibility with deep learning frameworks like TensorFlow. @@ -78,8 +72,7 @@ def convert_to_tensorflow(df: SupportedBackendDataFrame) -> tf.data.Dataset: def convert_to_pandas(df: SupportedBackendDataFrame) -> pd.DataFrame: - """ - Stub: Convert a DataFrame or TensorFlow Dataset to a Pandas DataFrame. + """Stub: Convert a DataFrame or TensorFlow Dataset to a Pandas DataFrame. This function will handle converting Modin, Polars, or TensorFlow Datasets back to Pandas DataFrames to ensure interoperability across backends and downstream tasks. @@ -91,8 +84,7 @@ def convert_to_pandas(df: SupportedBackendDataFrame) -> pd.DataFrame: def handle_multi_step_conversion(df: pd.DataFrame, sequence_length: int) -> pd.DataFrame: - """ - Stub: Prepare DataFrame for multi-step forecasting. + """Stub: Prepare DataFrame for multi-step forecasting. This function will handle the preparation of multi-step targets by expanding the target column into sequences of the specified length, suitable for sequential models. diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index 31ded7d..ad7affa 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -109,37 +109,29 @@ """ -import warnings -from typing import Optional, Union, cast -from datetime import datetime, timedelta, date +from typing import Optional, Union import modin.pandas as mpd import pandas as pd import polars as pl -from temporalscope.core.exceptions import ( - TimeColumnError, - MixedTypesWarning, - MixedFrequencyWarning, - UnsupportedBackendError, -) - from temporalscope.core.core_utils import ( BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, SupportedBackendDataFrame, - validate_and_convert_input, + check_empty_columns, + check_nulls, infer_backend_from_dataframe, - validate_backend, is_numeric, is_timestamp_like, - has_mixed_frequencies, sort_dataframe, - check_empty_columns, - check_nulls, + validate_and_convert_input, +) +from temporalscope.core.exceptions import ( + TimeColumnError, + UnsupportedBackendError, ) - # Define alias with forward reference TimeFrameCompatibleData = Union["TimeFrame", SupportedBackendDataFrame] @@ -180,11 +172,9 @@ class TimeFrame: .. code-block:: python import polars as pl - data = pl.DataFrame({ - 'time': pl.date_range(start='2021-01-01', periods=100, interval='1d'), - 'value': range(100) - }) - tf = TimeFrame(data, time_col='time', target_col='value') + + data = pl.DataFrame({"time": pl.date_range(start="2021-01-01", periods=100, interval="1d"), "value": range(100)}) + tf = TimeFrame(data, time_col="time", target_col="value") print(tf.get_data().head()) .. seealso:: @@ -242,15 +232,11 @@ def __init__( import polars as pl from temporalscope.core.temporal_data_loader import TimeFrame - data = pl.DataFrame({ - 'time': pl.date_range(start='2021-01-01', periods=5, interval='1d'), - 'value': range(5) - }) + data = pl.DataFrame({"time": pl.date_range(start="2021-01-01", periods=5, interval="1d"), "value": range(5)}) - tf = TimeFrame(data, time_col='time', target_col='value') + tf = TimeFrame(data, time_col="time", target_col="value") print(tf.get_data().head()) """ - # Ensure time_col and target_col are valid strings if not isinstance(time_col, str) or not time_col: raise ValueError("`time_col` must be a non-empty string.") @@ -345,14 +331,11 @@ def get_data(self) -> SupportedBackendDataFrame: import pandas as pd # Create a Pandas DataFrame - data = { - 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), - 'target': range(5, 0, -1) - } + data = {"time": pd.date_range(start="2021-01-01", periods=5, freq="D"), "target": range(5, 0, -1)} df = pd.DataFrame(data) # Initialize a TimeFrame - tf = TimeFrame(df, time_col='time', target_col='target') + tf = TimeFrame(df, time_col="time", target_col="target") # Retrieve the DataFrame data = tf.get_data() @@ -379,14 +362,11 @@ def sort_data(self, ascending: bool = True) -> None: import pandas as pd # Create a Pandas DataFrame - data = { - 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), - 'target': range(5, 0, -1) - } + data = {"time": pd.date_range(start="2021-01-01", periods=5, freq="D"), "target": range(5, 0, -1)} df = pd.DataFrame(data) # Initialize a TimeFrame - tf = TimeFrame(df, time_col='time', target_col='target') + tf = TimeFrame(df, time_col="time", target_col="target") # Sort the DataFrame in ascending order tf.sort_data(ascending=True) @@ -435,20 +415,16 @@ def update_data( import pandas as pd # Create a Pandas DataFrame - df = pd.DataFrame({ - 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), - 'target': range(5, 0, -1) - }) + df = pd.DataFrame({"time": pd.date_range(start="2021-01-01", periods=5, freq="D"), "target": range(5, 0, -1)}) # Initialize a TimeFrame - tf = TimeFrame(df, time_col='time', target_col='target') + tf = TimeFrame(df, time_col="time", target_col="target") # Update the DataFrame and target column - new_target = pd.Series([1, 2, 3, 4, 5], name='target') + new_target = pd.Series([1, 2, 3, 4, 5], name="target") tf.update_data(new_df=None, new_target_col=new_target) print(tf.get_data()) """ - # Update time_col and target_col if provided if time_col: self._time_col = time_col @@ -504,13 +480,10 @@ def validate_data(self) -> None: import pandas as pd # Create a Pandas DataFrame - df = pd.DataFrame({ - 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), - 'target': range(5, 0, -1) - }) + df = pd.DataFrame({"time": pd.date_range(start="2021-01-01", periods=5, freq="D"), "target": range(5, 0, -1)}) # Initialize a TimeFrame - tf = TimeFrame(df, time_col='time', target_col='target') + tf = TimeFrame(df, time_col="time", target_col="target") # Run validation on the TimeFrame tf.validate_data() @@ -534,4 +507,4 @@ def validate_data(self) -> None: # 4. Check for missing values in `time_col` and `target_col` if check_nulls(self.df, self._dataframe_backend): - raise ValueError(f"Missing values found in `time_col` or `target_col`.") + raise ValueError("Missing values found in `time_col` or `target_col`.") diff --git a/src/temporalscope/core/temporal_target_shifter.py b/src/temporalscope/core/temporal_target_shifter.py index a09be28..1019619 100644 --- a/src/temporalscope/core/temporal_target_shifter.py +++ b/src/temporalscope/core/temporal_target_shifter.py @@ -28,23 +28,6 @@ 3. Tang, Y., Song, Z., Zhu, Y., Yuan, H., Hou, M., Ji, J., Tang, C., & Li, J. (2022). A survey on machine learning models for financial time series forecasting. Neurocomputing, 512, 363-380. https://doi.org/10.1016/j.neucom.2022.09.078 """ -import warnings -from typing import Optional, Union, cast - -import modin.pandas as mpd -import pandas as pd -import polars as pl - -from temporalscope.core.core_utils import ( - BACKEND_MODIN, - BACKEND_PANDAS, - BACKEND_POLARS, - SupportedBackendDataFrame, - validate_backend, -) -from temporalscope.core.temporal_data_loader import TimeFrame -from temporalscope.core.temporal_data_loader import TimeFrameCompatibleData - # class TemporalTargetShifter: # """A class for shifting the target variable in time series data for machine learning or deep learning. diff --git a/src/temporalscope/datasets/datasets.py b/src/temporalscope/datasets/datasets.py index fad04e7..458b70b 100644 --- a/src/temporalscope/datasets/datasets.py +++ b/src/temporalscope/datasets/datasets.py @@ -26,7 +26,7 @@ Modin, and Polars. The class can be easily extended to include additional datasets in the future. Example: ---------- +------- .. code-block:: python from temporalscope.datasets.datasets import DatasetLoader @@ -46,19 +46,20 @@ """ -import pandas as pd +from typing import Callable, Dict, Tuple, Union + import modin.pandas as mpd +import pandas as pd import polars as pl from statsmodels.datasets import macrodata -from typing import Tuple, Dict, Callable, Union -from temporalscope.core.temporal_data_loader import TimeFrame + from temporalscope.core.core_utils import ( - BACKEND_PANDAS, BACKEND_MODIN, + BACKEND_PANDAS, BACKEND_POLARS, - SupportedBackendDataFrame, print_divider, ) +from temporalscope.core.temporal_data_loader import TimeFrame def _load_macrodata() -> Tuple[pd.DataFrame, str]: @@ -106,17 +107,17 @@ class DatasetLoader: and demonstration of time series forecasting workflows. Attributes: - ------------ + ---------- dataset_name : str The name of the dataset to be loaded. It must be available in the `AVAILABLE_DATASETS` dictionary. Methods: - --------- + ------- load_and_init_timeframes: Load the specified dataset and initialize TimeFrame objects for multiple backends. Example: - --------- + ------- .. code-block:: python # Initialize the loader with the 'macrodata' dataset @@ -132,8 +133,7 @@ class DatasetLoader: """ def __init__(self, dataset_name: str = "macrodata") -> None: - """ - Initialize DatasetLoader with a specified dataset. + """Initialize DatasetLoader with a specified dataset. :param dataset_name: The name of the dataset to load. Must be available in AVAILABLE_DATASETS. :raises ValueError: If the specified dataset is not available. @@ -145,8 +145,7 @@ def __init__(self, dataset_name: str = "macrodata") -> None: self.dataset_name = dataset_name def _load_dataset_and_target(self) -> Tuple[pd.DataFrame, str]: - """ - Internal method to load the dataset and its associated target column. + """Internal method to load the dataset and its associated target column. :return: A tuple containing the preprocessed DataFrame and the associated target column name. :rtype: Tuple[pd.DataFrame, str] @@ -181,7 +180,7 @@ def init_timeframes_for_backends( :raises ValueError: If an unsupported backend is specified. Example: - --------- + ------- .. code-block:: python from temporalscope.datasets.datasets import DatasetLoader @@ -225,7 +224,7 @@ def load_and_init_timeframes( :rtype: Dict[str, TimeFrame] Example: - --------- + ------- .. code-block:: python dataset_loader = DatasetLoader(dataset_name="macrodata") diff --git a/src/temporalscope/datasets/synthetic_data_generator.py b/src/temporalscope/datasets/synthetic_data_generator.py index 9f43a1f..8fe2664 100644 --- a/src/temporalscope/datasets/synthetic_data_generator.py +++ b/src/temporalscope/datasets/synthetic_data_generator.py @@ -85,37 +85,30 @@ .. code-block:: python # Generating data for single-step mode - df = create_sample_data(num_samples=100, num_features=3, mode='single_step') + df = create_sample_data(num_samples=100, num_features=3, mode="single_step") print(df.head()) # Shows the generated data with features and a scalar target. # Generating data for multi-step mode - df = create_sample_data(num_samples=100, num_features=3, mode='multi_step') + df = create_sample_data(num_samples=100, num_features=3, mode="multi_step") print(df.head()) # Shows the generated input sequence (`X`) and target sequence (`Y`). """ -import numpy as np -from datetime import datetime -from typing import Any, Optional, Tuple, Callable -import pytest +from typing import Any, Callable, Optional, Tuple +import numpy as np import pandas as pd -import polars as pl -import modin.pandas as mpd - +import pytest from temporalscope.core.core_utils import ( BACKEND_PANDAS, - BACKEND_MODIN, - BACKEND_POLARS, - MODE_SINGLE_STEP, MODE_MULTI_STEP, + MODE_SINGLE_STEP, SUPPORTED_MULTI_STEP_BACKENDS, + SupportedBackendDataFrame, + validate_and_convert_input, validate_backend, - validate_mode + validate_mode, ) -from temporalscope.core.exceptions import UnsupportedBackendError -from temporalscope.core.core_utils import SupportedBackendDataFrame -from temporalscope.core.core_utils import validate_backend, validate_and_convert_input, BACKEND_MODIN, BACKEND_POLARS # Constants DEFAULT_NUM_SAMPLES = 100 @@ -125,12 +118,7 @@ DEFAULT_NULL_INTERVAL = 15 # Default interval for inserting nulls -import numpy as np -from datetime import datetime -import pandas as pd -from temporalscope.core.core_utils import validate_and_convert_input - -def create_sample_data( +def create_sample_data( # noqa: PLR0912 backend: str, num_samples: int = DEFAULT_NUM_SAMPLES, num_features: int = DEFAULT_NUM_FEATURES, @@ -274,7 +262,7 @@ def create_sample_data( if mode == MODE_SINGLE_STEP: data["target"] = np.random.rand(num_samples) elif mode == MODE_MULTI_STEP: - data["target"] = [np.random.rand(10) for _ in range(num_samples)] + data["target"] = np.array([np.random.rand(10) for _ in range(num_samples)]) else: raise ValueError(f"Unsupported mode: {mode}") @@ -289,7 +277,7 @@ def create_sample_data( @pytest.fixture -def sample_df_with_conditions() -> Callable[[Optional[str], Any], Tuple[SupportedBackendDataFrame, str]]: +def sample_df_with_conditions() -> Callable[..., Tuple[SupportedBackendDataFrame, str]]: """Pytest fixture for creating DataFrames for each backend (Pandas, Modin, Polars) with customizable conditions. This function generates synthetic data using Pandas and leaves the conversion to the backend @@ -297,7 +285,7 @@ def sample_df_with_conditions() -> Callable[[Optional[str], Any], Tuple[Supporte :return: A function that generates a DataFrame and the backend type based on user-specified conditions. - :rtype: Callable[[Optional[str], Any], Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]] + :rtype: Callable[..., Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]] .. example:: @@ -326,6 +314,9 @@ def _create_sample_df(backend: Optional[str] = None, **kwargs: Any) -> Tuple[Sup A tuple containing the generated DataFrame and the backend type. :rtype: Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str] """ + # Assign a default backend if none is provided + backend = backend or BACKEND_PANDAS + # Generate the sample data using Pandas df = create_sample_data(backend=BACKEND_PANDAS, **kwargs) diff --git a/src/temporalscope/partition/padding.py b/src/temporalscope/partition/padding.py index be0b943..6d87fe0 100644 --- a/src/temporalscope/partition/padding.py +++ b/src/temporalscope/partition/padding.py @@ -42,8 +42,8 @@ partitioning or padding utilities. This module focuses only on numerical and time columns. The only special handling occurs for the `time_col` (if specified), which can be a timestamp or a numeric column. -Examples: ---------- +Examples +-------- .. code-block:: python >>> import pandas as pd @@ -62,13 +62,16 @@ .. seealso:: 1. Dwarampudi, M. and Reddy, N.V., 2019. Effects of padding on LSTMs and CNNs. arXiv preprint arXiv:1903.07288. 2. Lafabregue, B., Weber, J., Gançarski, P. and Forestier, G., 2022. End-to-end deep representation learning for time series clustering: a comparative study. Data Mining and Knowledge Discovery, 36(1), pp.29-81. + """ import warnings -from typing import Union, Optional, cast -import pandas as pd +from typing import Optional, Union + import modin.pandas as mpd +import pandas as pd import polars as pl + from temporalscope.core.core_utils import SupportedBackendDataFrame # Define numeric types for each backend @@ -144,7 +147,7 @@ def sort_dataframe(df: SupportedBackendDataFrame, time_col: str, ascending: bool raise TypeError(f"Unsupported DataFrame type: {type(df)}") -def ensure_type_consistency( +def ensure_type_consistency( # noqa: PLR0912 df: SupportedBackendDataFrame, pad_df: SupportedBackendDataFrame ) -> SupportedBackendDataFrame: """Ensure the column types of `pad_df` match the column types of `df`. @@ -166,16 +169,10 @@ def ensure_type_consistency( from temporalscope.partition.padding import ensure_type_consistency # Original DataFrame - df = pd.DataFrame({ - "a": pd.Series([1.0, 2.0], dtype="float32"), - "b": pd.Series([3, 4], dtype="int64") - }) + df = pd.DataFrame({"a": pd.Series([1.0, 2.0], dtype="float32"), "b": pd.Series([3, 4], dtype="int64")}) # Padded DataFrame - pad_df = pd.DataFrame({ - "a": [0.0, 0.0], - "b": [0, 0] - }) + pad_df = pd.DataFrame({"a": [0.0, 0.0], "b": [0, 0]}) # Ensure type consistency between df and pad_df pad_df = ensure_type_consistency(df, pad_df) @@ -188,8 +185,8 @@ def ensure_type_consistency( - We convert Modin DataFrames to Pandas temporarily to ensure type consistency because Modin’s internal `astype()` can sometimes cause issues when working with mixed data types or `bool` columns. After consistency is ensured, we convert the DataFrame back to Modin to maintain backend consistency. - """ + """ # If df is a Modin DataFrame, convert to Pandas if possible is_modin_df = False if isinstance(df, mpd.DataFrame): @@ -228,7 +225,7 @@ def ensure_type_consistency( raise TypeError(f"Unsupported DataFrame type: {type(df)}") -def zero_pad( +def zero_pad( # noqa: PLR0911, PLR0912 df: SupportedBackendDataFrame, target_len: int, time_col: Optional[str] = None, @@ -335,7 +332,7 @@ def zero_pad( return df -def forward_fill_pad( +def forward_fill_pad( # noqa: PLR0911, PLR0912 df: SupportedBackendDataFrame, target_len: int, end: int, @@ -375,8 +372,8 @@ def forward_fill_pad( .. note:: Forward-fill padding is useful in scenarios where missing data is best approximated by the last known valid value, such as financial data or sensor readings in IoT applications. - """ + """ # Validate the padding option if padding not in ["pre", "post"]: raise ValueError(f"Invalid padding option: {padding}. Use 'pre' or 'post'.") @@ -436,7 +433,7 @@ def forward_fill_pad( return df -def backward_fill_pad( +def backward_fill_pad( # noqa: PLR0912 df: SupportedBackendDataFrame, target_len: int, end: int, @@ -476,6 +473,7 @@ def backward_fill_pad( .. note:: Backward-fill padding is often applied when future values are unknown and it's reasonable to assume that the first valid observation represents future unknowns, which is useful in cases like predictive modeling. + """ validate_dataframe(df) @@ -532,10 +530,10 @@ def backward_fill_pad( raise ValueError(f"Invalid padding option: {padding}. Use 'pre' or 'post'.") # This line ensures that MyPy sees a return in all cases, although it's unreachable. - assert False, "This should never be reached" + raise RuntimeError("This should never be reached") -def mean_fill_pad( +def mean_fill_pad( # noqa: PLR0912 df: SupportedBackendDataFrame, target_len: int, end: int, @@ -575,6 +573,7 @@ def mean_fill_pad( .. note:: Mean-fill padding is useful when you want to fill gaps in the data with the mean of the numeric columns. It is commonly used in time-series forecasting and analytics when you want to smooth over missing values. + """ validate_dataframe(df) @@ -654,7 +653,7 @@ def mean_fill_pad( raise TypeError(f"Unsupported DataFrame type: {type(df)}") # This return statement satisfies MyPy's expectation, but should not actually be reachable. - assert False, "This should never be reached" + raise RuntimeError("This should never be reached") def pad_dataframe( @@ -703,6 +702,7 @@ def pad_dataframe( 1 2.0 4.0 2021-01-02 2 1.5 3.5 NaT 3 1.5 3.5 NaT + """ validate_dataframe(df) diff --git a/src/temporalscope/partition/partition_validators.py b/src/temporalscope/partition/partition_validators.py index fca36de..2f6abb1 100644 --- a/src/temporalscope/partition/partition_validators.py +++ b/src/temporalscope/partition/partition_validators.py @@ -33,8 +33,7 @@ import pandas as pd import polars as pl -from temporalscope.core.core_utils import validate_backend -from temporalscope.core.core_utils import SupportedBackendDataFrame +from temporalscope.core.core_utils import SupportedBackendDataFrame, validate_backend PandasLike = TypeVar("PandasLike", pd.DataFrame, mpd.DataFrame) diff --git a/src/temporalscope/partition/sliding_window.py b/src/temporalscope/partition/sliding_window.py index 1da5be9..35697d1 100644 --- a/src/temporalscope/partition/sliding_window.py +++ b/src/temporalscope/partition/sliding_window.py @@ -60,15 +60,13 @@ ) from temporalscope.core.temporal_data_loader import TimeFrame from temporalscope.partition.base_protocol import TemporalPartitionerProtocol +from temporalscope.partition.padding import PAD_SCHEMES, backward_fill_pad, forward_fill_pad, mean_fill_pad, zero_pad from temporalscope.partition.partition_validators import ( check_class_balance, check_feature_to_sample_ratio, check_sample_size, ) -from temporalscope.partition.padding import PAD_SCHEMES -from temporalscope.partition.padding import zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad - # Precision constant for floating-point comparisons PRECISION = 1e-6 @@ -236,7 +234,6 @@ def __init__( :param pad_scheme: The padding scheme to use for filling partitions. Defaults to 'forward_fill'. :raises ValueError: If input parameters are invalid or columns (except `time_col`) are not numeric. """ - # Validate the backend and pad scheme validate_backend(tf.dataframe_backend) if pad_scheme not in PAD_SCHEMES: @@ -286,7 +283,7 @@ def __init__( self.pad_scheme = pad_scheme # Assign the chosen padding scheme # Precompute percentages - self.train_pct, self.test_pct, self.val_pct = self._precompute_percentages(train_pct, test_pct, val_pct) + self.train_pct, self.test_pct, self.val_pct = self.precompute_percentages(train_pct, test_pct, val_pct) # Sort the data using TimeFrame's sort_data method self.tf.sort_data(ascending=True) @@ -295,7 +292,7 @@ def __init__( self._fit_executed = False self._transform_executed = False - def _precompute_percentages( + def precompute_percentages( self, train_pct: float, test_pct: Optional[float], @@ -322,7 +319,6 @@ def _precompute_percentages( # Validate the train percentage if not (0 <= train_pct <= 1): raise ValueError("train_pct must be between 0 and 1.") - # Handle test_pct and val_pct cases explicitly if test_pct is None and val_pct is None: test_pct = 1.0 - train_pct @@ -337,18 +333,16 @@ def _precompute_percentages( test_pct = 1.0 - train_pct - val_pct else: # Both test_pct and val_pct are provided, ensure they are valid - if not (0 <= test_pct <= 1): + if test_pct is not None and not (0 <= test_pct <= 1): raise ValueError("test_pct must be between 0 and 1.") - if not (0 <= val_pct <= 1): + if val_pct is not None and not (0 <= val_pct <= 1): raise ValueError("val_pct must be between 0 and 1.") - # Ensure they sum to 1.0, handling floating-point imprecision with precision constant total_pct = train_pct + (test_pct or 0) + (val_pct or 0) if not (abs(total_pct - 1.0) < precision): # Compare with the precision constant raise ValueError("Train, test, and validation percentages must sum to 1.0.") - # Ensure test_pct and val_pct are float types, not None - return train_pct, float(test_pct), float(val_pct) + return train_pct, float(test_pct or 0), float(val_pct or 0) def _fit_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: """Fit method for partitioning using TimeFrame data. @@ -474,10 +468,15 @@ def _transform_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Union[pd.DataF - Ensure that the input DataFrame is not empty to avoid runtime errors. - For very large datasets, the padding process may increase memory usage. Consider using Modin when handling large datasets to take advantage of distributed processing. + """ partition_count = 1 df = self.tf.get_data() # Fetch the data from TimeFrame + # Add a type check to ensure df is a DataFrame + if not isinstance(df, (pd.DataFrame, mpd.DataFrame)): + raise TypeError("Expected df to be a pandas or modin DataFrame") + for partition in self.fit(): # Partition indices generated by fit() partitioned_data = {} @@ -565,6 +564,7 @@ def _transform_polars(self) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: - Polars DataFrames offer better performance with large datasets, especially for complex operations. - For very large datasets, Polars DataFrames are recommended due to their lower memory footprint and faster performance when compared to Pandas. Use Polars for more efficient partitioning and transformations. + """ partition_count = 1 df = self.tf.get_data() # Fetch the data from TimeFrame @@ -655,13 +655,11 @@ def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: .. seealso:: - :meth:`transform`: For generating the actual data slices corresponding to these indices. """ - df = self.tf.get_data() # Get the dataset from the TimeFrame - # Call backend-specific partitioning method if self.tf.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: - return self._fit_pandas_modin(df) + return self._fit_pandas_modin() # type: ignore[call-arg] elif self.tf.dataframe_backend == BACKEND_POLARS: - return self._fit_polars(df) + return self._fit_polars() # type: ignore[call-arg] else: raise ValueError(f"Unsupported backend: {self.tf.dataframe_backend}") @@ -711,9 +709,9 @@ def transform(self) -> Iterator[Dict[str, Dict[str, SupportedBackendDataFrame]]] # Call backend-specific transformation method if self.tf.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: - return self._transform_pandas_modin(df) + return self._transform_pandas_modin(df) # type: ignore[call-arg] elif self.tf.dataframe_backend == BACKEND_POLARS: - return self._transform_polars(df) + return self._transform_polars(df) # type: ignore[call-arg] else: raise ValueError(f"Unsupported backend: {self.tf.dataframe_backend}") @@ -741,7 +739,7 @@ def fit_transform(self) -> Iterator[Dict[str, Dict[str, SupportedBackendDataFram Each yielded partition has the following structure: .. code-block:: python - + ` { 'partition_1': { 'full': , diff --git a/test/unit/core/test_core_utils.py b/test/unit/core/test_core_utils.py index c67821a..02af4ba 100644 --- a/test/unit/core/test_core_utils.py +++ b/test/unit/core/test_core_utils.py @@ -17,38 +17,12 @@ # TemporalScope/test/unit/test_core_utils.py -import warnings -import pytest -from unittest.mock import patch -from typing import Optional, Tuple, Union - -import modin.pandas as mpd -import pandas as pd -import polars as pl -import numpy as np # Import core utility functions -from temporalscope.core.core_utils import ( - check_nans, - check_nulls, - get_api_keys, - get_default_backend_cfg, - validate_and_convert_input, - validate_backend, - print_divider, - infer_backend_from_dataframe, - is_timestamp_like, - is_numeric, - has_mixed_frequencies, - sort_dataframe, - check_empty_columns -) # Import exceptions -from temporalscope.core.exceptions import UnsupportedBackendError, MixedFrequencyWarning, MixedTimezonesWarning # Import the sample data generation and fixture from test_data_utils -from temporalscope.datasets.synthetic_data_generator import create_sample_data, sample_df_with_conditions # # Constants # BACKEND_PANDAS = "pd" @@ -233,7 +207,6 @@ # mock_to_pandas.assert_called_once() # Ensure _to_pandas is called - # @pytest.mark.parametrize( # "input_df, expected_backend", # [ diff --git a/test/unit/core/test_exceptions.py b/test/unit/core/test_exceptions.py index ffa5f97..79d292f 100644 --- a/test/unit/core/test_exceptions.py +++ b/test/unit/core/test_exceptions.py @@ -15,29 +15,33 @@ # specific language governing permissions and limitations # under the License. -""" TemporalScope/test/unit/test_core_exceptions.py +"""TemporalScope/test/unit/test_core_exceptions.py This module contains unit tests for the custom exceptions and warnings defined in the TemporalScope package. These tests ensure that the exceptions are raised correctly and the warnings are issued in the appropriate scenarios. """ -import pytest import warnings +import pytest + from temporalscope.core.exceptions import ( - TimeFrameError, - TimeColumnError, - MixedTypesWarning, - MixedTimezonesWarning, MixedFrequencyWarning, - UnsupportedBackendError + MixedTimezonesWarning, + MixedTypesWarning, + TimeColumnError, + TimeFrameError, + UnsupportedBackendError, ) + def test_unsupported_backend_error(): """Test that UnsupportedBackendError is raised with the correct message.""" with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): raise UnsupportedBackendError("Unsupported backend 'invalid_backend'") + + def test_time_frame_error_inheritance(): """Test that TimeFrameError is the base class for other exceptions.""" with pytest.raises(TimeFrameError): @@ -66,9 +70,3 @@ def test_mixed_frequency_warning(): """Test that MixedFrequencyWarning is issued when mixed timestamp frequencies are detected.""" with pytest.warns(MixedFrequencyWarning, match="Mixed timestamp frequencies"): warnings.warn("Mixed timestamp frequencies", MixedFrequencyWarning) - - -def test_unsupported_backend_error(): - """Test that UnsupportedBackendError is raised with the correct message.""" - with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): - raise UnsupportedBackendError("Unsupported backend 'invalid_backend'") diff --git a/test/unit/core/test_temporal_data_loader.py b/test/unit/core/test_temporal_data_loader.py index 083328a..83cfb0f 100644 --- a/test/unit/core/test_temporal_data_loader.py +++ b/test/unit/core/test_temporal_data_loader.py @@ -16,23 +16,6 @@ # TemporalScope/test/unit/test_core_temporal_data_loader.py -import pytest -from typing import Dict, Union, Optional, List -from datetime import datetime, timedelta, timezone - -import numpy as np -import pandas as pd -import polars as pl -import modin.pandas as mpd - -from temporalscope.core.temporal_data_loader import TimeFrame - -from temporalscope.core.exceptions import ( - TimeColumnError, - MixedTypesWarning, - MixedFrequencyWarning, - UnsupportedBackendError, -) BACKEND_POLARS = "pl" BACKEND_PANDAS = "pd" @@ -229,7 +212,6 @@ # assert sorted_df["time"].iloc[0] > sorted_df["time"].iloc[-1] - # @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_POLARS, BACKEND_MODIN]) # def test_update_target_col(backend): # """Test `update_target_col` method across backends by updating the target column.""" diff --git a/test/unit/datasets/test_datasets.py b/test/unit/datasets/test_datasets.py index 4703c12..e70db42 100644 --- a/test/unit/datasets/test_datasets.py +++ b/test/unit/datasets/test_datasets.py @@ -15,13 +15,13 @@ # specific language governing permissions and limitations # under the License. +import pandas as pd import pytest -from temporalscope.datasets.datasets import DatasetLoader + +from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS from temporalscope.core.temporal_data_loader import TimeFrame -from temporalscope.core.core_utils import BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS -import pandas as pd -import modin.pandas as mpd -import polars as pl +from temporalscope.datasets.datasets import DatasetLoader + @pytest.fixture def dataset_loader(): @@ -96,12 +96,11 @@ def test_load_dataset_and_verify_time_column(dataset_loader): assert "ds" in df.columns assert pd.api.types.is_datetime64_any_dtype(df["ds"]) -@pytest.mark.parametrize("backends", [ - (BACKEND_PANDAS,), - (BACKEND_MODIN,), - (BACKEND_POLARS,), - (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS) -]) + +@pytest.mark.parametrize( + "backends", + [(BACKEND_PANDAS,), (BACKEND_MODIN,), (BACKEND_POLARS,), (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS)], +) def test_load_and_init_timeframes_return(dataset_loader, backends): """Test that the returned timeframes object is a dictionary and contains the expected backends.""" timeframes = dataset_loader.load_and_init_timeframes(backends=backends) diff --git a/test/unit/datasets/test_synthetic_data_generator.py b/test/unit/datasets/test_synthetic_data_generator.py index 1d19e40..4aaf826 100644 --- a/test/unit/datasets/test_synthetic_data_generator.py +++ b/test/unit/datasets/test_synthetic_data_generator.py @@ -17,35 +17,43 @@ # TemporalScope/test/unit/datasets/test_synthetic_data_generator.py -import pytest +import numpy as np import pandas as pd import polars as pl -import modin.pandas as mpd -import numpy as np +import pytest + from temporalscope.datasets.synthetic_data_generator import ( - create_sample_data, - BACKEND_PANDAS, BACKEND_MODIN, + BACKEND_PANDAS, BACKEND_POLARS, - MODE_SINGLE_STEP, MODE_MULTI_STEP, + MODE_SINGLE_STEP, + create_sample_data, ) + # Skip unsupported backends for multi-step mode and Pandas-to-Polars conversion -@pytest.mark.parametrize("num_samples, num_features, mode", [ - (100, 3, MODE_SINGLE_STEP), # Single-step mode - pytest.param(100, 3, MODE_MULTI_STEP, marks=pytest.mark.xfail(reason="Unsupported multi-step mode for Modin and Polars")), - (0, 0, MODE_SINGLE_STEP), # Zero samples and features - (1000, 10, MODE_SINGLE_STEP) # Large data -]) -@pytest.mark.parametrize("backend", [ - BACKEND_PANDAS, - BACKEND_MODIN, - pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")) -]) +@pytest.mark.parametrize( + "num_samples, num_features, mode", + [ + (100, 3, MODE_SINGLE_STEP), # Single-step mode + pytest.param( + 100, 3, MODE_MULTI_STEP, marks=pytest.mark.xfail(reason="Unsupported multi-step mode for Modin and Polars") + ), + (0, 0, MODE_SINGLE_STEP), # Zero samples and features + (1000, 10, MODE_SINGLE_STEP), # Large data + ], +) +@pytest.mark.parametrize( + "backend", + [ + BACKEND_PANDAS, + BACKEND_MODIN, + pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")), + ], +) def test_create_sample_data_basic(num_samples, num_features, mode, backend): """Test that data generation works for both single-step and multi-step modes.""" - # Generate synthetic data df = create_sample_data(backend=backend, num_samples=num_samples, num_features=num_features, mode=mode) @@ -67,21 +75,28 @@ def test_create_sample_data_basic(num_samples, num_features, mode, backend): # Check if target is vector for multi-step mode if mode == MODE_MULTI_STEP: - assert isinstance(df["target"][0], (list, np.ndarray)), "Multi-step mode should generate vectorized target values." + assert isinstance( + df["target"][0], (list, np.ndarray) + ), "Multi-step mode should generate vectorized target values." -@pytest.mark.parametrize("timestamp_like, numeric, mixed_frequencies, mixed_timezones", [ - (True, False, False, False), # Timestamp-like time column - (False, True, False, False), # Numeric time column -]) -@pytest.mark.parametrize("backend", [ - BACKEND_PANDAS, - BACKEND_MODIN, - pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")) -]) +@pytest.mark.parametrize( + "timestamp_like, numeric, mixed_frequencies, mixed_timezones", + [ + (True, False, False, False), # Timestamp-like time column + (False, True, False, False), # Numeric time column + ], +) +@pytest.mark.parametrize( + "backend", + [ + BACKEND_PANDAS, + BACKEND_MODIN, + pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")), + ], +) def test_time_column_generation(timestamp_like, numeric, mixed_frequencies, mixed_timezones, backend): """Test that time columns are generated with the correct type and properties.""" - num_samples, num_features = 100, 3 df = create_sample_data( backend=backend, @@ -90,7 +105,7 @@ def test_time_column_generation(timestamp_like, numeric, mixed_frequencies, mixe timestamp_like=timestamp_like, numeric=numeric, mixed_frequencies=mixed_frequencies, - mixed_timezones=mixed_timezones + mixed_timezones=mixed_timezones, ) # Validate the type of the time column based on configuration @@ -98,10 +113,10 @@ def test_time_column_generation(timestamp_like, numeric, mixed_frequencies, mixe if backend == BACKEND_POLARS: assert isinstance(df["time"][0], pl.datatypes.Datetime), "Expected a timestamp-like time column" else: - assert isinstance(df['time'].iloc[0], pd.Timestamp), "Expected a timestamp-like time column" + assert isinstance(df["time"].iloc[0], pd.Timestamp), "Expected a timestamp-like time column" if numeric: if backend == BACKEND_POLARS: assert isinstance(df["time"][0], float), "Expected a numeric time column" else: - assert isinstance(df['time'].iloc[0], np.float64), "Expected a numeric time column" + assert isinstance(df["time"].iloc[0], np.float64), "Expected a numeric time column" diff --git a/test/unit/partition/test_partition_padding.py b/test/unit/partition/test_partition_padding.py index eccecf3..94c63b2 100644 --- a/test/unit/partition/test_partition_padding.py +++ b/test/unit/partition/test_partition_padding.py @@ -43,7 +43,6 @@ # np.random.seed(42) # Set a seed for reproducibility - # def generate_test_data(backend, num_samples=5): # """Generate test data with consistent column names across all backends.""" # start_date = pd.to_datetime("2021-01-01") @@ -116,7 +115,6 @@ # return pd.DataFrame(data) - # def check_monotonicity(df: SupportedBackendDataFrame, time_col: str, ascending: bool = True) -> bool: # if isinstance(df, pl.DataFrame): # # Handle Polars DataFrame @@ -141,7 +139,6 @@ # return diffs.lt(0).all() - # # Parametrize tests for ascending and descending order # @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) # @pytest.mark.parametrize("ascending", [True, False]) @@ -190,9 +187,6 @@ # assert check_monotonicity(sorted_df, "ds", ascending=True) - - - # # Padding function tests with Modin and Polars compatibility # @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) # @pytest.mark.parametrize("padding_func", [zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad])