From 7a93638816060907c566527a2b4ff8a4f406b9de Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Wed, 18 Sep 2024 01:08:32 +0000 Subject: [PATCH 1/7] refactor: Refactoring API design to utilize generator/protocol pattern and core TimeFrame will utilize Universal Time Series model assumptions - assuming end-user will pre-filter or transform data without grouping from TemporalScope Other time series ML/AI time series packages do not explicitly state modelling assumptions and nomenclature - API design allows for flexibility but enforces best-practice standards --- ..._LITERATURE.md => SCIENTIFIC_LITERATURE.md | 0 pyproject.toml | 20 +- src/temporalscope/conf.py | 97 --- src/temporalscope/core/core_utils.py | 201 +++++++ .../core/temporal_data_loader.py | 386 ++++++------ .../core/temporal_model_trainer.py | 113 ---- .../core/temporal_target_shifter.py | 258 ++++++++ src/temporalscope/core/utils.py | 93 --- .../modeling/temporal_model_trainer.py | 97 +++ src/temporalscope/partition/base.py | 179 ------ src/temporalscope/partition/base_protocol.py | 168 ++++++ ...data_checks.py => partition_validators.py} | 331 ++++------- src/temporalscope/partition/sliding_window.py | 555 +++++++++++++----- test/unit/test_core_conf.py | 104 ---- test/unit/test_core_temporal_data_loader.py | 299 ++++++---- .../unit/test_core_temporal_target_shifter.py | 123 ++++ test/unit/test_core_utils.py | 139 ++++- test/unit/test_partion_data_checks.py | 33 +- .../speed_test_generators.ipynb | 116 ++++ 19 files changed, 2007 insertions(+), 1305 deletions(-) rename .github/SCIENTIFIC_LITERATURE.md => SCIENTIFIC_LITERATURE.md (100%) delete mode 100644 src/temporalscope/conf.py create mode 100644 src/temporalscope/core/core_utils.py delete mode 100644 src/temporalscope/core/temporal_model_trainer.py create mode 100644 src/temporalscope/core/temporal_target_shifter.py delete mode 100644 src/temporalscope/core/utils.py create mode 100644 src/temporalscope/modeling/temporal_model_trainer.py delete mode 100644 src/temporalscope/partition/base.py create mode 100644 src/temporalscope/partition/base_protocol.py rename src/temporalscope/partition/{data_checks.py => partition_validators.py} (51%) delete mode 100644 test/unit/test_core_conf.py create mode 100644 test/unit/test_core_temporal_target_shifter.py create mode 100644 tutorial_notebooks/speed_test_generators.ipynb diff --git a/.github/SCIENTIFIC_LITERATURE.md b/SCIENTIFIC_LITERATURE.md similarity index 100% rename from .github/SCIENTIFIC_LITERATURE.md rename to SCIENTIFIC_LITERATURE.md diff --git a/pyproject.toml b/pyproject.toml index 0507900..3db6045 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,11 @@ dependencies = [ "jupyterlab", "notebook", "commitizen==3.29.0", + "mypy", # Include dependencies for QA scripts + "bandit", # Include dependencies for QA scripts + "black", # Include dependencies for QA scripts + "pytest", # Include pytest for running tests + "pytest-cov" # Include pytest-cov for coverage if needed ] [tool.hatch.envs.docs] @@ -154,7 +159,7 @@ ignore = [ "docs/conf.py" = ["A001", "D103"] [tool.mypy] -files = "temporalscope" +files = "src/temporalscope" python_version = "3.10" ignore_missing_imports = true warn_unreachable = true @@ -178,6 +183,19 @@ check = "ruff check {args}" fix = "ruff check --fix" format = "ruff format {args}" format-check = "ruff format --check {args}" +# Automated developer Q&A script +quality-assurance = """ +pytest && +black src/temporalscope && +ruff check src/temporalscope --output-format=full --show-files --show-fixes && +mypy src/temporalscope --ignore-missing-imports --show-error-codes --warn-unreachable && +bandit -r src/temporalscope +""" +generate-kernel = """ +python -m ipykernel install --user --name temporalscope-kernel --display-name "TemporalScope" +echo "Jupyter kernel 'TemporalScope' created. You can now use it in Jupyter notebooks." +""" + [tool.commitizen] version = "0.1.0" diff --git a/src/temporalscope/conf.py b/src/temporalscope/conf.py deleted file mode 100644 index b64f86a..0000000 --- a/src/temporalscope/conf.py +++ /dev/null @@ -1,97 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Package-level configurations and utilities.""" - -import os - -import modin.pandas as mpd -import pandas as pd -import polars as pl -from dotenv import load_dotenv - -# Load environment variables from the .env file -load_dotenv() - -# Supported backend configuration -TF_DEFAULT_CFG = { - "BACKENDS": {"pl": "polars", "pd": "pandas", "mpd": "modin"}, -} - - -def get_default_backend_cfg() -> dict[str, dict[str, str]]: - """Get the default backend configuration. - - :return: The default backend configuration. - :rtype: dict[str, dict[str, str]] - """ - return TF_DEFAULT_CFG.copy() - - -def validate_backend(backend: str) -> None: - """Validate the backend against the supported backends in the configuration. - - :param backend: The backend to validate ('pl' for Polars, 'pd' for Pandas, 'mpd' - for Modin). - :type backend: str - :raises ValueError: If the backend is not supported. - """ - if backend not in TF_DEFAULT_CFG["BACKENDS"]: - raise ValueError( - f"Unsupported backend '{backend}'. Supported backends are: " - f"{', '.join(TF_DEFAULT_CFG['BACKENDS'].keys())}." - ) - - -def validate_input( - df: pl.DataFrame | pd.DataFrame | mpd.DataFrame, backend: str -) -> None: - """Validate the input DataFrame to ensure it matches the specified backend. - - :param df: The DataFrame to validate. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - :param backend: The backend against which to validate the DataFrame's type - ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). - :type backend: str - :raises TypeError: If the DataFrame does not match the expected type for the - backend. - """ - if backend == "pl" and not isinstance(df, pl.DataFrame): - raise TypeError("Expected a Polars DataFrame.") - elif backend == "pd" and not isinstance(df, pd.DataFrame): - raise TypeError("Expected a Pandas DataFrame.") - elif backend == "mpd" and not isinstance(df, mpd.DataFrame): - raise TypeError("Expected a Modin DataFrame.") - - -def get_api_keys() -> dict[str, str | None]: - """Retrieve API keys from environment variables. - - :return: A dictionary containing the API keys, or None if not found. - :rtype: Dict[str, Optional[str]] - """ - api_keys = { - "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"), - "CLAUDE_API_KEY": os.getenv("CLAUDE_API_KEY"), - } - - # Print warnings if keys are missing - for key, value in api_keys.items(): - if value is None: - print(f"Warning: {key} is not set in the environment variables.") - - return api_keys diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py new file mode 100644 index 0000000..954e149 --- /dev/null +++ b/src/temporalscope/core/core_utils.py @@ -0,0 +1,201 @@ +""" TemporalScope/src/temporalscope/core/core_utils.py + +This module provides utility functions that can be used throughout the TemporalScope package. +It includes methods for printing dividers, checking for nulls and NaNs, and validating the backend. + +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from typing import Union, cast, Dict, Optional +import os +from dotenv import load_dotenv +import polars as pl +import pandas as pd +import modin.pandas as mpd + +# Load environment variables from the .env file +load_dotenv() + +# Supported backend configuration +TF_DEFAULT_CFG = { + "BACKENDS": {"pl": "polars", "pd": "pandas", "mpd": "modin"}, +} + + +def get_default_backend_cfg() -> Dict[str, Dict[str, str]]: + """Retrieve the application configuration settings. + + :return: A dictionary of configuration settings. + :rtype: Dict[str, Dict[str, str]] + """ + return TF_DEFAULT_CFG.copy() + + +def validate_backend(backend: str) -> None: + """Validate the backend against the supported backends in the configuration. + + :param backend: The backend to validate ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). + :type backend: str + :raises ValueError: If the backend is not supported. + """ + if backend not in TF_DEFAULT_CFG["BACKENDS"].keys(): + raise ValueError( + f"Unsupported backend '{backend}'. Supported backends are: " + f"{', '.join(TF_DEFAULT_CFG['BACKENDS'].keys())}." + ) + + +def validate_input( + df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str +) -> None: + """Validates the input DataFrame to ensure it matches the expected type for the specified backend. + + :param df: The DataFrame to validate. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :param backend: The backend against which to validate the DataFrame's type ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). + :type backend: str + :raises TypeError: If the DataFrame does not match the expected type for the backend. + """ + if backend == "pl" and not isinstance(df, pl.DataFrame): + raise TypeError("Expected a Polars DataFrame.") + elif backend == "pd" and not isinstance(df, pd.DataFrame): + raise TypeError("Expected a Pandas DataFrame.") + elif backend == "mpd" and not isinstance(df, mpd.DataFrame): + raise TypeError("Expected a Modin DataFrame.") + + +def get_api_keys() -> Dict[str, Optional[str]]: + """Retrieve API keys from environment variables. + + :return: A dictionary containing the API keys, or None if not found. + :rtype: Dict[str, Optional[str]] + """ + api_keys = { + "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"), + "CLAUDE_API_KEY": os.getenv("CLAUDE_API_KEY"), + } + + # Print warnings if keys are missing + for key, value in api_keys.items(): + if value is None: + print(f"Warning: {key} is not set in the environment variables.") + + return api_keys + + +def validate_and_convert_input( + df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str +) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: + """Validates and converts the input DataFrame to the specified backend type. + + :param df: The input DataFrame to validate and convert. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :param backend: The desired backend type ('pl', 'pd', or 'mpd'). + :type backend: str + :return: The DataFrame converted to the specified backend type. + :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :raises TypeError: If the input DataFrame type doesn't match the specified backend. + """ + validate_backend(backend) # Use the existing validate_backend function + + if backend == "pl": + if isinstance(df, pl.DataFrame): + return df + elif isinstance(df, pd.DataFrame): + return pl.from_pandas(df) + elif isinstance(df, mpd.DataFrame): + return pl.from_pandas(df._to_pandas()) + elif backend == "pd": + if isinstance(df, pd.DataFrame): + return df + elif isinstance(df, pl.DataFrame): + return df.to_pandas() + elif isinstance(df, mpd.DataFrame): + return df._to_pandas() + elif backend == "mpd": + if isinstance(df, mpd.DataFrame): + return df + elif isinstance(df, pd.DataFrame): + return mpd.DataFrame(df) + elif isinstance(df, pl.DataFrame): + return mpd.DataFrame(df.to_pandas()) + + # If we reach here, the input DataFrame type doesn't match the backend + raise TypeError( + f"Input DataFrame type {type(df)} does not match the specified backend {backend}" + ) + + +def print_divider(char: str = "=", length: int = 70) -> None: + """Prints a divider line made of a specified character and length.""" + print(char * length) + + +def check_nulls( + df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str +) -> bool: + """Check for null values in the DataFrame using the specified backend. + + :param df: The DataFrame to check for null values. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :param backend: The backend used for the DataFrame ('polars', 'pandas', 'modin'). + :type backend: str + :return: True if there are null values, False otherwise. + :rtype: bool + :raises ValueError: If the backend is not supported. + """ + validate_backend(backend) + + if backend == "pd": + # Convert NumPy result to Python bool + return bool(cast(pd.DataFrame, df).isnull().values.any()) + elif backend == "pl": + # Polars-specific null check: sum the null counts and return a boolean + polars_df = cast(pl.DataFrame, df) + null_count = polars_df.null_count().select(pl.col("*").sum()).to_numpy().sum() + return bool(null_count > 0) + elif backend == "mpd": + # Convert NumPy result to Python bool + return bool(cast(mpd.DataFrame, df).isnull().values.any()) + else: + raise ValueError(f"Unsupported backend '{backend}'.") + + +def check_nans( + df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str +) -> bool: + """Check for NaN values in the DataFrame using the specified backend. + + :param df: The DataFrame to check for NaN values. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :param backend: The backend used for the DataFrame ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). + :type backend: str + :return: True if there are NaN values, False otherwise. + :rtype: bool + :raises ValueError: If the backend is not supported. + """ + validate_backend(backend) + + if backend == "pd": + # Convert NumPy result to Python bool + return bool(cast(pd.DataFrame, df).isna().values.any()) + elif backend == "pl": + # Polars-specific NaN check: check if there are any NaNs + polars_df = cast(pl.DataFrame, df) + nan_count = polars_df.select((polars_df == float("nan")).sum()).to_numpy().sum() + return bool(nan_count > 0) + elif backend == "mpd": + # Convert NumPy result to Python bool + return bool(cast(mpd.DataFrame, df).isna().values.any()) + else: + raise ValueError(f"Unsupported backend '{backend}'.") diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index 3555bd0..36344ed 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -1,110 +1,84 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Flexible and scalable interface for handling time series data. - -This module implements the TimeFrame class, which provides support for: -- Polars as the default backend -- Pandas as a secondary option for traditional data processing -- Modin for scalable Pandas-like data processing with distributed computing - -The TimeFrame class offers a unified interface for working with time series data -across these different backends, allowing users to choose the most appropriate -backend for their specific use case and performance requirements. -""" +"""TemporalScope/src/temporalscope/core/temporal_data_loader.py -from typing import cast +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -import modin.pandas as mpd -import pandas as pd + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from typing import Union, Optional, cast import polars as pl from polars import Expr - -from temporalscope.conf import ( - get_default_backend_cfg, +import pandas as pd +import modin.pandas as mpd +from temporalscope.core.core_utils import ( + validate_and_convert_input, validate_backend, - validate_input, + get_default_backend_cfg, ) class TimeFrame: - """Handle time series data for supported backends. + """Central class for the TemporalScope package, designed to manage time series data + across various backends such as Polars, Pandas, and Modin. This class enables + modular and flexible workflows for machine learning, deep learning, and time + series explainability (XAI) methods like temporal SHAP. - This class provides functionalities to manage time series data with optional - grouping, available masks, and backend flexibility. It can handle large datasets - efficiently. The class is intended for Machine & Deep Learning time series - forecasting, not classical time series forecasting. The implementation assumes - one-step ahead but other classes & modules can be utilized for partitioning for - pre-trained multi-step DL models that are compatible with SHAP & related - tools e.g., PyTorch or TensorFlow forecasting models. + The `TimeFrame` class supports workflows where the target variable can be either 1D scalar data, + typical in classical machine learning, or 3D tensor data, more common in deep learning contexts. + It is an essential component for temporal data analysis, including but not limited to explainability pipelines + like Temporal SHAP and concept drift analysis. + + Designed to be the core data handler in a variety of temporal analysis scenarios, the `TimeFrame` class + integrates seamlessly with other TemporalScope modules and can be extended for more advanced use cases. :param df: The input DataFrame. - :type df: Union[pl.DataFrame, pd.DataFrame, modin.pandas.DataFrame] + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] :param time_col: The column representing time in the DataFrame. :type time_col: str :param target_col: The column representing the target variable in the DataFrame. :type target_col: str - :param id_col: The column representing the ID for grouping, defaults to None. - :type id_col: Optional[str], optional - :param backend: The backend to use ('pl' for Polars, 'pd' for Pandas, or 'mpd' - for Modin), defaults to 'pl'. - :type backend: str, optional - :param sort: Sort the data by `time_col` (and `id_col` if provided) in ascending - order, defaults to True. - :type sort: bool, optional - - :note: The default assumption for the `TimeFrame` class is that the dataset is - cleaned and prepared for one-step-ahead forecasting, where the `target_col` - directly corresponds to the label. The `id_col` is included for grouping and - sorting purposes but is not used in the default model-building process. + :param id_col: Optional. The column representing the ID for grouping. Default is None. + :type id_col: Optional[str] + :param backend: The backend to use ('pl' for Polars, 'pd' for Pandas, or 'mpd' for Modin). Default is 'pl'. + :type backend: str + :param sort: Optional. Sort the data by `time_col` (and `id_col` if provided) in ascending order. Default is True. + :type sort: bool - :warning: Ensure that the `time_col` is properly formatted as a datetime type to - avoid issues with sorting and grouping. + .. note:: + The `TimeFrame` class is designed for workflows where the target label has already been generated. + If your workflow requires generating the target label, consider using the `TemporalTargetShifter` class + from the `TemporalScope` package to shift the target variable appropriately for tasks like forecasting. - :example: - - Example of creating a TimeFrame with Polars DataFrame: + Example Usage: + -------------- .. code-block:: python - data = pl.DataFrame( - { - "time": pl.date_range(start="2021-01-01", periods=100, interval="1d"), - "value": range(100), - } - ) - tf = TimeFrame(data, time_col="time", target_col="value") + # Example of creating a TimeFrame with a Polars DataFrame + data = pl.DataFrame({ + 'time': pl.date_range(start='2021-01-01', periods=100, interval='1d'), + 'value': range(100) + }) + tf = TimeFrame(data, time_col='time', target_col='value') # Accessing the data print(tf.get_data().head()) - Example of creating a TimeFrame with Modin DataFrame: - - .. code-block:: python - + # Example of creating a TimeFrame with a Modin DataFrame import modin.pandas as mpd - - df = mpd.DataFrame( - { - "time": pd.date_range(start="2021-01-01", periods=100, freq="D"), - "value": range(100), - } - ) - tf = TimeFrame(df, time_col="time", target_col="value", backend="mpd") + df = mpd.DataFrame({ + 'time': pd.date_range(start='2021-01-01', periods=100, freq='D'), + 'value': range(100) + }) + tf = TimeFrame(df, time_col='time', target_col='value', backend='mpd') # Accessing the data print(tf.get_data().head()) @@ -112,156 +86,124 @@ class TimeFrame: def __init__( self, - df: pl.DataFrame | pd.DataFrame | mpd.DataFrame, + df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], time_col: str, target_col: str, - id_col: str | None = None, + id_col: Optional[str] = None, backend: str = "pl", sort: bool = True, ): - """Initialize a TimeFrame object. - - :param df: The input DataFrame containing time series data. - :type df: Union[pl.DataFrame, pd.DataFrame, modin.pandas.DataFrame] - :param time_col: The name of the column representing time in the DataFrame. - :type time_col: str - :param target_col: The name of the column representing the target variable in - the DataFrame. - :type target_col: str - :param id_col: The name of the column representing the ID for grouping. - If None, no grouping is performed. - :type id_col: Optional[str] - :param backend: The backend to use for data processing. Options are: - - "pl": Polars (default) - - "pd": Pandas - - "mpd": Modin - :type backend: str - :param sort: Whether to sort the data by `time_col` (and `id_col` if provided) - in ascending order. - :type sort: bool - - :raises ValueError: If required columns are missing or if there are duplicate - time entries within groups. - :raises TypeError: If the input DataFrame type doesn't match the expected type - for the specified backend. - - :note: This method sets up the TimeFrame object by validating inputs, preparing - data, and performing initial sorting if required. - - :example: - - >>> import polars as pl - >>> data = pl.DataFrame( - ... { - ... "time": pl.date_range( - ... start="2021-01-01", periods=100, interval="1d" - ... ), - ... "value": range(100), - ... } - ... ) - >>> tf = TimeFrame(data, time_col="time", target_col="value") - """ self._cfg = get_default_backend_cfg() self._backend = backend - self._df = df self._time_col = time_col self._target_col = target_col self._id_col = id_col self._sort = sort - # Setup TimeFrame including renaming, sorting, and validations - self.setup_timeframe() - - def setup_timeframe(self) -> None: - """Set up the TimeFrame object by validating and preparing data as required.""" - # Validate the columns are present and correct after potential renaming - self.validate_columns() - - # Now sort data, assuming all columns are correct and exist - if self._sort: - self.sort_data(ascending=True) - - # Final validations after setup - validate_backend(self._backend) - validate_input(self._df, self._backend) + # Convert, validate, and set up the DataFrame + self.df = self._setup_timeframe(df) @property def backend(self) -> str: - """Return the backend used. - - :return: The backend used ('pl' for Polars, 'pd' for Pandas, - or 'mpd' for Modin). - :rtype: str - """ + """Return the backend used ('pl' for Polars, 'pd' for Pandas, or 'mpd' for Modin).""" return self._backend @property def time_col(self) -> str: - """Return the column name representing time. - - :return: The column name representing time. - :rtype: str - """ + """Return the column name representing time.""" return self._time_col @property def target_col(self) -> str: - """Return the column name representing the target variable. - - :return: The column name representing the target variable. - :rtype: str - """ + """Return the column name representing the target variable.""" return self._target_col @property - def id_col(self) -> str | None: - """Return the column name used for grouping or None if not set. - - :return: The column name used for grouping or None if not set. - :rtype: str | None - """ + def id_col(self) -> Optional[str]: + """Return the column name used for grouping or None if not set.""" return self._id_col - def validate_columns(self) -> None: + def _validate_columns( + self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + ) -> None: """Validate the presence and types of required columns in the DataFrame. + :param df: The DataFrame to validate. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] :raises ValueError: If required columns are missing. """ - # Check for the presence of required columns, ignoring None values required_columns = [self.time_col, self._target_col] + ( [self.id_col] if self.id_col else [] ) missing_columns = [ - col for col in required_columns if col and col not in self._df.columns + col for col in required_columns if col and col not in df.columns ] if missing_columns: raise ValueError(f"Missing required columns: {', '.join(missing_columns)}") - def sort_data(self, ascending: bool = True) -> None: - """Sort the DataFrame based on the backend. + def _sort_data( + self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + ) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: + """Internal method to sort the DataFrame based on the backend. - :param ascending: Specifies whether to sort in ascending order. - :type ascending: bool + :param df: The DataFrame to sort. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :return: The sorted DataFrame. + :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] """ sort_key = [self.id_col, self.time_col] if self.id_col else [self.time_col] + # Polars backend sorting if self._backend == "pl": - # For Polars, sort the DataFrame - if isinstance(self._df, pl.DataFrame): - if ascending: - # Sort in ascending order - self._df = self._df.sort(sort_key) - else: - # Sort in descending order using tuples with Polars' SORT_DESCENDING - sort_key_desc = [(col, pl.SORT_DESCENDING) for col in sort_key] - self._df = self._df.sort(sort_key_desc) + if isinstance(df, pl.DataFrame): + return df.sort(sort_key) + else: + raise TypeError("Expected a Polars DataFrame for the Polars backend.") + + # Pandas or Modin backend sorting elif self._backend in ["pd", "mpd"]: - # For Pandas/Modin, ensure we have a DataFrame before sorting - if isinstance(self._df, (pd.DataFrame, mpd.DataFrame)): - self._df = self._df.sort_values(by=sort_key, ascending=ascending) + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + return df.sort_values(by=sort_key) + else: + raise TypeError( + "Expected a Pandas or Modin DataFrame for the Pandas or Modin backend." + ) + + else: + raise ValueError(f"Unsupported backend: {self._backend}") + + def _setup_timeframe( + self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + ) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: + """Sets up the TimeFrame object by converting, validating, and preparing data as required. + + :param df: The input DataFrame to be processed. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :return: The processed DataFrame. + :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + """ + # Convert and validate the input DataFrame + df = validate_and_convert_input(df, self._backend) + + # Validate the presence of required columns + self._validate_columns(df) + + # Sort data if required + if self._sort: + df = self._sort_data(df) + + return df + + def sort_data(self, ascending: bool = True) -> None: + """Public method to sort the DataFrame by the time column (and ID column if present). + + :param ascending: If True, sort in ascending order; if False, sort in descending order. + :type ascending: bool + """ + self._sort_data(ascending) def check_duplicates(self) -> None: - """Check for duplicate time entries within groups. + """Check for duplicate time entries within groups, handling different data backends. :raises ValueError: If duplicate entries are found. """ @@ -273,16 +215,16 @@ def check_duplicates(self) -> None: time_duplicated_expr: Expr = pl.col(self._time_col).is_duplicated() # Combine expressions combined_expr: Expr = id_duplicated_expr | time_duplicated_expr - duplicates = self._df.filter(combined_expr) + duplicates = self.df.filter(combined_expr) # type: ignore else: # Only check the time column for duplicates - duplicates = self._df.filter(pl.col(self._time_col).is_duplicated()) + duplicates = self.df.filter(pl.col(self._time_col).is_duplicated()) # type: ignore # Check for duplicates by inspecting the number of rows if duplicates.height > 0: raise ValueError("Duplicate time entries found within the same group.") elif self._backend in ["pd", "mpd"]: # Cast to Pandas DataFrame for Pandas/Modin specific check - pandas_df = cast(pd.DataFrame, self._df) + pandas_df = cast(pd.DataFrame, self.df) duplicates = pandas_df.duplicated( subset=( [self._id_col, self._time_col] if self._id_col else [self._time_col] @@ -292,45 +234,77 @@ def check_duplicates(self) -> None: if duplicates.any(): raise ValueError("Duplicate time entries found within the same group.") - def get_data(self) -> pl.DataFrame | pd.DataFrame: + def get_data(self) -> Union[pl.DataFrame, pd.DataFrame]: """Return the DataFrame in its current state. - :return: The DataFrame in its current state. - :rtype: pl.DataFrame | pd.DataFrame + :return: The DataFrame managed by the TimeFrame instance. + :rtype: Union[pl.DataFrame, pd.DataFrame] """ - return self._df + return self.df - def get_grouped_data(self) -> pl.DataFrame | pd.DataFrame | mpd.DataFrame: - """Group the DataFrame by the ID column. + def get_grouped_data(self) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: + """Return the grouped DataFrame if an ID column is provided. - :return: The grouped DataFrame. - :rtype: pl.DataFrame | pd.DataFrame | mpd.DataFrame - :raises ValueError: If ID column is not set or if the backend is unsupported. - :raises TypeError: If the DataFrame type doesn't match the expected type for the - backend. + :return: Grouped DataFrame by the ID column if it is set, otherwise returns the original DataFrame. + :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :raises ValueError: If the ID column is not set or an unsupported backend is provided. + :raises TypeError: If the DataFrame type does not match the expected type for the specified backend. """ if not self.id_col: raise ValueError("ID column is not set; cannot group data.") if self._backend == "pl": # Polars specific group_by with aggregation - if isinstance(self._df, pl.DataFrame): - return self._df.group_by(self.id_col).agg( + if isinstance(self.df, pl.DataFrame): + return self.df.group_by(self.id_col).agg( pl.all() ) # Polars uses `group_by` else: - raise TypeError(f"Expected Polars DataFrame but got {type(self._df)}.") + raise TypeError(f"Expected Polars DataFrame but got {type(self.df)}.") elif self._backend == "pd": # Pandas specific groupby - if isinstance(self._df, pd.DataFrame): - return self._df.groupby(self.id_col).apply(lambda x: x) + if isinstance(self.df, pd.DataFrame): + return self.df.groupby(self.id_col).apply(lambda x: x) else: - raise TypeError(f"Expected Pandas DataFrame but got {type(self._df)}.") + raise TypeError(f"Expected Pandas DataFrame but got {type(self.df)}.") elif self._backend == "mpd": # Modin uses the same API as Pandas for this operation - if isinstance(self._df, mpd.DataFrame): - return self._df.groupby(self.id_col).apply(lambda x: x) + if isinstance(self.df, mpd.DataFrame): + return self.df.groupby(self.id_col).apply(lambda x: x) + else: + raise TypeError(f"Expected Modin DataFrame but got {type(self.df)}.") + else: + raise ValueError(f"Unsupported backend: {self._backend}") + + def update_data( + self, new_df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + ) -> None: + """Updates the internal DataFrame with the provided new DataFrame. + + :param new_df: The new DataFrame to replace the existing one. + :type new_df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + """ + self.df = new_df + + def update_target_col( + self, new_target_col: Union[pl.Series, pd.Series, mpd.Series] + ) -> None: + """Updates the target column in the internal DataFrame with the provided new target column. + + :param new_target_col: The new target column to replace the existing one. + :type new_target_col: Union{pl.Series, pd.Series, mpd.Series} + """ + if self._backend == "pl": + if isinstance(self.df, pl.DataFrame): + self.df = self.df.with_columns([new_target_col.alias(self._target_col)]) + else: + raise TypeError("Expected Polars DataFrame for Polars backend.") + elif self._backend in ["pd", "mpd"]: + if isinstance(self.df, (pd.DataFrame, mpd.DataFrame)): + self.df[self._target_col] = new_target_col else: - raise TypeError(f"Expected Modin DataFrame but got {type(self._df)}.") + raise TypeError( + "Expected Pandas or Modin DataFrame for respective backend." + ) else: raise ValueError(f"Unsupported backend: {self._backend}") diff --git a/src/temporalscope/core/temporal_model_trainer.py b/src/temporalscope/core/temporal_model_trainer.py deleted file mode 100644 index b287239..0000000 --- a/src/temporalscope/core/temporal_model_trainer.py +++ /dev/null @@ -1,113 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Implements the `TemporalModelTrainer` for training on temporally partitioned data. - -This module provides functionality to train machine learning models on data -partitioned by temporal methods. Users can pass their custom models or use a -default lightweight model (LightGBM). -""" - -from typing import Any, Dict, List, Optional, Protocol, Union - -import lightgbm as lgb -import pandas as pd - -from temporalscope.partition.base import BaseTemporalPartitioner - - -class Fittable(Protocol): - """Protocol for objects with `fit` method.""" - - def fit(self, x: Any, y: Any) -> None: - """Model Object should have a fit method.""" - ... - - -class TemporalModelTrainer: - """Train models on temporally partitioned data. - - Users can specify a custom model or use the default LightGBM model. - - :param partitioner: An instance of a class that inherits from - `BaseTemporalPartitioner`. - :type partitioner: BaseTemporalPartitioner - :param model: A custom model with `fit` and `predict` methods. Defaults to LightGBM. - :type model: Fittable, optional - :param model_params: Parameters for the default model (LightGBM). - Ignored if a custom model is provided. - :type model_params: dict, optional - """ - - def __init__( - self, - partitioner: BaseTemporalPartitioner, - model: Optional[Fittable] = None, - model_params: Optional[Dict[str, Union[str, int, float]]] = None, - ): - """Initialize the TemporalModelTrainer with a partitioner and model. - - param partitioner: An instance of a class that inherits from - BaseTemporalPartitioner. - type partitioner: BaseTemporalPartitioner - param model: A custom model with fit and predict methods. Defaults to LightGBM. - type model: Fittable, optional - param model_params: Parameters for the default model (LightGBM). - Ignored if a custom model is provided. - type model_params: dict, optional - """ - self.partitioner = partitioner - self.model = model or self._initialize_default_model(model_params) - - def _initialize_default_model( - self, model_params: Optional[Dict[str, Union[str, int, float]]] - ) -> lgb.LGBMRegressor: - """Initialize a default LightGBM model with specified or default parameters.""" - params = model_params or { - "objective": "regression", - "boosting_type": "gbdt", - "metric": "rmse", - "verbosity": -1, - } - return lgb.LGBMRegressor(**params) - - def train_and_evaluate(self) -> Dict[str, List[float]]: - """Train the model on each temporal partition and return predictions. - - :return: Dictionary containing predictions for each partition. - :rtype: Dict[str, List[float]] - """ - # partitioned_data = self.partitioner.get_partition_data() - phase_predictions: Dict[str, List[float]] = {} - - # for i, phase_data in enumerate(partitioned_data): - # trained_model = self.train_model_on_phase(phase_data) - # X_phase = phase_data.drop(columns=[self.partitioner.target]) - # phase_predictions[f"Phase {i}"] = trained_model.predict(X_phase).tolist() - - return phase_predictions - - # TODO: Fix type hints for this method - - def train_model_on_phase(self, phase_data: pd.DataFrame) -> Any: - """Train the model on the provided phase data.""" - # Todo fix - # X = phase_data.drop(columns=[self.partitioner.target]) - # y = phase_data[self.partitioner.target] - # self.model.fit(X, y) - # return self.model - return None diff --git a/src/temporalscope/core/temporal_target_shifter.py b/src/temporalscope/core/temporal_target_shifter.py new file mode 100644 index 0000000..9b1f8f5 --- /dev/null +++ b/src/temporalscope/core/temporal_target_shifter.py @@ -0,0 +1,258 @@ +""" TemporalScope/src/temporalscope/core/temporal_target_shifter.py + +This module provides a transformer-like class to shift the target variable in time series data, either +to a scalar value (for classical machine learning) or to an array (for deep learning). +It is designed to work with the TimeFrame class, supporting multiple backends. + +.. seealso:: + + 1. Torres, J.F., Hadjout, D., Sebaa, A., Martínez-Álvarez, F., & Troncoso, A. (2021). Deep learning for time series forecasting: a survey. Big Data, 9(1), 3-21. https://doi.org/10.1089/big.2020.0159 + 2. Lim, B., & Zohren, S. (2021). Time-series forecasting with deep learning: a survey. Philosophical Transactions of the Royal Society A, 379(2194), 20200209. https://doi.org/10.1098/rsta.2020.0209 + 3. Tang, Y., Song, Z., Zhu, Y., Yuan, H., Hou, M., Ji, J., Tang, C., & Li, J. (2022). A survey on machine learning models for financial time series forecasting. Neurocomputing, 512, 363-380. https://doi.org/10.1016/j.neucom.2022.09.078 + +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from typing import Union, Optional, Any, SupportsIndex +import polars as pl +import pandas as pd +import modin.pandas as mpd +import numpy as np +from temporalscope.conf import validate_backend, validate_input +from temporalscope.core.temporal_data_loader import TimeFrame + + +class TemporalTargetShifter: + """A class for shifting the target variable in time series data for machine learning or deep learning. + + This class works with the `TimeFrame` class to shift the target variable by a specified + number of lags (time steps). It supports multiple backends (Polars, Pandas, Modin) and can + generate output suitable for both machine learning models (scalar) and deep learning models (sequence). + + The class is designed to handle both raw `TimeFrame` data and data that has been partitioned + using a partitioner, such as `SlidingWindowPartitioner`. + + Assumptions: + ------------ + 1. The data is not grouped by any categorical variables. + 2. The `time_col` contains unique time points, ensuring predictable and unambiguous shifting. + + :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. + :type n_lags: int + :param mode: Mode of operation: "machine_learning" for scalar or "deep_learning" for sequences. + Default is "machine_learning". + :type mode: str + :param sequence_length: (Deep Learning Mode Only) The length of the input sequences. Required if mode is "deep_learning". + :type sequence_length: Optional[int] + :raises ValueError: If the backend is unsupported or if validation checks fail. + + .. note:: + For deep learning frameworks like TensorFlow, PyTorch, and JAX/Flax, the expected shape of the target + in "deep_learning" mode generated by this class is `(num_sequences, sequence_length)`. + Batching is typically handled by the data loader or model input pipeline. + + Example Usage: + -------------- + .. code-block:: python + + # Example of creating a TimeFrame with a Polars DataFrame + data = pl.DataFrame({ + 'time': pl.date_range(start='2021-01-01', periods=100, interval='1d'), + 'value': range(100) + }) + tf = TimeFrame(data, time_col='time', target_col='value') + + # Using TemporalTargetShifter in machine_learning mode + shifter_ml = TemporalTargetShifter(n_lags=3, mode='machine_learning') + tf_transformed_ml = shifter_ml.fit_transform(tf) + + # Accessing the shifted data + print("Machine Learning Mode (Scalar):") + print(tf_transformed_ml.get_data().head()) + + # Using TemporalTargetShifter in deep_learning mode + shifter_dl = TemporalTargetShifter(n_lags=3, mode='deep_learning', sequence_length=10) + tf_transformed_dl = shifter_dl.fit_transform(tf) + + # Accessing the shifted data for deep learning + print("Deep Learning Mode (Sequence):") + print(tf_transformed_dl.get_data().head()) + + # Example: Create a TimeFrame with a Modin DataFrame + import modin.pandas as mpd + df = mpd.DataFrame({ + 'time': pd.date_range(start='2021-01-01', periods=100, freq='D'), + 'value': range(100) + }) + tf_modin = TimeFrame(df, time_col='time', target_col='value', backend='mpd') + + # Accessing the data + print("Original Modin DataFrame:") + print(tf_modin.get_data().head()) + """ + + def __init__( + self, + n_lags: int = 1, + mode: str = "machine_learning", + sequence_length: Optional[int] = None, + ): + if mode not in ["machine_learning", "deep_learning"]: + raise ValueError("`mode` must be 'machine_learning' or 'deep_learning'.") + + self.n_lags = n_lags + self.mode = mode + self.sequence_length = sequence_length + + if self.mode == "deep_learning" and not self.sequence_length: + raise ValueError( + "`sequence_length` must be provided when mode is 'deep_learning'." + ) + + def fit(self, tf: TimeFrame) -> "TemporalTargetShifter": + """Validates the input TimeFrame without altering it. + + :param tf: The TimeFrame object to validate. + :type tf: TimeFrame + :rtype: TemporalTargetShifter + :raises ValueError: If data validation fails. + """ + self._validate_data(tf) + self.backend = tf.backend # Store the backend for later use + return self + + def transform(self, tf: TimeFrame) -> TimeFrame: + """Shift the target variable according to the number of lags specified. + + This method can operate on both raw TimeFrame data and partitioned data. + + :param tf: The TimeFrame object to transform. + :type tf: TimeFrame + :rtype: TimeFrame + :raises ValueError: If the backend is unsupported or data validation fails. + """ + df = tf.get_data() + target_col = tf._target_col + + if self.backend == "pl": + df = self._shift_polars(df, target_col) + elif self.backend in ["pd", "mpd"]: + df = self._shift_pandas_modin(df, target_col) + else: + raise ValueError(f"Unsupported backend: {self.backend}") + + tf.update_data(df) + return tf + + def fit_transform(self, tf: TimeFrame) -> TimeFrame: + """Combine fit and transform into a single call. + + :param tf: The TimeFrame object to transform. + :type tf: TimeFrame + :rtype: TimeFrame + """ + return self.fit(tf).transform(tf) + + def _validate_data(self, tf: TimeFrame) -> None: + """Validate the TimeFrame data for consistency.""" + df = tf.get_data() + time_col = tf._time_col + + validate_backend(tf.backend) + validate_input(df, tf.backend) + + # Ensure df[time_col] is treated as a Series, to avoid Mypy errors + if isinstance(df, pl.DataFrame): + if df[time_col].is_duplicated().any(): + raise ValueError( + "The time column contains duplicate values, which is not allowed." + ) + elif isinstance(df, (pd.DataFrame, mpd.DataFrame)): + if df[time_col].duplicated().any(): + raise ValueError( + "The time column contains duplicate values, which is not allowed." + ) + + def _shift_polars(self, df: pl.DataFrame, target_col: str) -> pl.DataFrame: + """Shift the target variable in a Polars DataFrame. + + :param df: The Polars DataFrame containing the time series data. + :type df: pl.DataFrame + :param target_col: The column representing the target variable. + :type target_col: str + :return: The Polars DataFrame with the shifted target variable. + :rtype: pl.DataFrame + :raises ValueError: If the backend is unsupported or data validation fails. + """ + if self.mode == "deep_learning": + if self.sequence_length is None: + raise ValueError( + "`sequence_length` must be provided in deep_learning mode." + ) + + shifted_columns = [ + df[target_col].shift(-i).alias(f"{target_col}_shift_{i}") + for i in range(self.sequence_length) + ] + df = df.with_columns(shifted_columns) + df = df.with_columns( + [ + pl.concat_list( + [ + pl.col(f"{target_col}_shift_{i}") + for i in range(self.sequence_length) + ] + ).alias(f"{target_col}_sequence") + ] + ) + df = df.drop( + [f"{target_col}_shift_{i}" for i in range(self.sequence_length)] + ) + else: # Machine Learning Mode (Scalar) + df = df.with_columns( + [ + df[target_col] + .shift(-self.n_lags) + .alias(f"{target_col}_shift_{self.n_lags}") + ] + ) + + return df.drop_nulls() + + def _shift_pandas_modin( + self, df: Union[pd.DataFrame, mpd.DataFrame], target_col: str + ) -> Union[pd.DataFrame, mpd.DataFrame]: + """Shift the target variable in a Pandas or Modin DataFrame. + + :param df: The Pandas or Modin DataFrame containing the time series data. + :type df: Union[pd.DataFrame, mpd.DataFrame] + :param target_col: The column representing the target variable. + :type target_col: str + :return: The DataFrame with the shifted target variable. + :rtype: Union[pd.DataFrame, mpd.DataFrame] + :raises ValueError: If the backend is unsupported or data validation fails. + """ + if self.mode == "deep_learning": + if self.sequence_length is None: + raise ValueError( + "`sequence_length` must be an integer when mode is 'deep_learning'." + ) + + shifted_columns = [ + df[target_col].shift(-i) for i in range(self.sequence_length) + ] + df[f"{target_col}_sequence"] = np.stack(shifted_columns, axis=1).tolist() + else: # Machine Learning Mode (Scalar) + df[f"{target_col}_shift_{self.n_lags}"] = df[target_col].shift(-self.n_lags) + + return df.dropna() diff --git a/src/temporalscope/core/utils.py b/src/temporalscope/core/utils.py deleted file mode 100644 index e74ace3..0000000 --- a/src/temporalscope/core/utils.py +++ /dev/null @@ -1,93 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Utility functions for the TemporalScope package. - -This module provides various utility functions that can be used throughout the -TemporalScope package. It includes methods for printing dividers, checking for -null and NaN values in DataFrames, and validating the backend. -""" - -from typing import cast - -import modin.pandas as mpd -import pandas as pd -import polars as pl - -from temporalscope.conf import validate_backend - - -def print_divider(char: str = "=", length: int = 70) -> None: - """Print a divider line made of a specified character and length.""" - print(char * length) - - -def check_nulls(df: pl.DataFrame | pd.DataFrame | mpd.DataFrame, backend: str) -> bool: - """Check for null values in the DataFrame using the specified backend. - - :param df: The DataFrame to check for null values. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - :param backend: The backend used for the DataFrame ('polars', 'pandas', 'modin'). - :type backend: str - :return: True if there are null values, False otherwise. - :rtype: bool - :raises ValueError: If the backend is not supported. - """ - validate_backend(backend) - - if backend == "pd": - # Convert NumPy result to Python bool - return bool(cast(pd.DataFrame, df).isnull().values.any()) - elif backend == "pl": - # Polars-specific null check: sum the null counts and return a boolean - polars_df = cast(pl.DataFrame, df) - null_count = polars_df.null_count().select(pl.col("*").sum()).to_numpy().sum() - return bool(null_count > 0) - elif backend == "mpd": - # Convert NumPy result to Python bool - return bool(cast(mpd.DataFrame, df).isnull().values.any()) - else: - raise ValueError(f"Unsupported backend '{backend}'.") - - -def check_nans(df: pl.DataFrame | pd.DataFrame | mpd.DataFrame, backend: str) -> bool: - """Check for NaN values in the DataFrame using the specified backend. - - :param df: The DataFrame to check for NaN values. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - :param backend: The backend used for the DataFrame - ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). - :type backend: str - :return: True if there are NaN values, False otherwise. - :rtype: bool - :raises ValueError: If the backend is not supported. - """ - validate_backend(backend) - - if backend == "pd": - # Convert NumPy result to Python bool - return bool(cast(pd.DataFrame, df).isna().values.any()) - elif backend == "pl": - # Polars-specific NaN check: check if there are any NaNs - polars_df = cast(pl.DataFrame, df) - nan_count = polars_df.select((polars_df == float("nan")).sum()).to_numpy().sum() - return bool(nan_count > 0) - elif backend == "mpd": - # Convert NumPy result to Python bool - return bool(cast(mpd.DataFrame, df).isna().values.any()) - else: - raise ValueError(f"Unsupported backend '{backend}'.") diff --git a/src/temporalscope/modeling/temporal_model_trainer.py b/src/temporalscope/modeling/temporal_model_trainer.py new file mode 100644 index 0000000..10efea8 --- /dev/null +++ b/src/temporalscope/modeling/temporal_model_trainer.py @@ -0,0 +1,97 @@ +# """Implements the `TemporalModelTrainer` for training on temporally partitioned data. + +# This module provides functionality to train machine learning models on data +# partitioned by temporal methods. Users can pass their custom models or use a +# default lightweight model (LightGBM). +# """ + +# from typing import Any, Dict, List, Optional, Protocol, Union + +# import lightgbm as lgb +# import pandas as pd + +# from temporalscope.partition.base import BaseTemporalPartitioner + +# TODO: Update class + +# class Fittable(Protocol): +# """Protocol for objects with `fit` method.""" + +# def fit(self, x: Any, y: Any) -> None: +# """Model Object should have a fit method.""" +# ... + + +# class TemporalModelTrainer: +# """Train models on temporally partitioned data. + +# Users can specify a custom model or use the default LightGBM model. + +# :param partitioner: An instance of a class that inherits from +# `BaseTemporalPartitioner`. +# :type partitioner: BaseTemporalPartitioner +# :param model: A custom model with `fit` and `predict` methods. Defaults to LightGBM. +# :type model: Fittable, optional +# :param model_params: Parameters for the default model (LightGBM). +# Ignored if a custom model is provided. +# :type model_params: dict, optional +# """ + +# def __init__( +# self, +# partitioner: BaseTemporalPartitioner, +# model: Optional[Fittable] = None, +# model_params: Optional[Dict[str, Union[str, int, float]]] = None, +# ): +# """Initialize the TemporalModelTrainer with a partitioner and model. + +# param partitioner: An instance of a class that inherits from +# BaseTemporalPartitioner. +# type partitioner: BaseTemporalPartitioner +# param model: A custom model with fit and predict methods. Defaults to LightGBM. +# type model: Fittable, optional +# param model_params: Parameters for the default model (LightGBM). +# Ignored if a custom model is provided. +# type model_params: dict, optional +# """ +# self.partitioner = partitioner +# self.model = model or self._initialize_default_model(model_params) + +# def _initialize_default_model( +# self, model_params: Optional[Dict[str, Union[str, int, float]]] +# ) -> lgb.LGBMRegressor: +# """Initialize a default LightGBM model with specified or default parameters.""" +# params = model_params or { +# "objective": "regression", +# "boosting_type": "gbdt", +# "metric": "rmse", +# "verbosity": -1, +# } +# return lgb.LGBMRegressor(**params) + +# def train_and_evaluate(self) -> Dict[str, List[float]]: +# """Train the model on each temporal partition and return predictions. + +# :return: Dictionary containing predictions for each partition. +# :rtype: Dict[str, List[float]] +# """ +# # partitioned_data = self.partitioner.get_partition_data() +# phase_predictions: Dict[str, List[float]] = {} + +# # for i, phase_data in enumerate(partitioned_data): +# # trained_model = self.train_model_on_phase(phase_data) +# # X_phase = phase_data.drop(columns=[self.partitioner.target]) +# # phase_predictions[f"Phase {i}"] = trained_model.predict(X_phase).tolist() + +# return phase_predictions + +# # TODO: Fix type hints for this method + +# def train_model_on_phase(self, phase_data: pd.DataFrame) -> Any: +# """Train the model on the provided phase data.""" +# # Todo fix +# # X = phase_data.drop(columns=[self.partitioner.target]) +# # y = phase_data[self.partitioner.target] +# # self.model.fit(X, y) +# # return self.model +# return None diff --git a/src/temporalscope/partition/base.py b/src/temporalscope/partition/base.py deleted file mode 100644 index f241614..0000000 --- a/src/temporalscope/partition/base.py +++ /dev/null @@ -1,179 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Base Temporal Partitioner. - -This module defines the BaseTemporalPartitioner class, an abstract base class for all -temporal partitioning methods. Each partitioning method must inherit from this class -and implement the required methods. -""" - -from abc import ABC, abstractmethod -from typing import Optional - -import modin.pandas as mpd -import pandas as pd -import polars as pl - -from temporalscope.core.temporal_data_loader import TimeFrame - - -class BaseTemporalPartitioner(ABC): - """Abstract base class for temporal partitioning methods. - - The BaseTemporalPartitioner operates on a TimeFrame object and provides core - functionality for retrieving partition indices and data. Subclasses must implement - partitioning logic and optionally perform data validation checks. - - :param tf: TimeFrame object with sorted time-series data. - :type tf: TimeFrame - - :param enable_warnings: Enable warnings for partition validations, defaults to False - :type enable_warnings: bool, optional - - :ivar tf: The TimeFrame object containing the data to be partitioned. - :ivar df: The DataFrame extracted from the TimeFrame. - :ivar enable_warnings: Flag to enable or disable warnings during partition - validation. - - .. note:: - The partitions returned by each partitioning method will always include a "full" - partition with index ranges. The "train", "test", and "validation" partitions are - supported, and at least "train" and "test" must be defined for logical - consistency. - """ - - def __init__(self, tf: TimeFrame, enable_warnings: Optional[bool] = False): - """Initialize the partitioner with the TimeFrame object and optional warnings. - - :param tf: TimeFrame object with sorted time-series data. - :type tf: TimeFrame - :param enable_warnings: Enable warnings for partition validations, - defaults to False - :type enable_warnings: bool, optional - """ - self.tf = tf - self.df = self.tf.get_data() # Retrieve DataFrame from TimeFrame - self.enable_warnings = enable_warnings - - @abstractmethod - def get_partition_indices(self) -> dict[str, dict[str, tuple[int, int]]]: - """Abstract method to generate partition indices. - - Includes 'full', 'train', 'test', 'validation'. - - :return: Dictionary of partitions with partition indices. - :rtype: Dict[str, Dict[str, Tuple[int, int]]] - - .. note:: - - Each partition dictionary should contain "full", "train", "test", - and optionally "validation" keys, where at least "train" and "test" - must be defined for logical partitioning. - - "validation" may be None if not required. - - .. rubric:: Example - - Example of a partition dictionary: - - .. code-block:: python - - { - "partition_1": { - "full": (0, 10), - "train": (0, 8), - "test": (8, 10), - "validation": None, - }, - "partition_2": { - "full": (5, 15), - "train": (5, 13), - "test": (13, 15), - "validation": None, - }, - } - """ - pass - - @abstractmethod - def data_checks(self) -> None: - """Perform data validation checks. - - This abstract method should be implemented by subclasses to perform - specific data validation logic. - - Implementations should consider checks such as: - - - Ensuring sufficient sample size - - Checking for window overlaps - - Validating feature count - - Any other relevant checks for the specific partitioning method - - :raises ValueError: If any validation check fails - :raises NotImplementedError: If the method is not implemented by a subclass - - .. note:: - Subclasses must override this method with their own implementation. - - .. warning:: - Failure to implement proper data checks may lead to invalid partitions - or unexpected behavior in downstream analysis. - """ - pass - - def get_partition_data( - self, - ) -> dict[str, dict[str, pd.DataFrame | mpd.DataFrame | pl.DataFrame]]: - """Return the data for each partition based on the partition indices. - - :return: Dictionary of partition names and their respective data slices. - :rtype: dict[str, dict[str, pd.DataFrame | mpd.DataFrame | pl.DataFrame]] - - .. note:: - This method returns the actual data slices for each partition based on the - indices generated by `get_partition_indices`. The returned structure mirrors - the same dictionary format but contains actual data instead of index ranges. - - .. rubric:: Example - - Example of the returned data structure: - - .. code-block:: python - - { - "partition_1": { - "full": DataFrame(...), - "train": DataFrame(...), - "test": DataFrame(...), - "validation": None, - }, - "partition_2": { - "full": DataFrame(...), - "train": DataFrame(...), - "test": DataFrame(...), - "validation": None, - }, - } - """ - partitions = self.get_partition_indices() - partitioned_data = {} - for key, partition_dict in partitions.items(): - partitioned_data[key] = { - part_name: self.df[start:end] # Direct slicing here - for part_name, (start, end) in partition_dict.items() - if start is not None and end is not None - } - return partitioned_data diff --git a/src/temporalscope/partition/base_protocol.py b/src/temporalscope/partition/base_protocol.py new file mode 100644 index 0000000..0b69839 --- /dev/null +++ b/src/temporalscope/partition/base_protocol.py @@ -0,0 +1,168 @@ +""" TemporalScope/src/temporalscope/partition/base_protocol.py + +This module defines the TemporalPartitionerProtocol, a protocol for all +temporal partitioning methods. Each partitioning method must implement +the required methods to comply with this protocol. + +Core Functionality: +------------------- +1. fit: Must generate the partition indices (row ranges) for the + partitions ('train', 'test', 'validation', etc.) in a memory-efficient manner. + Implementations should leverage lazy-loading techniques to ensure that + large datasets are handled efficiently, minimizing memory usage. +2. transform: Must use the indices from fit to return the actual partitioned data. + This method should apply the calculated indices to retrieve specific data slices, + maintaining the efficiency gained from lazy-loading in the fit stage. +3. check_data: Optional method to perform data validation checks. + +Each implementing class must provide its own logic for partitioning the data and +any necessary validation, while adhering to the design principles of lazy-loading +and memory efficiency. + +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from typing import Protocol, Dict, Tuple, Union, Iterator +import pandas as pd +import polars as pl +import modin.pandas as mpd +from temporalscope.core.temporal_data_loader import TimeFrame + + +class TemporalPartitionerProtocol(Protocol): + """Protocol for temporal partitioning methods. + + The `TemporalPartitionerProtocol` operates on a `TimeFrame` object and provides core + functionality for retrieving partition indices and data. Implementing classes must + provide partitioning logic and optionally perform data validation checks, with a + strong emphasis on memory efficiency through lazy-loading techniques. + + :ivar tf: The `TimeFrame` object containing the pre-sorted time series data to be partitioned. + :vartype tf: TimeFrame + :ivar df: The data extracted from the `TimeFrame`. + :vartype df: Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] + :ivar enable_warnings: Whether to enable warnings during partition validation. + :vartype enable_warnings: bool + + .. note:: + The partitions returned by each partitioning method will always include a + "full" partition with index ranges. The "train", "test", and "validation" + partitions are supported, and at least "train" and "test" must be defined + for logical consistency. To manage large datasets efficiently, implementations + should focus on generating indices lazily to reduce memory footprint. + """ + + tf: TimeFrame + df: Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] + enable_warnings: bool + + def fit( + self, + ) -> Union[ + Dict[str, Dict[str, Tuple[int, int]]], + Iterator[Dict[str, Dict[str, Tuple[int, int]]]], + ]: + """Generate partition indices. + + This method generates partition indices with keys like 'full', 'train', + 'test', and 'validation', utilizing lazy-loading techniques to ensure memory efficiency. + + :return: A dictionary of partitions with their respective indices, or an iterator over them. + :rtype: Union[Dict[str, Dict[str, Tuple[int, int]]], Iterator[Dict[str, Dict[str, Tuple[int, int]]]]] + + .. note:: + Each partition dictionary should contain "full", "train", "test", and + optionally "validation" keys, where at least "train" and "test" must + be defined for logical partitioning. + + "Validation" may be ``None`` if not required. + + Implementations should focus on generating these indices lazily to + optimize memory usage, particularly with large datasets. + + :example: + + .. code-block:: python + + { + "partition_1": { + "full": (0, 10), + "train": (0, 8), + "test": (8, 10), + "validation": None + }, + "partition_2": { + "full": (5, 15), + "train": (5, 13), + "test": (13, 15), + "validation": None + } + } + """ + pass + + def transform( + self, + ) -> Union[ + Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]], + Iterator[ + Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]] + ], + ]: + """Return the data for each partition. + + This method returns the data slices for each partition based on the + partition indices generated by the `fit` method. + + :return: A dictionary containing the data slices for each partition, + or an iterator over them. + :rtype: Union[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]], + Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]]]] + + .. note:: + This method returns the actual data slices for each partition + based on the indices generated by `fit`. The returned structure + mirrors the same dictionary format but contains actual data + instead of index ranges. The `transform` method should continue + to optimize for memory efficiency by using the pre-calculated + lazy indices to access only the necessary data. + + :example: + + .. code-block:: python + + { + "partition_1": { + "full": DataFrame(...), + "train": DataFrame(...), + "test": DataFrame(...), + "validation": None + }, + "partition_2": { + "full": DataFrame(...), + "train": DataFrame(...), + "test": DataFrame(...), + "validation": None + } + } + """ + pass + + def check_data(self) -> None: + """Perform data validation checks. + + Implementing classes must provide their own data validation logic, such as ensuring + sample size is sufficient, checking for window overlaps, or validating the + feature count. + """ + pass diff --git a/src/temporalscope/partition/data_checks.py b/src/temporalscope/partition/partition_validators.py similarity index 51% rename from src/temporalscope/partition/data_checks.py rename to src/temporalscope/partition/partition_validators.py index 399ce30..0e6f910 100644 --- a/src/temporalscope/partition/data_checks.py +++ b/src/temporalscope/partition/partition_validators.py @@ -1,52 +1,38 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Validates dataset partitions using heuristics from key literature. +""" TemporalScope/temporalscope/partition/partition_validators.py + +This module provides functions to validate dataset partitions against +a set of heuristics derived from key literature in the field. .. seealso:: + 1. Shwartz-Ziv, R. and Armon, A., 2022. Tabular data: Deep learning is not all you need. Information Fusion, 81, pp.84-90. + 2. Grinsztajn, L., Oyallon, E. and Varoquaux, G., 2022. Why do tree-based models still outperform deep learning on typical tabular data? + 3. Gorishniy, Y., Rubachev, I., Khrulkov, V. and Babenko, A., 2021. Revisiting deep learning models for tabular data. - .. Shwartz-Ziv, R. and Armon, A. (2022). - Tabular data: Deep learning is not all you need. - *Information Fusion*, 81, 84-90. +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - .. Grinsztajn, L., Oyallon, E. and Varoquaux, G. (2022). - Why do tree-based models still outperform deep learning on typical tabular data? - arXiv preprint arXiv:2207.08815. + http://www.apache.org/licenses/LICENSE-2.0 - .. Gorishniy, Y., Rubachev, I., Khrulkov, V. and Babenko, A. (2021). - Revisiting deep learning models for tabular data. - In *Advances in Neural Information Processing Systems* (pp. 18932-18943). +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. """ +from typing import Union, TypeVar, Any, Dict, cast import warnings -from typing import Any, TypeVar, cast - -import modin.pandas as mpd import pandas as pd import polars as pl - +import modin.pandas as mpd from temporalscope.conf import validate_backend PandasLike = TypeVar("PandasLike", pd.DataFrame, mpd.DataFrame) -_BINARY_THRESHOLD = 2 def check_sample_size( - df: pd.DataFrame | pl.DataFrame | mpd.DataFrame, + df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str = "pl", min_samples: int = 3000, max_samples: int = 50000, @@ -55,27 +41,20 @@ def check_sample_size( """Check if the dataset meets the minimum and maximum sample size requirements. This function checks if the dataset contains an appropriate number of samples - for training machine learning models. If the dataset has too few or - too many samples, warnings can be triggered depending on the `enable_warnings` flag. + for training machine learning models. If the dataset has too few or too many samples, + warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: pd.DataFrame or pl.DataFrame or mpd.DataFrame - - :param backend: The backend used for processing ('pd', 'pl', 'mpd'), - defaults to "pl" + :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str - - :param min_samples: Minimum number of samples required, defaults to 3000 + :param min_samples: Minimum number of samples required. :type min_samples: int - - :param max_samples: Maximum number of samples allowed, defaults to 50000 + :param max_samples: Maximum number of samples allowed. :type max_samples: int - - :param enable_warnings: Flag to enable warnings, defaults to False + :param enable_warnings: Flag to enable warnings, defaults to False. :type enable_warnings: bool - - :return: True if the dataset meets the sample size requirements, otherwise False - + :return: True if the dataset meets the sample size requirements, otherwise False. :rtype: bool """ validate_backend(backend) @@ -85,20 +64,17 @@ def check_sample_size( if num_samples < min_samples: if enable_warnings: warnings.warn( - f"""Dataset has fewer than {min_samples} samples. - Based on heuristics from the literature, this dataset size may not be - suitable for complex machine learning models. Consider alternative - approaches such as Linear, Bayesian, or other models that work well with - smaller datasets.""" + f"Dataset has fewer than {min_samples} samples. " + "Based on heuristics from the literature, this dataset size may not be suitable for complex machine learning models. " + "Consider alternative approaches such as Linear, Bayesian, or other models that work well with smaller datasets." ) return False if num_samples > max_samples: if enable_warnings: warnings.warn( - f"""Dataset has more than {max_samples} samples. - Larger datasets like this might benefit from scalable implementations of - classical models or deep learning techniques.""" + f"Dataset has more than {max_samples} samples. " + "Larger datasets like this might benefit from scalable implementations of classical models or deep learning techniques." ) return False @@ -106,7 +82,7 @@ def check_sample_size( def check_feature_count( - df: pd.DataFrame | pl.DataFrame | mpd.DataFrame, + df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str = "pl", min_features: int = 4, max_features: int = 500, @@ -114,28 +90,21 @@ def check_feature_count( ) -> bool: """Check if the dataset meets the minimum and maximum feature count requirements. - This function ensures the dataset has an appropriate number of features - for modeling. If the feature count is too low or too high, warnings can be - triggered depending on the `enable_warnings` flag. + This function ensures the dataset has an appropriate number of features for modeling. + If the feature count is too low or too high, warnings can be triggered depending on the + `enable_warnings` flag. :param df: The dataset to check. - :type df: pd.DataFrame or pl.DataFrame or mpd.DataFrame - - :param backend: The backend used for processing ('pd', 'pl', 'mpd'), - defaults to "pl" + :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str - - :param min_features: Minimum number of features required, defaults to 4 + :param min_features: Minimum number of features required. :type min_features: int - - :param max_features: Maximum number of features allowed, defaults to 500 + :param max_features: Maximum number of features allowed. :type max_features: int - - :param enable_warnings: Flag to enable warnings, defaults to False + :param enable_warnings: Flag to enable warnings, defaults to False. :type enable_warnings: bool - - :return: True if the dataset meets the feature count requirements, otherwise False - + :return: True if the dataset meets the feature count requirements, otherwise False. :rtype: bool """ validate_backend(backend) @@ -145,18 +114,17 @@ def check_feature_count( if num_features < min_features: if enable_warnings: warnings.warn( - f"""Dataset has fewer than {min_features} features. - Having too few features can oversimplify the model and reduce its ability to - capture data complexity. Consider adding more informative features.""" + f"Dataset has fewer than {min_features} features. " + "According to best practices, having too few features may result in an oversimplified model, " + "which may not capture the complexity of the data. Consider adding more informative features." ) return False if num_features > max_features: if enable_warnings: warnings.warn( - f"""Dataset has more than {max_features} features. - High dimensionality may lead to overfitting. Consider dimensionality - reduction or feature selection techniques.""" + f"Dataset has more than {max_features} features. " + "High-dimensional data can cause issues like overfitting. Consider dimensionality reduction techniques such as PCA or feature selection." ) return False @@ -164,32 +132,25 @@ def check_feature_count( def check_feature_to_sample_ratio( - df: pd.DataFrame | pl.DataFrame | mpd.DataFrame, + df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str = "pl", max_ratio: float = 0.1, enable_warnings: bool = False, ) -> bool: """Check if the feature-to-sample ratio is within acceptable limits. - This function verifies if the dataset's feature-to-sample ratio exceeds the maximum - allowable ratio, which may increase the risk of overfitting. - Warnings can be triggered depending on the `enable_warnings` flag. + This function verifies if the dataset's feature-to-sample ratio exceeds the maximum allowable ratio, + which may increase the risk of overfitting. Warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: pd.DataFrame or pl.DataFrame or mpd.DataFrame - - :param backend: The backend used for processing ('pd', 'pl', 'mpd'), - defaults to "pl" + :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str - - :param max_ratio: Maximum allowable feature-to-sample ratio, defaults to 0.1 + :param max_ratio: Maximum allowable feature-to-sample ratio. :type max_ratio: float - - :param enable_warnings: Flag to enable warnings, defaults to False + :param enable_warnings: Flag to enable warnings, defaults to False. :type enable_warnings: bool - - :return: True if the feature-to-sample ratio is within limits, otherwise False - + :return: True if the feature-to-sample ratio is within limits, otherwise False. :rtype: bool """ validate_backend(backend) @@ -201,10 +162,9 @@ def check_feature_to_sample_ratio( if ratio > max_ratio: if enable_warnings: warnings.warn( - f"""Feature-to-sample ratio exceeds {max_ratio}. - This can increase the risk of overfitting. Consider using regularization - techniques such as L2 regularization, or applying feature selection - methods to reduce the dimensionality of the dataset.""" + f"Feature-to-sample ratio exceeds {max_ratio}. " + "This can increase the risk of overfitting. Consider using regularization techniques such as L2 regularization, " + "or applying feature selection methods to reduce the dimensionality of the dataset." ) return False @@ -212,37 +172,28 @@ def check_feature_to_sample_ratio( def check_categorical_feature_cardinality( - df: pd.DataFrame | pl.DataFrame | mpd.DataFrame, + df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str = "pl", max_unique_values: int = 20, enable_warnings: bool = False, ) -> bool: """Check that categorical features do not have too many unique values. - This function ensures that categorical features have an acceptable number - of unique values. High-cardinality categorical features can complicate model - training and increase the risk of overfitting. Warnings can be triggered depending - on the `enable_warnings` flag. + This function ensures that categorical features have an acceptable number of unique values. + High-cardinality categorical features can complicate model training and increase the risk of overfitting. + Warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: pd.DataFrame or pl.DataFrame or mpd.DataFrame - - :param backend: The backend used for processing ('pd', 'pl', 'mpd'), - defaults to "pl" + :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str - - :param max_unique_values: Maximum number of unique values allowed for categorical - features,defaults to 20 + :param max_unique_values: Maximum number of unique values allowed for categorical features. :type max_unique_values: int - - :param enable_warnings: Flag to enable warnings, defaults to False + :param enable_warnings: Flag to enable warnings, defaults to False. :type enable_warnings: bool - - :return: True if the categorical features meet the cardinality limits, - otherwise False - + :return: True if the categorical features meet the cardinality limits, otherwise False. :rtype: bool - :raises ValueError: If backend is not supported. + :raises: ValueError if backend is not supported. """ validate_backend(backend) @@ -255,10 +206,8 @@ def check_categorical_feature_cardinality( if polars_df[col].n_unique() > max_unique_values: if enable_warnings: warnings.warn( - f"""Categorical feature '{col}' has more than - {max_unique_values} unique values. Consider using encoding - techniques such as target encoding, one-hot encoding, - or embeddings to handle high-cardinality features.""" + f"Categorical feature '{col}' has more than {max_unique_values} unique values. " + "Consider using encoding techniques such as target encoding, one-hot encoding, or embeddings to handle high-cardinality features." ) return False @@ -273,45 +222,36 @@ def check_categorical_feature_cardinality( for col in categorical_columns: if pandas_df[col].nunique() > max_unique_values: if enable_warnings: - message = f"""Categorical feature '{col}' has more than - {max_unique_values} unique values. Consider using encoding - techniques such as target encoding, one-hot encoding, or - embeddings to handle high-cardinality features.""" - warnings.warn(message) + warnings.warn( + f"Categorical feature '{col}' has more than {max_unique_values} unique values. " + "Consider using encoding techniques such as target encoding, one-hot encoding, or embeddings to handle high-cardinality features." + ) return False return True def check_numerical_feature_uniqueness( - df: pd.DataFrame | pl.DataFrame | mpd.DataFrame, + df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str = "pl", min_unique_values: int = 10, enable_warnings: bool = False, ) -> bool: """Check that numerical features have a sufficient number of unique values. - This function ensures that numerical features contain a minimum number of - unique values. Features with too few unique values may lack variability, - reducing model expressiveness. Warnings can be triggered depending on the - `enable_warnings` flag. + This function ensures that numerical features contain a minimum number of unique values. + Features with too few unique values may lack variability, reducing model expressiveness. + Warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: pd.DataFrame or pl.DataFrame or mpd.DataFrame - - :param backend: The backend used for processing ('pd', 'pl', 'mpd'), - defaults to "pl" + :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str - - :param min_unique_values: Minimum number of unique values required for numerical - features, defaults to 10 + :param min_unique_values: Minimum number of unique values required for numerical features. :type min_unique_values: int - - :param enable_warnings: Flag to enable warnings, defaults to False + :param enable_warnings: Flag to enable warnings, defaults to False. :type enable_warnings: bool - - :return: True if all numerical features have at least `min_unique_values` unique - values, otherwise False + :return: True if all numerical features have at least `min_unique_values` unique values, otherwise False. :rtype: bool """ validate_backend(backend) @@ -324,32 +264,22 @@ def check_numerical_feature_uniqueness( if pandas_df[col].nunique() < min_unique_values: if enable_warnings: warnings.warn( - f"""Numerical feature '{col}' has fewer than - {min_unique_values} unique values. - Low variability can limit model performance. - Consider feature engineering or transformations - (e.g., log transformation, interaction terms).""" + f"Numerical feature '{col}' has fewer than {min_unique_values} unique values. " + "Low feature variability can limit model expressiveness and accuracy. " + "Consider feature engineering or transformations (e.g., log transformation, interaction terms) to increase variability." ) return False elif backend == "pl": polars_df = df # Type narrowing for mypy if isinstance(polars_df, pl.DataFrame): for col in polars_df.columns: - if polars_df[col].dtype in [ - pl.Int32, - pl.Int64, - pl.Float32, - pl.Float64, - ]: + if polars_df[col].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]: if polars_df[col].n_unique() < min_unique_values: if enable_warnings: warnings.warn( - f"""Numerical feature '{col}' has fewer than - {min_unique_values} unique values. Low feature - variability can limit model expressiveness and accuracy. - Consider feature engineering or transformations - (e.g., log transformation, interaction terms) to - increase variability.""" + f"Numerical feature '{col}' has fewer than {min_unique_values} unique values. " + "Low feature variability can limit model expressiveness and accuracy. " + "Consider feature engineering or transformations (e.g., log transformation, interaction terms) to increase variability." ) return False @@ -357,28 +287,22 @@ def check_numerical_feature_uniqueness( def check_binary_numerical_features( - df: pd.DataFrame | pl.DataFrame | mpd.DataFrame, + df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str = "pl", enable_warnings: bool = False, ) -> bool: - """Check for binary numerical features and suggest conversion to categorical. + """Check if any numerical features are binary and suggest converting them to categorical. - Binary numerical features (i.e., features with only two unique values) are often - better represented as categorical features. This function detects such - features and suggests conversion. Warnings can be triggered depending on - the `enable_warnings` flag. + Binary numerical features (i.e., features with only two unique values) are often better represented as categorical features. + This function detects such features and suggests conversion. Warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: pd.DataFrame or pl.DataFrame or mpd.DataFrame - - :param backend: The backend used for processing ('pd', 'pl', 'mpd'), - defaults to "pl" + :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str - - :param enable_warnings: Flag to enable warnings, defaults to False + :param enable_warnings: Flag to enable warnings, defaults to False. :type enable_warnings: bool - - :return: True if no binary numerical features are found, otherwise False + :return: True if no binary numerical features are found, otherwise False. :rtype: bool """ validate_backend(backend) @@ -388,13 +312,11 @@ def check_binary_numerical_features( if isinstance(pandas_df, (pd.DataFrame, mpd.DataFrame)): numerical_columns = pandas_df.select_dtypes(include=["number"]).columns for col in numerical_columns: - if pandas_df[col].nunique() == _BINARY_THRESHOLD: + if pandas_df[col].nunique() == 2: if enable_warnings: warnings.warn( - f"""Numerical feature '{col}' has only 2 unique values. - "Binary numerical features should typically be converted - to categorical for better model performance and - interpretability.""" + f"Numerical feature '{col}' has only 2 unique values. " + "Binary numerical features should typically be converted to categorical for better model performance and interpretability." ) return False @@ -402,19 +324,12 @@ def check_binary_numerical_features( polars_df = df # Type narrowing for mypy if isinstance(polars_df, pl.DataFrame): for col in polars_df.columns: - if polars_df[col].dtype in [ - pl.Int32, - pl.Int64, - pl.Float32, - pl.Float64, - ]: - if polars_df[col].n_unique() == _BINARY_THRESHOLD: + if polars_df[col].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]: + if polars_df[col].n_unique() == 2: if enable_warnings: warnings.warn( - f"""Numerical feature '{col}' has only 2 unique values. - "Binary numerical features should typically be converted - to categorical for better model performance - and interpretability.""" + f"Numerical feature '{col}' has only 2 unique values. " + "Binary numerical features should typically be converted to categorical for better model performance and interpretability." ) return False @@ -422,39 +337,32 @@ def check_binary_numerical_features( def check_class_balance( - df: pd.DataFrame | pl.DataFrame | mpd.DataFrame, + df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], target_col: str, backend: str = "pl", enable_warnings: bool = False, ) -> bool: """Check that classes in a classification dataset are balanced. - This function checks the class distribution in the target column of a classification - dataset. If the ratio between the largest and smallest classes exceeds 1.5, - the dataset is considered imbalanced. Warnings can be triggered depending on the - `enable_warnings` flag. + This function checks the class distribution in the target column of a classification dataset. + If the ratio between the largest and smallest classes exceeds 1.5, the dataset is considered imbalanced. + Warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: pd.DataFrame or pl.DataFrame or mpd.DataFrame - + :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] :param target_col: The column containing the target labels. :type target_col: str - - :param backend: The backend used for processing ('pd', 'pl', 'mpd'), - defaults to "pl" + :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str - - :param enable_warnings: Flag to enable warnings, defaults to False + :param enable_warnings: Flag to enable warnings, defaults to False. :type enable_warnings: bool - - :return: True if classes are balanced (ratio <= 1.5), otherwise False + :return: True if classes are balanced (ratio <= 1.5), otherwise False. :rtype: bool - - :raises ValueError: If backend is not supported. + :raises: ValueError if backend is not supported. """ validate_backend(backend) - class_counts: dict[Any, int] = {} + class_counts: Dict[Any, int] = {} if backend in ["pd", "mpd"]: # Explicitly cast to Pandas/Modin DataFrame @@ -476,13 +384,12 @@ def check_class_balance( count_values = list(class_counts.values()) max_count = max(count_values) min_count = min(count_values) - MIN_RATIO = 1.5 - if max_count / min_count > MIN_RATIO: + if max_count / min_count > 1.5: if enable_warnings: - message = """Classes are imbalanced. Consider using techniques like \n - class weighting, SMOTE, or resampling to address class imbalance.""" - warnings.warn(message) + warnings.warn( + "Classes are imbalanced. Consider using techniques like class weighting, SMOTE, or resampling to address class imbalance." + ) return False return True diff --git a/src/temporalscope/partition/sliding_window.py b/src/temporalscope/partition/sliding_window.py index 2419ae3..78d556e 100644 --- a/src/temporalscope/partition/sliding_window.py +++ b/src/temporalscope/partition/sliding_window.py @@ -1,212 +1,477 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Implements the SlidingWindowPartitioner for creating temporal data partitions. - -This module defines the SlidingWindowPartitioner class, which extends -BaseTemporalPartitioner to divide datasets into non-overlapping partitions using -a sliding window approach. It uses a fixed window size and optional stride to create -partitions, allowing for potential gaps between consecutive partitions. -Each partition can be further divided into train, test, and validation sets -based on user-defined percentages. +""" TemporalScope/temporalscope/partitioning/sliding_window.py + +This module defines the SlidingWindowPartitioner class, a specific implementation of the +TemporalPartitionerProtocol for creating contiguous, non-overlapping partitions using a sliding window mechanism. + +Core Functionality: +------------------- +The SlidingWindowPartitioner divides a dataset into non-overlapping partitions using a fixed window size and +optional stride. The stride determines how far to move between the starting points of consecutive partitions, +which can introduce gaps between them. Each partition can be further split into train, test, and validation sets. + +This class utilizes the generator pattern for memory efficiency, yielding partition indices and data slices one at a time. + +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. """ +from typing import Dict, Tuple, Optional, Union, Iterator +import itertools +import warnings +import pandas as pd +import polars as pl +import modin.pandas as mpd from temporalscope.core.temporal_data_loader import TimeFrame -from temporalscope.partition.base import BaseTemporalPartitioner +from temporalscope.partition.base_protocol import TemporalPartitionerProtocol +from temporalscope.partition.partition_validators import ( + check_sample_size, + check_feature_to_sample_ratio, + check_class_balance, +) + + +class SlidingWindowPartitioner(TemporalPartitionerProtocol): + """Sliding Window Partitioner for dividing time series data into contiguous, non-overlapping partitions. + This class splits a dataset into partitions of a fixed window size. Users can define a stride to introduce gaps + between consecutive partitions. Each partition can be further divided into train, test, and validation sets + based on provided percentages. -class SlidingWindowPartitioner(BaseTemporalPartitioner): - """Divide time series data into contiguous, non-overlapping partitions. + Assumptions: + ------------ + - `train_pct` must be specified. + - `test_pct` is optional, and if not provided, the remaining percentage after `train_pct` will implicitly be assigned to `test_pct`. + - `val_pct` is also optional, and if provided, the sum of `train_pct`, `test_pct`, and `val_pct` must equal 1.0. + - The total of `train_pct`, `test_pct`, and `val_pct` must sum to 1.0 exactly. - This class splits a dataset into partitions of a fixed window size. Users can define - a stride to introduce gaps between consecutive partitions. Each partition can be - further divided into train, test, and validation sets based on provided percentages. + The class uses a generator pattern for `fit` and `transform` methods to yield partition indices and data slices + one at a time, promoting memory efficiency and lazy loading. :param tf: The TimeFrame object containing the data to be partitioned. :type tf: TimeFrame + :param window_size: The size of each partition (number of rows). + :type window_size: Optional[int] + :param stride: The number of rows to skip between the start points of consecutive partitions. + A stride larger than the window size creates gaps, while a stride equal to the window size results in no gaps. + :type stride: int + :param reverse: Whether the sliding window should move in reverse (from the end to the start of the dataset). + If set to True, the window slides in reverse; if False (default), it slides forward. + :type reverse: bool + :param truncate: Whether to truncate the last partition if its size is smaller than the window size. + :type truncate: bool + :param train_pct: Percentage of data allocated for training within each partition. Must be provided. + :type train_pct: float + :param test_pct: Percentage of data allocated for testing within each partition. Optional. + :type test_pct: Optional[float] + :param val_pct: Optional percentage of data allocated for validation within each partition. If provided, the sum of `train_pct`, + `test_pct`, and `val_pct` must equal 1.0. + :type val_pct: Optional[float] + :param enable_warnings: Enable warnings for uneven partition sizes. + :type enable_warnings: bool + :param verbose: If set to True, print partitioning details. + :type verbose: bool + + :raises ValueError: + - If `window_size` is not provided or is not a positive integer. + - If `stride` is not a positive integer. + - If `train_pct`, `test_pct`, or `val_pct` are not within the range [0, 1]. + - If `train_pct`, `test_pct`, and `val_pct` do not sum to 1.0. + - If `train_pct` is provided without `test_pct` or `val_pct` summing to 1.0. + - If the dataset cannot be sorted or retrieved properly from the TimeFrame. + - If any required data is missing or invalid during the partitioning process. + + Example Usage: + -------------- + .. code-block:: python + + import pandas as pd + from temporalscope.core.temporal_data_loader import TimeFrame + from temporalscope.partition.sliding_window import SlidingWindowPartitioner + + # Create a sample dataset using Pandas + data_df = pd.DataFrame({ + 'time': pd.date_range(start='2021-01-01', periods=20, freq='D'), + 'value': range(20) + }) + + # Create a TimeFrame object + data_tf = TimeFrame(data_df, time_col='time', target_col='value', backend='pd') + + # Create a SlidingWindowPartitioner with window_size=5 and stride=5 + partitioner = SlidingWindowPartitioner( + tf=data_tf, window_size=5, stride=5, truncate=True, train_pct=0.8, test_pct=0.2, reverse=False + ) - :param window_size: The size of each partition (number of rows). If not provided, - `num_partitions` is required. - :type window_size: int, optional + # Iterate over partition indices + for partition in partitioner.fit(): + print(partition) - :param num_partitions: The number of partitions to divide the data into. - If `window_size` is not provided, this parameter is used to - split the data evenly. - :type num_partitions: int, optional + # Iterate over data slices for each partition + for partition_data in partitioner.transform(): + print(partition_data) - :param stride: The number of rows to skip between the start points of consecutive - partitions. A stride larger than the window size creates gaps, while - a stride equal to the window size results in no gaps. - :type stride: int, default 1 + Notes + ----- + The sliding window can operate in two modes, depending on the `reverse` parameter: - :param truncate: Whether to truncate the last partition if its size is smaller than - the window size. - :type truncate: bool, default True + .. note:: - :param train_pct: Percentage of data allocated for training within each partition. - :type train_pct: float, default 0.7 + **Forward Sliding Window (reverse=False):** - :param test_pct: Percentage of data allocated for testing within each partition. - :type test_pct: float, optional, default 0.2 + The sliding window starts from the beginning of the dataset and moves forward. - :param val_pct: Percentage of data allocated for validation within each partition. - :type val_pct: float, optional, default 0.1 + Example: - :param enable_warnings: Enable warnings for uneven partition sizes. - :type enable_warnings: bool, default False + .. code-block:: text - :raises ValueError: If neither `window_size` nor `num_partitions` is provided, or if - train, test, and validation percentages do not sum to 1.0. + Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] + Window Size: 4, Stride: 3 - :example: - Create a sample dataset and partition it using `SlidingWindowPartitioner`. + Window 1: [ 1, 2, 3, 4 ] + Window 2: [ 4, 5, 6, 7 ] + Window 3: [ 7, 8, 9, 10 ] - .. code-block:: python + .. seealso:: - import pandas as pd - from temporalscope.core.temporal_data_loader import TimeFrame - from temporalscope.partition.sliding_window import SlidingWindowPartitioner + **Reverse Sliding Window (reverse=True):** - # Create a sample dataset using Pandas - data = pd.DataFrame( - { - "time": pd.date_range(start="2021-01-01", periods=20, freq="D"), - "value": range(20), - } - ) + The sliding window starts from the end of the dataset and moves backward. - # Create a TimeFrame object - tf = TimeFrame(data, time_col="time", target_col="value", backend="pd") - - # Create a SlidingWindowPartitioner with window_size=5 and stride=5 - partitioner = SlidingWindowPartitioner( - tf=tf, - window_size=5, - stride=5, - truncate=True, - train_pct=0.6, - test_pct=0.3, - val_pct=0.1, - ) + Example: - # Retrieve the partition indices - partitions = partitioner.get_partition_indices() - print(partitions) + .. code-block:: text - # Retrieve the actual data slices for each partition - partitioned_data = partitioner.get_partition_data() - print(partitioned_data) + Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] + Window Size: 4, Stride: 3 + + Window 1: [ 7, 8, 9, 10 ] + Window 2: [ 4, 5, 6, 7 ] + Window 3: [ 1, 2, 3, 4 ] """ def __init__( self, tf: TimeFrame, - window_size: int | None = None, - num_partitions: int | None = None, + window_size: Optional[int] = None, stride: int = 1, + reverse: bool = False, truncate: bool = True, train_pct: float = 0.7, - test_pct: float | None = 0.2, - val_pct: float | None = 0.1, + test_pct: Optional[float] = 0.2, + val_pct: Optional[float] = None, enable_warnings: bool = False, + verbose: bool = False, ): - """Initialize the SlidingWindowPartitioner with a TimeFrame.""" - super().__init__(tf, enable_warnings=enable_warnings) + if window_size is None or window_size <= 0: + raise ValueError("`window_size` must be a positive integer.") + if stride <= 0: + raise ValueError("`stride` must be a positive integer.") + if not (0 <= train_pct <= 1): + raise ValueError("`train_pct` must be between 0 and 1.") + if test_pct is not None and not (0 <= test_pct <= 1): + raise ValueError("`test_pct` must be between 0 and 1.") + if val_pct is not None and not (0 <= val_pct <= 1): + raise ValueError("`val_pct` must be between 0 and 1.") + if train_pct + (test_pct or 0) + (val_pct or 0) != 1.0: + raise ValueError("Train, test, and validation percentages must sum to 1.0.") + + self.tf = tf # Use TimeFrame directly self.window_size = window_size - self.num_partitions = num_partitions self.stride = stride + self.reverse = reverse self.truncate = truncate - self.train_pct = train_pct - self.test_pct = test_pct - self.val_pct = val_pct + self.verbose = verbose + self.train_pct, self.test_pct, self.val_pct = self._precompute_percentages( + train_pct, test_pct, val_pct + ) - # Check that train, test, and validation percentages sum to 1 - if not ( - 0 <= train_pct <= 1 and (test_pct or 0) + (val_pct or 0) + train_pct == 1.0 - ): - raise ValueError("Train, test, and validation percentages must sum to 1.") + # Sort data by time column using TimeFrame method + self.tf.sort_data(ascending=True) + + self._fit_executed = False + self._transform_executed = False + + def _precompute_percentages( + self, train_pct: float, test_pct: Optional[float], val_pct: Optional[float] + ) -> Tuple[float, Optional[float], Optional[float]]: + """Calculate and validate the percentages for train, test, and validation splits. + + This method checks that the provided percentages for training, testing, and validation + add up to 100%. It ensures that if a validation percentage is specified, both training + and testing percentages are also provided. The method also prints out the calculated + percentages if `verbose` mode is enabled. - if not window_size and not num_partitions: + :param train_pct: The percentage of data allocated for training within each partition. + :type train_pct: float + :param test_pct: The percentage of data allocated for testing within each partition. If not provided, + it defaults to 1.0 minus `train_pct` and `val_pct`. + :type test_pct: Optional[float] + :param val_pct: The percentage of data allocated for validation within each partition, if any. + :type val_pct: Optional[float] + + :return: A tuple containing the validated percentages for training, testing, and validation. + :rtype: Tuple[float, Optional[float], Optional[float]] + + :raises ValueError: If the sum of `train_pct`, `test_pct`, and `val_pct` does not equal 100%, or + if `val_pct` is specified without both `train_pct` and `test_pct`. + """ + total_pct = (train_pct or 0) + (test_pct or 0) + (val_pct or 0) + if total_pct != 1.0: + raise ValueError("Train, test, and validation percentages must sum to 1.0.") + if val_pct is not None and (train_pct is None or test_pct is None): raise ValueError( - "Either `window_size` or `num_partitions` must be specified." + "Validation percentage requires both train and test percentages to be provided." ) + if self.verbose: + print(f"Train percentage: {train_pct}") + print(f"Test percentage: {test_pct}") + print(f"Validation percentage: {val_pct}") + return train_pct, test_pct, val_pct - # Sort the data using the TimeFrame class - self.tf.sort_data(ascending=True) - - def _calculate_window_size(self, num_rows: int) -> int: - """Calculate window size based on the number of partitions or provided value. + def _validate_partitioning(self, num_rows: int, window_size: int) -> None: + """Validate the feasibility of partitioning the dataset with the given window size and stride. - If `self.num_partitions` is set, we calculate the window size by dividing the - total number of rows by the number of partitions, ensuring the window size is - at least 1. If `self.window_size` is provided, it will be used directly. - Otherwise, a fallback value will be provided. + This method checks if the dataset can be properly partitioned based on the provided `window_size` and `stride`. + It ensures that: + - The stride is not larger than the window size, which would cause partitions to be skipped. + - The stride is a positive integer. + - The dataset has enough rows to create at least one partition. :param num_rows: The total number of rows in the dataset. :type num_rows: int - - :return: The calculated window size. - :rtype: int + :param window_size: The window size to be used for each partition. + :type window_size: int + :raises ValueError: If partitioning is not possible due to any of the following conditions: + - The stride is larger than the window size. + - The stride is not a positive integer. + - The dataset is too small to create even a single partition with the given window size and stride. """ - # calculate window size based on number of partitions - if self.num_partitions: - # Ensure the window size is at least 1 (to avoid windows of size 0) - return max(1, num_rows // self.num_partitions) - - # If no partitions are specified, return the predefined window size. - # If `self.window_size` is None, use fallback (e.g., num_rows // 10 as default) - return ( - self.window_size if self.window_size is not None else max(1, num_rows // 10) - ) + # Ensure the stride is not larger than the window size + if self.stride > window_size: + raise ValueError( + f"Stride ({self.stride}) is larger than the window size ({window_size}). " + "This would cause partitions to be skipped." + ) + + # Ensure the stride is a positive integer + if self.stride <= 0: + raise ValueError("Stride must be a positive integer.") - def get_partition_indices(self) -> dict[str, dict[str, tuple[int, int]]]: - """Generate partition indices based on the window size or number of partitions. + # Calculate the number of possible partitions + num_possible_partitions = (num_rows - window_size) // self.stride + 1 - :return: Dictionary of partitions with indices for 'full', 'train', 'test', - and optionally 'validation'. - :rtype: Dict[str, Dict[str, Tuple[int, int]]] + # Ensure there are enough rows in the dataset for at least one partition + if num_possible_partitions < 1: + raise ValueError( + f"Not enough rows ({num_rows}) to create partitions with window size {window_size} " + f"and stride {self.stride}. Try reducing the number of partitions or adjusting the window size and stride." + ) + + # Print validation success message if verbose mode is enabled + if self.verbose: + print( + f"Partitioning validated: {num_possible_partitions} possible partitions." + ) + + def _get_data_shape(self) -> Tuple[int, int]: + """Get the number of rows and features from the dataset, ensuring compatibility with different backends. + + :return: A tuple containing the number of rows and features in the dataset. + :rtype: Tuple[int, int] + :raises ValueError: If the backend is unsupported. """ - num_rows = self.df.shape[0] - window_size = self._calculate_window_size(num_rows) - partitions = {} + backend = self.tf.backend # Access the backend from the TimeFrame object + if backend in ["pd", "mpd"]: + num_rows, num_features = self.df.shape + elif backend == "pl": + num_rows = self.df.height + num_features = self.df.width + else: + raise ValueError(f"Unsupported backend: {backend}") + return num_rows, num_features + + def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: + """Generate partition indices for the dataset, lazily yielding them one at a time. + + This method divides the dataset into partitions based on the specified window size and stride. + It generates indices for the entire partition as well as for the training, testing, and validation splits + within each partition. + + The method operates in a memory-efficient manner, generating and yielding each partition's indices + only when needed. + + :yield: A dictionary where each key corresponds to a partition (e.g., 'partition_1'), and the value is another + dictionary with keys 'full', 'train', 'test', and optionally 'validation', each mapping to a tuple of indices. + :rtype: Iterator[Dict[str, Dict[str, Tuple[int, int]]]] + :raises ValueError: If `window_size` is larger than the dataset or if the total number of partitions is insufficient. + """ + num_rows, _ = ( + self._get_data_shape() + ) # Retrieve the shape using backend-specific method + window_size = self.window_size + + # Validate that the partitioning is possible with the given window size and stride + self._validate_partitioning(num_rows, window_size) + partition_count = 1 - for start in range(0, num_rows, self.stride): + # Ensure start_range is always a list to avoid type conflicts + start_range = list(range(0, num_rows, self.stride)) + if self.reverse: + start_range.reverse() + + # Iterate over the dataset to generate partition indices + for start in start_range: end = start + window_size + # Adjust the end if it exceeds the number of rows and truncate is False if end > num_rows: if self.truncate: - break - end = num_rows # Include remaining data if truncate is False + break # Stop iteration if the last partition is smaller than the window size and truncate is True + end = num_rows # Adjust to include the remaining data + # Compute the split points for train, test, and validation train_end = start + int(self.train_pct * (end - start)) test_end = ( train_end + int(self.test_pct * (end - start)) if self.test_pct else train_end ) - validation_end = end if self.test_pct else train_end - - partitions[f"partition_{partition_count}"] = { - "full": (start, end), - "train": (start, train_end), - "test": (train_end, test_end), - "validation": (test_end, validation_end), + validation_end = end if self.val_pct else test_end + + # Yield the partition indices + yield { + f"partition_{partition_count}": { + "full": (start, end), + "train": (start, train_end), + "test": (train_end, test_end), + "validation": ( + (test_end, validation_end) if self.val_pct else (0, 0) + ), + } } + + # If verbose is enabled, print details of the current partition + if self.verbose: + print(f"Partition {partition_count}: {start} to {end}") + print( + f"Training: {start} to {train_end}, Testing: {train_end} to {test_end}" + ) + partition_count += 1 - return partitions + # Track that fit has been run + self._fit_executed = True + + def transform( + self, + ) -> Iterator[ + Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]] + ]: + """Generate and yield the data slices for each partition. + + This method utilizes the partition indices generated by the `fit` method to extract and return + the corresponding data slices from the original dataset. The data is returned for each partition, + including the full partition as well as the training, testing, and validation subsets. + + The method is designed to be memory-efficient, generating and yielding each partition's data + only when required. + + :yield: A dictionary where each key corresponds to a partition (e.g., 'partition_1'), and the value is another + dictionary with keys 'full', 'train', 'test', and optionally 'validation', each mapping to a DataFrame slice. + :rtype: Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]]] + :raises ValueError: If data slicing fails for any partition, which could occur if the indices are out of bounds. + """ + # Generate partition indices using the fit method + for partition in self.fit(): + partitioned_data = {} + + # Iterate over each partition and its corresponding indices + for key, partition_dict in partition.items(): + partitioned_data[key] = { + # Slice the data using the appropriate backend method (pandas, Modin, or Polars) + part_name: ( + self.df.iloc[start:end] + if isinstance(self.df, (pd.DataFrame, mpd.DataFrame)) + else self.df.slice(start, end - start) + ) # Polars-specific slicing + for part_name, (start, end) in partition_dict.items() + if start is not None + and end is not None # Ensure valid start and end indices + } + + # Yield the partitioned data + yield partitioned_data + + # Track that transform has been run + self._transform_executed = True + + def check_data(self, partition_index: Optional[int] = None) -> None: + """Perform data checks on the entire TimeFrame or a specific partition. + + This method validates whether the dataset or a specific partition meets + recommended criteria based on sample size, feature-to-sample ratio, and class balance. + + - If `partition_index` is provided, checks are performed on the specified partition. + - If `partition_index` is None, checks are performed on the entire TimeFrame. + + Assumptions: + ------------ + - If the method is called without running `fit`, it checks the full dataset. + - If `fit` has been run and `partition_index` is provided, it checks the specific partition. + + Warnings are raised instead of errors to allow users to proceed with caution. + + :param partition_index: Index of the partition to check, or None to check the entire dataset. + :type partition_index: Optional[int] + """ + if partition_index is not None: + # Generate the required partition directly without assuming prior fit() call + partition = next(itertools.islice(self.fit(), partition_index, None)) + start, end = partition[f"partition_{partition_index + 1}"]["full"] + df_to_check = self.df[start:end] + context = f"Partition {partition_index + 1}" + min_samples = 100 # Lower threshold for partitions + else: + df_to_check = self.df + context = "Full dataset" + min_samples = 3000 # Higher threshold for the full dataset + + num_rows, num_features = df_to_check.shape + target_col = self.tf.target_col + + # Perform checks with warnings enabled + check_sample_size( + df_to_check, + backend=self.tf.backend, + min_samples=min_samples, + max_samples=100000, # Standard large threshold + enable_warnings=True, + ) + + check_feature_to_sample_ratio( + df_to_check, + backend=self.tf.backend, + max_ratio=0.2, # Standard ratio for features to samples + enable_warnings=True, + ) + + if target_col: + check_class_balance( + df_to_check, + target_col=target_col, + backend=self.tf.backend, + enable_warnings=True, + ) + + if self.verbose: + print(f"{context} checks completed with warnings where applicable.") diff --git a/test/unit/test_core_conf.py b/test/unit/test_core_conf.py deleted file mode 100644 index 126cdf2..0000000 --- a/test/unit/test_core_conf.py +++ /dev/null @@ -1,104 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from unittest.mock import patch - -import modin.pandas as mpd -import pandas as pd -import polars as pl -import pytest - -from temporalscope.conf import ( - get_api_keys, - get_default_backend_cfg, - validate_backend, - validate_input, -) - -# Define mock API key constants to make it clear these are not real secrets -MOCK_OPENAI_API_KEY = "mock_openai_key" -MOCK_CLAUDE_API_KEY = "mock_claude_key" - - -def test_get_api_keys(): - """Test that get_api_keys retrieves environment variables correctly.""" - # Mock environment variables using the defined constants - with patch.dict( - "os.environ", - { - "OPENAI_API_KEY": MOCK_OPENAI_API_KEY, - "CLAUDE_API_KEY": MOCK_CLAUDE_API_KEY, - }, - ): - api_keys = get_api_keys() - assert api_keys["OPENAI_API_KEY"] == MOCK_OPENAI_API_KEY - assert api_keys["CLAUDE_API_KEY"] == MOCK_CLAUDE_API_KEY - - # Test when no environment variables are set - with patch.dict("os.environ", {}, clear=True): - api_keys = get_api_keys() - assert api_keys["OPENAI_API_KEY"] is None - assert api_keys["CLAUDE_API_KEY"] is None - - -def test_get_default_backend_cfg(): - """Test that the default backend configuration is returned correctly.""" - expected_cfg = { - "BACKENDS": {"pl": "polars", "pd": "pandas", "mpd": "modin"}, - } - result = get_default_backend_cfg() - assert result == expected_cfg - - -@pytest.mark.parametrize("backend", ["pl", "pd", "mpd"]) -def test_validate_backend_supported(backend): - """Test that supported backends are validated successfully.""" - validate_backend(backend) - - -@pytest.mark.parametrize("invalid_backend", ["tf", "spark", "unknown"]) -def test_validate_backend_unsupported(invalid_backend): - """Test that unsupported backends raise a ValueError.""" - with pytest.raises(ValueError, match="Unsupported backend"): - validate_backend(invalid_backend) - - -@pytest.mark.parametrize( - "backend, df", - [ - ("pl", pl.DataFrame({"a": [1, 2, 3]})), # Polars DataFrame - ("pd", pd.DataFrame({"a": [1, 2, 3]})), # Pandas DataFrame - ("mpd", mpd.DataFrame({"a": [1, 2, 3]})), # Modin DataFrame - ], -) -def test_validate_input_valid(backend, df): - """Test that valid DataFrame types are accepted based on the backend.""" - validate_input(df, backend) - - -@pytest.mark.parametrize( - "backend, df", - [ - ("pl", pd.DataFrame({"a": [1, 2, 3]})), # Invalid Polars input - ("pd", pl.DataFrame({"a": [1, 2, 3]})), # Invalid Pandas input - ("mpd", pd.DataFrame({"a": [1, 2, 3]})), # Invalid Modin input - ], -) -def test_validate_input_invalid(backend, df): - """Test that invalid DataFrame types raise a TypeError based on the backend.""" - with pytest.raises(TypeError, match="Expected a .* DataFrame"): - validate_input(df, backend) diff --git a/test/unit/test_core_temporal_data_loader.py b/test/unit/test_core_temporal_data_loader.py index b51d3eb..72da3aa 100644 --- a/test/unit/test_core_temporal_data_loader.py +++ b/test/unit/test_core_temporal_data_loader.py @@ -1,124 +1,183 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. +""" TemporalScope/test/unit/test_core_temporal_data_loader.py -import modin.pandas as mpd -import pandas as pd -import polars as pl -import pytest +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -from temporalscope.core.temporal_data_loader import TimeFrame + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" -@pytest.fixture -def sample_pandas_df(): - """Fixture for creating a sample Pandas DataFrame.""" - return pd.DataFrame( - { - "time": pd.date_range(start="2021-01-01", periods=10, freq="D"), - "value": range(10), - } - ) - - -@pytest.fixture -def sample_polars_df(): - """Fixture for creating a sample Polars DataFrame.""" - end_date = pl.Series( - "time", pd.date_range(start="2021-01-01", periods=10, freq="D") - ) - return pl.DataFrame({"time": end_date, "value": range(10)}) - - -@pytest.fixture -def sample_modin_df(): - """Fixture for creating a sample Modin DataFrame.""" - return mpd.DataFrame( - { - "time": pd.date_range(start="2021-01-01", periods=10, freq="D"), - "value": range(10), - } - ) - - -def test_initialize_pandas(sample_pandas_df): - """Test TimeFrame initialization with Pandas backend.""" - tf = TimeFrame(sample_pandas_df, time_col="time", target_col="value", backend="pd") - if tf.backend != "pd" or tf.time_col != "time" or tf.target_col != "value": - pytest.fail("Initialization with Pandas backend failed.") - - -def test_initialize_polars(sample_polars_df): - """Test TimeFrame initialization with Polars backend.""" - tf = TimeFrame(sample_polars_df, time_col="time", target_col="value", backend="pl") - if tf.backend != "pl" or tf.time_col != "time" or tf.target_col != "value": - pytest.fail("Initialization with Polars backend failed.") - - -def test_initialize_modin(sample_modin_df): - """Test TimeFrame initialization with Modin backend.""" - tf = TimeFrame(sample_modin_df, time_col="time", target_col="value", backend="mpd") - if tf.backend != "mpd" or tf.time_col != "time" or tf.target_col != "value": - pytest.fail("Initialization with Modin backend failed.") - - -def test_invalid_backend(sample_pandas_df): - """Test TimeFrame with an invalid backend.""" - with pytest.raises(ValueError): - TimeFrame( - sample_pandas_df, - time_col="time", - target_col="value", - backend="invalid_backend", - ) - - -def test_missing_columns(sample_pandas_df): - """Test TimeFrame initialization with missing required columns.""" - # Missing time column - with pytest.raises(ValueError): - TimeFrame( - sample_pandas_df.drop(columns=["time"]), - time_col="time", - target_col="value", - backend="pd", - ) - # Missing target column - with pytest.raises(ValueError): - TimeFrame( - sample_pandas_df.drop(columns=["value"]), - time_col="time", - target_col="value", - backend="pd", - ) - - -def test_duplicate_time_entries(sample_pandas_df): - """Test handling of duplicate time entries.""" - sample_pandas_df.loc[1, "time"] = sample_pandas_df.loc[0, "time"] - tf = TimeFrame(sample_pandas_df, time_col="time", target_col="value", backend="pd") - with pytest.raises( - ValueError, match="Duplicate time entries found within the same group." - ): - tf.check_duplicates() - - -def test_get_data(sample_pandas_df): - """Test get_data method.""" - tf = TimeFrame(sample_pandas_df, time_col="time", target_col="value", backend="pd") - df = tf.get_data() - if not isinstance(df, pd.DataFrame) or df.shape != sample_pandas_df.shape: - pytest.fail("get_data method failed to return correct DataFrame.") +import pytest +import numpy as np +import polars as pl +import pandas as pd +import modin.pandas as mpd +from temporalscope.core.temporal_data_loader import TimeFrame +from typing import Union, Dict, Any, List +from datetime import date, timedelta + +def create_sample_data(num_samples: int = 100, num_features: int = 3) -> Dict[str, Union[List[date], List[float], List[str]]]: + """Create a sample data dictionary representative of a time series ML dataset.""" + start_date = date(2021, 1, 1) + + data = { + "time": [start_date + timedelta(days=i) for i in range(num_samples)], + "id": [f"ID_{i%3}" for i in range(num_samples)], # 3 different IDs cycling + } + + # Add feature columns + for i in range(num_features): + data[f"feature_{i+1}"] = np.random.rand(num_samples).tolist() + + # Add a target column (let's assume it's a function of the features plus some noise) + data["target"] = [sum(data[f"feature_{j+1}"][i] for j in range(num_features)) + np.random.normal(0, 0.1) + for i in range(num_samples)] + + return data + +@pytest.fixture(params=["pd", "pl", "mpd"]) +def sample_df(request): + """Fixture for creating sample DataFrames for each backend.""" + data = create_sample_data() + if request.param == "pd": + return pd.DataFrame(data) + elif request.param == "pl": + return pl.DataFrame(data) + elif request.param == "mpd": + return mpd.DataFrame(data) + +@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +def test_initialize(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): + """Test TimeFrame initialization with various backends.""" + tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) + + assert tf.backend == backend + assert tf.time_col == "time" + assert tf.target_col == "target" + assert tf.id_col is None + + if backend == "pd": + assert isinstance(tf.get_data(), pd.DataFrame) + elif backend == "pl": + assert isinstance(tf.get_data(), pl.DataFrame) + elif backend == "mpd": + assert isinstance(tf.get_data(), mpd.DataFrame) + +# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +# def test_initialize_with_id(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): +# """Test TimeFrame initialization with ID column.""" +# tf = TimeFrame(sample_df, time_col="time", target_col="target", id_col="id", backend=backend) + +# assert tf.id_col == "id" + +# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +# def test_validate_columns(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): +# """Test column validation.""" +# tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) +# tf.validate_columns() # Should not raise an error + +# with pytest.raises(ValueError): +# TimeFrame(sample_df, time_col="non_existent", target_col="target", backend=backend) + + +# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +# def test_get_data(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): +# """Test get_data method.""" +# tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) +# assert tf.get_data().shape == sample_df.shape + + + +# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +# def test_update_data(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): +# """Test update_data method.""" +# tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) + +# new_data = create_sample_data() +# new_data["target"] = [x * 2 for x in new_data["target"]] # Double the target values + +# if backend == "pd": +# new_df = pd.DataFrame(new_data) +# elif backend == "pl": +# new_df = pl.DataFrame(new_data) +# else: +# new_df = mpd.DataFrame(new_data) + +# tf.update_data(new_df) + +# if backend == "pl": +# assert tf.get_data()["target"].to_list() == new_df["target"].to_list() +# else: +# assert tf.get_data()["target"].tolist() == new_df["target"].tolist() + +# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +# def test_update_target_col(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): +# """Test update_target_col method.""" +# tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) + +# new_target = [x * 3 for x in range(100)] # Triple the values + +# if backend == "pd": +# new_target_series = pd.Series(new_target) +# elif backend == "pl": +# new_target_series = pl.Series(new_target) +# else: +# new_target_series = mpd.Series(new_target) + +# tf.update_target_col(new_target_series) + +# if backend == "pl": +# assert tf.get_data()["target"].to_list() == new_target +# else: +# assert tf.get_data()["target"].tolist() == new_target + + +# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +# def test_sort_data(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): +# tf = TimeFrame(sample_df, time_col="time", target_col="target", id_col="id", backend=backend, sort=True) +# sorted_df = tf.get_data() + +# if backend == "pl": +# time_values = sorted_df["time"].to_list() +# else: +# time_values = sorted_df["time"].tolist() + +# assert time_values == sorted(time_values) + +# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +# def test_get_grouped_data(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): +# tf = TimeFrame(sample_df, time_col="time", target_col="target", id_col="id", backend=backend) +# grouped_data = tf.get_grouped_data() + +# if backend == "pl": +# assert grouped_data.shape[0] == len(set(sample_df["id"].to_list())) +# else: +# assert grouped_data.shape[0] == len(set(sample_df["id"].tolist())) + +# with pytest.raises(ValueError): +# tf_without_id = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) +# tf_without_id.get_grouped_data() + + +# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +# def test_check_duplicates(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): +# tf = TimeFrame(sample_df, time_col="time", target_col="target", id_col="id", backend=backend) +# tf.check_duplicates() # Should not raise an error + +# # Create a DataFrame with duplicates +# duplicate_data = sample_df.copy() +# if backend == "pl": +# duplicate_data = duplicate_data.with_columns(pl.col("time").shift(-1)) +# else: +# duplicate_data.loc[1:, "time"] = duplicate_data.loc[:98, "time"].values + +# tf_with_duplicates = TimeFrame(duplicate_data, time_col="time", target_col="target", id_col="id", backend=backend) + +# with pytest.raises(ValueError): +# tf_with_duplicates.check_duplicates() diff --git a/test/unit/test_core_temporal_target_shifter.py b/test/unit/test_core_temporal_target_shifter.py new file mode 100644 index 0000000..d28f9cf --- /dev/null +++ b/test/unit/test_core_temporal_target_shifter.py @@ -0,0 +1,123 @@ +""" TemporalScope/test/unit/test_core_temporal_target_shifter.py + +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import modin.pandas as mpd +import pandas as pd +import polars as pl +import pytest +from temporalscope.core.temporal_target_shifter import TemporalTargetShifter +from temporalscope.core.temporal_data_loader import TimeFrame +from typing import Union + +# Test DataFrames +pd_df = pd.DataFrame({ + "time": pd.date_range(start="2023-01-01", periods=5, freq="D"), + "target": [10, 20, 30, 40, 50], +}) + +pl_df = pl.DataFrame({ + "time": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05"], + "target": [10, 20, 30, 40, 50], +}) + +mpd_df = mpd.DataFrame({ + "time": pd.date_range(start="2023-01-01", periods=5, freq="D"), + "target": [10, 20, 30, 40, 50], +}) + +@pytest.mark.parametrize("backend, df", [ + ("pd", pd_df), + ("pl", pl_df), + ("mpd", mpd_df), +]) +def test_shift_target_scalar_output(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: + """Test shifting target to scalar output for each backend.""" + tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) + shifter = TemporalTargetShifter(shift_steps=1, array_output=False) + tf_transformed = shifter.transform(tf) + + expected_target = [20, 30, 40, 50, None] + + if backend == "pl": + actual_target = tf_transformed.get_data()["target_shift_1"].to_list() + else: + actual_target = tf_transformed.get_data()["target_shift_1"].tolist() + + assert actual_target == expected_target[:-1] # Comparing excluding the last item due to `None` handling + +@pytest.mark.parametrize("backend, df", [ + ("pd", pd_df), + ("pl", pl_df), + ("mpd", mpd_df), +]) +def test_shift_target_array_output(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: + """Test shifting target to array output for each backend.""" + tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) + shifter = TemporalTargetShifter(shift_steps=2, array_output=True) + tf_transformed = shifter.transform(tf) + + expected_target_array = [[20, 30], [30, 40], [40, 50], [50, None], [None, None]] + + if backend == "pl": + actual_target = tf_transformed.get_data()["target_array_2"].to_list() + else: + actual_target = tf_transformed.get_data()["target_array_2"].tolist() + + assert actual_target == expected_target_array + +@pytest.mark.parametrize("backend, df", [ + ("pd", pd_df), + ("pl", pl_df), + ("mpd", mpd_df), +]) +def test_shift_target_with_nonstandard_names(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: + """Test shifting target with non-standardized names.""" + tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) + shifter = TemporalTargetShifter(shift_steps=1, array_output=False) + tf_transformed = shifter.transform(tf) + + expected_target = [20, 30, 40, 50, None] + + if backend == "pl": + actual_target = tf_transformed.get_data()["target_shift_1"].to_list() + else: + actual_target = tf_transformed.get_data()["target_shift_1"].tolist() + + assert actual_target == expected_target[:-1] + +@pytest.mark.parametrize("backend, df", [ + ("pd", pd_df), + ("pl", pl_df), + ("mpd", mpd_df), +]) +def test_shift_target_invalid_backend(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: + """Test shifting target with an invalid backend.""" + tf = TimeFrame(df, time_col="time", target_col="target", backend="invalid_backend") + shifter = TemporalTargetShifter(shift_steps=1, array_output=False) + with pytest.raises(ValueError, match="Unsupported backend"): + shifter.transform(tf) + +@pytest.mark.parametrize("backend, df", [ + ("pd", pd_df), + ("pl", pl_df), + ("mpd", mpd_df), +]) +def test_shift_target_type_error(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: + """Test shifting target with an incorrect DataFrame type.""" + # Intentionally using an incorrect type (dictionary) instead of a DataFrame + with pytest.raises(TypeError): + tf = TimeFrame(df.to_dict(), time_col="time", target_col="target", backend=backend) + shifter = TemporalTargetShifter(shift_steps=1, array_output=False) + shifter.transform(tf) diff --git a/test/unit/test_core_utils.py b/test/unit/test_core_utils.py index 6bc6299..f5cf819 100644 --- a/test/unit/test_core_utils.py +++ b/test/unit/test_core_utils.py @@ -1,31 +1,132 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. +""" TemporalScope/test/unit/test_core_utils.py -import warnings +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from unittest.mock import patch import modin.pandas as mpd import pandas as pd import polars as pl import pytest - -from temporalscope.core.utils import check_nans, check_nulls, print_divider +from temporalscope.core.core_utils import ( + get_api_keys, + get_default_backend_cfg, + validate_backend, + validate_input, + validate_and_convert_input, + check_nans, + check_nulls, + print_divider +) +import warnings warnings.filterwarnings("ignore", message=".*defaulting to pandas.*") +# Define mock API key constants +MOCK_OPENAI_API_KEY = "mock_openai_key" +MOCK_CLAUDE_API_KEY = "mock_claude_key" + +def create_sample_data(): + """Create a sample data dictionary.""" + return {"a": [1, 2, 3]} + +@pytest.fixture(params=["pd", "pl", "mpd"]) +def sample_df(request): + """Fixture for creating sample DataFrames for each backend.""" + data = create_sample_data() + backend = request.param + if backend == "pd": + return pd.DataFrame(data), backend + elif backend == "pl": + return pl.DataFrame(data), backend + elif backend == "mpd": + return mpd.DataFrame(data), backend + +def test_get_api_keys(): + """Test that get_api_keys retrieves environment variables correctly.""" + with patch.dict("os.environ", {"OPENAI_API_KEY": MOCK_OPENAI_API_KEY, "CLAUDE_API_KEY": MOCK_CLAUDE_API_KEY}): + api_keys = get_api_keys() + assert api_keys["OPENAI_API_KEY"] == MOCK_OPENAI_API_KEY + assert api_keys["CLAUDE_API_KEY"] == MOCK_CLAUDE_API_KEY + + with patch.dict("os.environ", {}, clear=True): + api_keys = get_api_keys() + assert api_keys["OPENAI_API_KEY"] is None + assert api_keys["CLAUDE_API_KEY"] is None + +def test_get_default_backend_cfg(): + """Test that the default backend configuration is returned correctly.""" + expected_cfg = {"BACKENDS": {"pl": "polars", "pd": "pandas", "mpd": "modin"}} + result = get_default_backend_cfg() + assert result == expected_cfg + +@pytest.mark.parametrize("backend", ["pl", "pd", "mpd"]) +def test_validate_backend_supported(backend): + """Test that supported backends are validated successfully.""" + validate_backend(backend) + +@pytest.mark.parametrize("invalid_backend", ["tf", "spark", "unknown"]) +def test_validate_backend_unsupported(invalid_backend): + """Test that unsupported backends raise a ValueError.""" + with pytest.raises(ValueError, match="Unsupported backend"): + validate_backend(invalid_backend) + +def test_validate_input_valid(sample_df): + """Test that valid DataFrame types are accepted based on the backend.""" + df, backend = sample_df + validate_input(df, backend) + +@pytest.mark.parametrize("backend, df_type", [ + ("pl", pd.DataFrame), ("pd", pl.DataFrame), ("mpd", pd.DataFrame) +]) +def test_validate_input_invalid(backend, df_type): + """Test that invalid DataFrame types raise a TypeError based on the backend.""" + with pytest.raises(TypeError, match="Expected a .* DataFrame"): + validate_input(df_type(create_sample_data()), backend) + +@pytest.mark.parametrize("output_backend", ["pd", "pl", "mpd"]) +def test_validate_and_convert_input(sample_df, output_backend): + """Test that validate_and_convert_input correctly converts DataFrames.""" + df, input_backend = sample_df + result = validate_and_convert_input(df, output_backend) + + if output_backend == "pd": + assert isinstance(result, pd.DataFrame) + elif output_backend == "pl": + assert isinstance(result, pl.DataFrame) + elif output_backend == "mpd": + assert isinstance(result, mpd.DataFrame) + + assert result.shape == df.shape + + if output_backend == "pl": + assert result["a"].to_list() == [1, 2, 3] + else: + assert result["a"].tolist() == [1, 2, 3] + +def test_validate_and_convert_input_invalid_backend(sample_df): + """Test that an invalid backend raises a ValueError.""" + df, _ = sample_df + with pytest.raises(ValueError, match="Unsupported backend"): + validate_and_convert_input(df, "invalid_backend") + +def test_validate_and_convert_input_invalid_df_type(): + """Test that an invalid DataFrame type raises a TypeError.""" + with pytest.raises(TypeError, match="Input DataFrame type .* does not match the specified backend"): + validate_and_convert_input({"a": [1, 2, 3]}, "pd") # Not a DataFrame + + + # Test data for check_nulls test_nulls_data = [ diff --git a/test/unit/test_partion_data_checks.py b/test/unit/test_partion_data_checks.py index fb115dc..05f93fc 100644 --- a/test/unit/test_partion_data_checks.py +++ b/test/unit/test_partion_data_checks.py @@ -1,20 +1,21 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +""" import modin.pandas as mpd import pandas as pd import polars as pl diff --git a/tutorial_notebooks/speed_test_generators.ipynb b/tutorial_notebooks/speed_test_generators.ipynb new file mode 100644 index 0000000..c3b9637 --- /dev/null +++ b/tutorial_notebooks/speed_test_generators.ipynb @@ -0,0 +1,116 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "id": "c34c2376-3c80-4983-99db-ea52a0de3323", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (5, 2)\n", + "┌─────┬─────┐\n", + "│ A ┆ B │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═════╪═════╡\n", + "│ 1 ┆ a │\n", + "│ 2 ┆ b │\n", + "│ 3 ┆ c │\n", + "│ 4 ┆ d │\n", + "│ 5 ┆ e │\n", + "└─────┴─────┘\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "\n", + "# Create a simple DataFrame\n", + "df = pl.DataFrame({\n", + " \"A\": [1, 2, 3, 4, 5],\n", + " \"B\": [\"a\", \"b\", \"c\", \"d\", \"e\"]\n", + "})\n", + "\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5e882bf0-74f5-4e8f-812a-ccedbe902cd2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (10, 2)\n", + "┌────────────┬───────┐\n", + "│ date ┆ value │\n", + "│ --- ┆ --- │\n", + "│ date ┆ i64 │\n", + "╞════════════╪═══════╡\n", + "│ 2021-01-01 ┆ 0 │\n", + "│ 2021-01-02 ┆ 1 │\n", + "│ 2021-01-03 ┆ 2 │\n", + "│ 2021-01-04 ┆ 3 │\n", + "│ 2021-01-05 ┆ 4 │\n", + "│ 2021-01-06 ┆ 5 │\n", + "│ 2021-01-07 ┆ 6 │\n", + "│ 2021-01-08 ┆ 7 │\n", + "│ 2021-01-09 ┆ 8 │\n", + "│ 2021-01-10 ┆ 9 │\n", + "└────────────┴───────┘\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "from datetime import date\n", + "\n", + "df = pl.DataFrame({\n", + " \"date\": [date(2021, 1, i) for i in range(1, 11)],\n", + " \"value\": list(range(10))\n", + "})\n", + "\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f24851c-443d-4c2e-9251-6f83fffdf8e1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TemporalScope", + "language": "python", + "name": "temporalscope-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ef2198d7ad66eef2b81ca514cf9d9bddd2b98673 Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Fri, 20 Sep 2024 18:28:36 +0000 Subject: [PATCH 2/7] refactor: refactor sliding window partitioner and core utils for consistent dataframe handlin --- README.md | 48 +- SCIENTIFIC_LITERATURE.md | 29 +- src/temporalscope/core/core_utils.py | 140 +- .../core/temporal_data_loader.py | 321 ++-- .../core/temporal_target_shifter.py | 554 +++++-- src/temporalscope/partition/sliding_window.py | 890 ++++++++--- test/unit/test_core_temporal_data_loader.py | 382 +++-- .../unit/test_core_temporal_target_shifter.py | 274 ++-- test/unit/test_core_utils.py | 293 ++-- test/unit/test_partion_data_checks.py | 638 ++++---- ...data.ipynb => 0_load_data_timeframe.ipynb} | 50 +- .../1_load_data_target_shifter.ipynb | 1387 +++++++++++++++++ .../1_tutorial_partion_data.ipynb | 159 -- .../speed_test_generators.ipynb | 116 -- 14 files changed, 3635 insertions(+), 1646 deletions(-) rename tutorial_notebooks/introduction/{0_load_data.ipynb => 0_load_data_timeframe.ipynb} (94%) create mode 100644 tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb delete mode 100644 tutorial_notebooks/introduction/1_tutorial_partion_data.ipynb delete mode 100644 tutorial_notebooks/speed_test_generators.ipynb diff --git a/README.md b/README.md index 6654fac..a5296ad 100644 --- a/README.md +++ b/README.md @@ -29,14 +29,46 @@ --- -| | | -| --- | --- | -| Compatibility | ![Python Version](https://img.shields.io/badge/python-3.10%2B-blue) ![Linux Compatible](https://img.shields.io/badge/OS-Linux-blue) | -| License | ![License](https://img.shields.io/badge/License-Apache%202.0-green) | -| Code Quality | [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://docs.astral.sh/ruff/) ![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)| -| Build Tools | [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://hatch.pypa.io/latest/) | -| CI/CD | [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/philip-ndikum/TemporalScope/main.svg)](https://results.pre-commit.ci/latest/github/philip-ndikum/TemporalScope/main) [![codecov](https://codecov.io/gh/philip-ndikum/TemporalScope/branch/main/graph/badge.svg)](https://codecov.io/gh/philip-ndikum/TemporalScope)| -| Security | [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9424/badge)](https://www.bestpractices.dev/projects/9424) [![Security: Bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)| +
+ + + + + + + + + + + + + + + + + + + + + +
CompatibilityLicenseCode QualityBuild ToolsCI/CDSecurity
+ Python Version
+ Linux Compatible +
+ License + + Ruff
+ Checked with mypy +
+ Hatch project + + pre-commit.ci status
+ codecov +
+ OpenSSF Best Practices
+ Security: Bandit +
+
--- **TemporalScope** is an open-source Python package designed to bridge the gap between scientific research and practical industry applications for analyzing the temporal dynamics of feature importance in AI & ML time series models. Developed in alignment with Linux Foundation standards and licensed under Apache 2.0, it builds on tools such as Boruta-SHAP and SHAP, using modern window partitioning algorithms to tackle challenges like non-stationarity and concept drift. The tool is flexible and extensible, allowing for bespoke enhancements and algorithms, and supports frameworks like Pandas, Polars, and Modin. Additionally, the optional *Clara LLM* modules (etymology from the word _Clarity_) are intended to serve as a model-validation tool to support explainability efforts (XAI). **Note**: TemporalScope is currently in **beta and pre-release** phase so some installation methods may not work as expected on all platforms. Please check the `CONTRIBUTIONS.md` for the full roadmap. diff --git a/SCIENTIFIC_LITERATURE.md b/SCIENTIFIC_LITERATURE.md index d83a689..6039f3f 100644 --- a/SCIENTIFIC_LITERATURE.md +++ b/SCIENTIFIC_LITERATURE.md @@ -1,15 +1,20 @@ -### SCIENTIFIC_LITERATURE.md - -This document lists key literature that has informed the development of this package. Please note that this is not a conclusive list but highlights the most relevant works. - -| **Category** | **Title** | **Authors** | **Publication** | **Summary** | -|--------------|-----------|-------------|-----------------|-------------| -| **Regulatory Literature** | [Machine learning algorithms for financial asset price forecasting](https://arxiv.org/abs/2004.01504) | Ndikum, P. | arXiv preprint, 2020 | Discusses the application of machine learning algorithms for forecasting financial asset prices, with implications for regulatory frameworks. | -| **Regulatory Literature** | [Advancing Investment Frontiers: Industry-grade Deep Reinforcement Learning for Portfolio Optimization](https://arxiv.org/abs/2403.07916) | Ndikum, P., & Ndikum, S. | arXiv preprint, 2024 | Explores deep reinforcement learning approaches for portfolio optimization, emphasizing industry-grade applications and regulatory considerations. | -| **Scientific Literature** | [SHAP-based insights for aerospace PHM: Temporal feature importance, dependencies, robustness, and interaction analysis](https://www.sciencedirect.com/science/article/pii/S2590123024000872) | Alomari, Y., & Andó, M. | Results in Engineering, 2024 | This paper explores SHAP-based methods for analyzing temporal feature importance in aerospace predictive health management. | -| **Scientific Literature** | [Feature importance explanations for temporal black-box models](https://arxiv.org/pdf/2102.11934) | Sood, A., & Craven, M. | AAAI Conference on Artificial Intelligence, 2022 | Introduces the TIME framework for explaining temporal black-box models using feature importance. | -| **Scientific Literature** | [WindowSHAP: An efficient framework for explaining time-series classifiers based on Shapley values](https://doi.org/10.1016/j.jbi.2023.104438) | Nayebi, A., Tipirneni, S., Reddy, C. K., Foreman, B., & Subbian, V. | Journal of Biomedical Informatics, 2023 | Proposes the WindowSHAP framework to explain time-series classifiers, improving both computational efficiency and explanation quality. | -| **Scientific Literature** | [The sliding window and SHAP theory—an improved system with a long short-term memory network model for state of charge prediction in electric vehicle application](https://doi.org/10.3390/en14123692) | Gu, X., See, K. W., Wang, Y., Zhao, L., & Pu, W. | Energies, 2021 | Combines sliding window and SHAP theories to enhance LSTM-based SOC prediction models for electric vehicles. | +## Engineering Design + +This document lists key literature that has informed the development of this package. Please note that this is not a conclusive list but highlights the most relevant works. Our design is explicitly built for flexibility, unlike other time series machine learning and deep learning packages that often enforce rigid preprocessing constraints. We intentionally adopt familiar software engineering patterns, inspired by scikit-learn, to provide a modular and adaptable framework. The only assumption we impose is that features must be organized in a context window prior to the target variable. This allows users to focus on their core applications while ensuring compatibility with SHAP and other explainability methods. + + +| **Category** | **Title** | **Authors** | **Publication** | **Summary** | +|-------------------------|------------------------------------------------------------------------------------------------------------------|----------------------------------------------------|---------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Regulatory Literature** | [Machine learning algorithms for financial asset price forecasting](https://arxiv.org/abs/2004.01504) | Ndikum, P. | arXiv preprint, 2020 | Discusses the application of machine learning algorithms for forecasting financial asset prices, with implications for regulatory frameworks. | +| **Regulatory Literature** | [Advancing Investment Frontiers: Industry-grade Deep Reinforcement Learning for Portfolio Optimization](https://arxiv.org/abs/2403.07916) | Ndikum, P., & Ndikum, S. | arXiv preprint, 2024 | Explores deep reinforcement learning approaches for portfolio optimization, emphasizing industry-grade applications and regulatory considerations. | +| **Scientific Literature** | [SHAP-based insights for aerospace PHM: Temporal feature importance, dependencies, robustness, and interaction analysis](https://www.sciencedirect.com/science/article/pii/S2590123024000872) | Alomari, Y., & Andó, M. | Results in Engineering, 2024 | This paper explores SHAP-based methods for analyzing temporal feature importance in aerospace predictive health management. | +| **Scientific Literature** | [Feature importance explanations for temporal black-box models](https://arxiv.org/pdf/2102.11934) | Sood, A., & Craven, M. | AAAI Conference on Artificial Intelligence, 2022 | Introduces the TIME framework for explaining temporal black-box models using feature importance. | +| **Scientific Literature** | [WindowSHAP: An efficient framework for explaining time-series classifiers based on Shapley values](https://doi.org/10.1016/j.jbi.2023.104438) | Nayebi, A., Tipirneni, S., Reddy, C. K., et al. | Journal of Biomedical Informatics, 2023 | Proposes the WindowSHAP framework to explain time-series classifiers, improving both computational efficiency and explanation quality. | +| **Scientific Literature** | [The sliding window and SHAP theory—an improved system with a long short-term memory network model for state of charge prediction in electric vehicle application](https://doi.org/10.3390/en14123692) | Gu, X., See, K. W., Wang, Y., et al. | Energies, 2021 | Combines sliding window and SHAP theories to enhance LSTM-based SOC prediction models for electric vehicles. | +| **Scientific Literature** | [Cross-Frequency Time Series Meta-Forecasting](https://arxiv.org/abs/2302.02077) | Van Ness, M., Shen, H., Wang, H., et al. | arXiv preprint, 2023 | Proposes the CFA model, capable of handling varying frequencies in time series data, supporting flexible universal model assumptions in time series forecasting. | +| **Scientific Literature** | [Unified Training of Universal Time Series Forecasting Transformers](https://arxiv.org/abs/2402.02592) | Woo, G., Liu, C., Kumar, A., et al. | arXiv preprint, 2024 | Introduces Moirai, a transformer model that scales universally across multiple time series forecasting tasks without heavy preprocessing constraints. | +| **Scientific Literature** | [Universal Time-Series Representation Learning: A Survey](https://arxiv.org/abs/2401.03717) | Trirat, P., Shin, Y., Kang, J., et al. | arXiv preprint, 2024 | Provides a comprehensive survey of universal models for time series, outlining how generalization across datasets is achieved with minimal assumptions. | + ### Partitioning Guidelines diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py index 954e149..28ce5cf 100644 --- a/src/temporalscope/core/core_utils.py +++ b/src/temporalscope/core/core_utils.py @@ -16,7 +16,7 @@ limitations under the License. """ -from typing import Union, cast, Dict, Optional +from typing import Union, cast, Dict, Optional, NoReturn import os from dotenv import load_dotenv import polars as pl @@ -26,11 +26,24 @@ # Load environment variables from the .env file load_dotenv() -# Supported backend configuration +# Backend abbreviations +BACKEND_POLARS = "pl" +BACKEND_PANDAS = "pd" +BACKEND_MODIN = "mpd" + +# Mapping of backend keys to their full names or module references +BACKENDS = { + BACKEND_POLARS: "polars", + BACKEND_PANDAS: "pandas", + BACKEND_MODIN: "modin", +} + TF_DEFAULT_CFG = { - "BACKENDS": {"pl": "polars", "pd": "pandas", "mpd": "modin"}, + "BACKENDS": BACKENDS, } +# Define a type alias for DataFrames that support Pandas, Modin, and Polars backends +SupportedBackendDataFrame = Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] def get_default_backend_cfg() -> Dict[str, Dict[str, str]]: """Retrieve the application configuration settings. @@ -48,51 +61,42 @@ def validate_backend(backend: str) -> None: :type backend: str :raises ValueError: If the backend is not supported. """ - if backend not in TF_DEFAULT_CFG["BACKENDS"].keys(): + if backend not in TF_DEFAULT_CFG["BACKENDS"]: raise ValueError( f"Unsupported backend '{backend}'. Supported backends are: " f"{', '.join(TF_DEFAULT_CFG['BACKENDS'].keys())}." ) +def raise_invalid_backend(backend: str) -> NoReturn: + """Raise a ValueError for an invalid backend. + + :param backend: The backend to validate ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). + :type backend: str + :raises ValueError: If the backend is not supported. + """ + raise ValueError(f"Unsupported backend: {backend}") + + def validate_input( df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str ) -> None: - """Validates the input DataFrame to ensure it matches the expected type for the specified backend. + """Validate that the DataFrame matches the expected type for the specified backend. :param df: The DataFrame to validate. :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - :param backend: The backend against which to validate the DataFrame's type ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). + :param backend: The backend against which to validate the DataFrame's type ('pl', 'pd', 'mpd'). :type backend: str :raises TypeError: If the DataFrame does not match the expected type for the backend. """ - if backend == "pl" and not isinstance(df, pl.DataFrame): + if backend == BACKEND_POLARS and not isinstance(df, pl.DataFrame): raise TypeError("Expected a Polars DataFrame.") - elif backend == "pd" and not isinstance(df, pd.DataFrame): + elif backend == BACKEND_PANDAS and not isinstance(df, pd.DataFrame): raise TypeError("Expected a Pandas DataFrame.") - elif backend == "mpd" and not isinstance(df, mpd.DataFrame): + elif backend == BACKEND_MODIN and not isinstance(df, mpd.DataFrame): raise TypeError("Expected a Modin DataFrame.") -def get_api_keys() -> Dict[str, Optional[str]]: - """Retrieve API keys from environment variables. - - :return: A dictionary containing the API keys, or None if not found. - :rtype: Dict[str, Optional[str]] - """ - api_keys = { - "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"), - "CLAUDE_API_KEY": os.getenv("CLAUDE_API_KEY"), - } - - # Print warnings if keys are missing - for key, value in api_keys.items(): - if value is None: - print(f"Warning: {key} is not set in the environment variables.") - - return api_keys - - def validate_and_convert_input( df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str ) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: @@ -104,40 +108,66 @@ def validate_and_convert_input( :type backend: str :return: The DataFrame converted to the specified backend type. :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - :raises TypeError: If the input DataFrame type doesn't match the specified backend. + :raises TypeError: If the input DataFrame type doesn't match the specified backend or conversion fails. + :raises ValueError: If the backend is not supported. """ - validate_backend(backend) # Use the existing validate_backend function + validate_backend(backend) # Validates if backend is supported - if backend == "pl": + if backend == BACKEND_POLARS: if isinstance(df, pl.DataFrame): return df elif isinstance(df, pd.DataFrame): - return pl.from_pandas(df) + return pl.from_pandas(df) # Convert Pandas to Polars elif isinstance(df, mpd.DataFrame): - return pl.from_pandas(df._to_pandas()) - elif backend == "pd": + return pl.from_pandas(df._to_pandas()) # Modin to Pandas to Polars + elif backend == BACKEND_PANDAS: if isinstance(df, pd.DataFrame): return df elif isinstance(df, pl.DataFrame): - return df.to_pandas() + return df.to_pandas() # Convert Polars to Pandas elif isinstance(df, mpd.DataFrame): - return df._to_pandas() - elif backend == "mpd": + return df._to_pandas() # Convert Modin to Pandas + elif backend == BACKEND_MODIN: if isinstance(df, mpd.DataFrame): return df elif isinstance(df, pd.DataFrame): - return mpd.DataFrame(df) + return mpd.DataFrame(df) # Convert Pandas to Modin elif isinstance(df, pl.DataFrame): - return mpd.DataFrame(df.to_pandas()) + return mpd.DataFrame(df.to_pandas()) # Polars to Pandas to Modin - # If we reach here, the input DataFrame type doesn't match the backend + # If none of the types match, raise a TypeError raise TypeError( - f"Input DataFrame type {type(df)} does not match the specified backend {backend}" + f"Input DataFrame type {type(df)} does not match the specified backend '{backend}'" ) +def get_api_keys() -> Dict[str, Optional[str]]: + """Retrieve API keys from environment variables. + + :return: A dictionary containing the API keys, or None if not found. + :rtype: Dict[str, Optional[str]] + """ + api_keys = { + "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"), + "CLAUDE_API_KEY": os.getenv("CLAUDE_API_KEY"), + } + + # Print warnings if keys are missing + for key, value in api_keys.items(): + if value is None: + print(f"Warning: {key} is not set in the environment variables.") + + return api_keys + + def print_divider(char: str = "=", length: int = 70) -> None: - """Prints a divider line made of a specified character and length.""" + """Prints a divider line made of a specified character and length. + + :param char: The character to use for the divider, defaults to '=' + :type char: str, optional + :param length: The length of the divider, defaults to 70 + :type length: int, optional + """ print(char * length) @@ -148,7 +178,7 @@ def check_nulls( :param df: The DataFrame to check for null values. :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - :param backend: The backend used for the DataFrame ('polars', 'pandas', 'modin'). + :param backend: The backend used for the DataFrame ('pl', 'pd', 'mpd'). :type backend: str :return: True if there are null values, False otherwise. :rtype: bool @@ -156,19 +186,16 @@ def check_nulls( """ validate_backend(backend) - if backend == "pd": - # Convert NumPy result to Python bool + if backend == BACKEND_PANDAS: return bool(cast(pd.DataFrame, df).isnull().values.any()) - elif backend == "pl": - # Polars-specific null check: sum the null counts and return a boolean + elif backend == BACKEND_POLARS: polars_df = cast(pl.DataFrame, df) null_count = polars_df.null_count().select(pl.col("*").sum()).to_numpy().sum() return bool(null_count > 0) - elif backend == "mpd": - # Convert NumPy result to Python bool + elif backend == BACKEND_MODIN: return bool(cast(mpd.DataFrame, df).isnull().values.any()) else: - raise ValueError(f"Unsupported backend '{backend}'.") + raise_invalid_backend(backend) def check_nans( @@ -178,7 +205,7 @@ def check_nans( :param df: The DataFrame to check for NaN values. :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - :param backend: The backend used for the DataFrame ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). + :param backend: The backend used for the DataFrame ('pl', 'pd', 'mpd'). :type backend: str :return: True if there are NaN values, False otherwise. :rtype: bool @@ -186,16 +213,13 @@ def check_nans( """ validate_backend(backend) - if backend == "pd": - # Convert NumPy result to Python bool + if backend == BACKEND_PANDAS: return bool(cast(pd.DataFrame, df).isna().values.any()) - elif backend == "pl": - # Polars-specific NaN check: check if there are any NaNs + elif backend == BACKEND_POLARS: polars_df = cast(pl.DataFrame, df) nan_count = polars_df.select((polars_df == float("nan")).sum()).to_numpy().sum() return bool(nan_count > 0) - elif backend == "mpd": - # Convert NumPy result to Python bool + elif backend == BACKEND_MODIN: return bool(cast(mpd.DataFrame, df).isna().values.any()) else: - raise ValueError(f"Unsupported backend '{backend}'.") + raise_invalid_backend(backend) diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index 36344ed..df099cf 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -1,27 +1,39 @@ -"""TemporalScope/src/temporalscope/core/temporal_data_loader.py +""" TemporalScope/src/temporalscope/core/temporal_data_loader.py -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +This module provides a flexible data loader for time series forecasting, allowing users to define their own +preprocessing, loss functions, and explainability workflows. The core assumption is that features are organized +in a context window prior to the target column, making the system compatible with SHAP and other explainability methods. +Given the variance in pre-processing techniques, meta-learning & loss-functions TemporalScope explicitly does not +impose constraints on the end-user in the engineering design. + +.. seealso:: + + 1. Van Ness, M., Shen, H., Wang, H., Jin, X., Maddix, D.C., & Gopalswamy, K. (2023). Cross-Frequency Time Series Meta-Forecasting. arXiv preprint arXiv:2302.02077. + 2. Woo, G., Liu, C., Kumar, A., Xiong, C., Savarese, S., & Sahoo, D. (2024). Unified training of universal time series forecasting transformers. arXiv preprint arXiv:2402.02592. + 3. Trirat, P., Shin, Y., Kang, J., Nam, Y., Na, J., Bae, M., Kim, J., Kim, B., & Lee, J.-G. (2024). Universal time-series representation learning: A survey. arXiv preprint arXiv:2401.03717. + +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed +on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License +for the specific language governing permissions and limitations under the License. """ -from typing import Union, Optional, cast +from typing import Union, Optional import polars as pl -from polars import Expr import pandas as pd import modin.pandas as mpd from temporalscope.core.core_utils import ( - validate_and_convert_input, validate_backend, + validate_input, + validate_and_convert_input, get_default_backend_cfg, + BACKEND_POLARS, + BACKEND_PANDAS, + BACKEND_MODIN, ) @@ -39,23 +51,12 @@ class TimeFrame: Designed to be the core data handler in a variety of temporal analysis scenarios, the `TimeFrame` class integrates seamlessly with other TemporalScope modules and can be extended for more advanced use cases. - :param df: The input DataFrame. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - :param time_col: The column representing time in the DataFrame. - :type time_col: str - :param target_col: The column representing the target variable in the DataFrame. - :type target_col: str - :param id_col: Optional. The column representing the ID for grouping. Default is None. - :type id_col: Optional[str] - :param backend: The backend to use ('pl' for Polars, 'pd' for Pandas, or 'mpd' for Modin). Default is 'pl'. - :type backend: str - :param sort: Optional. Sort the data by `time_col` (and `id_col` if provided) in ascending order. Default is True. - :type sort: bool - - .. note:: - The `TimeFrame` class is designed for workflows where the target label has already been generated. - If your workflow requires generating the target label, consider using the `TemporalTargetShifter` class - from the `TemporalScope` package to shift the target variable appropriately for tasks like forecasting. + Assumptions: + -------------- + - This package does not impose constraints on grouping or handling duplicates. + - We assume users will build universal models and handle preprocessing (e.g., grouping, deduplication) with + TemporalScope modules or external methods. + - The only requirement is that features are arranged in a context window prior to the target column. Example Usage: -------------- @@ -78,7 +79,7 @@ class TimeFrame: 'time': pd.date_range(start='2021-01-01', periods=100, freq='D'), 'value': range(100) }) - tf = TimeFrame(df, time_col='time', target_col='value', backend='mpd') + tf = TimeFrame(df, time_col='time', target_col='value', backend=BACKEND_MODIN) # Accessing the data print(tf.get_data().head()) @@ -89,15 +90,45 @@ def __init__( df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], time_col: str, target_col: str, - id_col: Optional[str] = None, - backend: str = "pl", + backend: Optional[str] = None, sort: bool = True, ): + """Initialize a TimeFrame object. + + :param df: The input DataFrame. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :param time_col: The name of the column representing time in the DataFrame. + :type time_col: str + :param target_col: The name of the column representing the target variable in the DataFrame. + :type target_col: str + :param backend: The backend to use. If not provided, it will be inferred from the DataFrame type. + Supported backends are: + - `BACKEND_POLARS` ('pl') for Polars + - `BACKEND_PANDAS` ('pd') for Pandas + - `BACKEND_MODIN` ('mpd') for Modin + Default is to infer from the DataFrame. + :type backend: Optional[str] + :param sort: Optional. If True, sort the data by `time_col` in ascending order. Default is True. + :type sort: bool + :raises ValueError: + - If `time_col` or `target_col` is not a non-empty string. + - If required columns are missing in the DataFrame. + - If the inferred or specified backend is not supported. + :raises TypeError: + - If the DataFrame type does not match the specified backend. + """ + if not isinstance(time_col, str) or not time_col: + raise ValueError("time_col must be a non-empty string.") + if not isinstance(target_col, str) or not target_col: + raise ValueError("target_col must be a non-empty string.") + + # Infer the backend if not explicitly provided + self._backend = backend or self._infer_backend(df) + validate_backend(self._backend) + self._cfg = get_default_backend_cfg() - self._backend = backend self._time_col = time_col self._target_col = target_col - self._id_col = id_col self._sort = sort # Convert, validate, and set up the DataFrame @@ -105,71 +136,96 @@ def __init__( @property def backend(self) -> str: - """Return the backend used ('pl' for Polars, 'pd' for Pandas, or 'mpd' for Modin).""" + """Return the backend used. + + :return: The backend identifier (e.g., 'pl', 'pd', 'mpd'). + :rtype: str + """ return self._backend @property def time_col(self) -> str: - """Return the column name representing time.""" + """Return the column name representing time. + + :return: The name of the time column. + :rtype: str + """ return self._time_col @property def target_col(self) -> str: - """Return the column name representing the target variable.""" + """Return the column name representing the target variable. + + :return: The name of the target column. + :rtype: str + """ return self._target_col - @property - def id_col(self) -> Optional[str]: - """Return the column name used for grouping or None if not set.""" - return self._id_col + def _infer_backend( + self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + ) -> str: + """Infer the backend from the DataFrame type. + + :param df: The input DataFrame. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :return: The inferred backend ('pl', 'pd', or 'mpd'). + :rtype: str + :raises ValueError: If the DataFrame type is unsupported. + """ + if isinstance(df, pl.DataFrame): + return BACKEND_POLARS + elif isinstance(df, pd.DataFrame): + return BACKEND_PANDAS + elif isinstance(df, mpd.DataFrame): + return BACKEND_MODIN + else: + raise ValueError(f"Unsupported DataFrame type: {type(df)}") def _validate_columns( self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] ) -> None: - """Validate the presence and types of required columns in the DataFrame. + """Validate the presence of required columns in the DataFrame. :param df: The DataFrame to validate. :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] :raises ValueError: If required columns are missing. """ - required_columns = [self.time_col, self._target_col] + ( - [self.id_col] if self.id_col else [] - ) - missing_columns = [ - col for col in required_columns if col and col not in df.columns - ] + required_columns = [self._time_col, self._target_col] + missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: raise ValueError(f"Missing required columns: {', '.join(missing_columns)}") def _sort_data( - self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + self, + df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], + ascending: bool = True, ) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: """Internal method to sort the DataFrame based on the backend. :param df: The DataFrame to sort. :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :param ascending: If True, sort in ascending order; if False, sort in descending order. + :type ascending: bool :return: The sorted DataFrame. :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :raises TypeError: If the DataFrame type does not match the backend. + :raises ValueError: If the backend is unsupported. """ - sort_key = [self.id_col, self.time_col] if self.id_col else [self.time_col] - - # Polars backend sorting - if self._backend == "pl": - if isinstance(df, pl.DataFrame): - return df.sort(sort_key) - else: - raise TypeError("Expected a Polars DataFrame for the Polars backend.") - - # Pandas or Modin backend sorting - elif self._backend in ["pd", "mpd"]: - if isinstance(df, (pd.DataFrame, mpd.DataFrame)): - return df.sort_values(by=sort_key) - else: - raise TypeError( - "Expected a Pandas or Modin DataFrame for the Pandas or Modin backend." - ) + # Validate the DataFrame type + validate_input(df, self._backend) - else: + sort_key = [self._time_col] + + # Mapping of backends to their sort functions + sort_functions = { + BACKEND_POLARS: lambda df: df.sort(by=sort_key, descending=not ascending), + BACKEND_PANDAS: lambda df: df.sort_values(by=sort_key, ascending=ascending), + BACKEND_MODIN: lambda df: df.sort_values(by=sort_key, ascending=ascending), + } + + try: + return sort_functions[self._backend](df) + except KeyError: raise ValueError(f"Unsupported backend: {self._backend}") def _setup_timeframe( @@ -181,6 +237,10 @@ def _setup_timeframe( :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] :return: The processed DataFrame. :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :raises ValueError: + - If required columns are missing. + - If the specified backend is not supported. + :raises TypeError: If the DataFrame type does not match the backend. """ # Convert and validate the input DataFrame df = validate_and_convert_input(df, self._backend) @@ -195,86 +255,22 @@ def _setup_timeframe( return df def sort_data(self, ascending: bool = True) -> None: - """Public method to sort the DataFrame by the time column (and ID column if present). + """Public method to sort the DataFrame by the time column. :param ascending: If True, sort in ascending order; if False, sort in descending order. :type ascending: bool + :raises TypeError: If the DataFrame type does not match the backend. + :raises ValueError: If the backend is unsupported. """ - self._sort_data(ascending) - - def check_duplicates(self) -> None: - """Check for duplicate time entries within groups, handling different data backends. - - :raises ValueError: If duplicate entries are found. - """ - if self._backend == "pl": - # Polars specific check: Use boolean masks - if self._id_col: - # Create unique expressions for id and time columns - id_duplicated_expr: Expr = pl.col(self._id_col).is_duplicated() - time_duplicated_expr: Expr = pl.col(self._time_col).is_duplicated() - # Combine expressions - combined_expr: Expr = id_duplicated_expr | time_duplicated_expr - duplicates = self.df.filter(combined_expr) # type: ignore - else: - # Only check the time column for duplicates - duplicates = self.df.filter(pl.col(self._time_col).is_duplicated()) # type: ignore - # Check for duplicates by inspecting the number of rows - if duplicates.height > 0: - raise ValueError("Duplicate time entries found within the same group.") - elif self._backend in ["pd", "mpd"]: - # Cast to Pandas DataFrame for Pandas/Modin specific check - pandas_df = cast(pd.DataFrame, self.df) - duplicates = pandas_df.duplicated( - subset=( - [self._id_col, self._time_col] if self._id_col else [self._time_col] - ) - ) + self.df = self._sort_data(self.df, ascending=ascending) - if duplicates.any(): - raise ValueError("Duplicate time entries found within the same group.") - - def get_data(self) -> Union[pl.DataFrame, pd.DataFrame]: + def get_data(self) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: """Return the DataFrame in its current state. :return: The DataFrame managed by the TimeFrame instance. - :rtype: Union[pl.DataFrame, pd.DataFrame] - """ - return self.df - - def get_grouped_data(self) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: - """Return the grouped DataFrame if an ID column is provided. - - :return: Grouped DataFrame by the ID column if it is set, otherwise returns the original DataFrame. :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - :raises ValueError: If the ID column is not set or an unsupported backend is provided. - :raises TypeError: If the DataFrame type does not match the expected type for the specified backend. """ - if not self.id_col: - raise ValueError("ID column is not set; cannot group data.") - - if self._backend == "pl": - # Polars specific group_by with aggregation - if isinstance(self.df, pl.DataFrame): - return self.df.group_by(self.id_col).agg( - pl.all() - ) # Polars uses `group_by` - else: - raise TypeError(f"Expected Polars DataFrame but got {type(self.df)}.") - elif self._backend == "pd": - # Pandas specific groupby - if isinstance(self.df, pd.DataFrame): - return self.df.groupby(self.id_col).apply(lambda x: x) - else: - raise TypeError(f"Expected Pandas DataFrame but got {type(self.df)}.") - elif self._backend == "mpd": - # Modin uses the same API as Pandas for this operation - if isinstance(self.df, mpd.DataFrame): - return self.df.groupby(self.id_col).apply(lambda x: x) - else: - raise TypeError(f"Expected Modin DataFrame but got {type(self.df)}.") - else: - raise ValueError(f"Unsupported backend: {self._backend}") + return self.df def update_data( self, new_df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] @@ -283,7 +279,13 @@ def update_data( :param new_df: The new DataFrame to replace the existing one. :type new_df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :raises TypeError: If the new DataFrame type does not match the backend. + :raises ValueError: If required columns are missing in the new DataFrame. """ + # Validate and convert the new DataFrame + new_df = validate_and_convert_input(new_df, self._backend) + # Validate required columns + self._validate_columns(new_df) self.df = new_df def update_target_col( @@ -292,19 +294,38 @@ def update_target_col( """Updates the target column in the internal DataFrame with the provided new target column. :param new_target_col: The new target column to replace the existing one. - :type new_target_col: Union{pl.Series, pd.Series, mpd.Series} + :type new_target_col: Union[pl.Series, pd.Series, mpd.Series] + :raises TypeError: If the target column type does not match the backend. + :raises ValueError: If the length of the new target column does not match the DataFrame. """ - if self._backend == "pl": - if isinstance(self.df, pl.DataFrame): - self.df = self.df.with_columns([new_target_col.alias(self._target_col)]) - else: - raise TypeError("Expected Polars DataFrame for Polars backend.") - elif self._backend in ["pd", "mpd"]: - if isinstance(self.df, (pd.DataFrame, mpd.DataFrame)): - self.df[self._target_col] = new_target_col - else: - raise TypeError( - "Expected Pandas or Modin DataFrame for respective backend." - ) + # Validate the target column type + if self._backend == BACKEND_POLARS: + if not isinstance(new_target_col, pl.Series): + raise TypeError("Expected a Polars Series for the Polars backend.") + elif self._backend == BACKEND_PANDAS: + if not isinstance(new_target_col, pd.Series): + raise TypeError("Expected a Pandas Series for the Pandas backend.") + elif self._backend == BACKEND_MODIN: + if not isinstance(new_target_col, mpd.Series): + raise TypeError("Expected a Modin Series for the Modin backend.") else: raise ValueError(f"Unsupported backend: {self._backend}") + + # Check if the new target column length matches the DataFrame length + if len(new_target_col) != len(self.df): + raise ValueError( + "The new target column must have the same number of rows as the DataFrame." + ) + + # Update the target column based on the backend + if self._backend == BACKEND_POLARS: + # Polars uses the alias method for column renaming + self.df = self.df.with_columns([new_target_col.alias(self._target_col)]) + elif self._backend == BACKEND_PANDAS: + # Pandas series has .values + assert isinstance(new_target_col, pd.Series) # For mypy + self.df[self._target_col] = new_target_col.values + elif self._backend == BACKEND_MODIN: + # Modin series has .to_numpy + assert isinstance(new_target_col, mpd.Series) # For mypy + self.df[self._target_col] = new_target_col.to_numpy() diff --git a/src/temporalscope/core/temporal_target_shifter.py b/src/temporalscope/core/temporal_target_shifter.py index 9b1f8f5..60cdaa5 100644 --- a/src/temporalscope/core/temporal_target_shifter.py +++ b/src/temporalscope/core/temporal_target_shifter.py @@ -23,29 +23,78 @@ limitations under the License. """ -from typing import Union, Optional, Any, SupportsIndex +from typing import Union, Optional import polars as pl import pandas as pd import modin.pandas as mpd -import numpy as np -from temporalscope.conf import validate_backend, validate_input +import warnings from temporalscope.core.temporal_data_loader import TimeFrame +from temporalscope.core.core_utils import ( + validate_backend, + validate_input, + BACKEND_POLARS, + BACKEND_PANDAS, + BACKEND_MODIN, +) class TemporalTargetShifter: """A class for shifting the target variable in time series data for machine learning or deep learning. - This class works with the `TimeFrame` class to shift the target variable by a specified - number of lags (time steps). It supports multiple backends (Polars, Pandas, Modin) and can - generate output suitable for both machine learning models (scalar) and deep learning models (sequence). - - The class is designed to handle both raw `TimeFrame` data and data that has been partitioned - using a partitioner, such as `SlidingWindowPartitioner`. + This class works with the `TimeFrame` and partitioned datasets (e.g., from `SlidingWindowPartitioner`) + to shift the target variable by a specified number of lags (time steps). It supports multiple backends + (Polars, Pandas, Modin) and can generate output suitable for both machine learning models (scalar) + and deep learning models (sequence). Assumptions: ------------ - 1. The data is not grouped by any categorical variables. - 2. The `time_col` contains unique time points, ensuring predictable and unambiguous shifting. + 1. The class applies time shifting globally, without grouping by entities (e.g., tickers or SKUs). + Users should handle any entity-specific grouping outside of this class. + 2. The time shifting is applied to the target column, which may have varying data structures + depending on the backend (Polars, Pandas, Modin). + + Examples: + --------- + **Using `TimeFrame`:** + + .. code-block:: python + + from temporalscope.core.temporal_data_loader import TimeFrame + from temporalscope.core.temporal_target_shifter import TemporalTargetShifter + + # Create a sample Pandas DataFrame + data = { + 'time': pd.date_range(start='2022-01-01', periods=100), + 'target': np.random.rand(100), + 'feature_1': np.random.rand(100) + } + df = pd.DataFrame(data) + + # Create a TimeFrame object + tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + + # Apply target shifting + shifter = TemporalTargetShifter(n_lags=1, target_col="target") + shifted_df = shifter.fit_transform(tf) + + **Using `SlidingWindowPartitioner`:** + + .. code-block:: python + + from temporalscope.partition.sliding_window import SlidingWindowPartitioner + from temporalscope.core.temporal_data_loader import TimeFrame + from temporalscope.core.temporal_target_shifter import TemporalTargetShifter + + # Create a sample TimeFrame + tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + + # Create a SlidingWindowPartitioner + partitioner = SlidingWindowPartitioner(tf=tf, window_size=10, stride=1) + + # Apply TemporalTargetShifter on each partition + shifter = TemporalTargetShifter(n_lags=1, target_col="target") + for partition in partitioner.fit_transform(): + shifted_partition = shifter.fit_transform(partition) :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. :type n_lags: int @@ -54,134 +103,118 @@ class TemporalTargetShifter: :type mode: str :param sequence_length: (Deep Learning Mode Only) The length of the input sequences. Required if mode is "deep_learning". :type sequence_length: Optional[int] + :param target_col: The column representing the target variable (mandatory). + :type target_col: str + :param drop_target: Whether to drop the original target column after shifting. Default is True. + :type drop_target: bool + :param verbose: If True, prints information about the number of dropped rows during transformation. + :type verbose: bool :raises ValueError: If the backend is unsupported or if validation checks fail. - - .. note:: - For deep learning frameworks like TensorFlow, PyTorch, and JAX/Flax, the expected shape of the target - in "deep_learning" mode generated by this class is `(num_sequences, sequence_length)`. - Batching is typically handled by the data loader or model input pipeline. - - Example Usage: - -------------- - .. code-block:: python - - # Example of creating a TimeFrame with a Polars DataFrame - data = pl.DataFrame({ - 'time': pl.date_range(start='2021-01-01', periods=100, interval='1d'), - 'value': range(100) - }) - tf = TimeFrame(data, time_col='time', target_col='value') - - # Using TemporalTargetShifter in machine_learning mode - shifter_ml = TemporalTargetShifter(n_lags=3, mode='machine_learning') - tf_transformed_ml = shifter_ml.fit_transform(tf) - - # Accessing the shifted data - print("Machine Learning Mode (Scalar):") - print(tf_transformed_ml.get_data().head()) - - # Using TemporalTargetShifter in deep_learning mode - shifter_dl = TemporalTargetShifter(n_lags=3, mode='deep_learning', sequence_length=10) - tf_transformed_dl = shifter_dl.fit_transform(tf) - - # Accessing the shifted data for deep learning - print("Deep Learning Mode (Sequence):") - print(tf_transformed_dl.get_data().head()) - - # Example: Create a TimeFrame with a Modin DataFrame - import modin.pandas as mpd - df = mpd.DataFrame({ - 'time': pd.date_range(start='2021-01-01', periods=100, freq='D'), - 'value': range(100) - }) - tf_modin = TimeFrame(df, time_col='time', target_col='value', backend='mpd') - - # Accessing the data - print("Original Modin DataFrame:") - print(tf_modin.get_data().head()) """ + MODE_MACHINE_LEARNING = "machine_learning" + MODE_DEEP_LEARNING = "deep_learning" + def __init__( self, n_lags: int = 1, - mode: str = "machine_learning", + mode: str = MODE_MACHINE_LEARNING, sequence_length: Optional[int] = None, + target_col: Optional[str] = None, + drop_target: bool = True, + verbose: bool = False, ): - if mode not in ["machine_learning", "deep_learning"]: - raise ValueError("`mode` must be 'machine_learning' or 'deep_learning'.") + """Initialize the TemporalTargetShifter. + + :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. + :param mode: Mode of operation: "machine_learning" or "deep_learning". Default is "machine_learning". + :param sequence_length: (Deep Learning Mode Only) Length of the input sequences. Required if mode is "deep_learning". + :param target_col: Column representing the target variable (mandatory). + :param drop_target: Whether to drop the original target column after shifting. Default is True. + :param verbose: Whether to print detailed information about transformations. + :raises ValueError: If the target column is not provided or if an invalid mode is selected. + """ + if mode not in [self.MODE_MACHINE_LEARNING, self.MODE_DEEP_LEARNING]: + raise ValueError( + f"`mode` must be '{self.MODE_MACHINE_LEARNING}' or '{self.MODE_DEEP_LEARNING}'." + ) + + if target_col is None: + raise ValueError( + "`target_col` must be explicitly provided for TemporalTargetShifter." + ) + + if n_lags <= 0: + raise ValueError("`n_lags` must be greater than 0.") self.n_lags = n_lags self.mode = mode self.sequence_length = sequence_length + self.target_col = target_col + self.drop_target = drop_target + self.verbose = verbose + self.backend: Optional[str] = None # Backend will be set during fit - if self.mode == "deep_learning" and not self.sequence_length: + if self.mode == self.MODE_DEEP_LEARNING and not self.sequence_length: raise ValueError( "`sequence_length` must be provided when mode is 'deep_learning'." ) - def fit(self, tf: TimeFrame) -> "TemporalTargetShifter": - """Validates the input TimeFrame without altering it. - - :param tf: The TimeFrame object to validate. - :type tf: TimeFrame - :rtype: TemporalTargetShifter - :raises ValueError: If data validation fails. - """ - self._validate_data(tf) - self.backend = tf.backend # Store the backend for later use - return self - - def transform(self, tf: TimeFrame) -> TimeFrame: - """Shift the target variable according to the number of lags specified. - - This method can operate on both raw TimeFrame data and partitioned data. + def _infer_backend( + self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + ) -> str: + """Infer the backend from the DataFrame type. - :param tf: The TimeFrame object to transform. - :type tf: TimeFrame - :rtype: TimeFrame - :raises ValueError: If the backend is unsupported or data validation fails. + :param df: The input DataFrame. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :return: The inferred backend ('pl', 'pd', or 'mpd'). + :raises ValueError: If the DataFrame type is unsupported. """ - df = tf.get_data() - target_col = tf._target_col - - if self.backend == "pl": - df = self._shift_polars(df, target_col) - elif self.backend in ["pd", "mpd"]: - df = self._shift_pandas_modin(df, target_col) + if isinstance(df, pl.DataFrame): + return BACKEND_POLARS + elif isinstance(df, pd.DataFrame): + return BACKEND_PANDAS + elif isinstance(df, mpd.DataFrame): + return BACKEND_MODIN else: - raise ValueError(f"Unsupported backend: {self.backend}") + raise ValueError(f"Unsupported DataFrame type: {type(df)}") - tf.update_data(df) - return tf + def _set_backend( + self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + ) -> None: + """Set or infer the backend based on the DataFrame. - def fit_transform(self, tf: TimeFrame) -> TimeFrame: - """Combine fit and transform into a single call. - - :param tf: The TimeFrame object to transform. - :type tf: TimeFrame - :rtype: TimeFrame + :param df: The input DataFrame. + :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :raises ValueError: If the backend is not supported. """ - return self.fit(tf).transform(tf) - - def _validate_data(self, tf: TimeFrame) -> None: - """Validate the TimeFrame data for consistency.""" - df = tf.get_data() - time_col = tf._time_col - - validate_backend(tf.backend) - validate_input(df, tf.backend) - - # Ensure df[time_col] is treated as a Series, to avoid Mypy errors - if isinstance(df, pl.DataFrame): - if df[time_col].is_duplicated().any(): - raise ValueError( - "The time column contains duplicate values, which is not allowed." - ) - elif isinstance(df, (pd.DataFrame, mpd.DataFrame)): - if df[time_col].duplicated().any(): - raise ValueError( - "The time column contains duplicate values, which is not allowed." - ) + if self.backend is None: + self.backend = self._infer_backend(df) + validate_backend(self.backend) + + def _validate_data( + self, tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] + ) -> None: + """Validate the TimeFrame or partitioned data for consistency. + + :param tf: The `TimeFrame` object or a DataFrame (`pandas`, `modin`, or `polars`) that contains the time series data. + :type tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] + :raises ValueError: If the data is invalid or empty. + """ + if isinstance(tf, TimeFrame): + df = tf.get_data() + else: + df = tf + + # Check if the DataFrame is empty based on the backend + if isinstance(df, pd.DataFrame) or isinstance(df, mpd.DataFrame): + if df is None or df.empty: + raise ValueError("Input DataFrame is empty.") + elif isinstance(df, pl.DataFrame): + if df is None or df.is_empty(): + raise ValueError("Input DataFrame is empty.") + else: + raise ValueError("Unsupported DataFrame type.") def _shift_polars(self, df: pl.DataFrame, target_col: str) -> pl.DataFrame: """Shift the target variable in a Polars DataFrame. @@ -192,42 +225,44 @@ def _shift_polars(self, df: pl.DataFrame, target_col: str) -> pl.DataFrame: :type target_col: str :return: The Polars DataFrame with the shifted target variable. :rtype: pl.DataFrame - :raises ValueError: If the backend is unsupported or data validation fails. + :raises ValueError: If `sequence_length` or `n_lags` are not properly set. """ - if self.mode == "deep_learning": - if self.sequence_length is None: - raise ValueError( - "`sequence_length` must be provided in deep_learning mode." - ) - + if self.mode == self.MODE_DEEP_LEARNING: + if not isinstance(self.sequence_length, int): + raise ValueError("`sequence_length` must be an integer.") shifted_columns = [ df[target_col].shift(-i).alias(f"{target_col}_shift_{i}") for i in range(self.sequence_length) ] df = df.with_columns(shifted_columns) df = df.with_columns( - [ - pl.concat_list( - [ - pl.col(f"{target_col}_shift_{i}") - for i in range(self.sequence_length) - ] - ).alias(f"{target_col}_sequence") - ] + pl.concat_list( + [ + pl.col(f"{target_col}_shift_{i}") + for i in range(self.sequence_length) + ] + ).alias(f"{target_col}_sequence") ) df = df.drop( [f"{target_col}_shift_{i}" for i in range(self.sequence_length)] ) - else: # Machine Learning Mode (Scalar) + df = df.drop_nulls() + df = df.slice(0, len(df) - self.sequence_length + 1) + else: df = df.with_columns( - [ - df[target_col] - .shift(-self.n_lags) - .alias(f"{target_col}_shift_{self.n_lags}") - ] + df[target_col] + .shift(-self.n_lags) + .alias(f"{target_col}_shift_{self.n_lags}") ) + df = df.drop_nulls() + + if df.is_empty(): + raise ValueError("DataFrame is empty after shifting operation.") - return df.drop_nulls() + if self.drop_target: + df = df.drop(target_col) + + return df def _shift_pandas_modin( self, df: Union[pd.DataFrame, mpd.DataFrame], target_col: str @@ -240,19 +275,242 @@ def _shift_pandas_modin( :type target_col: str :return: The DataFrame with the shifted target variable. :rtype: Union[pd.DataFrame, mpd.DataFrame] - :raises ValueError: If the backend is unsupported or data validation fails. + :raises ValueError: If `sequence_length` or `n_lags` are not properly set. """ - if self.mode == "deep_learning": - if self.sequence_length is None: - raise ValueError( - "`sequence_length` must be an integer when mode is 'deep_learning'." - ) - + if self.mode == self.MODE_DEEP_LEARNING: + if not isinstance(self.sequence_length, int): + raise ValueError("`sequence_length` must be an integer.") shifted_columns = [ df[target_col].shift(-i) for i in range(self.sequence_length) ] - df[f"{target_col}_sequence"] = np.stack(shifted_columns, axis=1).tolist() - else: # Machine Learning Mode (Scalar) + df[f"{target_col}_sequence"] = list(zip(*shifted_columns)) + df = df.dropna() + df = df.iloc[: -self.sequence_length + 1] + else: df[f"{target_col}_shift_{self.n_lags}"] = df[target_col].shift(-self.n_lags) + df = df.dropna() + + if df.empty: + raise ValueError("DataFrame is empty after shifting operation.") + + if self.drop_target: + df = df.drop(columns=[target_col]) - return df.dropna() + return df + + def _transform_pandas_modin( + self, df: Union[pd.DataFrame, mpd.DataFrame] + ) -> Union[pd.DataFrame, mpd.DataFrame]: + """Handle shifting for Pandas or Modin backends. + + :param df: The input DataFrame (Pandas or Modin). + :type df: Union[pd.DataFrame, mpd.DataFrame] + :return: The transformed DataFrame with the target column shifted. + :rtype: Union[pd.DataFrame, mpd.DataFrame] + :raises ValueError: If `target_col` is not set. + """ + # Ensure target_col is not None + if self.target_col is None: + raise ValueError("`target_col` must be set before transformation.") + + df = self._shift_pandas_modin(df, self.target_col) + + rows_before = len(df) + df = df.dropna() # Handle missing values + rows_after = len(df) + + if rows_after == 0: + raise ValueError("All rows were dropped during transformation.") + + self._print_dropped_rows(rows_before, rows_after) + return df + + def _transform_polars(self, df: pl.DataFrame) -> pl.DataFrame: + """Handle shifting for Polars backend. + + :param df: The input Polars DataFrame. + :type df: pl.DataFrame + :return: The transformed Polars DataFrame with the target column shifted. + :rtype: pl.DataFrame + :raises ValueError: If `target_col` is not set. + """ + # Ensure target_col is not None + if self.target_col is None: + raise ValueError("`target_col` must be set before transformation.") + + df = self._shift_polars(df, self.target_col) + + rows_before = df.shape[0] + df = df.drop_nulls() + rows_after = df.shape[0] + + if rows_after == 0: + raise ValueError("All rows were dropped during transformation.") + + self._print_dropped_rows(rows_before, rows_after) + return df + + def _print_dropped_rows(self, rows_before: int, rows_after: int) -> None: + """Print information about dropped rows if verbose mode is enabled. + + :param rows_before: Number of rows before dropping nulls. + :type rows_before: int + :param rows_after: Number of rows after dropping nulls. + :type rows_after: int + """ + if self.verbose: + rows_dropped = rows_before - rows_after + print( + f"Rows before shift: {rows_before}; Rows after shift: {rows_after}; Rows dropped: {rows_dropped}" + ) + + def fit( + self, tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] + ) -> "TemporalTargetShifter": + """Validate and prepare the target data for transformation based on the specified backend. + + The `fit` method initializes the backend and validates the input data, ensuring the target column is consistent with the input data. + It does not alter the data but sets up the necessary configuration for later transformations. + + :param tf: The `TimeFrame` object, or a DataFrame (`pandas`, `modin`, or `polars`) that contains the time series data. + The DataFrame must have a target column defined or the `target_col` attribute set during initialization. + :type tf: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame], optional + :raises ValueError: If the target column is not provided, the data is invalid, or the backend is unsupported. + :raises Warning: If the target column provided in `TemporalTargetShifter` differs from the one in the `TimeFrame`. + :return: The fitted `TemporalTargetShifter` instance, ready for transforming the data. + :rtype: TemporalTargetShifter + + Example Usage: + -------------- + .. code-block:: python + + shifter = TemporalTargetShifter(n_lags=2, target_col="target") + shifter.fit(time_frame) + + """ + self._validate_data(tf) + + if isinstance(tf, TimeFrame): + # Set backend and handle target column for TimeFrame input + self.backend = tf.backend + if not self.target_col: + self.target_col = tf._target_col + elif self.target_col != tf._target_col: + warnings.warn( + f"The `target_col` in TemporalTargetShifter ('{self.target_col}') " + f"differs from the TimeFrame's target_col ('{tf._target_col}').", + UserWarning, + ) + elif tf is not None: + # Infer backend for non-TimeFrame input + self.backend = self._infer_backend(tf) + else: + raise ValueError("Input data is None.") + + return self + + def transform( + self, tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] + ) -> Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]: + """Transform the input time series data by shifting the target variable according to the specified number of lags. + + The `transform` method shifts the target variable in the input data according to the `n_lags` or `sequence_length` set during initialization. + This method works directly on either a `TimeFrame` or a raw DataFrame (Pandas, Modin, or Polars), applying the appropriate backend-specific transformation. + + :param tf: The `TimeFrame` object or a DataFrame (Pandas, Modin, or Polars) that contains the time series data to be transformed. + The data should contain a target column that will be shifted. + :type tf: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame], optional + :raises ValueError: If the input data is invalid, unsupported, or lacks columns. + :raises ValueError: If the backend is unsupported or data validation fails. + :return: A transformed DataFrame or `TimeFrame` with the target variable shifted by the specified lags or sequence length. + If a `TimeFrame` is provided, the returned object will be a `TimeFrame`. Otherwise, a DataFrame will be returned. + :rtype: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame] + + Example Usage: + -------------- + .. code-block:: python + + shifter = TemporalTargetShifter(n_lags=2, target_col="target") + transformed_data = shifter.transform(time_frame) + + """ + if isinstance(tf, TimeFrame): + tf.sort_data() # Ensure the data is sorted before shifting + df = tf.get_data() + if not self.target_col: + self.target_col = tf._target_col + self.backend = tf.backend + elif tf is not None: + df = tf + if not self.target_col: + if hasattr(df, "columns"): + self.target_col = df.columns[-1] + else: + raise ValueError("The input DataFrame does not have columns.") + self._set_backend(df) + else: + raise ValueError("Input data is None.") + + # Delegate the transformation to backend-specific methods + if self.backend == BACKEND_PANDAS or self.backend == BACKEND_MODIN: + transformed_df = self._transform_pandas_modin(df) + elif self.backend == BACKEND_POLARS: + transformed_df = self._transform_polars(df) + else: + raise ValueError(f"Unsupported backend: {self.backend}") + + # If the input was a TimeFrame, return a transformed TimeFrame + if isinstance(tf, TimeFrame): + return TimeFrame( + transformed_df, + time_col=tf.time_col, + target_col=( + f"{self.target_col}_shift_{self.n_lags}" + if self.mode == self.MODE_MACHINE_LEARNING + else f"{self.target_col}_sequence" + ), + backend=self.backend, + ) + + return transformed_df + + def fit_transform( + self, tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] + ) -> Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]: + """Fit and transform the input data in a single step. + + This method combines the functionality of the `fit` and `transform` methods. It first validates and prepares the input data (fitting), + then applies the target variable shifting (transformation) based on the `n_lags` or `sequence_length` specified during initialization. + + :param tf: The `TimeFrame` object or a DataFrame (`pandas`, `modin`, or `polars`) to be transformed. + The data should contain a target column that will be shifted according to the `n_lags` or `sequence_length`. + :type tf: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame], optional + :raises ValueError: If the input data is invalid or the backend is unsupported. + :raises ValueError: If the target column is not set, or is incompatible with the data. + :return: A transformed DataFrame or TimeFrame with the target variable shifted by the specified lags or sequence length. + :rtype: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame] + + Example Usage: + -------------- + .. code-block:: python + + shifter = TemporalTargetShifter(n_lags=2, target_col="target") + shifted_data = shifter.fit_transform(time_frame) + + """ + self.fit(tf) + transformed = self.transform(tf) + + # Return TimeFrame if input was TimeFrame, otherwise return DataFrame + if isinstance(tf, TimeFrame): + return TimeFrame( + transformed, + time_col=tf.time_col, + target_col=( + f"{self.target_col}_shift_{self.n_lags}" + if self.mode == self.MODE_MACHINE_LEARNING + else f"{self.target_col}_sequence" + ), + backend=self.backend, + ) + return transformed diff --git a/src/temporalscope/partition/sliding_window.py b/src/temporalscope/partition/sliding_window.py index 78d556e..20660a1 100644 --- a/src/temporalscope/partition/sliding_window.py +++ b/src/temporalscope/partition/sliding_window.py @@ -11,6 +11,9 @@ This class utilizes the generator pattern for memory efficiency, yielding partition indices and data slices one at a time. +The `SlidingWindowPartitioner` is intended for universal models, which assume flat partitioning across all entities. +Users are responsible for preprocessing steps such as deduplication or transforming `time_col` to numerical features. + TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -24,9 +27,8 @@ limitations under the License. """ -from typing import Dict, Tuple, Optional, Union, Iterator import itertools -import warnings +from typing import Dict, Tuple, Optional, Union, Iterator import pandas as pd import polars as pl import modin.pandas as mpd @@ -37,14 +39,31 @@ check_feature_to_sample_ratio, check_class_balance, ) +from temporalscope.core.core_utils import ( + validate_backend, + BACKEND_POLARS, + BACKEND_PANDAS, + BACKEND_MODIN, + SupportedBackendDataFrame +) + class SlidingWindowPartitioner(TemporalPartitionerProtocol): """Sliding Window Partitioner for dividing time series data into contiguous, non-overlapping partitions. - This class splits a dataset into partitions of a fixed window size. Users can define a stride to introduce gaps - between consecutive partitions. Each partition can be further divided into train, test, and validation sets - based on provided percentages. + This class splits a dataset into partitions using either a specified `window_size` or a calculated `window_size` + based on the desired `num_partitions`. Users can define a stride to introduce gaps between consecutive partitions. + Each partition can be further divided into train, test, and validation sets based on provided percentages. + + This class supports workflows for both machine learning (ML) and deep learning (DL) models. For ML, truncation or + varying window sizes may be acceptable. However, in DL pipelines (e.g., TensorFlow, PyTorch, JAX), padding is often + required to ensure uniform input shapes across batches, making the `truncate` parameter and padding behavior critical. + + The partitioning occurs globally across the entire dataset, maintaining the temporal order without grouping by entity. + This design ensures compatibility with universal models, where the entire dataset is treated as a single unit for + partitioning, aligning with the flexibility of the `TimeFrame` class. Users are responsible for any necessary preprocessing + (e.g., deduplication or transformation of `time_col`). Assumptions: ------------ @@ -52,43 +71,12 @@ class SlidingWindowPartitioner(TemporalPartitionerProtocol): - `test_pct` is optional, and if not provided, the remaining percentage after `train_pct` will implicitly be assigned to `test_pct`. - `val_pct` is also optional, and if provided, the sum of `train_pct`, `test_pct`, and `val_pct` must equal 1.0. - The total of `train_pct`, `test_pct`, and `val_pct` must sum to 1.0 exactly. + - Partitioning occurs globally across the dataset, and users are responsible for preprocessing, such as deduplication + or transformation of `time_col`. The class uses a generator pattern for `fit` and `transform` methods to yield partition indices and data slices one at a time, promoting memory efficiency and lazy loading. - :param tf: The TimeFrame object containing the data to be partitioned. - :type tf: TimeFrame - :param window_size: The size of each partition (number of rows). - :type window_size: Optional[int] - :param stride: The number of rows to skip between the start points of consecutive partitions. - A stride larger than the window size creates gaps, while a stride equal to the window size results in no gaps. - :type stride: int - :param reverse: Whether the sliding window should move in reverse (from the end to the start of the dataset). - If set to True, the window slides in reverse; if False (default), it slides forward. - :type reverse: bool - :param truncate: Whether to truncate the last partition if its size is smaller than the window size. - :type truncate: bool - :param train_pct: Percentage of data allocated for training within each partition. Must be provided. - :type train_pct: float - :param test_pct: Percentage of data allocated for testing within each partition. Optional. - :type test_pct: Optional[float] - :param val_pct: Optional percentage of data allocated for validation within each partition. If provided, the sum of `train_pct`, - `test_pct`, and `val_pct` must equal 1.0. - :type val_pct: Optional[float] - :param enable_warnings: Enable warnings for uneven partition sizes. - :type enable_warnings: bool - :param verbose: If set to True, print partitioning details. - :type verbose: bool - - :raises ValueError: - - If `window_size` is not provided or is not a positive integer. - - If `stride` is not a positive integer. - - If `train_pct`, `test_pct`, or `val_pct` are not within the range [0, 1]. - - If `train_pct`, `test_pct`, and `val_pct` do not sum to 1.0. - - If `train_pct` is provided without `test_pct` or `val_pct` summing to 1.0. - - If the dataset cannot be sorted or retrieved properly from the TimeFrame. - - If any required data is missing or invalid during the partitioning process. - Example Usage: -------------- .. code-block:: python @@ -99,16 +87,16 @@ class SlidingWindowPartitioner(TemporalPartitionerProtocol): # Create a sample dataset using Pandas data_df = pd.DataFrame({ - 'time': pd.date_range(start='2021-01-01', periods=20, freq='D'), - 'value': range(20) + 'time': pd.date_range(start='2021-01-01', periods=6, freq='D'), + 'value': range(6) }) # Create a TimeFrame object data_tf = TimeFrame(data_df, time_col='time', target_col='value', backend='pd') - # Create a SlidingWindowPartitioner with window_size=5 and stride=5 + # Create a SlidingWindowPartitioner with window_size=2 and stride=1 partitioner = SlidingWindowPartitioner( - tf=data_tf, window_size=5, stride=5, truncate=True, train_pct=0.8, test_pct=0.2, reverse=False + tf=data_tf, window_size=2, stride=1, truncate=True, train_pct=0.7, test_pct=0.3 ) # Iterate over partition indices @@ -119,48 +107,85 @@ class SlidingWindowPartitioner(TemporalPartitionerProtocol): for partition_data in partitioner.transform(): print(partition_data) - Notes - ----- - The sliding window can operate in two modes, depending on the `reverse` parameter: - + Visualization: + -------------- .. note:: - **Forward Sliding Window (reverse=False):** - - The sliding window starts from the beginning of the dataset and moves forward. - - Example: + Here's a conceptual 2D visualization of how the sliding window and stride work with a `time_col`: .. code-block:: text - Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] - Window Size: 4, Stride: 3 - - Window 1: [ 1, 2, 3, 4 ] - Window 2: [ 4, 5, 6, 7 ] - Window 3: [ 7, 8, 9, 10 ] - - .. seealso:: - - **Reverse Sliding Window (reverse=True):** - - The sliding window starts from the end of the dataset and moves backward. - - Example: - - .. code-block:: text + time value + ------- ------ + 2021-01-01 0 + 2021-01-02 1 + 2021-01-03 2 + 2021-01-04 3 + 2021-01-05 4 + 2021-01-06 5 + + Partitioning with `window_size=2` and `stride=1`: + + - First partition: + time value + ------- ------ + 2021-01-01 0 + 2021-01-02 1 + + - Second partition: + time value + ------- ------ + 2021-01-02 1 + 2021-01-03 2 + + - Third partition: + time value + ------- ------ + 2021-01-03 2 + 2021-01-04 3 + + The sliding window moves across the entire dataset, maintaining the temporal order within each partition. - Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] - Window Size: 4, Stride: 3 + :param tf: The TimeFrame object containing the data to be partitioned. + :type tf: TimeFrame + :param num_partitions: The desired number of partitions to create. If `window_size` is specified, this is ignored. + :type num_partitions: Optional[int] + :param window_size: The size of each partition (number of rows). If specified, it takes precedence over `num_partitions`. + :type window_size: Optional[int] + :param stride: The number of rows to skip between the start points of consecutive partitions. + A stride larger than the window size creates gaps, while a stride equal to the window size results in no gaps. + :type stride: int + :param reverse: Whether the sliding window should move in reverse (from the end to the start of the dataset). + If set to True, the window slides in reverse; if False (default), it slides forward. + :type reverse: bool + :param truncate: Whether to truncate the last partition if its size is smaller than the window size. + Note: For deep learning models, truncation can lead to varying input sizes and should be avoided. + :type truncate: bool + :param train_pct: Percentage of data allocated for training within each partition. Must be provided. + :type train_pct: float + :param test_pct: Percentage of data allocated for testing within each partition. Optional. + :type test_pct: Optional[float] + :param val_pct: Optional percentage of data allocated for validation within each partition. If provided, the sum of `train_pct`, + `test_pct`, and `val_pct` must equal 1.0. + :type val_pct: Optional[float] + :param enable_warnings: Enable warnings for uneven partition sizes. + :type enable_warnings: bool + :param verbose: If set to True, print partitioning details. + :type verbose: bool - Window 1: [ 7, 8, 9, 10 ] - Window 2: [ 4, 5, 6, 7 ] - Window 3: [ 1, 2, 3, 4 ] + :raises ValueError: + - If neither `window_size` nor `num_partitions` is provided or valid. + - If `stride` is not a positive integer. + - If `train_pct`, `test_pct`, or `val_pct` are not within the range [0, 1]. + - If `train_pct`, `test_pct`, and `val_pct` do not sum to 1.0. + - If the dataset cannot be sorted or retrieved properly from the TimeFrame. + - If any required data is missing or invalid during the partitioning process. """ def __init__( self, tf: TimeFrame, + num_partitions: Optional[int] = 2, window_size: Optional[int] = None, stride: int = 1, reverse: bool = False, @@ -171,10 +196,34 @@ def __init__( enable_warnings: bool = False, verbose: bool = False, ): - if window_size is None or window_size <= 0: + """Initialize the SlidingWindowPartitioner with the given parameters. + + :param tf: TimeFrame object to partition. + :param num_partitions: Number of partitions to create (ignored if `window_size` is provided). + :param window_size: Size of each partition. + :param stride: Number of rows to skip between partitions. + :param reverse: Whether the sliding window should move in reverse. + :param truncate: Whether to truncate the last partition if smaller than `window_size`. + :param train_pct: Percentage of data allocated for training. + :param test_pct: Percentage of data allocated for testing. + :param val_pct: Percentage of data allocated for validation. + :param enable_warnings: Enable warnings for uneven partition sizes. + :param verbose: Enable verbose output. + :raises ValueError: If input parameters are invalid. + """ + validate_backend(tf.backend) + num_rows = tf.get_data().shape[0] + if window_size is None: + if num_partitions is None or num_partitions <= 0: + raise ValueError("`num_partitions` must be a positive integer.") + window_size = num_rows // num_partitions + + if window_size <= 0: raise ValueError("`window_size` must be a positive integer.") if stride <= 0: raise ValueError("`stride` must be a positive integer.") + + # Validate percentage values if not (0 <= train_pct <= 1): raise ValueError("`train_pct` must be between 0 and 1.") if test_pct is not None and not (0 <= test_pct <= 1): @@ -184,7 +233,7 @@ def __init__( if train_pct + (test_pct or 0) + (val_pct or 0) != 1.0: raise ValueError("Train, test, and validation percentages must sum to 1.0.") - self.tf = tf # Use TimeFrame directly + self.tf = tf self.window_size = window_size self.stride = stride self.reverse = reverse @@ -202,144 +251,185 @@ def __init__( def _precompute_percentages( self, train_pct: float, test_pct: Optional[float], val_pct: Optional[float] - ) -> Tuple[float, Optional[float], Optional[float]]: - """Calculate and validate the percentages for train, test, and validation splits. + ) -> Tuple[float, float, float]: + """Precompute and validate train, test, and validation percentages. - This method checks that the provided percentages for training, testing, and validation - add up to 100%. It ensures that if a validation percentage is specified, both training - and testing percentages are also provided. The method also prints out the calculated - percentages if `verbose` mode is enabled. + This function ensures that the sum of train, test, and validation percentages equals 1.0. + If `test_pct` is not provided, it will be set to the remaining percentage after the train percentage. - :param train_pct: The percentage of data allocated for training within each partition. + :param train_pct: Percentage of data allocated for training. :type train_pct: float - :param test_pct: The percentage of data allocated for testing within each partition. If not provided, - it defaults to 1.0 minus `train_pct` and `val_pct`. + :param test_pct: Optional. Percentage of data allocated for testing. :type test_pct: Optional[float] - :param val_pct: The percentage of data allocated for validation within each partition, if any. + :param val_pct: Optional. Percentage of data allocated for validation. :type val_pct: Optional[float] - :return: A tuple containing the validated percentages for training, testing, and validation. - :rtype: Tuple[float, Optional[float], Optional[float]] - - :raises ValueError: If the sum of `train_pct`, `test_pct`, and `val_pct` does not equal 100%, or - if `val_pct` is specified without both `train_pct` and `test_pct`. + :rtype: Tuple[float, float, float] + :raises ValueError: If the percentages do not sum to 1.0 or are not within the valid range (0 to 1). """ - total_pct = (train_pct or 0) + (test_pct or 0) + (val_pct or 0) - if total_pct != 1.0: + # Validate the train percentage + if not (0 <= train_pct <= 1): + raise ValueError("`train_pct` must be between 0 and 1.") + + # Ensure test_pct and val_pct are set correctly + if test_pct is None and val_pct is None: + test_pct = 1.0 - train_pct + val_pct = 0.0 + elif test_pct is not None and val_pct is None: + if not (0 <= test_pct <= 1): + raise ValueError("`test_pct` must be between 0 and 1.") + val_pct = 1.0 - train_pct - test_pct + elif test_pct is None and val_pct is not None: + if not (0 <= val_pct <= 1): + raise ValueError("`val_pct` must be between 0 and 1.") + test_pct = 1.0 - train_pct - val_pct + else: + # Both test_pct and val_pct are provided, ensure they are valid before comparison + if test_pct is None or val_pct is None: + raise ValueError("`test_pct` and `val_pct` cannot be None.") + if not (0 <= test_pct <= 1): + raise ValueError("`test_pct` must be between 0 and 1.") + if not (0 <= val_pct <= 1): + raise ValueError("`val_pct` must be between 0 and 1.") + + # Ensure they sum to 1.0 + total_pct = train_pct + (test_pct or 0) + (val_pct or 0) + if not (abs(total_pct - 1.0) < 1e-6): # Allow for floating-point imprecision raise ValueError("Train, test, and validation percentages must sum to 1.0.") - if val_pct is not None and (train_pct is None or test_pct is None): - raise ValueError( - "Validation percentage requires both train and test percentages to be provided." - ) - if self.verbose: - print(f"Train percentage: {train_pct}") - print(f"Test percentage: {test_pct}") - print(f"Validation percentage: {val_pct}") - return train_pct, test_pct, val_pct - - def _validate_partitioning(self, num_rows: int, window_size: int) -> None: - """Validate the feasibility of partitioning the dataset with the given window size and stride. - - This method checks if the dataset can be properly partitioned based on the provided `window_size` and `stride`. - It ensures that: - - The stride is not larger than the window size, which would cause partitions to be skipped. - - The stride is a positive integer. - - The dataset has enough rows to create at least one partition. - - :param num_rows: The total number of rows in the dataset. - :type num_rows: int - :param window_size: The window size to be used for each partition. + + # Ensure test_pct and val_pct are float types, not None + return train_pct, float(test_pct), float(val_pct) + + def _pad_partition( + self, + df: Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame], + window_size: int, + end: int, + reverse: bool, + ) -> SupportedBackendDataFrame: + """Pad the partition to the required window size by repeating the last row. + + This function ensures that the partition is padded to the full window size + by repeating the last row of the partition until the desired window size is achieved. + + :param df: The DataFrame (Pandas, Modin, or Polars) to pad. + :type df: Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] + :param window_size: The target window size to pad the partition to. :type window_size: int - :raises ValueError: If partitioning is not possible due to any of the following conditions: - - The stride is larger than the window size. - - The stride is not a positive integer. - - The dataset is too small to create even a single partition with the given window size and stride. + :param end: The index indicating the end of the current partition. + :type end: int + :param reverse: If True, the padding is added to the start; otherwise, it's added at the end. + :type reverse: bool + :return: A DataFrame padded to the specified window size. + :rtype: Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] """ - # Ensure the stride is not larger than the window size - if self.stride > window_size: - raise ValueError( - f"Stride ({self.stride}) is larger than the window size ({window_size}). " - "This would cause partitions to be skipped." + # Calculate how many rows to pad + num_to_pad = window_size - df.shape[0] + + if num_to_pad <= 0: + return df # No need to pad + + # Handle Pandas or Modin DataFrames + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + # Select the row to use for padding + pad_row = df.iloc[[end - 1]] if not reverse else df.iloc[[0]] + + # Repeat the selected row for the required number of times + pad_rows = pd.concat([pad_row] * num_to_pad, ignore_index=True) + + # Concatenate the original DataFrame with the padding + if reverse: + return pd.concat([pad_rows, df], ignore_index=True) + else: + return pd.concat([df, pad_rows], ignore_index=True) + + # Handle Polars DataFrames + elif isinstance(df, pl.DataFrame): + # Select the row to use for padding + pad_row = df.slice(end - 1, 1) if not reverse else df.slice(0, 1) + + # Repeat the selected row for the required number of times + pad_rows = pl.DataFrame( + [pad_row.to_dict(as_series=False)[0] for _ in range(num_to_pad)] ) - # Ensure the stride is a positive integer - if self.stride <= 0: - raise ValueError("Stride must be a positive integer.") + # Concatenate the original DataFrame with the padding + if reverse: + return pad_rows.vstack(df) + else: + return df.vstack(pad_rows) - # Calculate the number of possible partitions - num_possible_partitions = (num_rows - window_size) // self.stride + 1 + raise TypeError("Unsupported DataFrame type.") - # Ensure there are enough rows in the dataset for at least one partition - if num_possible_partitions < 1: - raise ValueError( - f"Not enough rows ({num_rows}) to create partitions with window size {window_size} " - f"and stride {self.stride}. Try reducing the number of partitions or adjusting the window size and stride." - ) + def _fit_pandas_modin( + self, df: Union[pd.DataFrame, mpd.DataFrame] + ) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: + """Fit method specific to Pandas or Modin backends. - # Print validation success message if verbose mode is enabled - if self.verbose: - print( - f"Partitioning validated: {num_possible_partitions} possible partitions." - ) + :param df: Input DataFrame. + :return: Iterator yielding partition indices for Pandas/Modin. + """ + partition_count = 1 - def _get_data_shape(self) -> Tuple[int, int]: - """Get the number of rows and features from the dataset, ensuring compatibility with different backends. + num_rows = df.shape[0] + start_range = list(range(0, num_rows, self.stride)) - :return: A tuple containing the number of rows and features in the dataset. - :rtype: Tuple[int, int] - :raises ValueError: If the backend is unsupported. - """ - backend = self.tf.backend # Access the backend from the TimeFrame object - if backend in ["pd", "mpd"]: - num_rows, num_features = self.df.shape - elif backend == "pl": - num_rows = self.df.height - num_features = self.df.width - else: - raise ValueError(f"Unsupported backend: {backend}") - return num_rows, num_features + if self.reverse: + start_range.reverse() - def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: - """Generate partition indices for the dataset, lazily yielding them one at a time. + for start in start_range: + end = start + self.window_size - This method divides the dataset into partitions based on the specified window size and stride. - It generates indices for the entire partition as well as for the training, testing, and validation splits - within each partition. + if end > num_rows: + if self.truncate: + break + end = num_rows - The method operates in a memory-efficient manner, generating and yielding each partition's indices - only when needed. + train_end = start + int(self.train_pct * (end - start)) + test_end = ( + train_end + int(self.test_pct * (end - start)) + if self.test_pct + else train_end + ) + validation_end = end if self.val_pct else test_end - :yield: A dictionary where each key corresponds to a partition (e.g., 'partition_1'), and the value is another - dictionary with keys 'full', 'train', 'test', and optionally 'validation', each mapping to a tuple of indices. - :rtype: Iterator[Dict[str, Dict[str, Tuple[int, int]]]] - :raises ValueError: If `window_size` is larger than the dataset or if the total number of partitions is insufficient. - """ - num_rows, _ = ( - self._get_data_shape() - ) # Retrieve the shape using backend-specific method - window_size = self.window_size + # Yield the partition indices + yield { + f"partition_{partition_count}": { + "full": (start, end), + "train": (start, train_end), + "test": (train_end, test_end), + "validation": ( + (test_end, validation_end) if self.val_pct else (0, 0) + ), + } + } + partition_count += 1 - # Validate that the partitioning is possible with the given window size and stride - self._validate_partitioning(num_rows, window_size) + def _fit_polars( + self, df: pl.DataFrame + ) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: + """Fit method specific to Polars backend. + :param df: Input DataFrame. + :return: Iterator yielding partition indices for Polars. + """ partition_count = 1 - # Ensure start_range is always a list to avoid type conflicts + num_rows = df.height start_range = list(range(0, num_rows, self.stride)) + if self.reverse: start_range.reverse() - # Iterate over the dataset to generate partition indices for start in start_range: - end = start + window_size + end = start + self.window_size - # Adjust the end if it exceeds the number of rows and truncate is False if end > num_rows: if self.truncate: - break # Stop iteration if the last partition is smaller than the window size and truncate is True - end = num_rows # Adjust to include the remaining data + break + end = num_rows - # Compute the split points for train, test, and validation train_end = start + int(self.train_pct * (end - start)) test_end = ( train_end + int(self.test_pct * (end - start)) @@ -359,116 +449,412 @@ def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: ), } } + partition_count += 1 - # If verbose is enabled, print details of the current partition - if self.verbose: - print(f"Partition {partition_count}: {start} to {end}") - print( - f"Training: {start} to {train_end}, Testing: {train_end} to {test_end}" - ) + def _transform_pandas_modin( + self, df: Union[pd.DataFrame, mpd.DataFrame] + ) -> Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame]]]]: + """Transform method for Pandas/Modin backend. - partition_count += 1 + This method transforms the partitioned dataset into slices, yielding the data slices corresponding to + the partition indices generated by the `fit` method. - # Track that fit has been run - self._fit_executed = True + It processes each partition and splits it into train, test, and optionally validation sets. + If a partition's size is smaller than the specified `window_size`, padding is applied to ensure + uniform size across partitions, unless `truncate` is set to True. - def transform( - self, - ) -> Iterator[ - Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]] - ]: - """Generate and yield the data slices for each partition. - - This method utilizes the partition indices generated by the `fit` method to extract and return - the corresponding data slices from the original dataset. The data is returned for each partition, - including the full partition as well as the training, testing, and validation subsets. - - The method is designed to be memory-efficient, generating and yielding each partition's data - only when required. - - :yield: A dictionary where each key corresponds to a partition (e.g., 'partition_1'), and the value is another - dictionary with keys 'full', 'train', 'test', and optionally 'validation', each mapping to a DataFrame slice. - :rtype: Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]]] - :raises ValueError: If data slicing fails for any partition, which could occur if the indices are out of bounds. + :param df: Input DataFrame. This can be either Pandas or Modin DataFrame, depending on the backend. + :type df: Union[pd.DataFrame, mpd.DataFrame] + :return: Iterator yielding partitioned DataFrame slices for Pandas/Modin backends. + :rtype: Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame]]]] + + Example Usage: + -------------- + .. code-block:: python + + partitioner = SlidingWindowPartitioner( + tf=data_tf, window_size=5, stride=2, train_pct=0.7, test_pct=0.3 + ) + + for partition_data in partitioner._transform_pandas_modin(df): + print(partition_data) + + Output Format: + -------------- + Each yielded partition has the following structure: + + .. code-block:: python + + { + 'partition_1': { + 'full': , + 'train': , + 'test': , + 'validation': # (Optional, if val_pct is provided) + } + } + + Notes: + ------ + - Padding is applied when the size of a partition is smaller than the `window_size`, unless truncation is enabled. + - Ensure that the input DataFrame is not empty to avoid runtime errors. + + Performance Considerations: + --------------------------- + - For very large datasets, the padding process may increase memory usage. Consider using Modin when handling + large datasets to take advantage of distributed processing. """ - # Generate partition indices using the fit method + partition_count = 1 + for partition in self.fit(): partitioned_data = {} - # Iterate over each partition and its corresponding indices - for key, partition_dict in partition.items(): - partitioned_data[key] = { - # Slice the data using the appropriate backend method (pandas, Modin, or Polars) - part_name: ( - self.df.iloc[start:end] - if isinstance(self.df, (pd.DataFrame, mpd.DataFrame)) - else self.df.slice(start, end - start) - ) # Polars-specific slicing - for part_name, (start, end) in partition_dict.items() - if start is not None - and end is not None # Ensure valid start and end indices + # Ensure partition is a dictionary + if isinstance(partition, dict): + for key, partition_dict in partition.items(): + partitioned_data[key] = { + part_name: df.iloc[start:end] + for part_name, (start, end) in partition_dict.items() + if start is not None and end is not None + } + + # If the partition size is smaller than the window size, pad it + if ( + partition_dict["full"][1] - partition_dict["full"][0] + < self.window_size + and not self.truncate + ): + partitioned_data[key]["full"] = self._pad_partition( + partitioned_data[key]["full"], + self.window_size, + partition_dict["full"][1], + self.reverse, + ) + yield partitioned_data + + partition_count += 1 + + def _transform_polars( + self, df: pl.DataFrame + ) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: + """Transform method for Polars backend. + + This method generates partitioned data slices for the Polars backend, yielding the data slices corresponding + to the partition indices generated by the `fit` method. If the size of a partition is smaller than the + specified `window_size`, padding is applied unless `truncate` is set to True. + + :param df: Input Polars DataFrame. + :type df: pl.DataFrame + :return: Iterator yielding partitioned DataFrame slices for Polars backend. + :rtype: Iterator[Dict[str, Dict[str, pl.DataFrame]]] + + Example Usage: + -------------- + .. code-block:: python + + partitioner = SlidingWindowPartitioner( + tf=data_tf, window_size=5, stride=2, train_pct=0.7, test_pct=0.3 + ) + + for partition_data in partitioner._transform_polars(df): + print(partition_data) + + Output Format: + -------------- + Each yielded partition has the following structure: + + .. code-block:: python + + { + 'partition_1': { + 'full': , + 'train': , + 'test': , + 'validation': # (Optional, if val_pct is provided) } + } - # Yield the partitioned data - yield partitioned_data + Notes: + ------ + - Padding is applied when the size of a partition is smaller than the `window_size`, unless truncation is enabled. + - Polars DataFrames offer better performance with large datasets, especially for complex operations. - # Track that transform has been run - self._transform_executed = True + Performance Considerations: + --------------------------- + - For very large datasets, Polars DataFrames are recommended due to their lower memory footprint and faster + performance when compared to Pandas. Use Polars for more efficient partitioning and transformations. + """ + partition_count = 1 - def check_data(self, partition_index: Optional[int] = None) -> None: - """Perform data checks on the entire TimeFrame or a specific partition. + num_rows = df.height + start_range = list(range(0, num_rows, self.stride)) + + if self.reverse: + start_range.reverse() + + for start in start_range: + end = start + self.window_size + + if end > num_rows: + if self.truncate: + break + end = num_rows + + train_end = start + int(self.train_pct * (end - start)) + test_end = ( + train_end + int(self.test_pct * (end - start)) + if self.test_pct + else train_end + ) + validation_end = end if self.val_pct else test_end + + # Yield the partitioned data slices + partitioned_data = { + part_name: df.slice(start, end - start) + for part_name, (start, end) in { + "full": (start, end), + "train": (start, train_end), + "test": (train_end, test_end), + "validation": (test_end, validation_end), + }.items() + } + + # If partition size is smaller than window size, pad it + if partitioned_data["full"].height < self.window_size and not self.truncate: + partitioned_data["full"] = self._pad_partition( + partitioned_data["full"], + self.window_size, + partitioned_data["full"].height, + self.reverse, + ) + + # Wrap the partitioned_data in a dictionary to match the expected return type + yield {f"partition_{partition_count}": partitioned_data} + partition_count += 1 + + def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: + """Generate partition indices for the dataset. + + This method creates indices for sliding window partitions based on the specified `window_size`, `stride`, + and other parameters. It yields the start and end indices for each partition, as well as train, test, + and validation splits within each partition. + + :return: Iterator that yields partition indices for training, testing, and validation. + :rtype: Iterator[Dict[str, Dict[str, Tuple[int, int]]]] + :raises ValueError: If an unsupported backend is encountered. + + Example Usage: + -------------- + .. code-block:: python + + partitioner = SlidingWindowPartitioner( + tf=data_tf, window_size=5, stride=2, train_pct=0.7, test_pct=0.3 + ) + + for partition in partitioner.fit(): + print(partition) + + Output Format: + -------------- + Each yielded partition has the following structure: + + .. code-block:: python + + { + 'partition_1': { + 'full': (start_index, end_index), + 'train': (train_start, train_end), + 'test': (test_start, test_end), + 'validation': (validation_start, validation_end) # (Optional, if val_pct is provided) + } + } + + .. note:: + - The indices refer to row indices in the dataset, and the format remains the same regardless of the backend. + - The partitioning occurs in a sliding window fashion with optional gaps, as specified by the stride. + + .. seealso:: + - :meth:`transform`: For generating the actual data slices corresponding to these indices. + """ + + df = self.tf.get_data() # Get the dataset from the TimeFrame + + # Call backend-specific partitioning method + if self.tf.backend in [BACKEND_PANDAS, BACKEND_MODIN]: + return self._fit_pandas_modin(df) + elif self.tf.backend == BACKEND_POLARS: + return self._fit_polars(df) + else: + raise ValueError(f"Unsupported backend: {self.tf.backend}") - This method validates whether the dataset or a specific partition meets - recommended criteria based on sample size, feature-to-sample ratio, and class balance. + def transform(self) -> Iterator[Dict[str, Dict[str, SupportedBackendDataFrame]]]: + """Generate partitioned data slices for the dataset. + + This method yields the actual data slices corresponding to the partition indices generated by the `fit` method. + The slices are returned as generic DataFrames, regardless of the backend (e.g., Pandas, Modin, or Polars). + + :return: Iterator yielding partitioned DataFrame slices. + :rtype: Iterator[Dict[str, Dict[str, DataFrame]]] + :raises ValueError: If an unsupported backend is encountered. + + Example Usage: + -------------- + .. code-block:: python + + partitioner = SlidingWindowPartitioner( + tf=data_tf, window_size=5, stride=2, train_pct=0.7, test_pct=0.3 + ) + + for partition_data in partitioner.transform(): + print(partition_data) + + Output Format: + -------------- + Each yielded partition has the following structure: + + .. code-block:: python + + { + 'partition_1': { + 'full': , + 'train': , + 'test': , + 'validation': # (Optional, if val_pct is provided) + } + } + + .. note:: + - This method transforms the dataset into partitioned slices based on indices created by `fit`. + - Ensure the dataset is preprocessed properly to avoid errors during slicing. + - The DataFrame format is agnostic to the backend. + + .. seealso:: + - :meth:`fit`: For generating the partition indices that are sliced in this method. + """ + df = self.tf.get_data() # Get the dataset from the TimeFrame + + # Call backend-specific transformation method + if self.tf.backend in [BACKEND_PANDAS, BACKEND_MODIN]: + return self._transform_pandas_modin(df) + elif self.tf.backend == BACKEND_POLARS: + return self._transform_polars(df) + else: + raise ValueError(f"Unsupported backend: {self.tf.backend}") - - If `partition_index` is provided, checks are performed on the specified partition. - - If `partition_index` is None, checks are performed on the entire TimeFrame. + def fit_transform(self) -> Iterator[Dict[str, Dict[str, SupportedBackendDataFrame]]]: + """Fit and transform the dataset in a single step. - Assumptions: - ------------ - - If the method is called without running `fit`, it checks the full dataset. - - If `fit` has been run and `partition_index` is provided, it checks the specific partition. + This method combines the functionality of the `fit` and `transform` methods. It first generates partition indices + using `fit`, and then returns the partitioned data slices using `transform`. The DataFrame format is backend-agnostic. - Warnings are raised instead of errors to allow users to proceed with caution. + :return: Iterator yielding partitioned DataFrame slices. + :rtype: Iterator[Dict[str, Dict[str, DataFrame]]] + :raises ValueError: If an unsupported backend is encountered. - :param partition_index: Index of the partition to check, or None to check the entire dataset. + Example Usage: + -------------- + .. code-block:: python + + partitioner = SlidingWindowPartitioner( + tf=data_tf, window_size=5, stride=2, train_pct=0.7, test_pct=0.3 + ) + + for partition_data in partitioner.fit_transform(): + print(partition_data) + + Output Format: + -------------- + Each yielded partition has the following structure: + + .. code-block:: python + + { + 'partition_1': { + 'full': , + 'train': , + 'test': , + 'validation': # (Optional, if val_pct is provided) + } + } + + .. note:: + - This method is a convenient way to generate partition indices and their corresponding data slices in one step. + - Ensure that the dataset is preprocessed properly to avoid issues during partitioning. + + .. seealso:: + - :meth:`fit`: For generating partition indices. + - :meth:`transform`: For generating the actual partitioned slices. + """ + for partition_indices in self.fit(): + yield from self.transform() + + def check_data(self, partition_index: Optional[int] = None) -> None: + """Perform data checks on the entire dataset or a specific partition. + + This method performs validation checks on the dataset or a specific partition, ensuring that + the sample size, feature-to-sample ratio, and class balance (if applicable) meet the expected criteria. + + If a partition index is provided, it checks only that partition; otherwise, it checks the entire dataset. + + :param partition_index: Index of the partition to check, or `None` to check the full dataset. :type partition_index: Optional[int] + :raises ValueError: If the dataset or a partition fails validation checks. + + Example Usage: + -------------- + .. code-block:: python + + partitioner = SlidingWindowPartitioner( + tf=data_tf, window_size=5, stride=2, train_pct=0.7, test_pct=0.3 + ) + + # Perform checks on the full dataset + partitioner.check_data() + + # Perform checks on the first partition + partitioner.check_data(partition_index=0) + + .. note:: + - This method ensures that the data's structure and integrity (sample size, feature ratio, class balance) + meet expectations for further processing. + - Ensure the dataset or partition is not empty to avoid runtime errors. """ + df = self.tf.get_data() # Get the DataFrame (could be Pandas, Modin, or Polars) + if partition_index is not None: - # Generate the required partition directly without assuming prior fit() call partition = next(itertools.islice(self.fit(), partition_index, None)) start, end = partition[f"partition_{partition_index + 1}"]["full"] - df_to_check = self.df[start:end] + + # Slice the DataFrame based on its type (Pandas/Modin vs Polars) + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + df_to_check = df.iloc[start:end] + elif isinstance(df, pl.DataFrame): + df_to_check = df.slice(start, end - start) + else: + raise ValueError(f"Unsupported DataFrame type: {type(df)}") + context = f"Partition {partition_index + 1}" - min_samples = 100 # Lower threshold for partitions + min_samples = 100 else: - df_to_check = self.df + df_to_check = df context = "Full dataset" - min_samples = 3000 # Higher threshold for the full dataset - - num_rows, num_features = df_to_check.shape - target_col = self.tf.target_col + min_samples = 3000 - # Perform checks with warnings enabled + # Perform sample size, feature ratio, and class balance checks check_sample_size( df_to_check, backend=self.tf.backend, min_samples=min_samples, - max_samples=100000, # Standard large threshold + max_samples=100000, enable_warnings=True, ) - check_feature_to_sample_ratio( - df_to_check, - backend=self.tf.backend, - max_ratio=0.2, # Standard ratio for features to samples - enable_warnings=True, + df_to_check, backend=self.tf.backend, max_ratio=0.2, enable_warnings=True ) - - if target_col: + if self.tf.target_col: check_class_balance( df_to_check, - target_col=target_col, + target_col=self.tf.target_col, backend=self.tf.backend, enable_warnings=True, ) diff --git a/test/unit/test_core_temporal_data_loader.py b/test/unit/test_core_temporal_data_loader.py index 72da3aa..1d7f78b 100644 --- a/test/unit/test_core_temporal_data_loader.py +++ b/test/unit/test_core_temporal_data_loader.py @@ -1,16 +1,12 @@ """ TemporalScope/test/unit/test_core_temporal_data_loader.py -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import pytest @@ -19,165 +15,257 @@ import pandas as pd import modin.pandas as mpd from temporalscope.core.temporal_data_loader import TimeFrame -from typing import Union, Dict, Any, List +from temporalscope.core.core_utils import ( + BACKEND_POLARS, + BACKEND_PANDAS, + BACKEND_MODIN, +) +from typing import Union, Dict, List from datetime import date, timedelta -def create_sample_data(num_samples: int = 100, num_features: int = 3) -> Dict[str, Union[List[date], List[float], List[str]]]: - """Create a sample data dictionary representative of a time series ML dataset.""" +def create_sample_data( + num_samples: int = 100, num_features: int = 3 +) -> Dict[str, Union[List[date], List[float]]]: + """Create a sample data dictionary for testing. + + :param num_samples: Number of samples to generate, defaults to 100 + :type num_samples: int, optional + :param num_features: Number of feature columns to generate, defaults to 3 + :type num_features: int, optional + :return: A dictionary containing generated data with keys 'time', 'feature_1', ..., 'feature_n', and 'target' + :rtype: Dict[str, Union[List[date], List[float]]] + """ start_date = date(2021, 1, 1) - data = { "time": [start_date + timedelta(days=i) for i in range(num_samples)], - "id": [f"ID_{i%3}" for i in range(num_samples)], # 3 different IDs cycling } - - # Add feature columns - for i in range(num_features): - data[f"feature_{i+1}"] = np.random.rand(num_samples).tolist() - - # Add a target column (let's assume it's a function of the features plus some noise) - data["target"] = [sum(data[f"feature_{j+1}"][i] for j in range(num_features)) + np.random.normal(0, 0.1) - for i in range(num_samples)] - + + # Generate feature columns + for i in range(1, num_features + 1): + data[f"feature_{i}"] = np.random.rand(num_samples).tolist() + + # Generate target column (e.g., sum of features plus noise) + data["target"] = [ + sum(data[f"feature_{j}"][i] for j in range(1, num_features + 1)) + np.random.normal(0, 0.1) + for i in range(num_samples) + ] + return data -@pytest.fixture(params=["pd", "pl", "mpd"]) -def sample_df(request): - """Fixture for creating sample DataFrames for each backend.""" +@pytest.fixture(params=[BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def sample_dataframe(request): + """Fixture to create sample DataFrames for each backend. + + :param request: Pytest fixture request object containing the backend parameter. + :type request: _pytest.fixtures.SubRequest + :return: A tuple of the DataFrame and the backend identifier. + :rtype: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ data = create_sample_data() - if request.param == "pd": - return pd.DataFrame(data) - elif request.param == "pl": - return pl.DataFrame(data) - elif request.param == "mpd": - return mpd.DataFrame(data) - -@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -def test_initialize(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): - """Test TimeFrame initialization with various backends.""" - tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) - + backend = request.param + + if backend == BACKEND_POLARS: + # Ensure 'time' column is properly typed + data['time'] = pl.Series(data['time']) + df = pl.DataFrame(data) + elif backend == BACKEND_PANDAS: + df = pd.DataFrame(data) + elif backend == BACKEND_MODIN: + df = mpd.DataFrame(data) + else: + raise ValueError(f"Unsupported backend: {backend}") + return df, backend + +def test_timeframe_initialization(sample_dataframe): + """Test the initialization of TimeFrame with various backends. + + :param sample_dataframe: Fixture providing the DataFrame and backend. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, backend = sample_dataframe + tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) assert tf.backend == backend assert tf.time_col == "time" assert tf.target_col == "target" - assert tf.id_col is None - - if backend == "pd": - assert isinstance(tf.get_data(), pd.DataFrame) - elif backend == "pl": - assert isinstance(tf.get_data(), pl.DataFrame) - elif backend == "mpd": - assert isinstance(tf.get_data(), mpd.DataFrame) - -# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -# def test_initialize_with_id(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): -# """Test TimeFrame initialization with ID column.""" -# tf = TimeFrame(sample_df, time_col="time", target_col="target", id_col="id", backend=backend) - -# assert tf.id_col == "id" + assert len(tf.get_data()) == len(df) -# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -# def test_validate_columns(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): -# """Test column validation.""" -# tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) -# tf.validate_columns() # Should not raise an error +def test_sort_data(sample_dataframe): + """Test the sort_data method. -# with pytest.raises(ValueError): -# TimeFrame(sample_df, time_col="non_existent", target_col="target", backend=backend) + :param sample_dataframe: Fixture providing the DataFrame and backend. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, backend = sample_dataframe + tf = TimeFrame(df, time_col="time", target_col="target", backend=backend, sort=False) + # Shuffle the data + if backend == BACKEND_POLARS: + shuffled_df = tf.get_data().sample(fraction=1.0) + else: + shuffled_df = tf.get_data().sample(frac=1).reset_index(drop=True) + tf.update_data(shuffled_df) + tf.sort_data(ascending=True) + sorted_df = tf.get_data() + # Verify that data is sorted + times = sorted_df[tf.time_col].to_list() if backend == BACKEND_POLARS else sorted_df[tf.time_col].tolist() + assert times == sorted(times) +def test_update_data(sample_dataframe): + """Test the update_data method. -# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -# def test_get_data(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): -# """Test get_data method.""" -# tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) -# assert tf.get_data().shape == sample_df.shape + :param sample_dataframe: Fixture providing the DataFrame and backend. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, backend = sample_dataframe + tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) + new_data = create_sample_data(num_samples=50) + if backend == BACKEND_POLARS: + new_data['time'] = pl.Series(new_data['time']) + new_df = pl.DataFrame(new_data) + elif backend == BACKEND_PANDAS: + new_df = pd.DataFrame(new_data) + elif backend == BACKEND_MODIN: + new_df = mpd.DataFrame(new_data) + tf.update_data(new_df) + assert len(tf.get_data()) == 50 +def test_update_target_col(sample_dataframe): + """Test the update_target_col method. + :param sample_dataframe: Fixture providing the DataFrame and backend. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, backend = sample_dataframe + tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) + new_target = np.random.rand(len(df)) + if backend == BACKEND_POLARS: + new_target_col = pl.Series(new_target) + elif backend == BACKEND_PANDAS: + new_target_col = pd.Series(new_target) + elif backend == BACKEND_MODIN: + new_target_col = mpd.Series(new_target) + tf.update_target_col(new_target_col) + updated_target = tf.get_data()[tf.target_col].to_numpy() if backend == BACKEND_POLARS else tf.get_data()[tf.target_col].values + np.testing.assert_array_almost_equal(updated_target, new_target) + +def test_missing_columns(sample_dataframe): + """Test initialization with missing required columns. + + :param sample_dataframe: Fixture providing the DataFrame and backend. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, backend = sample_dataframe + # Remove the target column + if backend == BACKEND_POLARS: + df = df.drop(["target"]) + else: + df = df.drop(columns=["target"]) + with pytest.raises(ValueError) as excinfo: + TimeFrame(df, time_col="time", target_col="target", backend=backend) + assert "Missing required columns" in str(excinfo.value) + +def test_invalid_backend(sample_dataframe): + """Test initialization with an invalid backend. + + :param sample_dataframe: Fixture providing the DataFrame. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, _ = sample_dataframe + invalid_backend = "invalid_backend" + with pytest.raises(ValueError) as excinfo: + TimeFrame(df, time_col="time", target_col="target", backend=invalid_backend) + assert f"Unsupported backend '{invalid_backend}'" in str(excinfo.value) + +def test_invalid_time_col_type(sample_dataframe): + """Test initialization with invalid time_col type. + + :param sample_dataframe: Fixture providing the DataFrame and backend. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, backend = sample_dataframe + with pytest.raises(ValueError) as excinfo: + TimeFrame(df, time_col=123, target_col="target", backend=backend) + assert "time_col must be a non-empty string." in str(excinfo.value) + +def test_invalid_target_col_type(sample_dataframe): + """Test initialization with invalid target_col type. + + :param sample_dataframe: Fixture providing the DataFrame and backend. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, backend = sample_dataframe + with pytest.raises(ValueError) as excinfo: + TimeFrame(df, time_col="time", target_col=None, backend=backend) + assert "target_col must be a non-empty string." in str(excinfo.value) + +def test_invalid_dataframe_type(): + """Test initialization with an invalid DataFrame type.""" + invalid_df = "This is not a DataFrame" + with pytest.raises(TypeError): + TimeFrame(invalid_df, time_col="time", target_col="target", backend=BACKEND_POLARS) + +def test_sort_data_invalid_backend(): + """Test initialization with an unsupported backend.""" + data = create_sample_data() + df = pd.DataFrame(data) + with pytest.raises(ValueError) as excinfo: + TimeFrame(df, time_col="time", target_col="target", backend="unsupported_backend") + assert "Unsupported backend" in str(excinfo.value) + +def test_update_target_col_invalid_length(sample_dataframe): + """Test update_target_col with mismatched length. + + :param sample_dataframe: Fixture providing the DataFrame and backend. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, backend = sample_dataframe + tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) + new_target = np.random.rand(len(df) - 1) + if backend == BACKEND_POLARS: + new_target_col = pl.Series(new_target) + elif backend == BACKEND_PANDAS: + new_target_col = pd.Series(new_target) + elif backend == BACKEND_MODIN: + new_target_col = mpd.Series(new_target) + with pytest.raises(ValueError) as excinfo: + tf.update_target_col(new_target_col) + assert "The new target column must have the same number of rows as the DataFrame." in str(excinfo.value) + +def test_update_target_col_invalid_type(sample_dataframe): + """Test update_target_col with invalid Series type. + + :param sample_dataframe: Fixture providing the DataFrame and backend. + :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] + """ + df, backend = sample_dataframe + tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) + invalid_series = "This is not a Series" + with pytest.raises(TypeError) as excinfo: + tf.update_target_col(invalid_series) + assert "Expected a" in str(excinfo.value) -# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -# def test_update_data(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): -# """Test update_data method.""" -# tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) - -# new_data = create_sample_data() -# new_data["target"] = [x * 2 for x in new_data["target"]] # Double the target values - -# if backend == "pd": -# new_df = pd.DataFrame(new_data) -# elif backend == "pl": -# new_df = pl.DataFrame(new_data) -# else: -# new_df = mpd.DataFrame(new_data) - -# tf.update_data(new_df) - -# if backend == "pl": -# assert tf.get_data()["target"].to_list() == new_df["target"].to_list() -# else: -# assert tf.get_data()["target"].tolist() == new_df["target"].tolist() - -# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -# def test_update_target_col(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): -# """Test update_target_col method.""" -# tf = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) - -# new_target = [x * 3 for x in range(100)] # Triple the values - -# if backend == "pd": -# new_target_series = pd.Series(new_target) -# elif backend == "pl": -# new_target_series = pl.Series(new_target) -# else: -# new_target_series = mpd.Series(new_target) - -# tf.update_target_col(new_target_series) - -# if backend == "pl": -# assert tf.get_data()["target"].to_list() == new_target -# else: -# assert tf.get_data()["target"].tolist() == new_target - - -# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -# def test_sort_data(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): -# tf = TimeFrame(sample_df, time_col="time", target_col="target", id_col="id", backend=backend, sort=True) -# sorted_df = tf.get_data() - -# if backend == "pl": -# time_values = sorted_df["time"].to_list() -# else: -# time_values = sorted_df["time"].tolist() - -# assert time_values == sorted(time_values) - -# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -# def test_get_grouped_data(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): -# tf = TimeFrame(sample_df, time_col="time", target_col="target", id_col="id", backend=backend) -# grouped_data = tf.get_grouped_data() -# if backend == "pl": -# assert grouped_data.shape[0] == len(set(sample_df["id"].to_list())) -# else: -# assert grouped_data.shape[0] == len(set(sample_df["id"].tolist())) - -# with pytest.raises(ValueError): -# tf_without_id = TimeFrame(sample_df, time_col="time", target_col="target", backend=backend) -# tf_without_id.get_grouped_data() - - -# @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -# def test_check_duplicates(sample_df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], backend: str): -# tf = TimeFrame(sample_df, time_col="time", target_col="target", id_col="id", backend=backend) -# tf.check_duplicates() # Should not raise an error - -# # Create a DataFrame with duplicates -# duplicate_data = sample_df.copy() -# if backend == "pl": -# duplicate_data = duplicate_data.with_columns(pl.col("time").shift(-1)) -# else: -# duplicate_data.loc[1:, "time"] = duplicate_data.loc[:98, "time"].values +@pytest.mark.parametrize("df_backend,expected_backend", [ + (BACKEND_POLARS, BACKEND_POLARS), + (BACKEND_PANDAS, BACKEND_PANDAS), + (BACKEND_MODIN, BACKEND_MODIN) +]) +def test_infer_backend(sample_dataframe, df_backend, expected_backend): + """Test that the backend is correctly inferred for Polars, Pandas, and Modin DataFrames.""" + df, backend = sample_dataframe + if backend == df_backend: + tf = TimeFrame(df, time_col="time", target_col="target") + inferred_backend = tf._infer_backend(df) + assert inferred_backend == expected_backend + +def test_infer_backend_invalid(): + """Test that a ValueError is raised for unsupported DataFrame types.""" + invalid_df = "This is not a DataFrame" -# tf_with_duplicates = TimeFrame(duplicate_data, time_col="time", target_col="target", id_col="id", backend=backend) + # Creating a valid TimeFrame object first to avoid column validation + valid_df = pd.DataFrame({"time": [1, 2, 3], "target": [1, 2, 3]}) + tf = TimeFrame(valid_df, time_col="time", target_col="target") # Placeholder -# with pytest.raises(ValueError): -# tf_with_duplicates.check_duplicates() + # Now test the _infer_backend method directly on the invalid data + with pytest.raises(ValueError) as excinfo: + tf._infer_backend(invalid_df) + assert "Unsupported DataFrame type" in str(excinfo.value) diff --git a/test/unit/test_core_temporal_target_shifter.py b/test/unit/test_core_temporal_target_shifter.py index d28f9cf..45c1a33 100644 --- a/test/unit/test_core_temporal_target_shifter.py +++ b/test/unit/test_core_temporal_target_shifter.py @@ -1,5 +1,8 @@ """ TemporalScope/test/unit/test_core_temporal_target_shifter.py +This file contains unit tests for the TemporalTargetShifter class to ensure it behaves correctly across different +backends (pandas, modin, polars), modes of operation (machine_learning, deep_learning), and various configurations. + TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -13,111 +16,194 @@ limitations under the License. """ -import modin.pandas as mpd -import pandas as pd -import polars as pl import pytest +import polars as pl +import pandas as pd +import modin.pandas as mpd +import numpy as np from temporalscope.core.temporal_target_shifter import TemporalTargetShifter from temporalscope.core.temporal_data_loader import TimeFrame -from typing import Union - -# Test DataFrames -pd_df = pd.DataFrame({ - "time": pd.date_range(start="2023-01-01", periods=5, freq="D"), - "target": [10, 20, 30, 40, 50], -}) - -pl_df = pl.DataFrame({ - "time": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05"], - "target": [10, 20, 30, 40, 50], -}) - -mpd_df = mpd.DataFrame({ - "time": pd.date_range(start="2023-01-01", periods=5, freq="D"), - "target": [10, 20, 30, 40, 50], -}) - -@pytest.mark.parametrize("backend, df", [ - ("pd", pd_df), - ("pl", pl_df), - ("mpd", mpd_df), +from temporalscope.core.core_utils import BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN + +# Fixture to generate sample dataframes for different backends +@pytest.fixture(params=[BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def sample_dataframe(request): + """Fixture to generate sample dataframes for different backends.""" + data = { + "time": pd.date_range(start="2022-01-01", periods=100), + "target": np.random.rand(100), + "feature_1": np.random.rand(100), + "feature_2": np.random.rand(100), + } + backend = request.param + if backend == BACKEND_POLARS: + df = pl.DataFrame(data) + elif backend == BACKEND_PANDAS: + df = pd.DataFrame(data) + elif backend == BACKEND_MODIN: + df = mpd.DataFrame(data) + return df, backend, "target" + +# Parametrized Test for Backend Inference, n_lags, and Modes +@pytest.mark.parametrize("n_lags, mode, sequence_length", [ + (1, TemporalTargetShifter.MODE_MACHINE_LEARNING, None), + (3, TemporalTargetShifter.MODE_MACHINE_LEARNING, None), + (1, TemporalTargetShifter.MODE_DEEP_LEARNING, 5) ]) -def test_shift_target_scalar_output(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: - """Test shifting target to scalar output for each backend.""" - tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) - shifter = TemporalTargetShifter(shift_steps=1, array_output=False) - tf_transformed = shifter.transform(tf) +@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) # Parametrizing backends as well +def test_backend_inference(backend, n_lags, mode, sequence_length): + """Test backend inference and shifting functionality across all backends.""" + + # Generate data for the current backend + data = { + "time": pd.date_range(start="2022-01-01", periods=100), + "target": np.random.rand(100), + "feature_1": np.random.rand(100), + "feature_2": np.random.rand(100), + } + + if backend == BACKEND_POLARS: + df = pl.DataFrame(data) + elif backend == BACKEND_PANDAS: + df = pd.DataFrame(data) + elif backend == BACKEND_MODIN: + df = mpd.DataFrame(data) - expected_target = [20, 30, 40, 50, None] + # Initialize shifter + shifter = TemporalTargetShifter(n_lags=n_lags, mode=mode, sequence_length=sequence_length, target_col="target") - if backend == "pl": - actual_target = tf_transformed.get_data()["target_shift_1"].to_list() - else: - actual_target = tf_transformed.get_data()["target_shift_1"].tolist() + # Test fitting the dataframe and checking the inferred backend + shifter.fit(df) + assert shifter.backend == backend - assert actual_target == expected_target[:-1] # Comparing excluding the last item due to `None` handling + # Test transformation (ensure no crashes) + transformed = shifter.transform(df) + assert transformed is not None -@pytest.mark.parametrize("backend, df", [ - ("pd", pd_df), - ("pl", pl_df), - ("mpd", mpd_df), +# Parametrized test for invalid data and expected errors across backends +@pytest.mark.parametrize("invalid_data", [ + None, # Null input should raise an error + pd.DataFrame(), # Empty DataFrame should raise an error ]) -def test_shift_target_array_output(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: - """Test shifting target to array output for each backend.""" - tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) - shifter = TemporalTargetShifter(shift_steps=2, array_output=True) - tf_transformed = shifter.transform(tf) - - expected_target_array = [[20, 30], [30, 40], [40, 50], [50, None], [None, None]] +@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_invalid_data_handling(backend, invalid_data): + """Test invalid data handling for empty or None DataFrames across backends.""" + + shifter = TemporalTargetShifter(n_lags=1, target_col="target") - if backend == "pl": - actual_target = tf_transformed.get_data()["target_array_2"].to_list() - else: - actual_target = tf_transformed.get_data()["target_array_2"].tolist() + with pytest.raises(ValueError): + shifter.fit(invalid_data) - assert actual_target == expected_target_array +# Parametrized test for TimeFrame inputs and transformation across all backends +@pytest.mark.parametrize("n_lags", [1, 2]) +@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_time_frame_input(backend, n_lags): + """Test TimeFrame input handling and transformation across all backends.""" + + # Generate data for the current backend + data = { + "time": pd.date_range(start="2022-01-01", periods=100), + "target": np.random.rand(100), + "feature_1": np.random.rand(100), + "feature_2": np.random.rand(100), + } + + if backend == BACKEND_POLARS: + df = pl.DataFrame(data) + elif backend == BACKEND_PANDAS: + df = pd.DataFrame(data) + elif backend == BACKEND_MODIN: + df = mpd.DataFrame(data) -@pytest.mark.parametrize("backend, df", [ - ("pd", pd_df), - ("pl", pl_df), - ("mpd", mpd_df), -]) -def test_shift_target_with_nonstandard_names(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: - """Test shifting target with non-standardized names.""" tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) - shifter = TemporalTargetShifter(shift_steps=1, array_output=False) - tf_transformed = shifter.transform(tf) - - expected_target = [20, 30, 40, 50, None] - - if backend == "pl": - actual_target = tf_transformed.get_data()["target_shift_1"].to_list() - else: - actual_target = tf_transformed.get_data()["target_shift_1"].tolist() - - assert actual_target == expected_target[:-1] + shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") + + # Test fitting and transforming TimeFrame + shifter.fit(tf) + transformed = shifter.transform(tf) + assert transformed is not None + +# Parametrized test for deep learning mode with different sequence lengths across all backends +@pytest.mark.parametrize("sequence_length", [3, 5]) +@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_deep_learning_mode(backend, sequence_length): + """Test deep learning mode sequence generation across all backends.""" + + # Generate data for the current backend + data = { + "time": pd.date_range(start="2022-01-01", periods=100), + "target": np.random.rand(100), + "feature_1": np.random.rand(100), + "feature_2": np.random.rand(100), + } + + if backend == BACKEND_POLARS: + df = pl.DataFrame(data) + elif backend == BACKEND_PANDAS: + df = pd.DataFrame(data) + elif backend == BACKEND_MODIN: + df = mpd.DataFrame(data) + + shifter = TemporalTargetShifter( + n_lags=1, mode=TemporalTargetShifter.MODE_DEEP_LEARNING, sequence_length=sequence_length, target_col="target" + ) + + shifter.fit(df) + transformed = shifter.transform(df) + assert transformed is not None + +# Test verbose mode with stdout capture +@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_verbose_mode(backend, capfd): + """Test verbose mode output and row dropping information.""" + + # Generate data for the current backend + data = { + "time": pd.date_range(start="2022-01-01", periods=100), + "target": np.random.rand(100), + "feature_1": np.random.rand(100), + "feature_2": np.random.rand(100), + } + + if backend == BACKEND_POLARS: + df = pl.DataFrame(data) + elif backend == BACKEND_PANDAS: + df = pd.DataFrame(data) + elif backend == BACKEND_MODIN: + df = mpd.DataFrame(data) + + shifter = TemporalTargetShifter(n_lags=1, target_col="target", verbose=True) + + shifter.fit(df) + shifter.transform(df) + + # Capture stdout and check for printed verbose information + captured = capfd.readouterr() + assert "Rows before shift" in captured.out + +# Parametrized test for fit_transform method for all backends +@pytest.mark.parametrize("n_lags", [1, 2]) +@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_fit_transform(backend, n_lags): + """Test fit_transform() method for all backends.""" + + # Generate data for the current backend + data = { + "time": pd.date_range(start="2022-01-01", periods=100), + "target": np.random.rand(100), + "feature_1": np.random.rand(100), + "feature_2": np.random.rand(100), + } + + if backend == BACKEND_POLARS: + df = pl.DataFrame(data) + elif backend == BACKEND_PANDAS: + df = pd.DataFrame(data) + elif backend == BACKEND_MODIN: + df = mpd.DataFrame(data) + + shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") + + transformed = shifter.fit_transform(df) + assert transformed is not None -@pytest.mark.parametrize("backend, df", [ - ("pd", pd_df), - ("pl", pl_df), - ("mpd", mpd_df), -]) -def test_shift_target_invalid_backend(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: - """Test shifting target with an invalid backend.""" - tf = TimeFrame(df, time_col="time", target_col="target", backend="invalid_backend") - shifter = TemporalTargetShifter(shift_steps=1, array_output=False) - with pytest.raises(ValueError, match="Unsupported backend"): - shifter.transform(tf) - -@pytest.mark.parametrize("backend, df", [ - ("pd", pd_df), - ("pl", pl_df), - ("mpd", mpd_df), -]) -def test_shift_target_type_error(backend: str, df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame]) -> None: - """Test shifting target with an incorrect DataFrame type.""" - # Intentionally using an incorrect type (dictionary) instead of a DataFrame - with pytest.raises(TypeError): - tf = TimeFrame(df.to_dict(), time_col="time", target_col="target", backend=backend) - shifter = TemporalTargetShifter(shift_steps=1, array_output=False) - shifter.transform(tf) diff --git a/test/unit/test_core_utils.py b/test/unit/test_core_utils.py index f5cf819..a538ee1 100644 --- a/test/unit/test_core_utils.py +++ b/test/unit/test_core_utils.py @@ -13,10 +13,11 @@ limitations under the License. """ -from unittest.mock import patch -import modin.pandas as mpd +from typing import Optional, Union, Tuple +import numpy as np import pandas as pd import polars as pl +import modin.pandas as mpd import pytest from temporalscope.core.core_utils import ( get_api_keys, @@ -24,33 +25,79 @@ validate_backend, validate_input, validate_and_convert_input, - check_nans, check_nulls, + check_nans, print_divider ) +from unittest.mock import patch import warnings warnings.filterwarnings("ignore", message=".*defaulting to pandas.*") -# Define mock API key constants +# Mock API key constants MOCK_OPENAI_API_KEY = "mock_openai_key" MOCK_CLAUDE_API_KEY = "mock_claude_key" -def create_sample_data(): - """Create a sample data dictionary.""" - return {"a": [1, 2, 3]} - -@pytest.fixture(params=["pd", "pl", "mpd"]) -def sample_df(request): - """Fixture for creating sample DataFrames for each backend.""" - data = create_sample_data() - backend = request.param - if backend == "pd": - return pd.DataFrame(data), backend - elif backend == "pl": - return pl.DataFrame(data), backend - elif backend == "mpd": - return mpd.DataFrame(data), backend +# --- Data Generation Functions --- +def create_sample_data(num_samples: int = 100, with_nulls=False, with_nans=False): + """Create sample data with options for introducing nulls and NaNs.""" + data = { + "feature_1": np.random.rand(num_samples).tolist(), + "feature_2": np.random.rand(num_samples).tolist(), + "feature_3": np.random.rand(num_samples).tolist(), + } + + if with_nans: + for i in range(0, num_samples, 10): + data["feature_2"][i] = float("nan") # Every 10th value is NaN + + if with_nulls: + for i in range(0, num_samples, 15): + data["feature_3"][i] = None # Every 15th value is Null + + return data + +# Unified fixture for data with nulls and NaNs +@pytest.fixture +def sample_df_with_conditions(): + """Fixture for creating DataFrames for each backend. + + Provides a function to generate sample DataFrames with optional nulls or NaNs. + + :return: A function that generates a DataFrame and backend identifier based on the specified conditions. + :rtype: Callable[[Optional[str], bool, bool], Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]] + """ + def _create_sample_df( + backend: Optional[str] = None, + with_nulls: bool = False, + with_nans: bool = False + ) -> Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]: + """Creates a sample DataFrame for the specified backend with optional nulls and NaNs. + + :param backend: The backend to use ('pd', 'pl', 'mpd'). Defaults to 'pd' if None. + :type backend: Optional[str] + :param with_nulls: Whether to include null values in the data. Defaults to False. + :type with_nulls: bool + :param with_nans: Whether to include NaN values in the data. Defaults to False. + :type with_nans: bool + :return: A tuple containing the DataFrame and the backend string. + :rtype: Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str] + :raises ValueError: If an unsupported backend is specified. + """ + data = create_sample_data(with_nulls=with_nulls, with_nans=with_nans) + if backend is None: + backend = "pd" # Default to pandas for backward compatibility + if backend == "pd": + return pd.DataFrame(data), "pd" + elif backend == "pl": + return pl.DataFrame(data), "pl" + elif backend == "mpd": + return mpd.DataFrame(data), "mpd" + else: + raise ValueError(f"Unsupported backend '{backend}'") + return _create_sample_df + +# --- Tests --- def test_get_api_keys(): """Test that get_api_keys retrieves environment variables correctly.""" @@ -64,6 +111,15 @@ def test_get_api_keys(): assert api_keys["OPENAI_API_KEY"] is None assert api_keys["CLAUDE_API_KEY"] is None +@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +@pytest.mark.parametrize("with_nans", [True, False]) +def test_check_nans(backend, sample_df_with_conditions, with_nans): + """Test check_nans for both NaNs present and no NaNs across backends.""" + df, _ = sample_df_with_conditions(backend=backend, with_nans=with_nans) + result = check_nans(df, backend) + expected = with_nans # True if NaNs were introduced, else False + assert result == expected, f"Expected {expected} but got {result} for backend {backend}" + def test_get_default_backend_cfg(): """Test that the default backend configuration is returned correctly.""" expected_cfg = {"BACKENDS": {"pl": "polars", "pd": "pandas", "mpd": "modin"}} @@ -81,152 +137,77 @@ def test_validate_backend_unsupported(invalid_backend): with pytest.raises(ValueError, match="Unsupported backend"): validate_backend(invalid_backend) -def test_validate_input_valid(sample_df): - """Test that valid DataFrame types are accepted based on the backend.""" - df, backend = sample_df - validate_input(df, backend) +@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +@pytest.mark.parametrize("target_backend", ["pl", "pd", "mpd"]) +def test_validate_and_convert_input(sample_df_with_conditions, backend, target_backend): + """Test that DataFrame conversion between backends works correctly.""" + df, _ = sample_df_with_conditions(backend=backend, with_nulls=False) + result = validate_and_convert_input(df, target_backend) -@pytest.mark.parametrize("backend, df_type", [ - ("pl", pd.DataFrame), ("pd", pl.DataFrame), ("mpd", pd.DataFrame) -]) -def test_validate_input_invalid(backend, df_type): - """Test that invalid DataFrame types raise a TypeError based on the backend.""" - with pytest.raises(TypeError, match="Expected a .* DataFrame"): - validate_input(df_type(create_sample_data()), backend) - -@pytest.mark.parametrize("output_backend", ["pd", "pl", "mpd"]) -def test_validate_and_convert_input(sample_df, output_backend): - """Test that validate_and_convert_input correctly converts DataFrames.""" - df, input_backend = sample_df - result = validate_and_convert_input(df, output_backend) - - if output_backend == "pd": - assert isinstance(result, pd.DataFrame) - elif output_backend == "pl": - assert isinstance(result, pl.DataFrame) - elif output_backend == "mpd": - assert isinstance(result, mpd.DataFrame) - - assert result.shape == df.shape - - if output_backend == "pl": - assert result["a"].to_list() == [1, 2, 3] - else: - assert result["a"].tolist() == [1, 2, 3] + if target_backend == "pd": + assert isinstance(result, pd.DataFrame), f"Expected Pandas DataFrame but got {type(result)}" + elif target_backend == "pl": + assert isinstance(result, pl.DataFrame), f"Expected Polars DataFrame but got {type(result)}" + elif target_backend == "mpd": + assert isinstance(result, mpd.DataFrame), f"Expected Modin DataFrame but got {type(result)}" -def test_validate_and_convert_input_invalid_backend(sample_df): - """Test that an invalid backend raises a ValueError.""" - df, _ = sample_df - with pytest.raises(ValueError, match="Unsupported backend"): - validate_and_convert_input(df, "invalid_backend") - -def test_validate_and_convert_input_invalid_df_type(): - """Test that an invalid DataFrame type raises a TypeError.""" - with pytest.raises(TypeError, match="Input DataFrame type .* does not match the specified backend"): - validate_and_convert_input({"a": [1, 2, 3]}, "pd") # Not a DataFrame - - - - -# Test data for check_nulls -test_nulls_data = [ - ("pd", pd.DataFrame({"FEATURE_1": [1, None, 3]}), True), - ( - "pl", - pl.DataFrame({"FEATURE_1": [1, None, 3]}, schema={"FEATURE_1": pl.Float64}), - True, - ), - ("mpd", mpd.DataFrame({"FEATURE_1": [1, None, 3]}), True), - ("pd", pd.DataFrame({"FEATURE_1": [1, 2, 3]}), False), - ( - "pl", - pl.DataFrame({"FEATURE_1": [1, 2, 3]}, schema={"FEATURE_1": pl.Float64}), - False, - ), - ("mpd", mpd.DataFrame({"FEATURE_1": [1, 2, 3]}), False), - ("pd", pd.DataFrame(), False), # Empty DataFrame for Pandas - ( - "pl", - pl.DataFrame({"FEATURE_1": []}, schema={"FEATURE_1": pl.Float64}), - False, - ), # Empty DataFrame for Polars - ("mpd", mpd.DataFrame(), False), # Empty DataFrame for Modin -] - -# Test data for check_nans -test_nans_data = [ - ("pd", pd.DataFrame({"FEATURE_1": [1, float("nan"), 3]}), True), - ( - "pl", - pl.DataFrame( - {"FEATURE_1": [1, float("nan"), 3]}, - schema={"FEATURE_1": pl.Float64}, - ), - True, - ), - ("mpd", mpd.DataFrame({"FEATURE_1": [1, float("nan"), 3]}), True), - ("pd", pd.DataFrame({"FEATURE_1": [1, 2, 3]}), False), - ( - "pl", - pl.DataFrame({"FEATURE_1": [1, 2, 3]}, schema={"FEATURE_1": pl.Float64}), - False, - ), - ("mpd", mpd.DataFrame({"FEATURE_1": [1, 2, 3]}), False), - ("pd", pd.DataFrame(), False), # Empty DataFrame for Pandas - ( - "pl", - pl.DataFrame({"FEATURE_1": []}, schema={"FEATURE_1": pl.Float64}), - False, - ), # Empty DataFrame for Polars - ("mpd", mpd.DataFrame(), False), # Empty DataFrame for Modin -] +@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +def test_validate_and_convert_input_invalid_type(backend): + """Test that validate_and_convert_input raises TypeError when given an invalid DataFrame type.""" + invalid_df = "This is not a DataFrame" + with pytest.raises(TypeError, match="Input DataFrame type"): + validate_and_convert_input(invalid_df, backend) def test_print_divider(capsys): - """Test that print_divider prints without error.""" - print_divider() + """Test the print_divider function outputs the correct string.""" + print_divider("-", 50) captured = capsys.readouterr() - # Ensure that print was called and output is non-empty - assert len(captured.out.strip()) > 0 - + assert captured.out == "-" * 50 + "\n" -@pytest.mark.parametrize("backend, df, expected", test_nulls_data) -def test_check_nulls(backend, df, expected): - """Test that check_nulls detects null values correctly across backends.""" - if backend == "pl": - # Polars-specific null check: check if there are any null values - null_count = df.null_count().select(pl.col("*").sum()).to_numpy()[0][0] - assert (null_count > 0) == expected - else: - # Pandas/Modin null check using the utils function - result = check_nulls(df, backend) - assert result == expected - - -@pytest.mark.parametrize("backend, df, expected", test_nans_data) -def test_check_nans(backend, df, expected): - """Test that check_nans detects NaN values correctly across backends.""" - if backend == "pl": - # Polars-specific NaN check: convert to boolean, count NaN values - nan_count = df.select(pl.col("FEATURE_1").is_nan()).sum().to_numpy()[0][0] - assert (nan_count > 0) == expected - else: - # Pandas/Modin NaN check using the utils function - result = check_nans(df, backend) - assert result == expected +def test_check_nans_invalid_backend(sample_df_with_conditions): + """Test that an unsupported backend raises a ValueError in check_nans.""" + df, _ = sample_df_with_conditions(with_nans=True) + with pytest.raises(ValueError, match="Unsupported backend"): + check_nans(df, "invalid_backend") -@pytest.mark.parametrize("backend", ["unsupported_backend"]) -def test_check_nulls_invalid_backend(backend): - """Test that check_nulls raises ValueError for unsupported backends.""" - df = pd.DataFrame({"FEATURE_1": [1, 2, 3]}) - with pytest.raises(ValueError, match="Unsupported backend"): - check_nulls(df, backend) +@pytest.mark.parametrize("backend, expected_type", [ + ("pl", pl.DataFrame), + ("pd", pd.DataFrame), + ("mpd", mpd.DataFrame), +]) +def test_validate_input_correct_backend(sample_df_with_conditions, backend, expected_type): + """Test that validate_input passes when the DataFrame matches the backend.""" + df, _ = sample_df_with_conditions(backend=backend, with_nulls=False) + validate_input(df, backend) +@pytest.mark.parametrize("df_backend", ["pd", "pl", "mpd"]) +@pytest.mark.parametrize("validate_backend", ["pd", "pl", "mpd"]) +def test_validate_input_mismatched_backend(sample_df_with_conditions, df_backend, validate_backend): + """Test that validate_input raises TypeError when the DataFrame does not match the backend.""" + df, _ = sample_df_with_conditions(backend=df_backend, with_nulls=False) -@pytest.mark.parametrize("backend", ["unsupported_backend"]) -def test_check_nans_invalid_backend(backend): - """Test that check_nans raises ValueError for unsupported backends.""" - df = pd.DataFrame({"FEATURE_1": [1, 2, 3]}) + if df_backend != validate_backend: + # Expect TypeError when backends don't match + with pytest.raises(TypeError, match="Expected a"): + validate_input(df, validate_backend) + else: + # Should pass when backends match + validate_input(df, validate_backend) + +@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) +@pytest.mark.parametrize("with_nulls", [True, False]) +def test_check_nulls(backend, sample_df_with_conditions, with_nulls): + """Test check_nulls for both nulls present and no nulls across backends.""" + df, _ = sample_df_with_conditions(backend=backend, with_nulls=with_nulls) + result = check_nulls(df, backend) + expected = with_nulls # True if nulls were introduced, else False + assert result == expected, f"Expected {expected} but got {result} for backend {backend}" + +# Test for invalid backend handling +def test_check_nulls_invalid_backend(sample_df_with_conditions): + """Test that check_nulls raises ValueError when given an unsupported backend.""" + df, _ = sample_df_with_conditions(with_nulls=True) with pytest.raises(ValueError, match="Unsupported backend"): - check_nans(df, backend) + check_nulls(df, "invalid_backend") \ No newline at end of file diff --git a/test/unit/test_partion_data_checks.py b/test/unit/test_partion_data_checks.py index 05f93fc..1336ca4 100644 --- a/test/unit/test_partion_data_checks.py +++ b/test/unit/test_partion_data_checks.py @@ -1,336 +1,336 @@ -""" -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at +# """ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. -""" -import modin.pandas as mpd -import pandas as pd -import polars as pl -import pytest +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# """ +# import modin.pandas as mpd +# import pandas as pd +# import polars as pl +# import pytest -from temporalscope.partition.data_checks import ( - check_binary_numerical_features, - check_categorical_feature_cardinality, - check_class_balance, - check_feature_count, - check_feature_to_sample_ratio, - check_numerical_feature_uniqueness, - check_sample_size, -) +# from temporalscope.partition.data_checks import ( +# check_binary_numerical_features, +# check_categorical_feature_cardinality, +# check_class_balance, +# check_feature_count, +# check_feature_to_sample_ratio, +# check_numerical_feature_uniqueness, +# check_sample_size, +# ) -@pytest.mark.parametrize( - "dataframe,backend,min_samples,max_samples,expected_result", - [ - (pd.DataFrame({"feature1": range(100)}), "pd", 3000, 50000, False), - ( - pl.DataFrame({"feature1": pl.Series(range(100))}), - "pl", - 3000, - 50000, - False, - ), - ( - mpd.DataFrame({"feature1": range(100000)}), - "mpd", - 3000, - 50000, - False, - ), - ], -) -def test_check_sample_size( - dataframe, backend, min_samples, max_samples, expected_result -): - """Test sample size check for various dataframes and backends.""" - assert ( - check_sample_size( - dataframe, - backend=backend, - min_samples=min_samples, - max_samples=max_samples, - ) - == expected_result - ) +# @pytest.mark.parametrize( +# "dataframe,backend,min_samples,max_samples,expected_result", +# [ +# (pd.DataFrame({"feature1": range(100)}), "pd", 3000, 50000, False), +# ( +# pl.DataFrame({"feature1": pl.Series(range(100))}), +# "pl", +# 3000, +# 50000, +# False, +# ), +# ( +# mpd.DataFrame({"feature1": range(100000)}), +# "mpd", +# 3000, +# 50000, +# False, +# ), +# ], +# ) +# def test_check_sample_size( +# dataframe, backend, min_samples, max_samples, expected_result +# ): +# """Test sample size check for various dataframes and backends.""" +# assert ( +# check_sample_size( +# dataframe, +# backend=backend, +# min_samples=min_samples, +# max_samples=max_samples, +# ) +# == expected_result +# ) -@pytest.mark.parametrize( - "dataframe,backend,min_features,expected_result", - [ - # Pandas DataFrame - ( - pd.DataFrame({"feature1": range(100)}), - "pd", - 4, - False, - ), # Too few features - Pandas - # Polars DataFrame - ( - pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), - "pl", - 4, - True, - ), # Enough features - Polars - # Modin DataFrame - ( - mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), - "mpd", - 4, - True, - ), # Enough features - Modin - ], -) -def test_check_feature_count(dataframe, backend, min_features, expected_result): - """Tests check_feature_count for various dataframes and backends.""" - assert ( - check_feature_count(dataframe, backend=backend, min_features=min_features) - == expected_result - ) +# @pytest.mark.parametrize( +# "dataframe,backend,min_features,expected_result", +# [ +# # Pandas DataFrame +# ( +# pd.DataFrame({"feature1": range(100)}), +# "pd", +# 4, +# False, +# ), # Too few features - Pandas +# # Polars DataFrame +# ( +# pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), +# "pl", +# 4, +# True, +# ), # Enough features - Polars +# # Modin DataFrame +# ( +# mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), +# "mpd", +# 4, +# True, +# ), # Enough features - Modin +# ], +# ) +# def test_check_feature_count(dataframe, backend, min_features, expected_result): +# """Tests check_feature_count for various dataframes and backends.""" +# assert ( +# check_feature_count(dataframe, backend=backend, min_features=min_features) +# == expected_result +# ) -@pytest.mark.parametrize( - "dataframe,backend,max_ratio,expected_result", - [ - ( - pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), - "pl", - 0.1, - True, - ), - ( - mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), - "mpd", - 0.1, - True, - ), - ( - pd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), - "pd", - 0.1, - True, - ), - ], -) -def test_check_feature_to_sample_ratio(dataframe, backend, max_ratio, expected_result): - """Tests check_feature_to_sample_ratio for various dataframes and backends.""" - assert ( - check_feature_to_sample_ratio(dataframe, backend=backend, max_ratio=max_ratio) - == expected_result - ) +# @pytest.mark.parametrize( +# "dataframe,backend,max_ratio,expected_result", +# [ +# ( +# pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), +# "pl", +# 0.1, +# True, +# ), +# ( +# mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), +# "mpd", +# 0.1, +# True, +# ), +# ( +# pd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), +# "pd", +# 0.1, +# True, +# ), +# ], +# ) +# def test_check_feature_to_sample_ratio(dataframe, backend, max_ratio, expected_result): +# """Tests check_feature_to_sample_ratio for various dataframes and backends.""" +# assert ( +# check_feature_to_sample_ratio(dataframe, backend=backend, max_ratio=max_ratio) +# == expected_result +# ) -@pytest.mark.parametrize( - "dataframe,backend,max_unique_values,expected_result", - [ - # Pandas DataFrames - ( - pd.DataFrame({"category1": [str(i) for i in range(25)]}), - "pd", - 20, - False, - ), # Too many unique values - Pandas - ( - pd.DataFrame({"category1": ["A", "B", "C"] * 100}), - "pd", - 20, - True, - ), # Normal unique values - Pandas - # Polars DataFrames - ( - pl.DataFrame({"category1": pl.Series([str(i) for i in range(25)])}), - "pl", - 20, - False, - ), # Too many unique values - Polars - ( - pl.DataFrame({"category1": pl.Series(["A", "B", "C"] * 100)}), - "pl", - 20, - True, - ), # Normal unique values - Polars - # Modin DataFrames - ( - mpd.DataFrame({"category1": [str(i) for i in range(25)]}), - "mpd", - 20, - False, - ), # Too many unique values - Modin - ( - mpd.DataFrame({"category1": ["A", "B", "C"] * 100}), - "mpd", - 20, - True, - ), # Normal unique values - Modin - ], -) -def test_check_categorical_feature_cardinality( - dataframe, backend, max_unique_values, expected_result -): - """Tests check_categorical_feature_cardinality for various dataframe backends.""" - assert ( - check_categorical_feature_cardinality( - dataframe, backend=backend, max_unique_values=max_unique_values - ) - == expected_result - ) +# @pytest.mark.parametrize( +# "dataframe,backend,max_unique_values,expected_result", +# [ +# # Pandas DataFrames +# ( +# pd.DataFrame({"category1": [str(i) for i in range(25)]}), +# "pd", +# 20, +# False, +# ), # Too many unique values - Pandas +# ( +# pd.DataFrame({"category1": ["A", "B", "C"] * 100}), +# "pd", +# 20, +# True, +# ), # Normal unique values - Pandas +# # Polars DataFrames +# ( +# pl.DataFrame({"category1": pl.Series([str(i) for i in range(25)])}), +# "pl", +# 20, +# False, +# ), # Too many unique values - Polars +# ( +# pl.DataFrame({"category1": pl.Series(["A", "B", "C"] * 100)}), +# "pl", +# 20, +# True, +# ), # Normal unique values - Polars +# # Modin DataFrames +# ( +# mpd.DataFrame({"category1": [str(i) for i in range(25)]}), +# "mpd", +# 20, +# False, +# ), # Too many unique values - Modin +# ( +# mpd.DataFrame({"category1": ["A", "B", "C"] * 100}), +# "mpd", +# 20, +# True, +# ), # Normal unique values - Modin +# ], +# ) +# def test_check_categorical_feature_cardinality( +# dataframe, backend, max_unique_values, expected_result +# ): +# """Tests check_categorical_feature_cardinality for various dataframe backends.""" +# assert ( +# check_categorical_feature_cardinality( +# dataframe, backend=backend, max_unique_values=max_unique_values +# ) +# == expected_result +# ) -@pytest.mark.parametrize( - "dataframe,backend,min_unique_values,expected_result", - [ - # Pandas DataFrame - ( - pd.DataFrame({"feature1": range(100)}), - "pd", - 10, - True, - ), # Enough unique values - Pandas - # Polars DataFrame - ( - pl.DataFrame({"feature1": pl.Series(range(100))}), - "pl", - 10, - True, - ), # Enough unique values - Polars - # Modin DataFrame - ( - mpd.DataFrame({"feature1": [1, 1, 1, 2, 2, 2, 3, 3]}), - "mpd", - 10, - False, - ), # Too few unique values - Modin - ( - mpd.DataFrame({"feature1": range(100)}), - "mpd", - 10, - True, - ), # Enough unique values - Modin - ], -) -def test_check_numerical_feature_uniqueness( - dataframe, backend, min_unique_values, expected_result -): - """Tests check_numerical_feature_uniqueness for various dataframes and backends.""" - assert ( - check_numerical_feature_uniqueness( - dataframe, backend=backend, min_unique_values=min_unique_values - ) - == expected_result - ) +# @pytest.mark.parametrize( +# "dataframe,backend,min_unique_values,expected_result", +# [ +# # Pandas DataFrame +# ( +# pd.DataFrame({"feature1": range(100)}), +# "pd", +# 10, +# True, +# ), # Enough unique values - Pandas +# # Polars DataFrame +# ( +# pl.DataFrame({"feature1": pl.Series(range(100))}), +# "pl", +# 10, +# True, +# ), # Enough unique values - Polars +# # Modin DataFrame +# ( +# mpd.DataFrame({"feature1": [1, 1, 1, 2, 2, 2, 3, 3]}), +# "mpd", +# 10, +# False, +# ), # Too few unique values - Modin +# ( +# mpd.DataFrame({"feature1": range(100)}), +# "mpd", +# 10, +# True, +# ), # Enough unique values - Modin +# ], +# ) +# def test_check_numerical_feature_uniqueness( +# dataframe, backend, min_unique_values, expected_result +# ): +# """Tests check_numerical_feature_uniqueness for various dataframes and backends.""" +# assert ( +# check_numerical_feature_uniqueness( +# dataframe, backend=backend, min_unique_values=min_unique_values +# ) +# == expected_result +# ) -@pytest.mark.parametrize( - "dataframe,backend,expected_result", - [ - # Pandas DataFrame - ( - pd.DataFrame({"binary_feature": [0, 1] * 50}), - "pd", - False, - ), # Binary numerical feature - Pandas - ( - pd.DataFrame({"feature1": range(100)}), - "pd", - True, - ), # No binary feature - Pandas - # Polars DataFrame - ( - pl.DataFrame({"binary_feature": pl.Series([0, 1] * 50)}), - "pl", - False, - ), # Binary numerical feature - Polars - ( - pl.DataFrame({"feature1": pl.Series(range(100))}), - "pl", - True, - ), # No binary feature - Polars - # Modin DataFrame - ( - mpd.DataFrame({"binary_feature": [0, 1] * 50}), - "mpd", - False, - ), # Binary numerical feature - Modin - ( - mpd.DataFrame({"feature1": range(100)}), - "mpd", - True, - ), # No binary feature - Modin - ], -) -def test_check_binary_numerical_features(dataframe, backend, expected_result): - """Tests check_binary_numerical_features for various dataframes and backends.""" - assert ( - check_binary_numerical_features(dataframe, backend=backend) == expected_result - ) +# @pytest.mark.parametrize( +# "dataframe,backend,expected_result", +# [ +# # Pandas DataFrame +# ( +# pd.DataFrame({"binary_feature": [0, 1] * 50}), +# "pd", +# False, +# ), # Binary numerical feature - Pandas +# ( +# pd.DataFrame({"feature1": range(100)}), +# "pd", +# True, +# ), # No binary feature - Pandas +# # Polars DataFrame +# ( +# pl.DataFrame({"binary_feature": pl.Series([0, 1] * 50)}), +# "pl", +# False, +# ), # Binary numerical feature - Polars +# ( +# pl.DataFrame({"feature1": pl.Series(range(100))}), +# "pl", +# True, +# ), # No binary feature - Polars +# # Modin DataFrame +# ( +# mpd.DataFrame({"binary_feature": [0, 1] * 50}), +# "mpd", +# False, +# ), # Binary numerical feature - Modin +# ( +# mpd.DataFrame({"feature1": range(100)}), +# "mpd", +# True, +# ), # No binary feature - Modin +# ], +# ) +# def test_check_binary_numerical_features(dataframe, backend, expected_result): +# """Tests check_binary_numerical_features for various dataframes and backends.""" +# assert ( +# check_binary_numerical_features(dataframe, backend=backend) == expected_result +# ) -@pytest.mark.parametrize( - "dataframe,target_col,backend,expected_result", - [ - ( - pd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), - "target", - "pd", - False, - ), - ( - pd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), - "target", - "pd", - True, - ), - ( - pl.DataFrame( - { - "feature1": pl.Series(range(100)), - "target": pl.Series([1] * 90 + [0] * 10), - } - ), - "target", - "pl", - False, - ), - ( - pl.DataFrame( - { - "feature1": pl.Series(range(100)), - "target": pl.Series([0, 1] * 50), - } - ), - "target", - "pl", - True, - ), - ( - mpd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), - "target", - "mpd", - False, - ), - ( - mpd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), - "target", - "mpd", - True, - ), - ], -) -def test_check_class_balance(dataframe, target_col, backend, expected_result): - """Tests check_class_balance for various dataframes and backends.""" - result = check_class_balance(dataframe, target_col=target_col, backend=backend) - assert ( - result == expected_result - ), f"Expected {expected_result}, but got {result} for backend {backend}" +# @pytest.mark.parametrize( +# "dataframe,target_col,backend,expected_result", +# [ +# ( +# pd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), +# "target", +# "pd", +# False, +# ), +# ( +# pd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), +# "target", +# "pd", +# True, +# ), +# ( +# pl.DataFrame( +# { +# "feature1": pl.Series(range(100)), +# "target": pl.Series([1] * 90 + [0] * 10), +# } +# ), +# "target", +# "pl", +# False, +# ), +# ( +# pl.DataFrame( +# { +# "feature1": pl.Series(range(100)), +# "target": pl.Series([0, 1] * 50), +# } +# ), +# "target", +# "pl", +# True, +# ), +# ( +# mpd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), +# "target", +# "mpd", +# False, +# ), +# ( +# mpd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), +# "target", +# "mpd", +# True, +# ), +# ], +# ) +# def test_check_class_balance(dataframe, target_col, backend, expected_result): +# """Tests check_class_balance for various dataframes and backends.""" +# result = check_class_balance(dataframe, target_col=target_col, backend=backend) +# assert ( +# result == expected_result +# ), f"Expected {expected_result}, but got {result} for backend {backend}" diff --git a/tutorial_notebooks/introduction/0_load_data.ipynb b/tutorial_notebooks/introduction/0_load_data_timeframe.ipynb similarity index 94% rename from tutorial_notebooks/introduction/0_load_data.ipynb rename to tutorial_notebooks/introduction/0_load_data_timeframe.ipynb index 7f52cae..39c8f4d 100644 --- a/tutorial_notebooks/introduction/0_load_data.ipynb +++ b/tutorial_notebooks/introduction/0_load_data_timeframe.ipynb @@ -41,7 +41,8 @@ "======================================================================\n", "Loaded DataFrame shape: (202, 14)\n", "======================================================================\n", - "Shifted 'realgdp' to create a new target column 'target_realgdp' for future prediction.\n", + "Shifted 'realgdp' to create a new target column 'target_realgdp'\n", + " for future prediction.\n", "======================================================================\n" ] }, @@ -49,7 +50,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-09-08 22:54:57,047\tINFO worker.py:1783 -- Started a local Ray instance.\n" + "2024-09-20 02:19:57,110\tINFO worker.py:1786 -- Started a local Ray instance.\n" ] }, { @@ -74,7 +75,7 @@ "4 3.50 5.2 180.007 2.31 1.19 2834.390 \n", "======================================================================\n", "Metadata for Modin TimeFrame object:\n", - "{'_cfg': {'BACKENDS': {'pl': 'polars', 'pd': 'pandas', 'mpd': 'modin'}}, '_backend': 'mpd', '_df': ds realgdp realcons realinv realgovt realdpi cpi \\\n", + "{'_backend': 'mpd', '_cfg': {'BACKENDS': {'pl': 'polars', 'pd': 'pandas', 'mpd': 'modin'}}, '_time_col': 'ds', '_target_col': 'target_realgdp', '_sort': True, 'df': ds realgdp realcons realinv realgovt realdpi cpi \\\n", "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.980 \n", "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.150 \n", "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.350 \n", @@ -100,7 +101,7 @@ "200 1592.8 0.22 8.1 306.547 0.94 -0.71 12901.504 \n", "201 1653.6 0.18 9.2 307.226 3.37 -3.19 12990.341 \n", "\n", - "[202 rows x 14 columns], '_time_col': 'ds', '_target_col': 'target_realgdp', '_id_col': None, '_sort': True}\n", + "[202 rows x 14 columns]}\n", "======================================================================\n" ] } @@ -112,7 +113,7 @@ "from statsmodels.datasets import macrodata\n", "\n", "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", - "from temporalscope.core.utils import print_divider\n", + "from temporalscope.core.core_utils import print_divider\n", "\n", "\n", "def load_macrodata(target_col: str = \"realgdp\"):\n", @@ -516,9 +517,12 @@ { "data": { "text/plain": [ - "{'_cfg': {'BACKENDS': {'pl': 'polars', 'pd': 'pandas', 'mpd': 'modin'}},\n", - " '_backend': 'mpd',\n", - " '_df': ds realgdp realcons realinv realgovt realdpi cpi \\\n", + "{'_backend': 'mpd',\n", + " '_cfg': {'BACKENDS': {'pl': 'polars', 'pd': 'pandas', 'mpd': 'modin'}},\n", + " '_time_col': 'ds',\n", + " '_target_col': 'target_realgdp',\n", + " '_sort': True,\n", + " 'df': ds realgdp realcons realinv realgovt realdpi cpi \\\n", " 0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.980 \n", " 1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.150 \n", " 2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.350 \n", @@ -544,40 +548,32 @@ " 200 1592.8 0.22 8.1 306.547 0.94 -0.71 12901.504 \n", " 201 1653.6 0.18 9.2 307.226 3.37 -3.19 12990.341 \n", " \n", - " [202 rows x 14 columns],\n", - " '_time_col': 'ds',\n", - " '_target_col': 'target_realgdp',\n", - " '_id_col': None,\n", - " '_sort': True}" + " [202 rows x 14 columns]}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "*** SIGTERM received at time=1725836417 on cpu 4 ***\n", - "PC: @ 0x7f893b497c3e (unknown) epoll_wait\n", - " @ 0x7f893b3b4520 (unknown) (unknown)\n", - "[2024-09-08 23:00:17,466 E 5047 5047] logging.cc:440: *** SIGTERM received at time=1725836417 on cpu 4 ***\n", - "[2024-09-08 23:00:17,467 E 5047 5047] logging.cc:440: PC: @ 0x7f893b497c3e (unknown) epoll_wait\n", - "[2024-09-08 23:00:17,467 E 5047 5047] logging.cc:440: @ 0x7f893b3b4520 (unknown) (unknown)\n" - ] } ], "source": [ "macro_modin_tf.__dict__" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7d45fd7-3773-4d8b-ba66-af01b4aa2dd3", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python (temporalscope_hatch)", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "temporalscope_hatch" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb b/tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb new file mode 100644 index 0000000..740f6cb --- /dev/null +++ b/tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb @@ -0,0 +1,1387 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c8aefe6f-489e-42fe-9cb9-a20426652424", + "metadata": {}, + "source": [ + "# Temporal Scope Tutorial: Utilizing Target Shifter\n", + "\n", + "## Overview\n", + "\n", + "This tutorial demonstrates how to load macroeconomic data and apply the **TemporalTargetShifter** using the **Modin** backend. The tutorial shows how to shift the target variable in **machine learning** and **deep learning** modes for forecasting tasks. The tool supports flexible configurations for different forecasting needs.\n", + "\n", + "### Summary\n", + "\n", + "| **Step** | **Description** |\n", + "|-----------|---------------------------------------------------------------------------------|\n", + "| **1** | **Data Loading**: Load macroeconomic data and create a datetime column (`ds`). |\n", + "| **2** | **Modin Backend Initialization**: Initialize a `TimeFrame` for scalable data processing with Modin. |\n", + "| **3** | **Target Shifting (ML Mode)**: Shift the target variable (`realgdp`) for one-step-ahead forecasting in **machine learning mode**. |\n", + "| **4** | **Target Shifting (DL Mode)**: Shift the target variable for sequence-based forecasting in **deep learning mode**. |\n", + "\n", + "### Key Concepts\n", + "\n", + "- **One-step ahead forecasting**: Shifting the target variable to predict the next time step for machine learning models.\n", + "- **Sequence forecasting**: Generating sequences of target variables for deep learning models.\n", + "- **Modin Backend**: Scalable version of Pandas for large datasets.\n", + "- **TemporalTargetShifter**: A tool to shift target variables for forecasting tasks, supporting both machine learning and deep learning modes.\n", + "\n", + "### Steps\n", + "\n", + "1. **Load the macroeconomic dataset** using the `statsmodels` library.\n", + "2. **Initialize a TimeFrame** for the Modin backend.\n", + "3. **Apply the Target Shifter** in machine learning mode to shift the target variable by one step (for simple one-step-ahead forecasting).\n", + "4. **Apply the Target Shifter** in deep learning mode to create sequences for sequence-based forecasting tasks.\n" + ] + }, + { + "cell_type": "markdown", + "id": "b9b71cc0-f882-40b6-933d-d38cbe3a56cd", + "metadata": {}, + "source": [ + "# Part 1: Load Macro-Economic Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4b56016b-7609-4e26-bb0b-5d6e4f864c18", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "Loading the 'macrodata' dataset from statsmodels.\n", + "Using 'realgdp' as the target column for future prediction.\n", + "======================================================================\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
01959-01-012710.3491707.4286.898470.0451886.928.980139.72.825.8177.1460.000.00
11959-04-012778.8011733.7310.859481.3011919.729.150141.73.085.1177.8302.340.74
21959-07-012775.4881751.8289.226491.2601916.429.350140.53.825.3178.6572.741.09
31959-10-012785.2041753.7299.356484.0521931.329.370140.04.335.6179.3860.274.06
41960-01-012847.6991770.5331.722462.1991955.529.540139.63.505.2180.0072.311.19
..........................................
1982008-07-0113324.6009267.71990.693991.5519838.3216.8891474.71.176.0305.270-3.164.33
1992008-10-0113141.9209195.31857.6611007.2739920.4212.1741576.50.126.9305.952-8.798.91
2002009-01-0112925.4109209.21558.494996.2879926.4212.6711592.80.228.1306.5470.94-0.71
2012009-04-0112901.5049189.01456.6781023.52810077.5214.4691653.60.189.2307.2263.37-3.19
2022009-07-0112990.3419256.01486.3981044.08810040.6216.3851673.90.129.6308.0133.56-3.44
\n", + "

203 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " ds realgdp realcons realinv realgovt realdpi cpi \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.980 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.150 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.350 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.370 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.540 \n", + ".. ... ... ... ... ... ... ... \n", + "198 2008-07-01 13324.600 9267.7 1990.693 991.551 9838.3 216.889 \n", + "199 2008-10-01 13141.920 9195.3 1857.661 1007.273 9920.4 212.174 \n", + "200 2009-01-01 12925.410 9209.2 1558.494 996.287 9926.4 212.671 \n", + "201 2009-04-01 12901.504 9189.0 1456.678 1023.528 10077.5 214.469 \n", + "202 2009-07-01 12990.341 9256.0 1486.398 1044.088 10040.6 216.385 \n", + "\n", + " m1 tbilrate unemp pop infl realint \n", + "0 139.7 2.82 5.8 177.146 0.00 0.00 \n", + "1 141.7 3.08 5.1 177.830 2.34 0.74 \n", + "2 140.5 3.82 5.3 178.657 2.74 1.09 \n", + "3 140.0 4.33 5.6 179.386 0.27 4.06 \n", + "4 139.6 3.50 5.2 180.007 2.31 1.19 \n", + ".. ... ... ... ... ... ... \n", + "198 1474.7 1.17 6.0 305.270 -3.16 4.33 \n", + "199 1576.5 0.12 6.9 305.952 -8.79 8.91 \n", + "200 1592.8 0.22 8.1 306.547 0.94 -0.71 \n", + "201 1653.6 0.18 9.2 307.226 3.37 -3.19 \n", + "202 1673.9 0.12 9.6 308.013 3.56 -3.44 \n", + "\n", + "[203 rows x 13 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from statsmodels.datasets import macrodata\n", + "from temporalscope.core.core_utils import print_divider\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "\n", + "def load_macrodata(target_col: str = \"realgdp\"):\n", + " \"\"\"Preprocess the dataset with a combined column for time target.\n", + " \n", + " :param target_col: The column to be used as the target for prediction.\n", + " :type target_col: str, optional\n", + " :return: Preprocessed DataFrame with target column.\n", + " :rtype: pd.DataFrame\n", + " \"\"\"\n", + " print_divider()\n", + " print(\"Loading the 'macrodata' dataset from statsmodels.\")\n", + " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", + " print_divider()\n", + "\n", + " # Load macrodata dataset\n", + " macro_df = macrodata.load_pandas().data.copy()\n", + "\n", + " # Create 'ds' column combining 'year' and 'quarter'\n", + " macro_df[\"ds\"] = pd.to_datetime(\n", + " macro_df[\"year\"].astype(int).astype(str) + \"-\" + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str) + \"-01\"\n", + " )\n", + "\n", + " # Drop the 'year' and 'quarter' columns\n", + " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", + "\n", + " # Reorder columns to put 'ds' (datetime) first\n", + " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", + " macro_df = macro_df[cols].copy()\n", + "\n", + " return macro_df, target_col\n", + "\n", + "\n", + "# Load the macrodata dataset and preprocess\n", + "macro_df, target_col = load_macrodata()\n", + "macro_df" + ] + }, + { + "cell_type": "markdown", + "id": "5bddbc46-e8cf-421c-8561-363aeef1143c", + "metadata": {}, + "source": [ + "## Part 2: Shifting for Machine Learning" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "051a47f4-b8dd-46e3-92c1-39b49ee04f51", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "Loading the 'macrodata' dataset from statsmodels.\n", + "Using 'realgdp' as the target column for future prediction.\n", + "======================================================================\n", + "======================================================================\n", + "Initializing TimeFrame for the Modin backend...\n", + "Original DataFrame:\n", + " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", + "\n", + " tbilrate unemp pop infl realint \n", + "0 2.82 5.8 177.146 0.00 0.00 \n", + "1 3.08 5.1 177.830 2.34 0.74 \n", + "2 3.82 5.3 178.657 2.74 1.09 \n", + "3 4.33 5.6 179.386 0.27 4.06 \n", + "4 3.50 5.2 180.007 2.31 1.19 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n" + ] + } + ], + "source": [ + "import modin.pandas as mpd\n", + "from statsmodels.datasets import macrodata\n", + "from temporalscope.core.temporal_data_loader import TimeFrame\n", + "from temporalscope.core.core_utils import print_divider, BACKEND_MODIN\n", + "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "\n", + "# Step 1: Load the macrodata dataset and preprocess\n", + "macro_df, target_col = load_macrodata()\n", + "\n", + "# Step 2: Initialize Modin TimeFrame for Modin backend\n", + "print_divider()\n", + "print(\"Initializing TimeFrame for the Modin backend...\")\n", + "macro_modin_df = mpd.DataFrame(macro_df)\n", + "modin_tf = TimeFrame(macro_modin_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_MODIN)\n", + "\n", + "# Step 3: Preview the original data\n", + "print(\"Original DataFrame:\")\n", + "print(modin_tf.get_data().head())\n", + "print_divider()\n", + "\n", + "# Step 4: Apply the TemporalTargetShifter in machine learning mode\n", + "print(f\"\\nApplying Target Shifter in {MODE_MACHINE_LEARNING} mode...\")\n", + "\n", + "# Setup the TemporalTargetShifter\n", + "shifter = TemporalTargetShifter(n_lags=1, mode=MODE_MACHINE_LEARNING, verbose=True)\n", + "\n", + "# Apply the shifter\n", + "shifted_df = shifter.fit_transform(modin_tf)\n", + "\n", + "# Print the shifted data\n", + "print(\"Shifted data:\")\n", + "print(shifted_df.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5ff95236-87eb-487e-9a65-fce69340d3f6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsrealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealintrealgdp_shift_1
01959-01-011707.4286.898470.0451886.928.98139.72.825.8177.1460.000.002778.801
11959-04-011733.7310.859481.3011919.729.15141.73.085.1177.8302.340.742775.488
21959-07-011751.8289.226491.2601916.429.35140.53.825.3178.6572.741.092785.204
31959-10-011753.7299.356484.0521931.329.37140.04.335.6179.3860.274.062847.699
41960-01-011770.5331.722462.1991955.529.54139.63.505.2180.0072.311.192834.390
\n", + "
" + ], + "text/plain": [ + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shifted_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f4efe10f-e4ca-4b61-821d-87959557a51e", + "metadata": {}, + "source": [ + "## Part 2: Shifting for Deep Learning" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9c6ef6be-d13b-4576-bdef-fa4afbb687a5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Applying Target Shifter in deep_learning mode...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data for deep learning mode (sequences):\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \\\n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + " realgdp_sequence \n", + "0 [2710.349, 2778.801, 2775.488] \n", + "1 [2778.801, 2775.488, 2785.204] \n", + "2 [2775.488, 2785.204, 2847.699] \n", + "3 [2785.204, 2847.699, 2834.39] \n", + "4 [2847.699, 2834.39, 2839.022] \n" + ] + } + ], + "source": [ + "# Step 5: Apply the TemporalTargetShifter in deep learning mode\n", + "MODE_DEEP_LEARNING = \"deep_learning\"\n", + "\n", + "print(f\"\\nApplying Target Shifter in {MODE_DEEP_LEARNING} mode...\")\n", + "\n", + "# Setup the TemporalTargetShifter for deep learning mode with a sequence length\n", + "sequence_length = 3 # Length of sequence for deep learning\n", + "shifter_dl = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=sequence_length, verbose=True)\n", + "\n", + "# Apply the shifter\n", + "shifted_dl_df = shifter_dl.fit_transform(modin_tf)\n", + "\n", + "# Print the shifted data with sequences\n", + "print(\"Shifted data for deep learning mode (sequences):\")\n", + "print(shifted_dl_df.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "369d0213-0bca-4c05-af9e-42daa260b3fe", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsrealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealintrealgdp_shift_1realgdp_sequence
01959-01-011707.4286.898470.0451886.928.980139.72.825.8177.1460.000.002778.801[2710.349, 2778.801, 2775.488]
11959-04-011733.7310.859481.3011919.729.150141.73.085.1177.8302.340.742775.488[2778.801, 2775.488, 2785.204]
21959-07-011751.8289.226491.2601916.429.350140.53.825.3178.6572.741.092785.204[2775.488, 2785.204, 2847.699]
31959-10-011753.7299.356484.0521931.329.370140.04.335.6179.3860.274.062847.699[2785.204, 2847.699, 2834.39]
41960-01-011770.5331.722462.1991955.529.540139.63.505.2180.0072.311.192834.390[2847.699, 2834.39, 2839.022]
.............................................
1972008-04-019351.02026.518961.28010059.0218.6101409.31.745.4304.4838.53-6.7913324.600[13415.266, 13324.6, 13141.92]
1982008-07-019267.71990.693991.5519838.3216.8891474.71.176.0305.270-3.164.3313141.920[13324.6, 13141.92, 12925.41]
1992008-10-019195.31857.6611007.2739920.4212.1741576.50.126.9305.952-8.798.9112925.410[13141.92, 12925.41, 12901.504]
2002009-01-019209.21558.494996.2879926.4212.6711592.80.228.1306.5470.94-0.7112901.504[12925.41, 12901.504, 12990.341]
2012009-04-019189.01456.6781023.52810077.5214.4691653.60.189.2307.2263.37-3.1912990.341[12901.504, 12990.341, nan]
\n", + "

202 rows x 14 columns

\n", + "
" + ], + "text/plain": [ + " ds realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.980 139.7 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.150 141.7 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.350 140.5 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.370 140.0 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.540 139.6 \n", + ".. ... ... ... ... ... ... ... \n", + "197 2008-04-01 9351.0 2026.518 961.280 10059.0 218.610 1409.3 \n", + "198 2008-07-01 9267.7 1990.693 991.551 9838.3 216.889 1474.7 \n", + "199 2008-10-01 9195.3 1857.661 1007.273 9920.4 212.174 1576.5 \n", + "200 2009-01-01 9209.2 1558.494 996.287 9926.4 212.671 1592.8 \n", + "201 2009-04-01 9189.0 1456.678 1023.528 10077.5 214.469 1653.6 \n", + "\n", + " tbilrate unemp pop infl realint realgdp_shift_1 \\\n", + "0 2.82 5.8 177.146 0.00 0.00 2778.801 \n", + "1 3.08 5.1 177.830 2.34 0.74 2775.488 \n", + "2 3.82 5.3 178.657 2.74 1.09 2785.204 \n", + "3 4.33 5.6 179.386 0.27 4.06 2847.699 \n", + "4 3.50 5.2 180.007 2.31 1.19 2834.390 \n", + ".. ... ... ... ... ... ... \n", + "197 1.74 5.4 304.483 8.53 -6.79 13324.600 \n", + "198 1.17 6.0 305.270 -3.16 4.33 13141.920 \n", + "199 0.12 6.9 305.952 -8.79 8.91 12925.410 \n", + "200 0.22 8.1 306.547 0.94 -0.71 12901.504 \n", + "201 0.18 9.2 307.226 3.37 -3.19 12990.341 \n", + "\n", + " realgdp_sequence \n", + "0 [2710.349, 2778.801, 2775.488] \n", + "1 [2778.801, 2775.488, 2785.204] \n", + "2 [2775.488, 2785.204, 2847.699] \n", + "3 [2785.204, 2847.699, 2834.39] \n", + "4 [2847.699, 2834.39, 2839.022] \n", + ".. ... \n", + "197 [13415.266, 13324.6, 13141.92] \n", + "198 [13324.6, 13141.92, 12925.41] \n", + "199 [13141.92, 12925.41, 12901.504] \n", + "200 [12925.41, 12901.504, 12990.341] \n", + "201 [12901.504, 12990.341, nan] \n", + "\n", + "[202 rows x 14 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shifted_dl_df" + ] + }, + { + "cell_type": "markdown", + "id": "b0cbc6e3-a665-45f2-a9aa-60b9057d5540", + "metadata": {}, + "source": [ + "## Part 4: Shifting for all backends" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "170bad23-b236-4837-b042-7218622c4e62", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "Loading the 'macrodata' dataset from the open-license statsmodels package.\n", + "Using 'realgdp' as the target column for future prediction.\n", + "======================================================================\n", + "======================================================================\n", + "Demonstrating Target Shifter for backend: pd\n", + "Preview of the TimeFrame DataFrame:\n", + " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", + "\n", + " tbilrate unemp pop infl realint \n", + "0 2.82 5.8 177.146 0.00 0.00 \n", + "1 3.08 5.1 177.830 2.34 0.74 \n", + "2 3.82 5.3 178.657 2.74 1.09 \n", + "3 4.33 5.6 179.386 0.27 4.06 \n", + "4 3.50 5.2 180.007 2.31 1.19 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode...\n", + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + "Applying Target Shifter in deep_learning mode...\n", + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \\\n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + " realgdp_sequence \n", + "0 [2710.349, 2778.801, 2775.488] \n", + "1 [2778.801, 2775.488, 2785.204] \n", + "2 [2775.488, 2785.204, 2847.699] \n", + "3 [2785.204, 2847.699, 2834.39] \n", + "4 [2847.699, 2834.39, 2839.022] \n", + "======================================================================\n", + "Demonstrating Target Shifter for backend: pl\n", + "Preview of the TimeFrame DataFrame:\n", + "shape: (5, 13)\n", + "┌─────────────────────┬──────────┬──────────┬─────────┬───┬───────┬─────────┬──────┬─────────┐\n", + "│ ds ┆ realgdp ┆ realcons ┆ realinv ┆ … ┆ unemp ┆ pop ┆ infl ┆ realint │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═════════════════════╪══════════╪══════════╪═════════╪═══╪═══════╪═════════╪══════╪═════════╡\n", + "│ 1959-01-01 00:00:00 ┆ 2710.349 ┆ 1707.4 ┆ 286.898 ┆ … ┆ 5.8 ┆ 177.146 ┆ 0.0 ┆ 0.0 │\n", + "│ 1959-04-01 00:00:00 ┆ 2778.801 ┆ 1733.7 ┆ 310.859 ┆ … ┆ 5.1 ┆ 177.83 ┆ 2.34 ┆ 0.74 │\n", + "│ 1959-07-01 00:00:00 ┆ 2775.488 ┆ 1751.8 ┆ 289.226 ┆ … ┆ 5.3 ┆ 178.657 ┆ 2.74 ┆ 1.09 │\n", + "│ 1959-10-01 00:00:00 ┆ 2785.204 ┆ 1753.7 ┆ 299.356 ┆ … ┆ 5.6 ┆ 179.386 ┆ 0.27 ┆ 4.06 │\n", + "│ 1960-01-01 00:00:00 ┆ 2847.699 ┆ 1770.5 ┆ 331.722 ┆ … ┆ 5.2 ┆ 180.007 ┆ 2.31 ┆ 1.19 │\n", + "└─────────────────────┴──────────┴──────────┴─────────┴───┴───────┴─────────┴──────┴─────────┘\n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode...\n", + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + "shape: (5, 13)\n", + "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬─────────────────┐\n", + "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_shift_1 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪═════════════════╡\n", + "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ 2778.801 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ 2775.488 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ 2785.204 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ 2847.699 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ 2834.39 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴─────────────────┘\n", + "\n", + "Applying Target Shifter in deep_learning mode...\n", + "Rows before shift: 203; Rows after shift: 203; Rows dropped: 0\n", + "Shifted data:\n", + "shape: (5, 13)\n", + "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬────────────────────┐\n", + "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_sequence │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ list[f64] │\n", + "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪════════════════════╡\n", + "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ [2710.349, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2778.801, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488] │\n", + "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ [2778.801, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204] │\n", + "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ [2775.488, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699] │\n", + "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ [2785.204, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699, 2834.39] │\n", + "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ [2847.699, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2834.39, 2839.022] │\n", + "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴────────────────────┘\n", + "======================================================================\n", + "Demonstrating Target Shifter for backend: mpd\n", + "Preview of the TimeFrame DataFrame:\n", + " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", + "\n", + " tbilrate unemp pop infl realint \n", + "0 2.82 5.8 177.146 0.00 0.00 \n", + "1 3.08 5.1 177.830 2.34 0.74 \n", + "2 3.82 5.3 178.657 2.74 1.09 \n", + "3 4.33 5.6 179.386 0.27 4.06 \n", + "4 3.50 5.2 180.007 2.31 1.19 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + "Applying Target Shifter in deep_learning mode...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \\\n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + " realgdp_sequence \n", + "0 [2710.349, 2778.801, 2775.488] \n", + "1 [2778.801, 2775.488, 2785.204] \n", + "2 [2775.488, 2785.204, 2847.699] \n", + "3 [2785.204, 2847.699, 2834.39] \n", + "4 [2847.699, 2834.39, 2839.022] \n" + ] + } + ], + "source": [ + "import modin.pandas as mpd\n", + "import pandas as pd\n", + "import polars as pl\n", + "from statsmodels.datasets import macrodata\n", + "\n", + "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", + "from temporalscope.core.core_utils import print_divider, BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN\n", + "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "MODE_DEEP_LEARNING = \"deep_learning\"\n", + "\n", + "def load_macrodata(target_col: str = \"realgdp\"):\n", + " \"\"\"Preprocess the dataset with a combined column for time & shifted target.\n", + "\n", + " :param target_col: The column to be used as the target for prediction\n", + " :type target_col: str, optional\n", + " :default target_col: 'realgdp'\n", + "\n", + " :return: Preprocessed DataFrame with shifted target\n", + " :rtype: pd.DataFrame\n", + " \"\"\"\n", + " print_divider()\n", + " print(\"Loading the 'macrodata' dataset from the open-license statsmodels package.\")\n", + " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", + " print_divider()\n", + "\n", + " # Load macrodata dataset\n", + " macro_df = macrodata.load_pandas().data.copy()\n", + "\n", + " # Create 'ds' column by combining 'year' and 'quarter'\n", + " macro_df[\"ds\"] = pd.to_datetime(\n", + " macro_df[\"year\"].astype(int).astype(str)\n", + " + \"-\"\n", + " + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str)\n", + " + \"-01\"\n", + " )\n", + "\n", + " # Drop the 'year' and 'quarter' columns\n", + " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", + "\n", + " # Reorder columns to place 'ds' first\n", + " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", + " macro_df = macro_df[cols].copy()\n", + "\n", + " return macro_df, target_col\n", + "\n", + "\n", + "def init_timeframes_for_backends(macro_df, target_col: str):\n", + " \"\"\"Initialize TimeFrame objects for all backends (Pandas, Polars, Modin) using constants.\n", + "\n", + " :param macro_df: Preprocessed macro dataset.\n", + " :type macro_df: pd.DataFrame\n", + " :param target_col: The target column for prediction.\n", + " :type target_col: str\n", + " :return: A dictionary containing TimeFrame objects for Pandas, Polars, and Modin.\n", + " :rtype: dict\n", + " \"\"\"\n", + " timeframes = {}\n", + "\n", + " # Pandas backend\n", + " macro_pandas_df = pd.DataFrame(macro_df)\n", + " timeframes[BACKEND_PANDAS] = tf(\n", + " macro_pandas_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_PANDAS\n", + " )\n", + "\n", + " # Polars backend\n", + " macro_polars_df = pl.DataFrame(macro_df)\n", + " timeframes[BACKEND_POLARS] = tf(\n", + " macro_polars_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_POLARS\n", + " )\n", + "\n", + " # Modin backend\n", + " macro_modin_df = mpd.DataFrame(macro_df)\n", + " timeframes[BACKEND_MODIN] = tf(\n", + " macro_modin_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_MODIN\n", + " )\n", + "\n", + " return timeframes\n", + "\n", + "\n", + "def apply_target_shifter(tf_obj, mode: str):\n", + " \"\"\"Apply the TemporalTargetShifter in the specified mode.\n", + "\n", + " :param tf_obj: TimeFrame object to apply the shifter to.\n", + " :param mode: Mode of operation (machine_learning or deep_learning).\n", + " \"\"\"\n", + " print(f\"\\nApplying Target Shifter in {mode} mode...\")\n", + "\n", + " # Setup the TemporalTargetShifter\n", + " if mode == MODE_MACHINE_LEARNING:\n", + " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_MACHINE_LEARNING, verbose=True)\n", + " elif mode == MODE_DEEP_LEARNING:\n", + " # In deep learning mode, sequence_length must be provided\n", + " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=3, verbose=True)\n", + " else:\n", + " raise ValueError(f\"Invalid mode: {mode}\")\n", + "\n", + " # Apply the shifter\n", + " shifted_df = shifter.fit_transform(tf_obj)\n", + "\n", + " # Print the result (since it's already a DataFrame, no need for get_data())\n", + " print(\"Shifted data:\")\n", + " print(shifted_df.head())\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Load the macrodata dataset and preprocess\n", + " macro_df, target_col = load_macrodata()\n", + "\n", + " # Initialize TimeFrame objects for various backends using constants\n", + " timeframes = init_timeframes_for_backends(macro_df, target_col)\n", + "\n", + " # Apply and demonstrate shifting for all backends\n", + " for backend, tf_obj in timeframes.items():\n", + " print_divider()\n", + " print(f\"Demonstrating Target Shifter for backend: {backend}\")\n", + " print(\"Preview of the TimeFrame DataFrame:\")\n", + " print(tf_obj.get_data().head())\n", + " print_divider()\n", + "\n", + " # Apply target shifting in machine learning mode\n", + " apply_target_shifter(tf_obj, MODE_MACHINE_LEARNING)\n", + "\n", + " # Apply target shifting in deep learning mode\n", + " apply_target_shifter(tf_obj, MODE_DEEP_LEARNING)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7d45fd7-3773-4d8b-ba66-af01b4aa2dd3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TemporalScope", + "language": "python", + "name": "temporalscope-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorial_notebooks/introduction/1_tutorial_partion_data.ipynb b/tutorial_notebooks/introduction/1_tutorial_partion_data.ipynb deleted file mode 100644 index 0967c79..0000000 --- a/tutorial_notebooks/introduction/1_tutorial_partion_data.ipynb +++ /dev/null @@ -1,159 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import modin.pandas as mpd\n", - "import pandas as pd\n", - "from statsmodels.datasets import macrodata\n", - "\n", - "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", - "from temporalscope.core.utils import print_divider\n", - "from temporalscope.partition.sliding_window import SlidingWindowPartitioner as SWP\n", - "\n", - "\n", - "def load_macrodata(target_col: str = \"realgdp\"):\n", - " \"\"\"Preprocess the dataset with a combined column for time & shifted target.\n", - "\n", - " :param target_col: The column to be used as the target for prediction.\n", - " Defaults to 'realgdp'.\n", - "\n", - " :type target_col: str, optional\n", - "\n", - " :returns: Preprocessed DataFrame with shifted target.\n", - " :rtype: pd.DataFrame\n", - " \"\"\"\n", - " print_divider()\n", - " print(\"Loading the 'macrodata' dataset from the open-license statsmodels package.\")\n", - " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", - " print_divider()\n", - "\n", - " # Load macrodata dataset\n", - " macro_df = macrodata.load_pandas().data.copy()\n", - "\n", - " # Create 'ds' column by combining 'year' and 'quarter'\n", - " macro_df[\"ds\"] = pd.to_datetime(\n", - " macro_df[\"year\"].astype(int).astype(str)\n", - " + \"-\"\n", - " + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str)\n", - " + \"-01\"\n", - " )\n", - "\n", - " # Drop the 'year' and 'quarter' columns\n", - " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", - "\n", - " # Reorder columns to place 'ds' first\n", - " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", - " macro_df = macro_df[cols].copy()\n", - "\n", - " # Shift the target column for future prediction and rename it\n", - " shifted_target_col = f\"target_{target_col}\"\n", - " macro_df[shifted_target_col] = macro_df[target_col].shift(-1)\n", - "\n", - " # Drop any rows with NaN (due to shifting)\n", - " macro_df = macro_df.dropna().copy()\n", - "\n", - " # Print the shape of the DataFrame\n", - " print(f\"Loaded DataFrame shape: {macro_df.shape}\")\n", - "\n", - " print_divider()\n", - " print(\n", - " f\"\"\"Shifted '{target_col}' to create a new target column '{shifted_target_col}'\n", - " for future prediction.\"\"\"\n", - " )\n", - " print_divider()\n", - "\n", - " return macro_df, shifted_target_col\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " # Load the macrodata dataset and preprocess\n", - " macro_df, shifted_target_col = load_macrodata()\n", - "\n", - " # Initialize the TimeFrame using the Modin backend\n", - " print_divider()\n", - " print(\"Using Modin backend for Sliding Window Partitioning:\")\n", - " macro_modin_df = mpd.DataFrame(macro_df)\n", - " macro_modin_tf = tf(\n", - " macro_modin_df, time_col=\"ds\", target_col=shifted_target_col, backend=\"mpd\"\n", - " )\n", - "\n", - " print(\"Preview of the Modin DataFrame (macrodata):\")\n", - " print(macro_modin_tf.get_data().head())\n", - " print_divider()\n", - "\n", - " # Initialize SlidingWindowPartitioner with the TimeFrame object\n", - " print(\"Applying Sliding Window Partitioner:\")\n", - "\n", - " partitioner = SWP(\n", - " tf=macro_modin_tf, # TimeFrame object\n", - " window_size=20, # Fixed window size of 20\n", - " stride=10, # Step size between windows of 10\n", - " truncate=True, # Skip the last partition if it doesn't meet the window size\n", - " expand_last=False, # Do not expand the last partition to match the window size\n", - " enable_warnings=True, # Enable warnings for uneven partitions\n", - " )\n", - "\n", - " # Get the partition indices dictionary.\n", - " # 60% training, 20% testing, and 20% validation splits\n", - " partitions_dict = partitioner.get_partitions_indices_dict(\n", - " train_pct=0.6, test_pct=0.2, val_pct=0.2\n", - " )\n", - "\n", - " # Print the partitioned indices\n", - " print(\"Partitioned Indices with 60% train, 20% test, and 20% validation split:\")\n", - " for partition_name, partition_indices in partitions_dict.items():\n", - " print(f\"{partition_name}: {partition_indices}\")\n", - " print_divider()\n", - "\n", - " # Verify the partitions by printing the training data from the first partition\n", - " print(\"Training data of the first partition:\")\n", - " first_partition = partitioner.apply_partition(\n", - " partitions_dict[\"partition_1\"][\"train\"]\n", - " )\n", - " print(first_partition)\n", - " print_divider()\n", - "\n", - " # Verify the test data from the first partition\n", - " print(\"Test data of the first partition:\")\n", - " test_partition = partitioner.apply_partition(partitions_dict[\"partition_1\"][\"test\"])\n", - " print(test_partition)\n", - " print_divider()\n", - "\n", - " # Verify the validation data from the first partition\n", - " print(\"Validation data of the first partition:\")\n", - " validation_partition = partitioner.apply_partition(\n", - " partitions_dict[\"partition_1\"][\"validation\"]\n", - " )\n", - " print(validation_partition)\n", - " print_divider()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (temporalscope_hatch)", - "language": "python", - "name": "temporalscope_hatch" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tutorial_notebooks/speed_test_generators.ipynb b/tutorial_notebooks/speed_test_generators.ipynb deleted file mode 100644 index c3b9637..0000000 --- a/tutorial_notebooks/speed_test_generators.ipynb +++ /dev/null @@ -1,116 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 10, - "id": "c34c2376-3c80-4983-99db-ea52a0de3323", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (5, 2)\n", - "┌─────┬─────┐\n", - "│ A ┆ B │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═════╪═════╡\n", - "│ 1 ┆ a │\n", - "│ 2 ┆ b │\n", - "│ 3 ┆ c │\n", - "│ 4 ┆ d │\n", - "│ 5 ┆ e │\n", - "└─────┴─────┘\n" - ] - } - ], - "source": [ - "import polars as pl\n", - "\n", - "# Create a simple DataFrame\n", - "df = pl.DataFrame({\n", - " \"A\": [1, 2, 3, 4, 5],\n", - " \"B\": [\"a\", \"b\", \"c\", \"d\", \"e\"]\n", - "})\n", - "\n", - "print(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "5e882bf0-74f5-4e8f-812a-ccedbe902cd2", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (10, 2)\n", - "┌────────────┬───────┐\n", - "│ date ┆ value │\n", - "│ --- ┆ --- │\n", - "│ date ┆ i64 │\n", - "╞════════════╪═══════╡\n", - "│ 2021-01-01 ┆ 0 │\n", - "│ 2021-01-02 ┆ 1 │\n", - "│ 2021-01-03 ┆ 2 │\n", - "│ 2021-01-04 ┆ 3 │\n", - "│ 2021-01-05 ┆ 4 │\n", - "│ 2021-01-06 ┆ 5 │\n", - "│ 2021-01-07 ┆ 6 │\n", - "│ 2021-01-08 ┆ 7 │\n", - "│ 2021-01-09 ┆ 8 │\n", - "│ 2021-01-10 ┆ 9 │\n", - "└────────────┴───────┘\n" - ] - } - ], - "source": [ - "import polars as pl\n", - "from datetime import date\n", - "\n", - "df = pl.DataFrame({\n", - " \"date\": [date(2021, 1, i) for i in range(1, 11)],\n", - " \"value\": list(range(10))\n", - "})\n", - "\n", - "print(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f24851c-443d-4c2e-9251-6f83fffdf8e1", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "TemporalScope", - "language": "python", - "name": "temporalscope-kernel" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 756ca18b6b6abbc9343b9f1bb2ad1a1c9a4b247e Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Sat, 21 Sep 2024 02:42:39 +0000 Subject: [PATCH 3/7] refactor(refactor:-fix-pre-commit-linting-and-CI/CD-errors---Fixed-linting-errors-from-ruff,-including-docstring-format-and-magic-values---Resolved-issues-with-pre-commit-hooks-for-linting,-security,-and-style-checks---Ensured-codespell,-bandit,-mypy,-and-other-pre-commit-checks-pass-cleanly---Updated-and-cleaned-up-CI/CD-pipeline-to-ensure-smooth-code-integration): fix pre-commit linting errors and update ci/cd pipeline --- .pre-commit-config.yaml | 7 + pyproject.toml | 30 +++-- src/temporalscope/core/core_utils.py | 101 +++++++-------- .../core/temporal_data_loader.py | 101 +++++++-------- .../core/temporal_target_shifter.py | 121 +++++++----------- src/temporalscope/metrics/masv.py | 18 +-- src/temporalscope/partition/base_protocol.py | 23 ++-- .../partition/partition_validators.py | 90 +++++++------ src/temporalscope/partition/sliding_window.py | 114 +++++++---------- test/unit/test_core_temporal_data_loader.py | 73 ++++++----- .../unit/test_core_temporal_target_shifter.py | 55 ++++---- test/unit/test_core_utils.py | 56 +++++--- test/unit/test_partion_data_checks.py | 2 +- .../introduction/0_load_data_timeframe.ipynb | 2 +- .../1_load_data_target_shifter.ipynb | 9 +- 15 files changed, 392 insertions(+), 410 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ae6be35..be44c13 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,12 @@ repos: rev: v0.6.5 hooks: - id: ruff + # Exclude tests and tutorials + exclude: "^(test/|tutorial_notebooks/)" + # No args needed, uses pyproject.toml settings - id: ruff-format + args: ["--line-length=120"] + # No need for --ignore options here, as ruff-format is for applying automatic fixes. - repo: https://github.com/codespell-project/codespell rev: v2.3.0 @@ -33,6 +38,8 @@ repos: - id: codespell additional_dependencies: - tomli + args: ["--ignore-words-list=Nam"] + - repo: https://github.com/rhysd/actionlint rev: v1.7.1 diff --git a/pyproject.toml b/pyproject.toml index 4d3f64f..2fbe849 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,8 @@ dependencies = [ "bandit", # Include dependencies for QA scripts "black", # Include dependencies for QA scripts "pytest", # Include pytest for running tests - "pytest-cov" # Include pytest-cov for coverage if needed + "pytest-cov", # Include pytest-cov for coverage if needed + "docformatter", # Add docformatter for docstring formatting "commitizen", ] @@ -106,14 +107,13 @@ log_date_format = "%Y-%m-%d %H:%M:%S" minversion = "6.0" filterwarnings = "ignore" +[tool.black] +line-length = 120 # Set Black's line length to 120 for consistency + [tool.ruff] -extend-exclude = ["*.pyc"] +extend-exclude = ["*.pyc", "test/*", "tutorial_notebooks/*"] target-version = "py310" -line-length = 88 - - -[tool.ruff.format] -docstring-code-format = true +line-length = 120 # Consistent line length across all tools [tool.ruff.lint] select = [ @@ -139,7 +139,12 @@ select = [ # docstring rules "D", # flake8-docstrings ] + ignore = [ + "D400", # Ignore "First line should end with a period" for docstrings. + "D401", # Ignore "First line should be in imperative mood" for docstrings. + "D415", # Ignore "First line should end with a period, question mark, or exclamation point." + "E501", # Ignore "Line too long" in docstrings/comments for exceeding 120 characters. "PERF203", # `try`-`except` within a loop incurs performance overhead "PERF401", # Use a list comprehension to create a transformed list "PLR1714", # repeated-equality-comparison @@ -160,7 +165,7 @@ ignore = [ "docs/conf.py" = ["A001", "D103"] [tool.mypy] -files = "src/temporalscope" +files = "src/temporalscope" python_version = "3.10" ignore_missing_imports = true warn_unreachable = true @@ -184,10 +189,15 @@ check = "ruff check {args}" fix = "ruff check --fix" format = "ruff format {args}" format-check = "ruff format --check {args}" -# Automated developer Q&A script +docformat = """ +docformatter --check --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope || \ +docformatter --in-place --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope +""" +# Automated developer Q&A script quality-assurance = """ pytest && -black src/temporalscope && +docformatter --check --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope || \ +docformatter --in-place --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope ruff check src/temporalscope --output-format=full --show-files --show-fixes && mypy src/temporalscope --ignore-missing-imports --show-error-codes --warn-unreachable && bandit -r src/temporalscope diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py index 28ce5cf..e3f2f0a 100644 --- a/src/temporalscope/core/core_utils.py +++ b/src/temporalscope/core/core_utils.py @@ -1,13 +1,11 @@ -""" TemporalScope/src/temporalscope/core/core_utils.py +"""TemporalScope/src/temporalscope/core/core_utils.py. -This module provides utility functions that can be used throughout the TemporalScope package. -It includes methods for printing dividers, checking for nulls and NaNs, and validating the backend. +This module provides utility functions that can be used throughout the TemporalScope package. It includes methods for +printing dividers, checking for nulls and NaNs, and validating the backend. -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in +compliance with the License. You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -16,12 +14,13 @@ limitations under the License. """ -from typing import Union, cast, Dict, Optional, NoReturn import os -from dotenv import load_dotenv -import polars as pl -import pandas as pd +from typing import Dict, NoReturn, Optional, Union, cast + import modin.pandas as mpd +import pandas as pd +import polars as pl +from dotenv import load_dotenv # Load environment variables from the .env file load_dotenv() @@ -45,6 +44,7 @@ # Define a type alias for DataFrames that support Pandas, Modin, and Polars backends SupportedBackendDataFrame = Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] + def get_default_backend_cfg() -> Dict[str, Dict[str, str]]: """Retrieve the application configuration settings. @@ -78,13 +78,11 @@ def raise_invalid_backend(backend: str) -> NoReturn: raise ValueError(f"Unsupported backend: {backend}") -def validate_input( - df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str -) -> None: +def validate_input(df: SupportedBackendDataFrame, backend: str) -> None: """Validate that the DataFrame matches the expected type for the specified backend. :param df: The DataFrame to validate. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The backend against which to validate the DataFrame's type ('pl', 'pd', 'mpd'). :type backend: str :raises TypeError: If the DataFrame does not match the expected type for the backend. @@ -97,48 +95,47 @@ def validate_input( raise TypeError("Expected a Modin DataFrame.") -def validate_and_convert_input( - df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str -) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: +def validate_and_convert_input(df: SupportedBackendDataFrame, backend: str) -> SupportedBackendDataFrame: """Validates and converts the input DataFrame to the specified backend type. :param df: The input DataFrame to validate and convert. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The desired backend type ('pl', 'pd', or 'mpd'). :type backend: str :return: The DataFrame converted to the specified backend type. - :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :rtype: SupportedBackendDataFrame :raises TypeError: If the input DataFrame type doesn't match the specified backend or conversion fails. :raises ValueError: If the backend is not supported. """ validate_backend(backend) # Validates if backend is supported - if backend == BACKEND_POLARS: - if isinstance(df, pl.DataFrame): - return df - elif isinstance(df, pd.DataFrame): - return pl.from_pandas(df) # Convert Pandas to Polars - elif isinstance(df, mpd.DataFrame): - return pl.from_pandas(df._to_pandas()) # Modin to Pandas to Polars - elif backend == BACKEND_PANDAS: - if isinstance(df, pd.DataFrame): - return df - elif isinstance(df, pl.DataFrame): - return df.to_pandas() # Convert Polars to Pandas - elif isinstance(df, mpd.DataFrame): - return df._to_pandas() # Convert Modin to Pandas - elif backend == BACKEND_MODIN: - if isinstance(df, mpd.DataFrame): - return df - elif isinstance(df, pd.DataFrame): - return mpd.DataFrame(df) # Convert Pandas to Modin - elif isinstance(df, pl.DataFrame): - return mpd.DataFrame(df.to_pandas()) # Polars to Pandas to Modin + # Mapping for backends and conversion functions + backend_conversion_map = { + BACKEND_POLARS: { + pl.DataFrame: lambda x: x, + pd.DataFrame: pl.from_pandas, + mpd.DataFrame: lambda x: pl.from_pandas(x._to_pandas()), + }, + BACKEND_PANDAS: { + pd.DataFrame: lambda x: x, + pl.DataFrame: lambda x: x.to_pandas(), + mpd.DataFrame: lambda x: x._to_pandas(), + }, + BACKEND_MODIN: { + mpd.DataFrame: lambda x: x, + pd.DataFrame: lambda x: mpd.DataFrame(x), + pl.DataFrame: lambda x: mpd.DataFrame(x.to_pandas()), + }, + } + + if backend not in backend_conversion_map: + raise ValueError(f"Unsupported backend: {backend}") + + for dataframe_type, conversion_func in backend_conversion_map[backend].items(): + if isinstance(df, dataframe_type): + return conversion_func(df) - # If none of the types match, raise a TypeError - raise TypeError( - f"Input DataFrame type {type(df)} does not match the specified backend '{backend}'" - ) + raise TypeError(f"Input DataFrame type {type(df)} does not match the specified backend '{backend}'") def get_api_keys() -> Dict[str, Optional[str]]: @@ -171,13 +168,11 @@ def print_divider(char: str = "=", length: int = 70) -> None: print(char * length) -def check_nulls( - df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str -) -> bool: +def check_nulls(df: SupportedBackendDataFrame, backend: str) -> bool: """Check for null values in the DataFrame using the specified backend. :param df: The DataFrame to check for null values. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The backend used for the DataFrame ('pl', 'pd', 'mpd'). :type backend: str :return: True if there are null values, False otherwise. @@ -198,13 +193,11 @@ def check_nulls( raise_invalid_backend(backend) -def check_nans( - df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], backend: str -) -> bool: +def check_nans(df: SupportedBackendDataFrame, backend: str) -> bool: """Check for NaN values in the DataFrame using the specified backend. :param df: The DataFrame to check for NaN values. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The backend used for the DataFrame ('pl', 'pd', 'mpd'). :type backend: str :return: True if there are NaN values, False otherwise. diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index df099cf..785a223 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -1,9 +1,9 @@ -""" TemporalScope/src/temporalscope/core/temporal_data_loader.py +"""TemporalScope/src/temporalscope/core/temporal_data_loader.py. -This module provides a flexible data loader for time series forecasting, allowing users to define their own -preprocessing, loss functions, and explainability workflows. The core assumption is that features are organized +This module provides a flexible data loader for time series forecasting, allowing users to define their own +preprocessing, loss functions, and explainability workflows. The core assumption is that features are organized in a context window prior to the target column, making the system compatible with SHAP and other explainability methods. -Given the variance in pre-processing techniques, meta-learning & loss-functions TemporalScope explicitly does not +Given the variance in pre-processing techniques, meta-learning & loss-functions TemporalScope explicitly does not impose constraints on the end-user in the engineering design. .. seealso:: @@ -12,36 +12,40 @@ 2. Woo, G., Liu, C., Kumar, A., Xiong, C., Savarese, S., & Sahoo, D. (2024). Unified training of universal time series forecasting transformers. arXiv preprint arXiv:2402.02592. 3. Trirat, P., Shin, Y., Kang, J., Nam, Y., Na, J., Bae, M., Kim, J., Kim, B., & Lee, J.-G. (2024). Universal time-series representation learning: A survey. arXiv preprint arXiv:2401.03717. -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software distributed under the License is distributed -on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed +on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ -from typing import Union, Optional -import polars as pl -import pandas as pd +from typing import Optional, Union + import modin.pandas as mpd +import pandas as pd +import polars as pl + from temporalscope.core.core_utils import ( + BACKEND_MODIN, + BACKEND_PANDAS, + BACKEND_POLARS, + SupportedBackendDataFrame, + get_default_backend_cfg, + validate_and_convert_input, validate_backend, validate_input, - validate_and_convert_input, - get_default_backend_cfg, - BACKEND_POLARS, - BACKEND_PANDAS, - BACKEND_MODIN, ) class TimeFrame: - """Central class for the TemporalScope package, designed to manage time series data - across various backends such as Polars, Pandas, and Modin. This class enables - modular and flexible workflows for machine learning, deep learning, and time - series explainability (XAI) methods like temporal SHAP. + """Central class for the TemporalScope package. + + Designed to manage time series data across various backends such as + Polars, Pandas, and Modin. This class enables modular and flexible workflows for machine learning, deep learning, + and time series explainability (XAI) methods like temporal SHAP. The `TimeFrame` class supports workflows where the target variable can be either 1D scalar data, typical in classical machine learning, or 3D tensor data, more common in deep learning contexts. @@ -87,7 +91,7 @@ class TimeFrame: def __init__( self, - df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], + df: SupportedBackendDataFrame, time_col: str, target_col: str, backend: Optional[str] = None, @@ -96,7 +100,7 @@ def __init__( """Initialize a TimeFrame object. :param df: The input DataFrame. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param time_col: The name of the column representing time in the DataFrame. :type time_col: str :param target_col: The name of the column representing the target variable in the DataFrame. @@ -161,13 +165,11 @@ def target_col(self) -> str: """ return self._target_col - def _infer_backend( - self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - ) -> str: + def _infer_backend(self, df: SupportedBackendDataFrame) -> str: """Infer the backend from the DataFrame type. :param df: The input DataFrame. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :return: The inferred backend ('pl', 'pd', or 'mpd'). :rtype: str :raises ValueError: If the DataFrame type is unsupported. @@ -181,13 +183,11 @@ def _infer_backend( else: raise ValueError(f"Unsupported DataFrame type: {type(df)}") - def _validate_columns( - self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - ) -> None: + def _validate_columns(self, df: SupportedBackendDataFrame) -> None: """Validate the presence of required columns in the DataFrame. :param df: The DataFrame to validate. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :raises ValueError: If required columns are missing. """ required_columns = [self._time_col, self._target_col] @@ -197,17 +197,17 @@ def _validate_columns( def _sort_data( self, - df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], + df: SupportedBackendDataFrame, ascending: bool = True, - ) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: + ) -> SupportedBackendDataFrame: """Internal method to sort the DataFrame based on the backend. :param df: The DataFrame to sort. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param ascending: If True, sort in ascending order; if False, sort in descending order. :type ascending: bool :return: The sorted DataFrame. - :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :rtype: SupportedBackendDataFrame :raises TypeError: If the DataFrame type does not match the backend. :raises ValueError: If the backend is unsupported. """ @@ -228,15 +228,13 @@ def _sort_data( except KeyError: raise ValueError(f"Unsupported backend: {self._backend}") - def _setup_timeframe( - self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - ) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: + def _setup_timeframe(self, df: SupportedBackendDataFrame) -> SupportedBackendDataFrame: """Sets up the TimeFrame object by converting, validating, and preparing data as required. :param df: The input DataFrame to be processed. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :return: The processed DataFrame. - :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :rtype: SupportedBackendDataFrame :raises ValueError: - If required columns are missing. - If the specified backend is not supported. @@ -264,21 +262,19 @@ def sort_data(self, ascending: bool = True) -> None: """ self.df = self._sort_data(self.df, ascending=ascending) - def get_data(self) -> Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame]: + def get_data(self) -> SupportedBackendDataFrame: """Return the DataFrame in its current state. :return: The DataFrame managed by the TimeFrame instance. - :rtype: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :rtype: SupportedBackendDataFrame """ return self.df - def update_data( - self, new_df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - ) -> None: + def update_data(self, new_df: SupportedBackendDataFrame) -> None: """Updates the internal DataFrame with the provided new DataFrame. :param new_df: The new DataFrame to replace the existing one. - :type new_df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type new_df: SupportedBackendDataFrame :raises TypeError: If the new DataFrame type does not match the backend. :raises ValueError: If required columns are missing in the new DataFrame. """ @@ -288,9 +284,7 @@ def update_data( self._validate_columns(new_df) self.df = new_df - def update_target_col( - self, new_target_col: Union[pl.Series, pd.Series, mpd.Series] - ) -> None: + def update_target_col(self, new_target_col: Union[pl.Series, pd.Series, mpd.Series]) -> None: """Updates the target column in the internal DataFrame with the provided new target column. :param new_target_col: The new target column to replace the existing one. @@ -313,19 +307,12 @@ def update_target_col( # Check if the new target column length matches the DataFrame length if len(new_target_col) != len(self.df): - raise ValueError( - "The new target column must have the same number of rows as the DataFrame." - ) + raise ValueError("The new target column must have the same number of rows as the DataFrame.") # Update the target column based on the backend if self._backend == BACKEND_POLARS: - # Polars uses the alias method for column renaming self.df = self.df.with_columns([new_target_col.alias(self._target_col)]) elif self._backend == BACKEND_PANDAS: - # Pandas series has .values - assert isinstance(new_target_col, pd.Series) # For mypy - self.df[self._target_col] = new_target_col.values + self.df[self._target_col] = new_target_col.to_numpy() # Convert to NumPy for Pandas elif self._backend == BACKEND_MODIN: - # Modin series has .to_numpy - assert isinstance(new_target_col, mpd.Series) # For mypy - self.df[self._target_col] = new_target_col.to_numpy() + self.df[self._target_col] = new_target_col.to_numpy() # Use .to_numpy() for Modin diff --git a/src/temporalscope/core/temporal_target_shifter.py b/src/temporalscope/core/temporal_target_shifter.py index 60cdaa5..ff9b762 100644 --- a/src/temporalscope/core/temporal_target_shifter.py +++ b/src/temporalscope/core/temporal_target_shifter.py @@ -1,4 +1,4 @@ -""" TemporalScope/src/temporalscope/core/temporal_target_shifter.py +"""TemporalScope/src/temporalscope/core/temporal_target_shifter.py. This module provides a transformer-like class to shift the target variable in time series data, either to a scalar value (for classical machine learning) or to an array (for deep learning). @@ -23,19 +23,21 @@ limitations under the License. """ -from typing import Union, Optional -import polars as pl -import pandas as pd -import modin.pandas as mpd import warnings -from temporalscope.core.temporal_data_loader import TimeFrame +from typing import Optional, Union + +import modin.pandas as mpd +import pandas as pd +import polars as pl + from temporalscope.core.core_utils import ( - validate_backend, - validate_input, - BACKEND_POLARS, - BACKEND_PANDAS, BACKEND_MODIN, + BACKEND_PANDAS, + BACKEND_POLARS, + SupportedBackendDataFrame, + validate_backend, ) +from temporalscope.core.temporal_data_loader import TimeFrame class TemporalTargetShifter: @@ -53,8 +55,8 @@ class TemporalTargetShifter: 2. The time shifting is applied to the target column, which may have varying data structures depending on the backend (Polars, Pandas, Modin). - Examples: - --------- + Examples + -------- **Using `TimeFrame`:** .. code-block:: python @@ -110,6 +112,7 @@ class TemporalTargetShifter: :param verbose: If True, prints information about the number of dropped rows during transformation. :type verbose: bool :raises ValueError: If the backend is unsupported or if validation checks fail. + """ MODE_MACHINE_LEARNING = "machine_learning" @@ -128,21 +131,18 @@ def __init__( :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. :param mode: Mode of operation: "machine_learning" or "deep_learning". Default is "machine_learning". - :param sequence_length: (Deep Learning Mode Only) Length of the input sequences. Required if mode is "deep_learning". + :param sequence_length: (Deep Learning Mode Only) Length of the input sequences. Required if mode is + "deep_learning". :param target_col: Column representing the target variable (mandatory). :param drop_target: Whether to drop the original target column after shifting. Default is True. :param verbose: Whether to print detailed information about transformations. :raises ValueError: If the target column is not provided or if an invalid mode is selected. """ if mode not in [self.MODE_MACHINE_LEARNING, self.MODE_DEEP_LEARNING]: - raise ValueError( - f"`mode` must be '{self.MODE_MACHINE_LEARNING}' or '{self.MODE_DEEP_LEARNING}'." - ) + raise ValueError(f"`mode` must be '{self.MODE_MACHINE_LEARNING}' or '{self.MODE_DEEP_LEARNING}'.") if target_col is None: - raise ValueError( - "`target_col` must be explicitly provided for TemporalTargetShifter." - ) + raise ValueError("`target_col` must be explicitly provided for TemporalTargetShifter.") if n_lags <= 0: raise ValueError("`n_lags` must be greater than 0.") @@ -156,17 +156,13 @@ def __init__( self.backend: Optional[str] = None # Backend will be set during fit if self.mode == self.MODE_DEEP_LEARNING and not self.sequence_length: - raise ValueError( - "`sequence_length` must be provided when mode is 'deep_learning'." - ) + raise ValueError("`sequence_length` must be provided when mode is 'deep_learning'.") - def _infer_backend( - self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - ) -> str: + def _infer_backend(self, df: SupportedBackendDataFrame) -> str: """Infer the backend from the DataFrame type. :param df: The input DataFrame. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :return: The inferred backend ('pl', 'pd', or 'mpd'). :raises ValueError: If the DataFrame type is unsupported. """ @@ -179,26 +175,22 @@ def _infer_backend( else: raise ValueError(f"Unsupported DataFrame type: {type(df)}") - def _set_backend( - self, df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] - ) -> None: + def _set_backend(self, df: SupportedBackendDataFrame) -> None: """Set or infer the backend based on the DataFrame. :param df: The input DataFrame. - :type df: Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :raises ValueError: If the backend is not supported. """ if self.backend is None: self.backend = self._infer_backend(df) validate_backend(self.backend) - def _validate_data( - self, tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] - ) -> None: + def _validate_data(self, tf: SupportedBackendDataFrame) -> None: """Validate the TimeFrame or partitioned data for consistency. :param tf: The `TimeFrame` object or a DataFrame (`pandas`, `modin`, or `polars`) that contains the time series data. - :type tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] + :type tf: SupportedBackendDataFrame :raises ValueError: If the data is invalid or empty. """ if isinstance(tf, TimeFrame): @@ -207,7 +199,7 @@ def _validate_data( df = tf # Check if the DataFrame is empty based on the backend - if isinstance(df, pd.DataFrame) or isinstance(df, mpd.DataFrame): + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): # Merge the `isinstance` calls for `pd` and `mpd` if df is None or df.empty: raise ValueError("Input DataFrame is empty.") elif isinstance(df, pl.DataFrame): @@ -231,29 +223,19 @@ def _shift_polars(self, df: pl.DataFrame, target_col: str) -> pl.DataFrame: if not isinstance(self.sequence_length, int): raise ValueError("`sequence_length` must be an integer.") shifted_columns = [ - df[target_col].shift(-i).alias(f"{target_col}_shift_{i}") - for i in range(self.sequence_length) + df[target_col].shift(-i).alias(f"{target_col}_shift_{i}") for i in range(self.sequence_length) ] df = df.with_columns(shifted_columns) df = df.with_columns( - pl.concat_list( - [ - pl.col(f"{target_col}_shift_{i}") - for i in range(self.sequence_length) - ] - ).alias(f"{target_col}_sequence") - ) - df = df.drop( - [f"{target_col}_shift_{i}" for i in range(self.sequence_length)] + pl.concat_list([pl.col(f"{target_col}_shift_{i}") for i in range(self.sequence_length)]).alias( + f"{target_col}_sequence" + ) ) + df = df.drop([f"{target_col}_shift_{i}" for i in range(self.sequence_length)]) df = df.drop_nulls() df = df.slice(0, len(df) - self.sequence_length + 1) else: - df = df.with_columns( - df[target_col] - .shift(-self.n_lags) - .alias(f"{target_col}_shift_{self.n_lags}") - ) + df = df.with_columns(df[target_col].shift(-self.n_lags).alias(f"{target_col}_shift_{self.n_lags}")) df = df.drop_nulls() if df.is_empty(): @@ -280,9 +262,7 @@ def _shift_pandas_modin( if self.mode == self.MODE_DEEP_LEARNING: if not isinstance(self.sequence_length, int): raise ValueError("`sequence_length` must be an integer.") - shifted_columns = [ - df[target_col].shift(-i) for i in range(self.sequence_length) - ] + shifted_columns = [df[target_col].shift(-i) for i in range(self.sequence_length)] df[f"{target_col}_sequence"] = list(zip(*shifted_columns)) df = df.dropna() df = df.iloc[: -self.sequence_length + 1] @@ -298,9 +278,7 @@ def _shift_pandas_modin( return df - def _transform_pandas_modin( - self, df: Union[pd.DataFrame, mpd.DataFrame] - ) -> Union[pd.DataFrame, mpd.DataFrame]: + def _transform_pandas_modin(self, df: Union[pd.DataFrame, mpd.DataFrame]) -> Union[pd.DataFrame, mpd.DataFrame]: """Handle shifting for Pandas or Modin backends. :param df: The input DataFrame (Pandas or Modin). @@ -360,13 +338,9 @@ def _print_dropped_rows(self, rows_before: int, rows_after: int) -> None: """ if self.verbose: rows_dropped = rows_before - rows_after - print( - f"Rows before shift: {rows_before}; Rows after shift: {rows_after}; Rows dropped: {rows_dropped}" - ) + print(f"Rows before shift: {rows_before}; Rows after shift: {rows_after}; Rows dropped: {rows_dropped}") - def fit( - self, tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] - ) -> "TemporalTargetShifter": + def fit(self, tf: SupportedBackendDataFrame) -> "TemporalTargetShifter": """Validate and prepare the target data for transformation based on the specified backend. The `fit` method initializes the backend and validates the input data, ensuring the target column is consistent with the input data. @@ -374,7 +348,7 @@ def fit( :param tf: The `TimeFrame` object, or a DataFrame (`pandas`, `modin`, or `polars`) that contains the time series data. The DataFrame must have a target column defined or the `target_col` attribute set during initialization. - :type tf: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame], optional + :type tf: SupportedBackendDataFrame, optional :raises ValueError: If the target column is not provided, the data is invalid, or the backend is unsupported. :raises Warning: If the target column provided in `TemporalTargetShifter` differs from the one in the `TimeFrame`. :return: The fitted `TemporalTargetShifter` instance, ready for transforming the data. @@ -386,7 +360,6 @@ def fit( shifter = TemporalTargetShifter(n_lags=2, target_col="target") shifter.fit(time_frame) - """ self._validate_data(tf) @@ -409,9 +382,7 @@ def fit( return self - def transform( - self, tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] - ) -> Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]: + def transform(self, tf: SupportedBackendDataFrame) -> SupportedBackendDataFrame: """Transform the input time series data by shifting the target variable according to the specified number of lags. The `transform` method shifts the target variable in the input data according to the `n_lags` or `sequence_length` set during initialization. @@ -419,12 +390,12 @@ def transform( :param tf: The `TimeFrame` object or a DataFrame (Pandas, Modin, or Polars) that contains the time series data to be transformed. The data should contain a target column that will be shifted. - :type tf: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame], optional + :type tf: SupportedBackendDataFrame, optional :raises ValueError: If the input data is invalid, unsupported, or lacks columns. :raises ValueError: If the backend is unsupported or data validation fails. :return: A transformed DataFrame or `TimeFrame` with the target variable shifted by the specified lags or sequence length. If a `TimeFrame` is provided, the returned object will be a `TimeFrame`. Otherwise, a DataFrame will be returned. - :rtype: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame] + :rtype: SupportedBackendDataFrame Example Usage: -------------- @@ -432,7 +403,6 @@ def transform( shifter = TemporalTargetShifter(n_lags=2, target_col="target") transformed_data = shifter.transform(time_frame) - """ if isinstance(tf, TimeFrame): tf.sort_data() # Ensure the data is sorted before shifting @@ -474,9 +444,7 @@ def transform( return transformed_df - def fit_transform( - self, tf: Optional[Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]] - ) -> Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame]: + def fit_transform(self, tf: SupportedBackendDataFrame) -> SupportedBackendDataFrame: """Fit and transform the input data in a single step. This method combines the functionality of the `fit` and `transform` methods. It first validates and prepares the input data (fitting), @@ -484,11 +452,11 @@ def fit_transform( :param tf: The `TimeFrame` object or a DataFrame (`pandas`, `modin`, or `polars`) to be transformed. The data should contain a target column that will be shifted according to the `n_lags` or `sequence_length`. - :type tf: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame], optional + :type tf: SupportedBackendDataFrame, optional :raises ValueError: If the input data is invalid or the backend is unsupported. :raises ValueError: If the target column is not set, or is incompatible with the data. :return: A transformed DataFrame or TimeFrame with the target variable shifted by the specified lags or sequence length. - :rtype: Union[TimeFrame, pd.DataFrame, mpd.DataFrame, pl.DataFrame] + :rtype: SupportedBackendDataFrame Example Usage: -------------- @@ -496,7 +464,6 @@ def fit_transform( shifter = TemporalTargetShifter(n_lags=2, target_col="target") shifted_data = shifter.fit_transform(time_frame) - """ self.fit(tf) transformed = self.transform(tf) diff --git a/src/temporalscope/metrics/masv.py b/src/temporalscope/metrics/masv.py index a700152..5656317 100644 --- a/src/temporalscope/metrics/masv.py +++ b/src/temporalscope/metrics/masv.py @@ -14,15 +14,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - """Mean Absolute SHAP (MASV) analysis for temporal feature importance. -This module implements the Mean Absolute SHAP (MASV) analysis, which evaluates -temporal feature importance across different operational phases of a system. +This module implements the Mean Absolute SHAP (MASV) analysis, which evaluates temporal feature importance across +different operational phases of a system. -The MASV metric provides insights into how feature importance varies over time -or across different operational phases, helping to identify key factors -influencing system behavior at different stages. +The MASV metric provides insights into how feature importance varies over time or across different operational phases, +helping to identify key factors influencing system behavior at different stages. """ from collections.abc import Callable @@ -34,9 +32,7 @@ from temporalscope.partition.base import BaseTemporalPartitioner -def calculate_masv( - model: Callable, data: pd.DataFrame, partitioner: BaseTemporalPartitioner -) -> dict[str, list[float]]: +def calculate_masv(model: Callable, data: pd.DataFrame, partitioner: BaseTemporalPartitioner) -> dict[str, list[float]]: r"""Calculate Mean Absolute SHAP Values (MASV). Calculate MASV for temporal feature importance across partitions. @@ -85,9 +81,7 @@ def calculate_masv( # Iterate over each partition for partition_data in partitions.values(): # Extract the training data for the current partition - phase_data = partition_data[ - "train" - ] # Assuming we're calculating MASV on the 'train' partition + phase_data = partition_data["train"] # Assuming we're calculating MASV on the 'train' partition # Calculate SHAP values for the partition data shap_values = explainer(phase_data) diff --git a/src/temporalscope/partition/base_protocol.py b/src/temporalscope/partition/base_protocol.py index 0b69839..9bacefa 100644 --- a/src/temporalscope/partition/base_protocol.py +++ b/src/temporalscope/partition/base_protocol.py @@ -1,12 +1,12 @@ -""" TemporalScope/src/temporalscope/partition/base_protocol.py +"""TemporalScope/src/temporalscope/partition/base_protocol.py. -This module defines the TemporalPartitionerProtocol, a protocol for all +This module defines the TemporalPartitionerProtocol, a protocol for all temporal partitioning methods. Each partitioning method must implement the required methods to comply with this protocol. Core Functionality: ------------------- -1. fit: Must generate the partition indices (row ranges) for the +1. fit: Must generate the partition indices (row ranges) for the partitions ('train', 'test', 'validation', etc.) in a memory-efficient manner. Implementations should leverage lazy-loading techniques to ensure that large datasets are handled efficiently, minimizing memory usage. @@ -15,7 +15,7 @@ maintaining the efficiency gained from lazy-loading in the fit stage. 3. check_data: Optional method to perform data validation checks. -Each implementing class must provide its own logic for partitioning the data and +Each implementing class must provide its own logic for partitioning the data and any necessary validation, while adhering to the design principles of lazy-loading and memory efficiency. @@ -32,10 +32,12 @@ limitations under the License. """ -from typing import Protocol, Dict, Tuple, Union, Iterator +from typing import Dict, Iterator, Protocol, Tuple, Union + +import modin.pandas as mpd import pandas as pd import polars as pl -import modin.pandas as mpd + from temporalscope.core.temporal_data_loader import TimeFrame @@ -115,9 +117,7 @@ def transform( self, ) -> Union[ Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]], - Iterator[ - Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]] - ], + Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame]]]], ]: """Return the data for each partition. @@ -161,8 +161,7 @@ def transform( def check_data(self) -> None: """Perform data validation checks. - Implementing classes must provide their own data validation logic, such as ensuring - sample size is sufficient, checking for window overlaps, or validating the - feature count. + Implementing classes must provide their own data validation logic, such as ensuring sample size is sufficient, + checking for window overlaps, or validating the feature count. """ pass diff --git a/src/temporalscope/partition/partition_validators.py b/src/temporalscope/partition/partition_validators.py index 0e6f910..b51d135 100644 --- a/src/temporalscope/partition/partition_validators.py +++ b/src/temporalscope/partition/partition_validators.py @@ -1,4 +1,4 @@ -""" TemporalScope/temporalscope/partition/partition_validators.py +"""TemporalScope/temporalscope/partition/partition_validators.py. This module provides functions to validate dataset partitions against a set of heuristics derived from key literature in the field. @@ -6,7 +6,7 @@ .. seealso:: 1. Shwartz-Ziv, R. and Armon, A., 2022. Tabular data: Deep learning is not all you need. Information Fusion, 81, pp.84-90. 2. Grinsztajn, L., Oyallon, E. and Varoquaux, G., 2022. Why do tree-based models still outperform deep learning on typical tabular data? - 3. Gorishniy, Y., Rubachev, I., Khrulkov, V. and Babenko, A., 2021. Revisiting deep learning models for tabular data. + 3. Gorishniy, Y., Rubachev, I., Khrulkov, V. and Babenko, A., 2021. Revisiting deep learning models for tabular data. TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,18 +21,21 @@ limitations under the License. """ -from typing import Union, TypeVar, Any, Dict, cast import warnings +from typing import Any, Dict, TypeVar, cast + +import modin.pandas as mpd import pandas as pd import polars as pl -import modin.pandas as mpd + from temporalscope.conf import validate_backend +from temporalscope.core.core_utils import SupportedBackendDataFrame PandasLike = TypeVar("PandasLike", pd.DataFrame, mpd.DataFrame) def check_sample_size( - df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], + df: SupportedBackendDataFrame, backend: str = "pl", min_samples: int = 3000, max_samples: int = 50000, @@ -45,7 +48,7 @@ def check_sample_size( warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str :param min_samples: Minimum number of samples required. @@ -82,7 +85,7 @@ def check_sample_size( def check_feature_count( - df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], + df: SupportedBackendDataFrame, backend: str = "pl", min_features: int = 4, max_features: int = 500, @@ -95,7 +98,7 @@ def check_feature_count( `enable_warnings` flag. :param df: The dataset to check. - :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str :param min_features: Minimum number of features required. @@ -132,7 +135,7 @@ def check_feature_count( def check_feature_to_sample_ratio( - df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], + df: SupportedBackendDataFrame, backend: str = "pl", max_ratio: float = 0.1, enable_warnings: bool = False, @@ -143,7 +146,7 @@ def check_feature_to_sample_ratio( which may increase the risk of overfitting. Warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str :param max_ratio: Maximum allowable feature-to-sample ratio. @@ -172,7 +175,7 @@ def check_feature_to_sample_ratio( def check_categorical_feature_cardinality( - df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], + df: SupportedBackendDataFrame, backend: str = "pl", max_unique_values: int = 20, enable_warnings: bool = False, @@ -184,7 +187,7 @@ def check_categorical_feature_cardinality( Warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str :param max_unique_values: Maximum number of unique values allowed for categorical features. @@ -213,12 +216,8 @@ def check_categorical_feature_cardinality( elif backend in ["pd", "mpd"]: # Explicitly cast to Pandas/Modin DataFrame - pandas_df = ( - cast(pd.DataFrame, df) if backend == "pd" else cast(mpd.DataFrame, df) - ) - categorical_columns = pandas_df.select_dtypes( - include=["category", "object"] - ).columns + pandas_df = cast(pd.DataFrame, df) if backend == "pd" else cast(mpd.DataFrame, df) + categorical_columns = pandas_df.select_dtypes(include=["category", "object"]).columns for col in categorical_columns: if pandas_df[col].nunique() > max_unique_values: if enable_warnings: @@ -232,7 +231,7 @@ def check_categorical_feature_cardinality( def check_numerical_feature_uniqueness( - df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], + df: SupportedBackendDataFrame, backend: str = "pl", min_unique_values: int = 10, enable_warnings: bool = False, @@ -244,7 +243,7 @@ def check_numerical_feature_uniqueness( Warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str :param min_unique_values: Minimum number of unique values required for numerical features. @@ -287,17 +286,19 @@ def check_numerical_feature_uniqueness( def check_binary_numerical_features( - df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], + df: SupportedBackendDataFrame, backend: str = "pl", enable_warnings: bool = False, ) -> bool: - """Check if any numerical features are binary and suggest converting them to categorical. + """Detect binary numerical features and suggest converting them to categorical. - Binary numerical features (i.e., features with only two unique values) are often better represented as categorical features. - This function detects such features and suggests conversion. Warnings can be triggered depending on the `enable_warnings` flag. + Binary numerical features (i.e., features with only two unique values) are + often better represented as categorical features. This function detects + such features and suggests conversion. Warnings can be triggered depending + on the `enable_warnings` flag. :param df: The dataset to check. - :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str :param enable_warnings: Flag to enable warnings, defaults to False. @@ -305,6 +306,8 @@ def check_binary_numerical_features( :return: True if no binary numerical features are found, otherwise False. :rtype: bool """ + BINARY_UNIQUE_VALUES = 2 # Constant for binary unique values + validate_backend(backend) if backend in ["pd", "mpd"]: @@ -312,11 +315,13 @@ def check_binary_numerical_features( if isinstance(pandas_df, (pd.DataFrame, mpd.DataFrame)): numerical_columns = pandas_df.select_dtypes(include=["number"]).columns for col in numerical_columns: - if pandas_df[col].nunique() == 2: + if pandas_df[col].nunique() == BINARY_UNIQUE_VALUES: if enable_warnings: warnings.warn( - f"Numerical feature '{col}' has only 2 unique values. " - "Binary numerical features should typically be converted to categorical for better model performance and interpretability." + f"Numerical feature '{col}' has only {BINARY_UNIQUE_VALUES} " + "unique values. Binary numerical features should typically " + "be converted to categorical for better model performance and " + "interpretability." ) return False @@ -325,11 +330,13 @@ def check_binary_numerical_features( if isinstance(polars_df, pl.DataFrame): for col in polars_df.columns: if polars_df[col].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]: - if polars_df[col].n_unique() == 2: + if polars_df[col].n_unique() == BINARY_UNIQUE_VALUES: if enable_warnings: warnings.warn( - f"Numerical feature '{col}' has only 2 unique values. " - "Binary numerical features should typically be converted to categorical for better model performance and interpretability." + f"Numerical feature '{col}' has only {BINARY_UNIQUE_VALUES} " + "unique values. Binary numerical features should typically " + "be converted to categorical for better model performance and " + "interpretability." ) return False @@ -337,26 +344,29 @@ def check_binary_numerical_features( def check_class_balance( - df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], + df: SupportedBackendDataFrame, target_col: str, backend: str = "pl", enable_warnings: bool = False, + imbalance_threshold: float = 1.5, # Default threshold for class imbalance ) -> bool: """Check that classes in a classification dataset are balanced. This function checks the class distribution in the target column of a classification dataset. - If the ratio between the largest and smallest classes exceeds 1.5, the dataset is considered imbalanced. + If the ratio between the largest and smallest classes exceeds the defined threshold, the dataset is considered imbalanced. Warnings can be triggered depending on the `enable_warnings` flag. :param df: The dataset to check. - :type df: Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame] + :type df: SupportedBackendDataFrame :param target_col: The column containing the target labels. :type target_col: str :param backend: The backend used for processing ('pd', 'pl', 'mpd'). :type backend: str :param enable_warnings: Flag to enable warnings, defaults to False. :type enable_warnings: bool - :return: True if classes are balanced (ratio <= 1.5), otherwise False. + :param imbalance_threshold: The threshold for determining class imbalance, defaults to 1.5. + :type imbalance_threshold: float + :return: True if classes are balanced (ratio <= threshold), otherwise False. :rtype: bool :raises: ValueError if backend is not supported. """ @@ -366,9 +376,7 @@ def check_class_balance( if backend in ["pd", "mpd"]: # Explicitly cast to Pandas/Modin DataFrame - pandas_df = ( - cast(pd.DataFrame, df) if backend == "pd" else cast(mpd.DataFrame, df) - ) + pandas_df = cast(pd.DataFrame, df) if backend == "pd" else cast(mpd.DataFrame, df) value_counts = pandas_df[target_col].value_counts() class_counts = {k: int(v) for k, v in value_counts.items()} @@ -376,16 +384,14 @@ def check_class_balance( # Explicitly cast to Polars DataFrame polars_df = cast(pl.DataFrame, df) value_counts = polars_df[target_col].value_counts() - class_counts = { - str(row[target_col]): int(row["count"]) for row in value_counts.to_dicts() - } + class_counts = {str(row[target_col]): int(row["count"]) for row in value_counts.to_dicts()} if class_counts: count_values = list(class_counts.values()) max_count = max(count_values) min_count = min(count_values) - if max_count / min_count > 1.5: + if max_count / min_count > imbalance_threshold: if enable_warnings: warnings.warn( "Classes are imbalanced. Consider using techniques like class weighting, SMOTE, or resampling to address class imbalance." diff --git a/src/temporalscope/partition/sliding_window.py b/src/temporalscope/partition/sliding_window.py index 20660a1..00e5fe2 100644 --- a/src/temporalscope/partition/sliding_window.py +++ b/src/temporalscope/partition/sliding_window.py @@ -1,4 +1,4 @@ -""" TemporalScope/temporalscope/partitioning/sliding_window.py +"""TemporalScope/temporalscope/partitioning/sliding_window.py. This module defines the SlidingWindowPartitioner class, a specific implementation of the TemporalPartitionerProtocol for creating contiguous, non-overlapping partitions using a sliding window mechanism. @@ -6,7 +6,7 @@ Core Functionality: ------------------- The SlidingWindowPartitioner divides a dataset into non-overlapping partitions using a fixed window size and -optional stride. The stride determines how far to move between the starting points of consecutive partitions, +optional stride. The stride determines how far to move between the starting points of consecutive partitions, which can introduce gaps between them. Each partition can be further split into train, test, and validation sets. This class utilizes the generator pattern for memory efficiency, yielding partition indices and data slices one at a time. @@ -28,27 +28,28 @@ """ import itertools -from typing import Dict, Tuple, Optional, Union, Iterator +from typing import Dict, Iterator, Optional, Tuple, Union + +import modin.pandas as mpd import pandas as pd import polars as pl -import modin.pandas as mpd + +from temporalscope.core.core_utils import ( + BACKEND_MODIN, + BACKEND_PANDAS, + BACKEND_POLARS, + SupportedBackendDataFrame, + validate_backend, +) from temporalscope.core.temporal_data_loader import TimeFrame from temporalscope.partition.base_protocol import TemporalPartitionerProtocol from temporalscope.partition.partition_validators import ( - check_sample_size, - check_feature_to_sample_ratio, check_class_balance, -) -from temporalscope.core.core_utils import ( - validate_backend, - BACKEND_POLARS, - BACKEND_PANDAS, - BACKEND_MODIN, - SupportedBackendDataFrame + check_feature_to_sample_ratio, + check_sample_size, ) - class SlidingWindowPartitioner(TemporalPartitionerProtocol): """Sliding Window Partitioner for dividing time series data into contiguous, non-overlapping partitions. @@ -239,9 +240,7 @@ def __init__( self.reverse = reverse self.truncate = truncate self.verbose = verbose - self.train_pct, self.test_pct, self.val_pct = self._precompute_percentages( - train_pct, test_pct, val_pct - ) + self.train_pct, self.test_pct, self.val_pct = self._precompute_percentages(train_pct, test_pct, val_pct) # Sort data by time column using TimeFrame method self.tf.sort_data(ascending=True) @@ -250,7 +249,11 @@ def __init__( self._transform_executed = False def _precompute_percentages( - self, train_pct: float, test_pct: Optional[float], val_pct: Optional[float] + self, + train_pct: float, + test_pct: Optional[float], + val_pct: Optional[float], + precision: float = 1e-6, # Default precision for floating-point comparisons ) -> Tuple[float, float, float]: """Precompute and validate train, test, and validation percentages. @@ -263,6 +266,8 @@ def _precompute_percentages( :type test_pct: Optional[float] :param val_pct: Optional. Percentage of data allocated for validation. :type val_pct: Optional[float] + :param precision: The tolerance level for floating-point imprecision, defaults to 1e-6. + :type precision: float :return: A tuple containing the validated percentages for training, testing, and validation. :rtype: Tuple[float, float, float] :raises ValueError: If the percentages do not sum to 1.0 or are not within the valid range (0 to 1). @@ -294,7 +299,7 @@ def _precompute_percentages( # Ensure they sum to 1.0 total_pct = train_pct + (test_pct or 0) + (val_pct or 0) - if not (abs(total_pct - 1.0) < 1e-6): # Allow for floating-point imprecision + if not (abs(total_pct - 1.0) < precision): # Use the precision parameter here raise ValueError("Train, test, and validation percentages must sum to 1.0.") # Ensure test_pct and val_pct are float types, not None @@ -309,8 +314,8 @@ def _pad_partition( ) -> SupportedBackendDataFrame: """Pad the partition to the required window size by repeating the last row. - This function ensures that the partition is padded to the full window size - by repeating the last row of the partition until the desired window size is achieved. + This function ensures that the partition is padded to the full window size by repeating the last row of the + partition until the desired window size is achieved. :param df: The DataFrame (Pandas, Modin, or Polars) to pad. :type df: Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] @@ -349,9 +354,7 @@ def _pad_partition( pad_row = df.slice(end - 1, 1) if not reverse else df.slice(0, 1) # Repeat the selected row for the required number of times - pad_rows = pl.DataFrame( - [pad_row.to_dict(as_series=False)[0] for _ in range(num_to_pad)] - ) + pad_rows = pl.DataFrame([pad_row.to_dict(as_series=False)[0] for _ in range(num_to_pad)]) # Concatenate the original DataFrame with the padding if reverse: @@ -386,11 +389,7 @@ def _fit_pandas_modin( end = num_rows train_end = start + int(self.train_pct * (end - start)) - test_end = ( - train_end + int(self.test_pct * (end - start)) - if self.test_pct - else train_end - ) + test_end = train_end + int(self.test_pct * (end - start)) if self.test_pct else train_end validation_end = end if self.val_pct else test_end # Yield the partition indices @@ -399,16 +398,12 @@ def _fit_pandas_modin( "full": (start, end), "train": (start, train_end), "test": (train_end, test_end), - "validation": ( - (test_end, validation_end) if self.val_pct else (0, 0) - ), + "validation": ((test_end, validation_end) if self.val_pct else (0, 0)), } } partition_count += 1 - def _fit_polars( - self, df: pl.DataFrame - ) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: + def _fit_polars(self, df: pl.DataFrame) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: """Fit method specific to Polars backend. :param df: Input DataFrame. @@ -431,11 +426,7 @@ def _fit_polars( end = num_rows train_end = start + int(self.train_pct * (end - start)) - test_end = ( - train_end + int(self.test_pct * (end - start)) - if self.test_pct - else train_end - ) + test_end = train_end + int(self.test_pct * (end - start)) if self.test_pct else train_end validation_end = end if self.val_pct else test_end # Yield the partition indices @@ -444,9 +435,7 @@ def _fit_polars( "full": (start, end), "train": (start, train_end), "test": (train_end, test_end), - "validation": ( - (test_end, validation_end) if self.val_pct else (0, 0) - ), + "validation": ((test_end, validation_end) if self.val_pct else (0, 0)), } } partition_count += 1 @@ -456,7 +445,7 @@ def _transform_pandas_modin( ) -> Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame]]]]: """Transform method for Pandas/Modin backend. - This method transforms the partitioned dataset into slices, yielding the data slices corresponding to + This method transforms the partitioned dataset into slices, yielding the data slices corresponding to the partition indices generated by the `fit` method. It processes each partition and splits it into train, test, and optionally validation sets. @@ -494,15 +483,16 @@ def _transform_pandas_modin( } } - Notes: - ------ + Notes + ----- - Padding is applied when the size of a partition is smaller than the `window_size`, unless truncation is enabled. - Ensure that the input DataFrame is not empty to avoid runtime errors. Performance Considerations: --------------------------- - - For very large datasets, the padding process may increase memory usage. Consider using Modin when handling + - For very large datasets, the padding process may increase memory usage. Consider using Modin when handling large datasets to take advantage of distributed processing. + """ partition_count = 1 @@ -519,11 +509,7 @@ def _transform_pandas_modin( } # If the partition size is smaller than the window size, pad it - if ( - partition_dict["full"][1] - partition_dict["full"][0] - < self.window_size - and not self.truncate - ): + if partition_dict["full"][1] - partition_dict["full"][0] < self.window_size and not self.truncate: partitioned_data[key]["full"] = self._pad_partition( partitioned_data[key]["full"], self.window_size, @@ -534,13 +520,11 @@ def _transform_pandas_modin( partition_count += 1 - def _transform_polars( - self, df: pl.DataFrame - ) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: + def _transform_polars(self, df: pl.DataFrame) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: """Transform method for Polars backend. This method generates partitioned data slices for the Polars backend, yielding the data slices corresponding - to the partition indices generated by the `fit` method. If the size of a partition is smaller than the + to the partition indices generated by the `fit` method. If the size of a partition is smaller than the specified `window_size`, padding is applied unless `truncate` is set to True. :param df: Input Polars DataFrame. @@ -574,15 +558,16 @@ def _transform_polars( } } - Notes: - ------ + Notes + ----- - Padding is applied when the size of a partition is smaller than the `window_size`, unless truncation is enabled. - Polars DataFrames offer better performance with large datasets, especially for complex operations. Performance Considerations: --------------------------- - - For very large datasets, Polars DataFrames are recommended due to their lower memory footprint and faster + - For very large datasets, Polars DataFrames are recommended due to their lower memory footprint and faster performance when compared to Pandas. Use Polars for more efficient partitioning and transformations. + """ partition_count = 1 @@ -601,11 +586,7 @@ def _transform_polars( end = num_rows train_end = start + int(self.train_pct * (end - start)) - test_end = ( - train_end + int(self.test_pct * (end - start)) - if self.test_pct - else train_end - ) + test_end = train_end + int(self.test_pct * (end - start)) if self.test_pct else train_end validation_end = end if self.val_pct else test_end # Yield the partitioned data slices @@ -676,7 +657,6 @@ def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: .. seealso:: - :meth:`transform`: For generating the actual data slices corresponding to these indices. """ - df = self.tf.get_data() # Get the dataset from the TimeFrame # Call backend-specific partitioning method @@ -848,9 +828,7 @@ def check_data(self, partition_index: Optional[int] = None) -> None: max_samples=100000, enable_warnings=True, ) - check_feature_to_sample_ratio( - df_to_check, backend=self.tf.backend, max_ratio=0.2, enable_warnings=True - ) + check_feature_to_sample_ratio(df_to_check, backend=self.tf.backend, max_ratio=0.2, enable_warnings=True) if self.tf.target_col: check_class_balance( df_to_check, diff --git a/test/unit/test_core_temporal_data_loader.py b/test/unit/test_core_temporal_data_loader.py index 1d7f78b..633a9ec 100644 --- a/test/unit/test_core_temporal_data_loader.py +++ b/test/unit/test_core_temporal_data_loader.py @@ -1,4 +1,4 @@ -""" TemporalScope/test/unit/test_core_temporal_data_loader.py +"""TemporalScope/test/unit/test_core_temporal_data_loader.py TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -9,23 +9,24 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ -import pytest +from datetime import date, timedelta +from typing import Dict, List, Union + +import modin.pandas as mpd import numpy as np -import polars as pl import pandas as pd -import modin.pandas as mpd -from temporalscope.core.temporal_data_loader import TimeFrame +import polars as pl +import pytest + from temporalscope.core.core_utils import ( - BACKEND_POLARS, - BACKEND_PANDAS, BACKEND_MODIN, + BACKEND_PANDAS, + BACKEND_POLARS, ) -from typing import Union, Dict, List -from datetime import date, timedelta +from temporalscope.core.temporal_data_loader import TimeFrame -def create_sample_data( - num_samples: int = 100, num_features: int = 3 -) -> Dict[str, Union[List[date], List[float]]]: + +def create_sample_data(num_samples: int = 100, num_features: int = 3) -> Dict[str, Union[List[date], List[float]]]: """Create a sample data dictionary for testing. :param num_samples: Number of samples to generate, defaults to 100 @@ -52,6 +53,7 @@ def create_sample_data( return data + @pytest.fixture(params=[BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) def sample_dataframe(request): """Fixture to create sample DataFrames for each backend. @@ -66,7 +68,7 @@ def sample_dataframe(request): if backend == BACKEND_POLARS: # Ensure 'time' column is properly typed - data['time'] = pl.Series(data['time']) + data["time"] = pl.Series(data["time"]) df = pl.DataFrame(data) elif backend == BACKEND_PANDAS: df = pd.DataFrame(data) @@ -76,6 +78,7 @@ def sample_dataframe(request): raise ValueError(f"Unsupported backend: {backend}") return df, backend + def test_timeframe_initialization(sample_dataframe): """Test the initialization of TimeFrame with various backends. @@ -89,6 +92,7 @@ def test_timeframe_initialization(sample_dataframe): assert tf.target_col == "target" assert len(tf.get_data()) == len(df) + def test_sort_data(sample_dataframe): """Test the sort_data method. @@ -109,6 +113,7 @@ def test_sort_data(sample_dataframe): times = sorted_df[tf.time_col].to_list() if backend == BACKEND_POLARS else sorted_df[tf.time_col].tolist() assert times == sorted(times) + def test_update_data(sample_dataframe): """Test the update_data method. @@ -119,7 +124,7 @@ def test_update_data(sample_dataframe): tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) new_data = create_sample_data(num_samples=50) if backend == BACKEND_POLARS: - new_data['time'] = pl.Series(new_data['time']) + new_data["time"] = pl.Series(new_data["time"]) new_df = pl.DataFrame(new_data) elif backend == BACKEND_PANDAS: new_df = pd.DataFrame(new_data) @@ -128,6 +133,7 @@ def test_update_data(sample_dataframe): tf.update_data(new_df) assert len(tf.get_data()) == 50 + def test_update_target_col(sample_dataframe): """Test the update_target_col method. @@ -144,9 +150,12 @@ def test_update_target_col(sample_dataframe): elif backend == BACKEND_MODIN: new_target_col = mpd.Series(new_target) tf.update_target_col(new_target_col) - updated_target = tf.get_data()[tf.target_col].to_numpy() if backend == BACKEND_POLARS else tf.get_data()[tf.target_col].values + updated_target = ( + tf.get_data()[tf.target_col].to_numpy() if backend == BACKEND_POLARS else tf.get_data()[tf.target_col].values + ) np.testing.assert_array_almost_equal(updated_target, new_target) + def test_missing_columns(sample_dataframe): """Test initialization with missing required columns. @@ -163,6 +172,7 @@ def test_missing_columns(sample_dataframe): TimeFrame(df, time_col="time", target_col="target", backend=backend) assert "Missing required columns" in str(excinfo.value) + def test_invalid_backend(sample_dataframe): """Test initialization with an invalid backend. @@ -175,6 +185,7 @@ def test_invalid_backend(sample_dataframe): TimeFrame(df, time_col="time", target_col="target", backend=invalid_backend) assert f"Unsupported backend '{invalid_backend}'" in str(excinfo.value) + def test_invalid_time_col_type(sample_dataframe): """Test initialization with invalid time_col type. @@ -186,6 +197,7 @@ def test_invalid_time_col_type(sample_dataframe): TimeFrame(df, time_col=123, target_col="target", backend=backend) assert "time_col must be a non-empty string." in str(excinfo.value) + def test_invalid_target_col_type(sample_dataframe): """Test initialization with invalid target_col type. @@ -197,12 +209,14 @@ def test_invalid_target_col_type(sample_dataframe): TimeFrame(df, time_col="time", target_col=None, backend=backend) assert "target_col must be a non-empty string." in str(excinfo.value) + def test_invalid_dataframe_type(): """Test initialization with an invalid DataFrame type.""" invalid_df = "This is not a DataFrame" with pytest.raises(TypeError): TimeFrame(invalid_df, time_col="time", target_col="target", backend=BACKEND_POLARS) + def test_sort_data_invalid_backend(): """Test initialization with an unsupported backend.""" data = create_sample_data() @@ -211,25 +225,26 @@ def test_sort_data_invalid_backend(): TimeFrame(df, time_col="time", target_col="target", backend="unsupported_backend") assert "Unsupported backend" in str(excinfo.value) -def test_update_target_col_invalid_length(sample_dataframe): - """Test update_target_col with mismatched length. - :param sample_dataframe: Fixture providing the DataFrame and backend. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ +def test_update_target_col_invalid_length(sample_dataframe): + """Test update_target_col with mismatched length.""" df, backend = sample_dataframe tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) - new_target = np.random.rand(len(df) - 1) + new_target = np.random.rand(len(df) - 1) # Mismatch length by 1 if backend == BACKEND_POLARS: new_target_col = pl.Series(new_target) elif backend == BACKEND_PANDAS: new_target_col = pd.Series(new_target) elif backend == BACKEND_MODIN: new_target_col = mpd.Series(new_target) + with pytest.raises(ValueError) as excinfo: tf.update_target_col(new_target_col) + assert "The new target column must have the same number of rows as the DataFrame." in str(excinfo.value) + + def test_update_target_col_invalid_type(sample_dataframe): """Test update_target_col with invalid Series type. @@ -243,12 +258,11 @@ def test_update_target_col_invalid_type(sample_dataframe): tf.update_target_col(invalid_series) assert "Expected a" in str(excinfo.value) - -@pytest.mark.parametrize("df_backend,expected_backend", [ - (BACKEND_POLARS, BACKEND_POLARS), - (BACKEND_PANDAS, BACKEND_PANDAS), - (BACKEND_MODIN, BACKEND_MODIN) -]) + +@pytest.mark.parametrize( + "df_backend,expected_backend", + [(BACKEND_POLARS, BACKEND_POLARS), (BACKEND_PANDAS, BACKEND_PANDAS), (BACKEND_MODIN, BACKEND_MODIN)], +) def test_infer_backend(sample_dataframe, df_backend, expected_backend): """Test that the backend is correctly inferred for Polars, Pandas, and Modin DataFrames.""" df, backend = sample_dataframe @@ -257,14 +271,15 @@ def test_infer_backend(sample_dataframe, df_backend, expected_backend): inferred_backend = tf._infer_backend(df) assert inferred_backend == expected_backend + def test_infer_backend_invalid(): """Test that a ValueError is raised for unsupported DataFrame types.""" invalid_df = "This is not a DataFrame" - + # Creating a valid TimeFrame object first to avoid column validation valid_df = pd.DataFrame({"time": [1, 2, 3], "target": [1, 2, 3]}) tf = TimeFrame(valid_df, time_col="time", target_col="target") # Placeholder - + # Now test the _infer_backend method directly on the invalid data with pytest.raises(ValueError) as excinfo: tf._infer_backend(invalid_df) diff --git a/test/unit/test_core_temporal_target_shifter.py b/test/unit/test_core_temporal_target_shifter.py index 45c1a33..e8d3239 100644 --- a/test/unit/test_core_temporal_target_shifter.py +++ b/test/unit/test_core_temporal_target_shifter.py @@ -1,4 +1,4 @@ -""" TemporalScope/test/unit/test_core_temporal_target_shifter.py +"""TemporalScope/test/unit/test_core_temporal_target_shifter.py This file contains unit tests for the TemporalTargetShifter class to ensure it behaves correctly across different backends (pandas, modin, polars), modes of operation (machine_learning, deep_learning), and various configurations. @@ -16,14 +16,16 @@ limitations under the License. """ -import pytest -import polars as pl -import pandas as pd import modin.pandas as mpd import numpy as np -from temporalscope.core.temporal_target_shifter import TemporalTargetShifter +import pandas as pd +import polars as pl +import pytest + +from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS from temporalscope.core.temporal_data_loader import TimeFrame -from temporalscope.core.core_utils import BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN +from temporalscope.core.temporal_target_shifter import TemporalTargetShifter + # Fixture to generate sample dataframes for different backends @pytest.fixture(params=[BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) @@ -44,16 +46,19 @@ def sample_dataframe(request): df = mpd.DataFrame(data) return df, backend, "target" + # Parametrized Test for Backend Inference, n_lags, and Modes -@pytest.mark.parametrize("n_lags, mode, sequence_length", [ - (1, TemporalTargetShifter.MODE_MACHINE_LEARNING, None), - (3, TemporalTargetShifter.MODE_MACHINE_LEARNING, None), - (1, TemporalTargetShifter.MODE_DEEP_LEARNING, 5) -]) +@pytest.mark.parametrize( + "n_lags, mode, sequence_length", + [ + (1, TemporalTargetShifter.MODE_MACHINE_LEARNING, None), + (3, TemporalTargetShifter.MODE_MACHINE_LEARNING, None), + (1, TemporalTargetShifter.MODE_DEEP_LEARNING, 5), + ], +) @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) # Parametrizing backends as well def test_backend_inference(backend, n_lags, mode, sequence_length): """Test backend inference and shifting functionality across all backends.""" - # Generate data for the current backend data = { "time": pd.date_range(start="2022-01-01", periods=100), @@ -68,7 +73,7 @@ def test_backend_inference(backend, n_lags, mode, sequence_length): df = pd.DataFrame(data) elif backend == BACKEND_MODIN: df = mpd.DataFrame(data) - + # Initialize shifter shifter = TemporalTargetShifter(n_lags=n_lags, mode=mode, sequence_length=sequence_length, target_col="target") @@ -80,26 +85,29 @@ def test_backend_inference(backend, n_lags, mode, sequence_length): transformed = shifter.transform(df) assert transformed is not None + # Parametrized test for invalid data and expected errors across backends -@pytest.mark.parametrize("invalid_data", [ - None, # Null input should raise an error - pd.DataFrame(), # Empty DataFrame should raise an error -]) +@pytest.mark.parametrize( + "invalid_data", + [ + None, # Null input should raise an error + pd.DataFrame(), # Empty DataFrame should raise an error + ], +) @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) def test_invalid_data_handling(backend, invalid_data): """Test invalid data handling for empty or None DataFrames across backends.""" - shifter = TemporalTargetShifter(n_lags=1, target_col="target") with pytest.raises(ValueError): shifter.fit(invalid_data) + # Parametrized test for TimeFrame inputs and transformation across all backends @pytest.mark.parametrize("n_lags", [1, 2]) @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) def test_time_frame_input(backend, n_lags): """Test TimeFrame input handling and transformation across all backends.""" - # Generate data for the current backend data = { "time": pd.date_range(start="2022-01-01", periods=100), @@ -117,18 +125,18 @@ def test_time_frame_input(backend, n_lags): tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") - + # Test fitting and transforming TimeFrame shifter.fit(tf) transformed = shifter.transform(tf) assert transformed is not None + # Parametrized test for deep learning mode with different sequence lengths across all backends @pytest.mark.parametrize("sequence_length", [3, 5]) @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) def test_deep_learning_mode(backend, sequence_length): """Test deep learning mode sequence generation across all backends.""" - # Generate data for the current backend data = { "time": pd.date_range(start="2022-01-01", periods=100), @@ -152,11 +160,11 @@ def test_deep_learning_mode(backend, sequence_length): transformed = shifter.transform(df) assert transformed is not None + # Test verbose mode with stdout capture @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) def test_verbose_mode(backend, capfd): """Test verbose mode output and row dropping information.""" - # Generate data for the current backend data = { "time": pd.date_range(start="2022-01-01", periods=100), @@ -181,12 +189,12 @@ def test_verbose_mode(backend, capfd): captured = capfd.readouterr() assert "Rows before shift" in captured.out + # Parametrized test for fit_transform method for all backends @pytest.mark.parametrize("n_lags", [1, 2]) @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) def test_fit_transform(backend, n_lags): """Test fit_transform() method for all backends.""" - # Generate data for the current backend data = { "time": pd.date_range(start="2022-01-01", periods=100), @@ -206,4 +214,3 @@ def test_fit_transform(backend, n_lags): transformed = shifter.fit_transform(df) assert transformed is not None - diff --git a/test/unit/test_core_utils.py b/test/unit/test_core_utils.py index a538ee1..0a049d2 100644 --- a/test/unit/test_core_utils.py +++ b/test/unit/test_core_utils.py @@ -1,4 +1,4 @@ -""" TemporalScope/test/unit/test_core_utils.py +"""TemporalScope/test/unit/test_core_utils.py TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,24 +13,26 @@ limitations under the License. """ -from typing import Optional, Union, Tuple +import warnings +from typing import Optional, Tuple, Union +from unittest.mock import patch + +import modin.pandas as mpd import numpy as np import pandas as pd import polars as pl -import modin.pandas as mpd import pytest + from temporalscope.core.core_utils import ( + check_nans, + check_nulls, get_api_keys, get_default_backend_cfg, + print_divider, + validate_and_convert_input, validate_backend, validate_input, - validate_and_convert_input, - check_nulls, - check_nans, - print_divider ) -from unittest.mock import patch -import warnings warnings.filterwarnings("ignore", message=".*defaulting to pandas.*") @@ -38,6 +40,7 @@ MOCK_OPENAI_API_KEY = "mock_openai_key" MOCK_CLAUDE_API_KEY = "mock_claude_key" + # --- Data Generation Functions --- def create_sample_data(num_samples: int = 100, with_nulls=False, with_nans=False): """Create sample data with options for introducing nulls and NaNs.""" @@ -57,6 +60,7 @@ def create_sample_data(num_samples: int = 100, with_nulls=False, with_nans=False return data + # Unified fixture for data with nulls and NaNs @pytest.fixture def sample_df_with_conditions(): @@ -67,10 +71,9 @@ def sample_df_with_conditions(): :return: A function that generates a DataFrame and backend identifier based on the specified conditions. :rtype: Callable[[Optional[str], bool, bool], Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]] """ + def _create_sample_df( - backend: Optional[str] = None, - with_nulls: bool = False, - with_nans: bool = False + backend: Optional[str] = None, with_nulls: bool = False, with_nans: bool = False ) -> Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]: """Creates a sample DataFrame for the specified backend with optional nulls and NaNs. @@ -95,10 +98,13 @@ def _create_sample_df( return mpd.DataFrame(data), "mpd" else: raise ValueError(f"Unsupported backend '{backend}'") + return _create_sample_df + # --- Tests --- + def test_get_api_keys(): """Test that get_api_keys retrieves environment variables correctly.""" with patch.dict("os.environ", {"OPENAI_API_KEY": MOCK_OPENAI_API_KEY, "CLAUDE_API_KEY": MOCK_CLAUDE_API_KEY}): @@ -111,6 +117,7 @@ def test_get_api_keys(): assert api_keys["OPENAI_API_KEY"] is None assert api_keys["CLAUDE_API_KEY"] is None + @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) @pytest.mark.parametrize("with_nans", [True, False]) def test_check_nans(backend, sample_df_with_conditions, with_nans): @@ -120,23 +127,27 @@ def test_check_nans(backend, sample_df_with_conditions, with_nans): expected = with_nans # True if NaNs were introduced, else False assert result == expected, f"Expected {expected} but got {result} for backend {backend}" + def test_get_default_backend_cfg(): """Test that the default backend configuration is returned correctly.""" expected_cfg = {"BACKENDS": {"pl": "polars", "pd": "pandas", "mpd": "modin"}} result = get_default_backend_cfg() assert result == expected_cfg + @pytest.mark.parametrize("backend", ["pl", "pd", "mpd"]) def test_validate_backend_supported(backend): """Test that supported backends are validated successfully.""" validate_backend(backend) + @pytest.mark.parametrize("invalid_backend", ["tf", "spark", "unknown"]) def test_validate_backend_unsupported(invalid_backend): """Test that unsupported backends raise a ValueError.""" with pytest.raises(ValueError, match="Unsupported backend"): validate_backend(invalid_backend) + @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) @pytest.mark.parametrize("target_backend", ["pl", "pd", "mpd"]) def test_validate_and_convert_input(sample_df_with_conditions, backend, target_backend): @@ -151,6 +162,7 @@ def test_validate_and_convert_input(sample_df_with_conditions, backend, target_b elif target_backend == "mpd": assert isinstance(result, mpd.DataFrame), f"Expected Modin DataFrame but got {type(result)}" + @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) def test_validate_and_convert_input_invalid_type(backend): """Test that validate_and_convert_input raises TypeError when given an invalid DataFrame type.""" @@ -159,12 +171,14 @@ def test_validate_and_convert_input_invalid_type(backend): with pytest.raises(TypeError, match="Input DataFrame type"): validate_and_convert_input(invalid_df, backend) + def test_print_divider(capsys): """Test the print_divider function outputs the correct string.""" print_divider("-", 50) captured = capsys.readouterr() assert captured.out == "-" * 50 + "\n" + def test_check_nans_invalid_backend(sample_df_with_conditions): """Test that an unsupported backend raises a ValueError in check_nans.""" df, _ = sample_df_with_conditions(with_nans=True) @@ -172,16 +186,20 @@ def test_check_nans_invalid_backend(sample_df_with_conditions): check_nans(df, "invalid_backend") -@pytest.mark.parametrize("backend, expected_type", [ - ("pl", pl.DataFrame), - ("pd", pd.DataFrame), - ("mpd", mpd.DataFrame), -]) +@pytest.mark.parametrize( + "backend, expected_type", + [ + ("pl", pl.DataFrame), + ("pd", pd.DataFrame), + ("mpd", mpd.DataFrame), + ], +) def test_validate_input_correct_backend(sample_df_with_conditions, backend, expected_type): """Test that validate_input passes when the DataFrame matches the backend.""" df, _ = sample_df_with_conditions(backend=backend, with_nulls=False) validate_input(df, backend) + @pytest.mark.parametrize("df_backend", ["pd", "pl", "mpd"]) @pytest.mark.parametrize("validate_backend", ["pd", "pl", "mpd"]) def test_validate_input_mismatched_backend(sample_df_with_conditions, df_backend, validate_backend): @@ -196,6 +214,7 @@ def test_validate_input_mismatched_backend(sample_df_with_conditions, df_backend # Should pass when backends match validate_input(df, validate_backend) + @pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) @pytest.mark.parametrize("with_nulls", [True, False]) def test_check_nulls(backend, sample_df_with_conditions, with_nulls): @@ -205,9 +224,10 @@ def test_check_nulls(backend, sample_df_with_conditions, with_nulls): expected = with_nulls # True if nulls were introduced, else False assert result == expected, f"Expected {expected} but got {result} for backend {backend}" + # Test for invalid backend handling def test_check_nulls_invalid_backend(sample_df_with_conditions): """Test that check_nulls raises ValueError when given an unsupported backend.""" df, _ = sample_df_with_conditions(with_nulls=True) with pytest.raises(ValueError, match="Unsupported backend"): - check_nulls(df, "invalid_backend") \ No newline at end of file + check_nulls(df, "invalid_backend") diff --git a/test/unit/test_partion_data_checks.py b/test/unit/test_partion_data_checks.py index 1336ca4..fed87a6 100644 --- a/test/unit/test_partion_data_checks.py +++ b/test/unit/test_partion_data_checks.py @@ -1,4 +1,4 @@ -# """ +# """ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information diff --git a/tutorial_notebooks/introduction/0_load_data_timeframe.ipynb b/tutorial_notebooks/introduction/0_load_data_timeframe.ipynb index 39c8f4d..9028a7b 100644 --- a/tutorial_notebooks/introduction/0_load_data_timeframe.ipynb +++ b/tutorial_notebooks/introduction/0_load_data_timeframe.ipynb @@ -112,8 +112,8 @@ "import polars as pl\n", "from statsmodels.datasets import macrodata\n", "\n", - "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", "from temporalscope.core.core_utils import print_divider\n", + "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", "\n", "\n", "def load_macrodata(target_col: str = \"realgdp\"):\n", diff --git a/tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb b/tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb index 740f6cb..d1a2fd3 100644 --- a/tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb +++ b/tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb @@ -317,6 +317,7 @@ "source": [ "import pandas as pd\n", "from statsmodels.datasets import macrodata\n", + "\n", "from temporalscope.core.core_utils import print_divider\n", "\n", "# Constants for modes\n", @@ -434,9 +435,9 @@ ], "source": [ "import modin.pandas as mpd\n", - "from statsmodels.datasets import macrodata\n", + "\n", + "from temporalscope.core.core_utils import BACKEND_MODIN\n", "from temporalscope.core.temporal_data_loader import TimeFrame\n", - "from temporalscope.core.core_utils import print_divider, BACKEND_MODIN\n", "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", "\n", "# Constants for modes\n", @@ -1226,12 +1227,10 @@ ], "source": [ "import modin.pandas as mpd\n", - "import pandas as pd\n", "import polars as pl\n", - "from statsmodels.datasets import macrodata\n", "\n", + "from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, print_divider\n", "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", - "from temporalscope.core.core_utils import print_divider, BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN\n", "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", "\n", "# Constants for modes\n", From 6e117cef8eecd9f35a615ae388618cbefec7b4d2 Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Sat, 21 Sep 2024 02:52:12 +0000 Subject: [PATCH 4/7] ci(license-compliance): fix license compliance workflow to resolve branch issue --- .github/workflows/license-compliance.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/license-compliance.yml b/.github/workflows/license-compliance.yml index d9f4db6..f2b7c51 100644 --- a/.github/workflows/license-compliance.yml +++ b/.github/workflows/license-compliance.yml @@ -17,6 +17,9 @@ jobs: steps: - name: Checkout Code uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch full history to ensure we're on a branch + ref: ${{ github.head_ref }} # Ensure the branch is checked out - name: Fix License Header uses: apache/skywalking-eyes/header@v0.6.0 @@ -31,6 +34,7 @@ jobs: author_name: License Bot author_email: license_bot@github.com message: 'chore: automatic application of license header' + push: true # Ensure the changes are pushed back to the branch check_dependencies: runs-on: ubuntu-latest @@ -40,3 +44,4 @@ jobs: - name: Check Dependencies' License uses: apache/skywalking-eyes/dependency@v0.6.0 + From 84a1f40dc2ff005e27201661e030a1beeb362bc4 Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Sat, 21 Sep 2024 03:04:40 +0000 Subject: [PATCH 5/7] ci(reduced-coverage-thresholds-to-allow-faster-development-without-blocking-ci/cd-due-to-low-coverage.-set-project-threshold-to-50%-and-patch-threshold-to-10%,-making-it-highly-lenient-during-beta.): lower codecov thresholds for beta phase --- codecov.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/codecov.yml b/codecov.yml index 98673ac..110344d 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,9 +1,21 @@ +# codecov.yml + coverage: status: project: default: - target: auto - threshold: 5% + target: auto # Automatically adjusts based on the current project coverage + threshold: 80% # Allow up to an 80% drop in overall coverage during beta phase + # Note: This permits a large drop in overall project coverage during the fast-paced development phase. + patch: default: - informational: true + informational: true # Informational only; won't fail the pipeline + target: 5% # Set a minimal target of 5% coverage for new code patches + threshold: 90% # Allow up to a 90% drop on new code coverage (extremely lenient) + # Note: This ensures that even with low coverage, the pipeline won’t block PRs, but you still get coverage insights. + +parsers: + python: + include: + - "src/temporalscope/**" # Focus coverage checks only on main source files, excluding tests and docs for now From 1f92d21755cd2f72d1ca21fb330acb6ed6f1899d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 21 Sep 2024 03:05:44 +0000 Subject: [PATCH 6/7] chore(pre-commit): autofix run --- .github/workflows/license-compliance.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/license-compliance.yml b/.github/workflows/license-compliance.yml index f2b7c51..9964134 100644 --- a/.github/workflows/license-compliance.yml +++ b/.github/workflows/license-compliance.yml @@ -44,4 +44,3 @@ jobs: - name: Check Dependencies' License uses: apache/skywalking-eyes/dependency@v0.6.0 - From 54e3e7604fe87a1547949d295368fcb8d3a2aae4 Mon Sep 17 00:00:00 2001 From: License Bot Date: Sat, 21 Sep 2024 03:06:13 +0000 Subject: [PATCH 7/7] chore: automatic application of license header --- src/temporalscope/core/core_utils.py | 17 +++++++++++++++++ src/temporalscope/core/temporal_data_loader.py | 17 +++++++++++++++++ .../core/temporal_target_shifter.py | 17 +++++++++++++++++ .../modeling/temporal_model_trainer.py | 17 +++++++++++++++++ src/temporalscope/partition/base_protocol.py | 17 +++++++++++++++++ .../partition/partition_validators.py | 17 +++++++++++++++++ src/temporalscope/partition/sliding_window.py | 17 +++++++++++++++++ test/unit/test_core_temporal_data_loader.py | 17 +++++++++++++++++ test/unit/test_core_temporal_target_shifter.py | 17 +++++++++++++++++ test/unit/test_core_utils.py | 17 +++++++++++++++++ 10 files changed, 170 insertions(+) diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py index e3f2f0a..0bb6d78 100644 --- a/src/temporalscope/core/core_utils.py +++ b/src/temporalscope/core/core_utils.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """TemporalScope/src/temporalscope/core/core_utils.py. This module provides utility functions that can be used throughout the TemporalScope package. It includes methods for diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index 785a223..82c2615 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """TemporalScope/src/temporalscope/core/temporal_data_loader.py. This module provides a flexible data loader for time series forecasting, allowing users to define their own diff --git a/src/temporalscope/core/temporal_target_shifter.py b/src/temporalscope/core/temporal_target_shifter.py index ff9b762..bd72925 100644 --- a/src/temporalscope/core/temporal_target_shifter.py +++ b/src/temporalscope/core/temporal_target_shifter.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """TemporalScope/src/temporalscope/core/temporal_target_shifter.py. This module provides a transformer-like class to shift the target variable in time series data, either diff --git a/src/temporalscope/modeling/temporal_model_trainer.py b/src/temporalscope/modeling/temporal_model_trainer.py index 10efea8..31311cc 100644 --- a/src/temporalscope/modeling/temporal_model_trainer.py +++ b/src/temporalscope/modeling/temporal_model_trainer.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + # """Implements the `TemporalModelTrainer` for training on temporally partitioned data. # This module provides functionality to train machine learning models on data diff --git a/src/temporalscope/partition/base_protocol.py b/src/temporalscope/partition/base_protocol.py index 9bacefa..ead79cf 100644 --- a/src/temporalscope/partition/base_protocol.py +++ b/src/temporalscope/partition/base_protocol.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """TemporalScope/src/temporalscope/partition/base_protocol.py. This module defines the TemporalPartitionerProtocol, a protocol for all diff --git a/src/temporalscope/partition/partition_validators.py b/src/temporalscope/partition/partition_validators.py index b51d135..c553a4a 100644 --- a/src/temporalscope/partition/partition_validators.py +++ b/src/temporalscope/partition/partition_validators.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """TemporalScope/temporalscope/partition/partition_validators.py. This module provides functions to validate dataset partitions against diff --git a/src/temporalscope/partition/sliding_window.py b/src/temporalscope/partition/sliding_window.py index 00e5fe2..d8af38e 100644 --- a/src/temporalscope/partition/sliding_window.py +++ b/src/temporalscope/partition/sliding_window.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """TemporalScope/temporalscope/partitioning/sliding_window.py. This module defines the SlidingWindowPartitioner class, a specific implementation of the diff --git a/test/unit/test_core_temporal_data_loader.py b/test/unit/test_core_temporal_data_loader.py index 633a9ec..64e67f9 100644 --- a/test/unit/test_core_temporal_data_loader.py +++ b/test/unit/test_core_temporal_data_loader.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """TemporalScope/test/unit/test_core_temporal_data_loader.py TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/test/unit/test_core_temporal_target_shifter.py b/test/unit/test_core_temporal_target_shifter.py index e8d3239..97fb881 100644 --- a/test/unit/test_core_temporal_target_shifter.py +++ b/test/unit/test_core_temporal_target_shifter.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """TemporalScope/test/unit/test_core_temporal_target_shifter.py This file contains unit tests for the TemporalTargetShifter class to ensure it behaves correctly across different diff --git a/test/unit/test_core_utils.py b/test/unit/test_core_utils.py index 0a049d2..75f6d55 100644 --- a/test/unit/test_core_utils.py +++ b/test/unit/test_core_utils.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """TemporalScope/test/unit/test_core_utils.py TemporalScope is Licensed under the Apache License, Version 2.0 (the "License");