From f45f67e9143d93e201dbcfc0fde4018004bd29e6 Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Mon, 23 Sep 2024 15:23:12 +0000 Subject: [PATCH 1/6] Save my work before merging CI/CD changes --- src/temporalscope/core/core_utils.py | 12 +- .../core/temporal_data_loader.py | 18 +- .../core/temporal_target_shifter.py | 214 ++-- src/temporalscope/partition/base_protocol.py | 12 - .../partition/partition_validators.py | 12 - src/temporalscope/partition/sliding_window.py | 174 ++-- test/unit/test_core_temporal_data_loader.py | 10 +- .../unit/test_core_temporal_target_shifter.py | 139 ++- .../introduction/1_target_shifter.ipynb | 922 ++++++++++++++++++ ...r.ipynb => 2_partion_sliding_window.ipynb} | 0 10 files changed, 1209 insertions(+), 304 deletions(-) create mode 100644 tutorial_notebooks/introduction/1_target_shifter.ipynb rename tutorial_notebooks/introduction/{1_load_data_target_shifter.ipynb => 2_partion_sliding_window.ipynb} (100%) diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py index 0bb6d78..bf485b6 100644 --- a/src/temporalscope/core/core_utils.py +++ b/src/temporalscope/core/core_utils.py @@ -19,16 +19,6 @@ This module provides utility functions that can be used throughout the TemporalScope package. It includes methods for printing dividers, checking for nulls and NaNs, and validating the backend. - -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in -compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. """ import os @@ -46,6 +36,8 @@ BACKEND_POLARS = "pl" BACKEND_PANDAS = "pd" BACKEND_MODIN = "mpd" +MODE_MACHINE_LEARNING = "machine_learning" +MODE_DEEP_LEARNING = "deep_learning" # Mapping of backend keys to their full names or module references BACKENDS = { diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index 82c2615..99c1212 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -28,18 +28,9 @@ 1. Van Ness, M., Shen, H., Wang, H., Jin, X., Maddix, D.C., & Gopalswamy, K. (2023). Cross-Frequency Time Series Meta-Forecasting. arXiv preprint arXiv:2302.02077. 2. Woo, G., Liu, C., Kumar, A., Xiong, C., Savarese, S., & Sahoo, D. (2024). Unified training of universal time series forecasting transformers. arXiv preprint arXiv:2402.02592. 3. Trirat, P., Shin, Y., Kang, J., Nam, Y., Na, J., Bae, M., Kim, J., Kim, B., & Lee, J.-G. (2024). Universal time-series representation learning: A survey. arXiv preprint arXiv:2401.03717. - -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file -except in compliance with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software distributed under the License is distributed -on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License -for the specific language governing permissions and limitations under the License. """ -from typing import Optional, Union +from typing import Optional, Union, TYPE_CHECKING import modin.pandas as mpd import pandas as pd @@ -56,6 +47,12 @@ validate_input, ) +# Use forward reference for TimeFrame +if TYPE_CHECKING: + from temporalscope.core.temporal_data_loader import TimeFrame + +# Define alias with forward reference +TimeFrameCompatibleData = Union['TimeFrame', SupportedBackendDataFrame] # Use string to refer to TimeFrame class TimeFrame: """Central class for the TemporalScope package. @@ -154,7 +151,6 @@ def __init__( # Convert, validate, and set up the DataFrame self.df = self._setup_timeframe(df) - @property def backend(self) -> str: """Return the backend used. diff --git a/src/temporalscope/core/temporal_target_shifter.py b/src/temporalscope/core/temporal_target_shifter.py index bd72925..527854f 100644 --- a/src/temporalscope/core/temporal_target_shifter.py +++ b/src/temporalscope/core/temporal_target_shifter.py @@ -26,18 +26,6 @@ 1. Torres, J.F., Hadjout, D., Sebaa, A., Martínez-Álvarez, F., & Troncoso, A. (2021). Deep learning for time series forecasting: a survey. Big Data, 9(1), 3-21. https://doi.org/10.1089/big.2020.0159 2. Lim, B., & Zohren, S. (2021). Time-series forecasting with deep learning: a survey. Philosophical Transactions of the Royal Society A, 379(2194), 20200209. https://doi.org/10.1098/rsta.2020.0209 3. Tang, Y., Song, Z., Zhu, Y., Yuan, H., Hou, M., Ji, J., Tang, C., & Li, J. (2022). A survey on machine learning models for financial time series forecasting. Neurocomputing, 512, 363-380. https://doi.org/10.1016/j.neucom.2022.09.078 - -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. """ import warnings @@ -51,10 +39,13 @@ BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, + MODE_MACHINE_LEARNING, + MODE_DEEP_LEARNING, SupportedBackendDataFrame, validate_backend, ) from temporalscope.core.temporal_data_loader import TimeFrame +from temporalscope.core.temporal_data_loader import TimeFrameCompatibleData class TemporalTargetShifter: @@ -132,9 +123,6 @@ class TemporalTargetShifter: """ - MODE_MACHINE_LEARNING = "machine_learning" - MODE_DEEP_LEARNING = "deep_learning" - def __init__( self, n_lags: int = 1, @@ -154,26 +142,41 @@ def __init__( :param drop_target: Whether to drop the original target column after shifting. Default is True. :param verbose: Whether to print detailed information about transformations. :raises ValueError: If the target column is not provided or if an invalid mode is selected. + + Note: + The data_format is set to None during initialization and will be inferred in the fit() method based on + the type of input data (TimeFrame or SupportedBackendDataFrame). """ + # Validate the mode (should be machine learning or deep learning) if mode not in [self.MODE_MACHINE_LEARNING, self.MODE_DEEP_LEARNING]: raise ValueError(f"`mode` must be '{self.MODE_MACHINE_LEARNING}' or '{self.MODE_DEEP_LEARNING}'.") + # Ensure the target column is provided if target_col is None: raise ValueError("`target_col` must be explicitly provided for TemporalTargetShifter.") + # Validate n_lags (should be greater than 0) if n_lags <= 0: raise ValueError("`n_lags` must be greater than 0.") + # Handle deep learning mode, ensure sequence length is set + if mode == self.MODE_DEEP_LEARNING and sequence_length is None: + raise ValueError("`sequence_length` must be provided when mode is 'deep_learning'.") + + # Assign instance attributes self.n_lags = n_lags self.mode = mode self.sequence_length = sequence_length self.target_col = target_col self.drop_target = drop_target self.verbose = verbose - self.backend: Optional[str] = None # Backend will be set during fit - if self.mode == self.MODE_DEEP_LEARNING and not self.sequence_length: - raise ValueError("`sequence_length` must be provided when mode is 'deep_learning'.") + # The data format will be inferred later during fit() + self.data_format = None # Data format will be inferred during fit() + + # Print a verbose message if required + if verbose: + print(f"Initialized TemporalTargetShifter with mode={mode}, n_lags={n_lags}, target_col={target_col}") def _infer_backend(self, df: SupportedBackendDataFrame) -> str: """Infer the backend from the DataFrame type. @@ -184,11 +187,11 @@ def _infer_backend(self, df: SupportedBackendDataFrame) -> str: :raises ValueError: If the DataFrame type is unsupported. """ if isinstance(df, pl.DataFrame): - return BACKEND_POLARS + return "pl" elif isinstance(df, pd.DataFrame): - return BACKEND_PANDAS + return "pd" elif isinstance(df, mpd.DataFrame): - return BACKEND_MODIN + return "mpd" else: raise ValueError(f"Unsupported DataFrame type: {type(df)}") @@ -203,24 +206,26 @@ def _set_backend(self, df: SupportedBackendDataFrame) -> None: self.backend = self._infer_backend(df) validate_backend(self.backend) - def _validate_data(self, tf: SupportedBackendDataFrame) -> None: - """Validate the TimeFrame or partitioned data for consistency. + def _validate_data(self, tf: TimeFrameCompatibleData) -> None: + """Validate the TimeFrame or DataFrame input for consistency. + + This method ensures that the input data is valid and non-empty, regardless of whether it is a TimeFrame or a raw DataFrame. - :param tf: The `TimeFrame` object or a DataFrame (`pandas`, `modin`, or `polars`) that contains the time series data. - :type tf: SupportedBackendDataFrame - :raises ValueError: If the data is invalid or empty. + :param tf: The `TimeFrame` object or a raw DataFrame (Pandas, Modin, or Polars) to be validated. + :type tf: TimeFrameCompatibleData + :raises ValueError: If the input data is empty or invalid. """ if isinstance(tf, TimeFrame): df = tf.get_data() else: df = tf - # Check if the DataFrame is empty based on the backend - if isinstance(df, (pd.DataFrame, mpd.DataFrame)): # Merge the `isinstance` calls for `pd` and `mpd` + # Check if the DataFrame is empty + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): if df is None or df.empty: raise ValueError("Input DataFrame is empty.") elif isinstance(df, pl.DataFrame): - if df is None or df.is_empty(): + if df.is_empty(): raise ValueError("Input DataFrame is empty.") else: raise ValueError("Unsupported DataFrame type.") @@ -357,16 +362,17 @@ def _print_dropped_rows(self, rows_before: int, rows_after: int) -> None: rows_dropped = rows_before - rows_after print(f"Rows before shift: {rows_before}; Rows after shift: {rows_after}; Rows dropped: {rows_dropped}") - def fit(self, tf: SupportedBackendDataFrame) -> "TemporalTargetShifter": - """Validate and prepare the target data for transformation based on the specified backend. + def fit(self, tf: TimeFrameCompatibleData) -> "TemporalTargetShifter": + """Validate and prepare the target data for transformation based on the inferred data format (backend). - The `fit` method initializes the backend and validates the input data, ensuring the target column is consistent with the input data. - It does not alter the data but sets up the necessary configuration for later transformations. + The `fit` method initializes the data format (whether it's a `TimeFrame` or a raw DataFrame) and validates the input data. + It ensures the target column is consistent with the input data and sets the backend (`data_format`), which will be used + in subsequent transformations. - :param tf: The `TimeFrame` object, or a DataFrame (`pandas`, `modin`, or `polars`) that contains the time series data. - The DataFrame must have a target column defined or the `target_col` attribute set during initialization. - :type tf: SupportedBackendDataFrame, optional - :raises ValueError: If the target column is not provided, the data is invalid, or the backend is unsupported. + :param tf: The `TimeFrame` object or a raw DataFrame (`pandas`, `modin`, or `polars`) that contains the time series data. + The data should contain a target column that will be shifted. + :type tf: TimeFrameCompatibleData + :raises ValueError: If the target column is not provided, the data is invalid, or the backend format is unsupported. :raises Warning: If the target column provided in `TemporalTargetShifter` differs from the one in the `TimeFrame`. :return: The fitted `TemporalTargetShifter` instance, ready for transforming the data. :rtype: TemporalTargetShifter @@ -375,76 +381,129 @@ def fit(self, tf: SupportedBackendDataFrame) -> "TemporalTargetShifter": -------------- .. code-block:: python + from temporalscope.core.temporal_target_shifter import TemporalTargetShifter + from temporalscope.core.temporal_data_loader import TimeFrame + import pandas as pd + import numpy as np + + # Create a sample Pandas DataFrame + data = { + 'time': pd.date_range(start='2022-01-01', periods=100), + 'target': np.random.rand(100), + 'feature_1': np.random.rand(100) + } + df = pd.DataFrame(data) + + # Create a TimeFrame object + tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + + # Create a TemporalTargetShifter instance shifter = TemporalTargetShifter(n_lags=2, target_col="target") - shifter.fit(time_frame) + + # Fit the shifter to the TimeFrame + shifter.fit(tf) """ + # Validate the input data (whether it's TimeFrame or DataFrame) self._validate_data(tf) + # If input is a TimeFrame, set the backend using the @property method and manage the target column if isinstance(tf, TimeFrame): - # Set backend and handle target column for TimeFrame input - self.backend = tf.backend + self.data_format = tf.backend # Using the @property to access the backend if not self.target_col: - self.target_col = tf._target_col + self.target_col = tf._target_col # If target_col not set in the shifter, use TimeFrame's target_col elif self.target_col != tf._target_col: warnings.warn( - f"The `target_col` in TemporalTargetShifter ('{self.target_col}') " - f"differs from the TimeFrame's target_col ('{tf._target_col}').", + f"The `target_col` in TemporalTargetShifter ('{self.target_col}') differs from the TimeFrame's " + f"target_col ('{tf._target_col}').", UserWarning, ) + # If input is a raw DataFrame (pandas, modin, or polars), infer the backend elif tf is not None: - # Infer backend for non-TimeFrame input - self.backend = self._infer_backend(tf) + self.data_format = self._infer_backend(tf) else: raise ValueError("Input data is None.") + # Return the instance after fitting return self - def transform(self, tf: SupportedBackendDataFrame) -> SupportedBackendDataFrame: + def transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFrame: """Transform the input time series data by shifting the target variable according to the specified number of lags. - The `transform` method shifts the target variable in the input data according to the `n_lags` or `sequence_length` set during initialization. - This method works directly on either a `TimeFrame` or a raw DataFrame (Pandas, Modin, or Polars), applying the appropriate backend-specific transformation. + The `transform` method shifts the target variable in the input data according to the `n_lags` or `sequence_length` + set during initialization. This method works directly on either a `TimeFrame` or a raw DataFrame (Pandas, Modin, + or Polars), applying the appropriate backend-specific transformation. - :param tf: The `TimeFrame` object or a DataFrame (Pandas, Modin, or Polars) that contains the time series data to be transformed. - The data should contain a target column that will be shifted. - :type tf: SupportedBackendDataFrame, optional + :param tf: The `TimeFrame` object or a DataFrame (Pandas, Modin, or Polars) that contains the time series data + to be transformed. The data should contain a target column that will be shifted. + :type tf: TimeFrameCompatibleData :raises ValueError: If the input data is invalid, unsupported, or lacks columns. :raises ValueError: If the backend is unsupported or data validation fails. - :return: A transformed DataFrame or `TimeFrame` with the target variable shifted by the specified lags or sequence length. - If a `TimeFrame` is provided, the returned object will be a `TimeFrame`. Otherwise, a DataFrame will be returned. + :return: A transformed DataFrame or `TimeFrame` with the target variable shifted by the specified lags or sequence + length. If a `TimeFrame` is provided, the returned object will be a `TimeFrame`. Otherwise, a DataFrame + will be returned. :rtype: SupportedBackendDataFrame Example Usage: -------------- .. code-block:: python + from temporalscope.core.temporal_target_shifter import TemporalTargetShifter + from temporalscope.core.temporal_data_loader import TimeFrame + import pandas as pd + + # Create a sample Pandas DataFrame + data = { + 'time': pd.date_range(start='2022-01-01', periods=100), + 'target': np.random.rand(100), + 'feature_1': np.random.rand(100) + } + df = pd.DataFrame(data) + + # Create a TimeFrame object + tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + + # Initialize TemporalTargetShifter shifter = TemporalTargetShifter(n_lags=2, target_col="target") - transformed_data = shifter.transform(time_frame) + + # Fit the shifter and transform the data + shifter.fit(tf) + transformed_data = shifter.transform(tf) """ + # Handle TimeFrame input: sort data and retrieve the DataFrame if isinstance(tf, TimeFrame): - tf.sort_data() # Ensure the data is sorted before shifting + tf.sort_data() # Ensure data is sorted before shifting df = tf.get_data() + + # If target_col isn't set in the shifter, retrieve it from TimeFrame if not self.target_col: self.target_col = tf._target_col - self.backend = tf.backend + + # Assign the backend from TimeFrame + self.data_format = tf.backend + + # Handle raw DataFrame input elif tf is not None: df = tf + + # Infer the target column from the input if not already set if not self.target_col: if hasattr(df, "columns"): self.target_col = df.columns[-1] else: raise ValueError("The input DataFrame does not have columns.") + + # Set or infer the backend for the DataFrame self._set_backend(df) else: raise ValueError("Input data is None.") - # Delegate the transformation to backend-specific methods - if self.backend == BACKEND_PANDAS or self.backend == BACKEND_MODIN: + # Delegate transformation to backend-specific methods + if self.data_format == BACKEND_PANDAS or self.data_format == BACKEND_MODIN: transformed_df = self._transform_pandas_modin(df) - elif self.backend == BACKEND_POLARS: + elif self.data_format == BACKEND_POLARS: transformed_df = self._transform_polars(df) else: - raise ValueError(f"Unsupported backend: {self.backend}") + raise ValueError(f"Unsupported backend: {self.data_format}") # If the input was a TimeFrame, return a transformed TimeFrame if isinstance(tf, TimeFrame): @@ -456,12 +515,12 @@ def transform(self, tf: SupportedBackendDataFrame) -> SupportedBackendDataFrame: if self.mode == self.MODE_MACHINE_LEARNING else f"{self.target_col}_sequence" ), - backend=self.backend, + backend=self.data_format, ) return transformed_df - def fit_transform(self, tf: SupportedBackendDataFrame) -> SupportedBackendDataFrame: + def fit_transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFrame: """Fit and transform the input data in a single step. This method combines the functionality of the `fit` and `transform` methods. It first validates and prepares the input data (fitting), @@ -469,7 +528,7 @@ def fit_transform(self, tf: SupportedBackendDataFrame) -> SupportedBackendDataFr :param tf: The `TimeFrame` object or a DataFrame (`pandas`, `modin`, or `polars`) to be transformed. The data should contain a target column that will be shifted according to the `n_lags` or `sequence_length`. - :type tf: SupportedBackendDataFrame, optional + :type tf: TimeFrameCompatibleData :raises ValueError: If the input data is invalid or the backend is unsupported. :raises ValueError: If the target column is not set, or is incompatible with the data. :return: A transformed DataFrame or TimeFrame with the target variable shifted by the specified lags or sequence length. @@ -479,13 +538,34 @@ def fit_transform(self, tf: SupportedBackendDataFrame) -> SupportedBackendDataFr -------------- .. code-block:: python + from temporalscope.core.temporal_target_shifter import TemporalTargetShifter + from temporalscope.core.temporal_data_loader import TimeFrame + import pandas as pd + + # Create a sample Pandas DataFrame + data = { + 'time': pd.date_range(start='2022-01-01', periods=100), + 'target': np.random.rand(100), + 'feature_1': np.random.rand(100) + } + df = pd.DataFrame(data) + + # Create a TimeFrame object + tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + + # Initialize TemporalTargetShifter shifter = TemporalTargetShifter(n_lags=2, target_col="target") - shifted_data = shifter.fit_transform(time_frame) + + # Fit and transform in a single step + shifted_data = shifter.fit_transform(tf) """ + # Fit the data (infers backend and validates input) self.fit(tf) + + # Apply the transformation (delegates to backend-specific methods) transformed = self.transform(tf) - # Return TimeFrame if input was TimeFrame, otherwise return DataFrame + # If input was a TimeFrame, return the transformed TimeFrame if isinstance(tf, TimeFrame): return TimeFrame( transformed, @@ -495,6 +575,8 @@ def fit_transform(self, tf: SupportedBackendDataFrame) -> SupportedBackendDataFr if self.mode == self.MODE_MACHINE_LEARNING else f"{self.target_col}_sequence" ), - backend=self.backend, + backend=self.data_format, # Ensure we use the inferred backend from fit() ) + + # If input was a raw DataFrame, return the transformed DataFrame return transformed diff --git a/src/temporalscope/partition/base_protocol.py b/src/temporalscope/partition/base_protocol.py index ead79cf..2017f72 100644 --- a/src/temporalscope/partition/base_protocol.py +++ b/src/temporalscope/partition/base_protocol.py @@ -35,18 +35,6 @@ Each implementing class must provide its own logic for partitioning the data and any necessary validation, while adhering to the design principles of lazy-loading and memory efficiency. - -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. """ from typing import Dict, Iterator, Protocol, Tuple, Union diff --git a/src/temporalscope/partition/partition_validators.py b/src/temporalscope/partition/partition_validators.py index c553a4a..bc61e86 100644 --- a/src/temporalscope/partition/partition_validators.py +++ b/src/temporalscope/partition/partition_validators.py @@ -24,18 +24,6 @@ 1. Shwartz-Ziv, R. and Armon, A., 2022. Tabular data: Deep learning is not all you need. Information Fusion, 81, pp.84-90. 2. Grinsztajn, L., Oyallon, E. and Varoquaux, G., 2022. Why do tree-based models still outperform deep learning on typical tabular data? 3. Gorishniy, Y., Rubachev, I., Khrulkov, V. and Babenko, A., 2021. Revisiting deep learning models for tabular data. - -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. """ import warnings diff --git a/src/temporalscope/partition/sliding_window.py b/src/temporalscope/partition/sliding_window.py index d8af38e..e124795 100644 --- a/src/temporalscope/partition/sliding_window.py +++ b/src/temporalscope/partition/sliding_window.py @@ -30,18 +30,6 @@ The `SlidingWindowPartitioner` is intended for universal models, which assume flat partitioning across all entities. Users are responsible for preprocessing steps such as deduplication or transforming `time_col` to numerical features. - -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. """ import itertools @@ -229,13 +217,19 @@ def __init__( :param verbose: Enable verbose output. :raises ValueError: If input parameters are invalid. """ + # Validate the backend validate_backend(tf.backend) + + # Get the number of rows from the TimeFrame object num_rows = tf.get_data().shape[0] + + # Determine window size if not provided if window_size is None: if num_partitions is None or num_partitions <= 0: raise ValueError("`num_partitions` must be a positive integer.") window_size = num_rows // num_partitions + # Validate the window size and stride if window_size <= 0: raise ValueError("`window_size` must be a positive integer.") if stride <= 0: @@ -251,17 +245,21 @@ def __init__( if train_pct + (test_pct or 0) + (val_pct or 0) != 1.0: raise ValueError("Train, test, and validation percentages must sum to 1.0.") + # Assign attributes self.tf = tf self.window_size = window_size self.stride = stride self.reverse = reverse self.truncate = truncate self.verbose = verbose + + # Precompute percentages self.train_pct, self.test_pct, self.val_pct = self._precompute_percentages(train_pct, test_pct, val_pct) - # Sort data by time column using TimeFrame method + # Sort the data using TimeFrame's sort_data method self.tf.sort_data(ascending=True) + # Initialize internal state self._fit_executed = False self._transform_executed = False @@ -324,7 +322,6 @@ def _precompute_percentages( def _pad_partition( self, - df: Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame], window_size: int, end: int, reverse: bool, @@ -334,8 +331,6 @@ def _pad_partition( This function ensures that the partition is padded to the full window size by repeating the last row of the partition until the desired window size is achieved. - :param df: The DataFrame (Pandas, Modin, or Polars) to pad. - :type df: Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] :param window_size: The target window size to pad the partition to. :type window_size: int :param end: The index indicating the end of the current partition. @@ -343,52 +338,34 @@ def _pad_partition( :param reverse: If True, the padding is added to the start; otherwise, it's added at the end. :type reverse: bool :return: A DataFrame padded to the specified window size. - :rtype: Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] + :rtype: SupportedBackendDataFrame """ - # Calculate how many rows to pad + df = self.tf.get_data() # Get the DataFrame from TimeFrame num_to_pad = window_size - df.shape[0] if num_to_pad <= 0: return df # No need to pad - # Handle Pandas or Modin DataFrames - if isinstance(df, (pd.DataFrame, mpd.DataFrame)): - # Select the row to use for padding - pad_row = df.iloc[[end - 1]] if not reverse else df.iloc[[0]] + # Select the row to use for padding (whether reverse or not) + pad_row = df.iloc[[end - 1]] if not reverse else df.iloc[[0]] - # Repeat the selected row for the required number of times - pad_rows = pd.concat([pad_row] * num_to_pad, ignore_index=True) + # Create padding by repeating the selected row for the required number of times + pad_rows = pd.concat([pad_row] * num_to_pad, ignore_index=True) - # Concatenate the original DataFrame with the padding - if reverse: - return pd.concat([pad_rows, df], ignore_index=True) - else: - return pd.concat([df, pad_rows], ignore_index=True) - - # Handle Polars DataFrames - elif isinstance(df, pl.DataFrame): - # Select the row to use for padding - pad_row = df.slice(end - 1, 1) if not reverse else df.slice(0, 1) - - # Repeat the selected row for the required number of times - pad_rows = pl.DataFrame([pad_row.to_dict(as_series=False)[0] for _ in range(num_to_pad)]) - - # Concatenate the original DataFrame with the padding - if reverse: - return pad_rows.vstack(df) - else: - return df.vstack(pad_rows) + # Concatenate the padding with the original DataFrame + if reverse: + return pd.concat([pad_rows, df], ignore_index=True) + else: + return pd.concat([df, pad_rows], ignore_index=True) - raise TypeError("Unsupported DataFrame type.") + def _fit_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: + """Fit method for partitioning using TimeFrame data. - def _fit_pandas_modin( - self, df: Union[pd.DataFrame, mpd.DataFrame] - ) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: - """Fit method specific to Pandas or Modin backends. + This method partitions the dataset retrieved from TimeFrame, irrespective of backend. - :param df: Input DataFrame. - :return: Iterator yielding partition indices for Pandas/Modin. + :return: Iterator yielding partition indices. """ + df = self.tf.get_data() # Get the DataFrame from TimeFrame partition_count = 1 num_rows = df.shape[0] @@ -415,20 +392,22 @@ def _fit_pandas_modin( "full": (start, end), "train": (start, train_end), "test": (train_end, test_end), - "validation": ((test_end, validation_end) if self.val_pct else (0, 0)), + "validation": (test_end, validation_end) if self.val_pct else (0, 0), } } partition_count += 1 - def _fit_polars(self, df: pl.DataFrame) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: - """Fit method specific to Polars backend. + def _fit_polars(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: + """Fit method for partitioning using TimeFrame data. - :param df: Input DataFrame. - :return: Iterator yielding partition indices for Polars. + This method partitions the dataset retrieved from TimeFrame, irrespective of backend. + + :return: Iterator yielding partition indices. """ + df = self.tf.get_data() # Get the DataFrame from TimeFrame partition_count = 1 - num_rows = df.height + num_rows = df.shape[0] # Use shape[0] to be consistent with other backends like Pandas/Modin start_range = list(range(0, num_rows, self.stride)) if self.reverse: @@ -452,14 +431,12 @@ def _fit_polars(self, df: pl.DataFrame) -> Iterator[Dict[str, Dict[str, Tuple[in "full": (start, end), "train": (start, train_end), "test": (train_end, test_end), - "validation": ((test_end, validation_end) if self.val_pct else (0, 0)), + "validation": (test_end, validation_end) if self.val_pct else (0, 0), } } partition_count += 1 - def _transform_pandas_modin( - self, df: Union[pd.DataFrame, mpd.DataFrame] - ) -> Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame]]]]: + def _transform_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame]]]]: """Transform method for Pandas/Modin backend. This method transforms the partitioned dataset into slices, yielding the data slices corresponding to @@ -469,8 +446,6 @@ def _transform_pandas_modin( If a partition's size is smaller than the specified `window_size`, padding is applied to ensure uniform size across partitions, unless `truncate` is set to True. - :param df: Input DataFrame. This can be either Pandas or Modin DataFrame, depending on the backend. - :type df: Union[pd.DataFrame, mpd.DataFrame] :return: Iterator yielding partitioned DataFrame slices for Pandas/Modin backends. :rtype: Iterator[Dict[str, Dict[str, Union[pd.DataFrame, mpd.DataFrame]]]] @@ -482,7 +457,7 @@ def _transform_pandas_modin( tf=data_tf, window_size=5, stride=2, train_pct=0.7, test_pct=0.3 ) - for partition_data in partitioner._transform_pandas_modin(df): + for partition_data in partitioner._transform_pandas_modin(): print(partition_data) Output Format: @@ -509,26 +484,25 @@ def _transform_pandas_modin( --------------------------- - For very large datasets, the padding process may increase memory usage. Consider using Modin when handling large datasets to take advantage of distributed processing. - """ partition_count = 1 + df = self.tf.get_data() # Fetch the data from TimeFrame - for partition in self.fit(): + for partition in self.fit(): # Partition indices generated by fit() partitioned_data = {} - # Ensure partition is a dictionary + # Iterate through the partition and generate train, test, validation sets if isinstance(partition, dict): for key, partition_dict in partition.items(): partitioned_data[key] = { - part_name: df.iloc[start:end] + part_name: df.iloc[start:end] # Slice based on indices for part_name, (start, end) in partition_dict.items() if start is not None and end is not None } - # If the partition size is smaller than the window size, pad it + # Apply padding if partition size is smaller than window_size and truncate is False if partition_dict["full"][1] - partition_dict["full"][0] < self.window_size and not self.truncate: partitioned_data[key]["full"] = self._pad_partition( - partitioned_data[key]["full"], self.window_size, partition_dict["full"][1], self.reverse, @@ -537,15 +511,13 @@ def _transform_pandas_modin( partition_count += 1 - def _transform_polars(self, df: pl.DataFrame) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: + def _transform_polars(self) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: """Transform method for Polars backend. This method generates partitioned data slices for the Polars backend, yielding the data slices corresponding to the partition indices generated by the `fit` method. If the size of a partition is smaller than the specified `window_size`, padding is applied unless `truncate` is set to True. - :param df: Input Polars DataFrame. - :type df: pl.DataFrame :return: Iterator yielding partitioned DataFrame slices for Polars backend. :rtype: Iterator[Dict[str, Dict[str, pl.DataFrame]]] @@ -557,7 +529,7 @@ def _transform_polars(self, df: pl.DataFrame) -> Iterator[Dict[str, Dict[str, pl tf=data_tf, window_size=5, stride=2, train_pct=0.7, test_pct=0.3 ) - for partition_data in partitioner._transform_polars(df): + for partition_data in partitioner._transform_polars(): print(partition_data) Output Format: @@ -587,47 +559,29 @@ def _transform_polars(self, df: pl.DataFrame) -> Iterator[Dict[str, Dict[str, pl """ partition_count = 1 + df = self.tf.get_data() # Fetch the data from TimeFrame - num_rows = df.height - start_range = list(range(0, num_rows, self.stride)) - - if self.reverse: - start_range.reverse() - - for start in start_range: - end = start + self.window_size - - if end > num_rows: - if self.truncate: - break - end = num_rows + for partition in self.fit(): # Partition indices generated by fit() + partitioned_data = {} - train_end = start + int(self.train_pct * (end - start)) - test_end = train_end + int(self.test_pct * (end - start)) if self.test_pct else train_end - validation_end = end if self.val_pct else test_end + # Iterate through the partition and generate train, test, validation sets + if isinstance(partition, dict): + for key, partition_dict in partition.items(): + partitioned_data[key] = { + part_name: df.slice(start, end - start) # Slice based on indices + for part_name, (start, end) in partition_dict.items() + if start is not None and end is not None + } - # Yield the partitioned data slices - partitioned_data = { - part_name: df.slice(start, end - start) - for part_name, (start, end) in { - "full": (start, end), - "train": (start, train_end), - "test": (train_end, test_end), - "validation": (test_end, validation_end), - }.items() - } + # Apply padding if partition size is smaller than window_size and truncate is False + if partition_dict["full"][1] - partition_dict["full"][0] < self.window_size and not self.truncate: + partitioned_data[key]["full"] = self._pad_partition( + self.window_size, + partition_dict["full"][1], + self.reverse, + ) + yield partitioned_data - # If partition size is smaller than window size, pad it - if partitioned_data["full"].height < self.window_size and not self.truncate: - partitioned_data["full"] = self._pad_partition( - partitioned_data["full"], - self.window_size, - partitioned_data["full"].height, - self.reverse, - ) - - # Wrap the partitioned_data in a dictionary to match the expected return type - yield {f"partition_{partition_count}": partitioned_data} partition_count += 1 def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: diff --git a/test/unit/test_core_temporal_data_loader.py b/test/unit/test_core_temporal_data_loader.py index 64e67f9..85588a1 100644 --- a/test/unit/test_core_temporal_data_loader.py +++ b/test/unit/test_core_temporal_data_loader.py @@ -15,16 +15,8 @@ # specific language governing permissions and limitations # under the License. -"""TemporalScope/test/unit/test_core_temporal_data_loader.py +# TemporalScope/test/unit/test_core_temporal_data_loader.py -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. - -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -""" from datetime import date, timedelta from typing import Dict, List, Union diff --git a/test/unit/test_core_temporal_target_shifter.py b/test/unit/test_core_temporal_target_shifter.py index 97fb881..d454ff3 100644 --- a/test/unit/test_core_temporal_target_shifter.py +++ b/test/unit/test_core_temporal_target_shifter.py @@ -15,23 +15,7 @@ # specific language governing permissions and limitations # under the License. -"""TemporalScope/test/unit/test_core_temporal_target_shifter.py - -This file contains unit tests for the TemporalTargetShifter class to ensure it behaves correctly across different -backends (pandas, modin, polars), modes of operation (machine_learning, deep_learning), and various configurations. - -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# TemporalScope/test/unit/test_core_temporal_target_shifter.py import modin.pandas as mpd import numpy as np @@ -39,44 +23,50 @@ import polars as pl import pytest -from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS +from temporalscope.core.core_utils import ( + BACKEND_MODIN, + BACKEND_PANDAS, + BACKEND_POLARS, + MODE_MACHINE_LEARNING, + MODE_DEEP_LEARNING, +) from temporalscope.core.temporal_data_loader import TimeFrame from temporalscope.core.temporal_target_shifter import TemporalTargetShifter -# Fixture to generate sample dataframes for different backends +# Fixture to generate sample dataframes for different data_formats @pytest.fixture(params=[BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) def sample_dataframe(request): - """Fixture to generate sample dataframes for different backends.""" + """Fixture to generate sample dataframes for different data_formats.""" data = { "time": pd.date_range(start="2022-01-01", periods=100), "target": np.random.rand(100), "feature_1": np.random.rand(100), "feature_2": np.random.rand(100), } - backend = request.param - if backend == BACKEND_POLARS: + data_format = request.param + if data_format == BACKEND_POLARS: df = pl.DataFrame(data) - elif backend == BACKEND_PANDAS: + elif data_format == BACKEND_PANDAS: df = pd.DataFrame(data) - elif backend == BACKEND_MODIN: + elif data_format == BACKEND_MODIN: df = mpd.DataFrame(data) - return df, backend, "target" + return df, data_format, "target" -# Parametrized Test for Backend Inference, n_lags, and Modes +# Parametrized Test for data_format Inference, n_lags, and Modes @pytest.mark.parametrize( "n_lags, mode, sequence_length", [ - (1, TemporalTargetShifter.MODE_MACHINE_LEARNING, None), - (3, TemporalTargetShifter.MODE_MACHINE_LEARNING, None), - (1, TemporalTargetShifter.MODE_DEEP_LEARNING, 5), + (1, MODE_MACHINE_LEARNING, None), + (3, MODE_MACHINE_LEARNING, None), + (1, MODE_DEEP_LEARNING, 5), ], ) -@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) # Parametrizing backends as well -def test_backend_inference(backend, n_lags, mode, sequence_length): - """Test backend inference and shifting functionality across all backends.""" - # Generate data for the current backend +@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) # Parametrizing data_formats as well +def test_data_format_inference(data_format, n_lags, mode, sequence_length): + """Test data_format inference and shifting functionality across all data_formats.""" + # Generate data for the current data_format data = { "time": pd.date_range(start="2022-01-01", periods=100), "target": np.random.rand(100), @@ -84,26 +74,26 @@ def test_backend_inference(backend, n_lags, mode, sequence_length): "feature_2": np.random.rand(100), } - if backend == BACKEND_POLARS: + if data_format == BACKEND_POLARS: df = pl.DataFrame(data) - elif backend == BACKEND_PANDAS: + elif data_format == BACKEND_PANDAS: df = pd.DataFrame(data) - elif backend == BACKEND_MODIN: + elif data_format == BACKEND_MODIN: df = mpd.DataFrame(data) # Initialize shifter shifter = TemporalTargetShifter(n_lags=n_lags, mode=mode, sequence_length=sequence_length, target_col="target") - # Test fitting the dataframe and checking the inferred backend + # Test fitting the dataframe and checking the inferred data_format shifter.fit(df) - assert shifter.backend == backend + assert shifter.data_format == data_format # Test transformation (ensure no crashes) transformed = shifter.transform(df) assert transformed is not None -# Parametrized test for invalid data and expected errors across backends +# Parametrized test for invalid data and expected errors across data_formats @pytest.mark.parametrize( "invalid_data", [ @@ -111,21 +101,21 @@ def test_backend_inference(backend, n_lags, mode, sequence_length): pd.DataFrame(), # Empty DataFrame should raise an error ], ) -@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_invalid_data_handling(backend, invalid_data): - """Test invalid data handling for empty or None DataFrames across backends.""" +@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_invalid_data_handling(data_format, invalid_data): + """Test invalid data handling for empty or None DataFrames across data_formats.""" shifter = TemporalTargetShifter(n_lags=1, target_col="target") with pytest.raises(ValueError): shifter.fit(invalid_data) -# Parametrized test for TimeFrame inputs and transformation across all backends +# Parametrized test for TimeFrame inputs and transformation across all data_formats @pytest.mark.parametrize("n_lags", [1, 2]) -@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_time_frame_input(backend, n_lags): - """Test TimeFrame input handling and transformation across all backends.""" - # Generate data for the current backend +@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_time_frame_input(data_format, n_lags): + """Test TimeFrame input handling and transformation across all data_formats.""" + # Generate data for the current data_format data = { "time": pd.date_range(start="2022-01-01", periods=100), "target": np.random.rand(100), @@ -133,14 +123,15 @@ def test_time_frame_input(backend, n_lags): "feature_2": np.random.rand(100), } - if backend == BACKEND_POLARS: + if data_format == BACKEND_POLARS: df = pl.DataFrame(data) - elif backend == BACKEND_PANDAS: + elif data_format == BACKEND_PANDAS: df = pd.DataFrame(data) - elif backend == BACKEND_MODIN: + elif data_format == BACKEND_MODIN: df = mpd.DataFrame(data) - tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) + # Ensure TimeFrame uses data_format instead of backend + tf = TimeFrame(df, time_col="time", target_col="target", data_format=data_format) shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") # Test fitting and transforming TimeFrame @@ -149,12 +140,12 @@ def test_time_frame_input(backend, n_lags): assert transformed is not None -# Parametrized test for deep learning mode with different sequence lengths across all backends +# Parametrized test for deep learning mode with different sequence lengths across all data_formats @pytest.mark.parametrize("sequence_length", [3, 5]) -@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_deep_learning_mode(backend, sequence_length): - """Test deep learning mode sequence generation across all backends.""" - # Generate data for the current backend +@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_deep_learning_mode(data_format, sequence_length): + """Test deep learning mode sequence generation across all data_formats.""" + # Generate data for the current data_format data = { "time": pd.date_range(start="2022-01-01", periods=100), "target": np.random.rand(100), @@ -162,15 +153,15 @@ def test_deep_learning_mode(backend, sequence_length): "feature_2": np.random.rand(100), } - if backend == BACKEND_POLARS: + if data_format == BACKEND_POLARS: df = pl.DataFrame(data) - elif backend == BACKEND_PANDAS: + elif data_format == BACKEND_PANDAS: df = pd.DataFrame(data) - elif backend == BACKEND_MODIN: + elif data_format == BACKEND_MODIN: df = mpd.DataFrame(data) shifter = TemporalTargetShifter( - n_lags=1, mode=TemporalTargetShifter.MODE_DEEP_LEARNING, sequence_length=sequence_length, target_col="target" + n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=sequence_length, target_col="target" ) shifter.fit(df) @@ -179,10 +170,10 @@ def test_deep_learning_mode(backend, sequence_length): # Test verbose mode with stdout capture -@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_verbose_mode(backend, capfd): +@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_verbose_mode(data_format, capfd): """Test verbose mode output and row dropping information.""" - # Generate data for the current backend + # Generate data for the current data_format data = { "time": pd.date_range(start="2022-01-01", periods=100), "target": np.random.rand(100), @@ -190,11 +181,11 @@ def test_verbose_mode(backend, capfd): "feature_2": np.random.rand(100), } - if backend == BACKEND_POLARS: + if data_format == BACKEND_POLARS: df = pl.DataFrame(data) - elif backend == BACKEND_PANDAS: + elif data_format == BACKEND_PANDAS: df = pd.DataFrame(data) - elif backend == BACKEND_MODIN: + elif data_format == BACKEND_MODIN: df = mpd.DataFrame(data) shifter = TemporalTargetShifter(n_lags=1, target_col="target", verbose=True) @@ -207,12 +198,12 @@ def test_verbose_mode(backend, capfd): assert "Rows before shift" in captured.out -# Parametrized test for fit_transform method for all backends +# Parametrized test for fit_transform method for all data_formats @pytest.mark.parametrize("n_lags", [1, 2]) -@pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_fit_transform(backend, n_lags): - """Test fit_transform() method for all backends.""" - # Generate data for the current backend +@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +def test_fit_transform(data_format, n_lags): + """Test fit_transform() method for all data_formats.""" + # Generate data for the current data_format data = { "time": pd.date_range(start="2022-01-01", periods=100), "target": np.random.rand(100), @@ -220,11 +211,11 @@ def test_fit_transform(backend, n_lags): "feature_2": np.random.rand(100), } - if backend == BACKEND_POLARS: + if data_format == BACKEND_POLARS: df = pl.DataFrame(data) - elif backend == BACKEND_PANDAS: + elif data_format == BACKEND_PANDAS: df = pd.DataFrame(data) - elif backend == BACKEND_MODIN: + elif data_format == BACKEND_MODIN: df = mpd.DataFrame(data) shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") diff --git a/tutorial_notebooks/introduction/1_target_shifter.ipynb b/tutorial_notebooks/introduction/1_target_shifter.ipynb new file mode 100644 index 0000000..75dafa2 --- /dev/null +++ b/tutorial_notebooks/introduction/1_target_shifter.ipynb @@ -0,0 +1,922 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c8aefe6f-489e-42fe-9cb9-a20426652424", + "metadata": {}, + "source": [ + "# Temporal Scope Tutorial: Utilizing Target Shifter\n", + "\n", + "## Overview\n", + "\n", + "This tutorial demonstrates how to load macroeconomic data and apply the **TemporalTargetShifter** using the **Modin** backend. The tutorial shows how to shift the target variable in **machine learning** and **deep learning** modes for forecasting tasks. The tool supports flexible configurations for different forecasting needs.\n", + "\n", + "### Summary\n", + "\n", + "| **Step** | **Description** |\n", + "|-----------|---------------------------------------------------------------------------------|\n", + "| **1** | **Data Loading**: Load macroeconomic data and create a datetime column (`ds`). |\n", + "| **2** | **Modin Backend Initialization**: Initialize a `TimeFrame` for scalable data processing with Modin. |\n", + "| **3** | **Target Shifting (ML Mode)**: Shift the target variable (`realgdp`) for one-step-ahead forecasting in **machine learning mode**. |\n", + "| **4** | **Target Shifting (DL Mode)**: Shift the target variable for sequence-based forecasting in **deep learning mode**. |\n", + "\n", + "### Key Concepts\n", + "\n", + "- **One-step ahead forecasting**: Shifting the target variable to predict the next time step for machine learning models.\n", + "- **Sequence forecasting**: Generating sequences of target variables for deep learning models.\n", + "- **Modin Backend**: Scalable version of Pandas for large datasets.\n", + "- **TemporalTargetShifter**: A tool to shift target variables for forecasting tasks, supporting both machine learning and deep learning modes.\n", + "\n", + "### Steps\n", + "\n", + "1. **Load the macroeconomic dataset** using the `statsmodels` library.\n", + "2. **Initialize a TimeFrame** for the Modin backend.\n", + "3. **Apply the Target Shifter** in machine learning mode to shift the target variable by one step (for simple one-step-ahead forecasting).\n", + "4. **Apply the Target Shifter** in deep learning mode to create sequences for sequence-based forecasting tasks.\n" + ] + }, + { + "cell_type": "markdown", + "id": "b9b71cc0-f882-40b6-933d-d38cbe3a56cd", + "metadata": {}, + "source": [ + "# Part 1: Load Macro-Economic Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4b56016b-7609-4e26-bb0b-5d6e4f864c18", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "Loading the 'macrodata' dataset from statsmodels.\n", + "Using 'realgdp' as the target column for future prediction.\n", + "======================================================================\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
01959-01-012710.3491707.4286.898470.0451886.928.980139.72.825.8177.1460.000.00
11959-04-012778.8011733.7310.859481.3011919.729.150141.73.085.1177.8302.340.74
21959-07-012775.4881751.8289.226491.2601916.429.350140.53.825.3178.6572.741.09
31959-10-012785.2041753.7299.356484.0521931.329.370140.04.335.6179.3860.274.06
41960-01-012847.6991770.5331.722462.1991955.529.540139.63.505.2180.0072.311.19
..........................................
1982008-07-0113324.6009267.71990.693991.5519838.3216.8891474.71.176.0305.270-3.164.33
1992008-10-0113141.9209195.31857.6611007.2739920.4212.1741576.50.126.9305.952-8.798.91
2002009-01-0112925.4109209.21558.494996.2879926.4212.6711592.80.228.1306.5470.94-0.71
2012009-04-0112901.5049189.01456.6781023.52810077.5214.4691653.60.189.2307.2263.37-3.19
2022009-07-0112990.3419256.01486.3981044.08810040.6216.3851673.90.129.6308.0133.56-3.44
\n", + "

203 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " ds realgdp realcons realinv realgovt realdpi cpi \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.980 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.150 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.350 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.370 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.540 \n", + ".. ... ... ... ... ... ... ... \n", + "198 2008-07-01 13324.600 9267.7 1990.693 991.551 9838.3 216.889 \n", + "199 2008-10-01 13141.920 9195.3 1857.661 1007.273 9920.4 212.174 \n", + "200 2009-01-01 12925.410 9209.2 1558.494 996.287 9926.4 212.671 \n", + "201 2009-04-01 12901.504 9189.0 1456.678 1023.528 10077.5 214.469 \n", + "202 2009-07-01 12990.341 9256.0 1486.398 1044.088 10040.6 216.385 \n", + "\n", + " m1 tbilrate unemp pop infl realint \n", + "0 139.7 2.82 5.8 177.146 0.00 0.00 \n", + "1 141.7 3.08 5.1 177.830 2.34 0.74 \n", + "2 140.5 3.82 5.3 178.657 2.74 1.09 \n", + "3 140.0 4.33 5.6 179.386 0.27 4.06 \n", + "4 139.6 3.50 5.2 180.007 2.31 1.19 \n", + ".. ... ... ... ... ... ... \n", + "198 1474.7 1.17 6.0 305.270 -3.16 4.33 \n", + "199 1576.5 0.12 6.9 305.952 -8.79 8.91 \n", + "200 1592.8 0.22 8.1 306.547 0.94 -0.71 \n", + "201 1653.6 0.18 9.2 307.226 3.37 -3.19 \n", + "202 1673.9 0.12 9.6 308.013 3.56 -3.44 \n", + "\n", + "[203 rows x 13 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from statsmodels.datasets import macrodata\n", + "\n", + "from temporalscope.core.core_utils import print_divider\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "\n", + "def load_macrodata(target_col: str = \"realgdp\"):\n", + " \"\"\"Preprocess the dataset with a combined column for time target.\n", + " \n", + " :param target_col: The column to be used as the target for prediction.\n", + " :type target_col: str, optional\n", + " :return: Preprocessed DataFrame with target column.\n", + " :rtype: pd.DataFrame\n", + " \"\"\"\n", + " print_divider()\n", + " print(\"Loading the 'macrodata' dataset from statsmodels.\")\n", + " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", + " print_divider()\n", + "\n", + " # Load macrodata dataset\n", + " macro_df = macrodata.load_pandas().data.copy()\n", + "\n", + " # Create 'ds' column combining 'year' and 'quarter'\n", + " macro_df[\"ds\"] = pd.to_datetime(\n", + " macro_df[\"year\"].astype(int).astype(str) + \"-\" + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str) + \"-01\"\n", + " )\n", + "\n", + " # Drop the 'year' and 'quarter' columns\n", + " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", + "\n", + " # Reorder columns to put 'ds' (datetime) first\n", + " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", + " macro_df = macro_df[cols].copy()\n", + "\n", + " return macro_df, target_col\n", + "\n", + "\n", + "# Load the macrodata dataset and preprocess\n", + "macro_df, target_col = load_macrodata()\n", + "macro_df" + ] + }, + { + "cell_type": "markdown", + "id": "5bddbc46-e8cf-421c-8561-363aeef1143c", + "metadata": {}, + "source": [ + "## Part 2: Shifting for Machine Learning" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "051a47f4-b8dd-46e3-92c1-39b49ee04f51", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "Loading the 'macrodata' dataset from statsmodels.\n", + "Using 'realgdp' as the target column for future prediction.\n", + "======================================================================\n", + "======================================================================\n", + "Initializing TimeFrame for the Modin backend...\n", + "Original DataFrame:\n", + " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", + "\n", + " tbilrate unemp pop infl realint \n", + "0 2.82 5.8 177.146 0.00 0.00 \n", + "1 3.08 5.1 177.830 2.34 0.74 \n", + "2 3.82 5.3 178.657 2.74 1.09 \n", + "3 4.33 5.6 179.386 0.27 4.06 \n", + "4 3.50 5.2 180.007 2.31 1.19 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", + "Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 202; Rows after shift: 202; Rows dropped: 0\n" + ] + }, + { + "ename": "TypeError", + "evalue": "Input DataFrame type does not match the specified backend 'mpd'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 30\u001b[0m\n\u001b[1;32m 27\u001b[0m shifter \u001b[38;5;241m=\u001b[39m TemporalTargetShifter(n_lags\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, mode\u001b[38;5;241m=\u001b[39mMODE_MACHINE_LEARNING, target_col\u001b[38;5;241m=\u001b[39mtarget_col, verbose\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 29\u001b[0m \u001b[38;5;66;03m# Apply the shifter\u001b[39;00m\n\u001b[0;32m---> 30\u001b[0m shifted_df \u001b[38;5;241m=\u001b[39m \u001b[43mshifter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodin_tf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;66;03m# Print the shifted data\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShifted data:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/core/temporal_target_shifter.py:478\u001b[0m, in \u001b[0;36mTemporalTargetShifter.fit_transform\u001b[0;34m(self, tf)\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;66;03m# Return TimeFrame if input was TimeFrame, otherwise return DataFrame\u001b[39;00m\n\u001b[1;32m 477\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(tf, TimeFrame):\n\u001b[0;32m--> 478\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mTimeFrame\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 479\u001b[0m \u001b[43m \u001b[49m\u001b[43mtransformed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 480\u001b[0m \u001b[43m \u001b[49m\u001b[43mtime_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtime_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 481\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 482\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_col\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m_shift_\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_lags\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 483\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mMODE_MACHINE_LEARNING\u001b[49m\n\u001b[1;32m 484\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_col\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m_sequence\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 485\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[43m \u001b[49m\u001b[43mbackend\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackend\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 487\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m transformed\n", + "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/core/temporal_data_loader.py:147\u001b[0m, in \u001b[0;36mTimeFrame.__init__\u001b[0;34m(self, df, time_col, target_col, backend, sort)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sort \u001b[38;5;241m=\u001b[39m sort\n\u001b[1;32m 146\u001b[0m \u001b[38;5;66;03m# Convert, validate, and set up the DataFrame\u001b[39;00m\n\u001b[0;32m--> 147\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_setup_timeframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/core/temporal_data_loader.py:252\u001b[0m, in \u001b[0;36mTimeFrame._setup_timeframe\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Sets up the TimeFrame object by converting, validating, and preparing data as required.\u001b[39;00m\n\u001b[1;32m 241\u001b[0m \n\u001b[1;32m 242\u001b[0m \u001b[38;5;124;03m:param df: The input DataFrame to be processed.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[38;5;124;03m:raises TypeError: If the DataFrame type does not match the backend.\u001b[39;00m\n\u001b[1;32m 250\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 251\u001b[0m \u001b[38;5;66;03m# Convert and validate the input DataFrame\u001b[39;00m\n\u001b[0;32m--> 252\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mvalidate_and_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_backend\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 254\u001b[0m \u001b[38;5;66;03m# Validate the presence of required columns\u001b[39;00m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_columns(df)\n", + "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/core/core_utils.py:145\u001b[0m, in \u001b[0;36mvalidate_and_convert_input\u001b[0;34m(df, backend)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(df, dataframe_type):\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m conversion_func(df)\n\u001b[0;32m--> 145\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInput DataFrame type \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(df)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not match the specified backend \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbackend\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mTypeError\u001b[0m: Input DataFrame type does not match the specified backend 'mpd'" + ] + } + ], + "source": [ + "import modin.pandas as mpd\n", + "from temporalscope.core.core_utils import BACKEND_MODIN\n", + "from temporalscope.core.temporal_data_loader import TimeFrame\n", + "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "\n", + "# Step 1: Load the macrodata dataset and preprocess\n", + "macro_df, target_col = load_macrodata()\n", + "\n", + "# Step 2: Initialize Modin TimeFrame for Modin backend\n", + "print_divider()\n", + "print(\"Initializing TimeFrame for the Modin backend...\")\n", + "macro_modin_df = mpd.DataFrame(macro_df)\n", + "modin_tf = TimeFrame(macro_modin_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_MODIN)\n", + "\n", + "# Step 3: Preview the original data\n", + "print(\"Original DataFrame:\")\n", + "print(modin_tf.get_data().head())\n", + "print_divider()\n", + "\n", + "# Step 4: Apply the TemporalTargetShifter in machine learning mode\n", + "print(f\"\\nApplying Target Shifter in {MODE_MACHINE_LEARNING} mode...\")\n", + "\n", + "# Setup the TemporalTargetShifter\n", + "shifter = TemporalTargetShifter(n_lags=1, mode=MODE_MACHINE_LEARNING, target_col=target_col, verbose=True)\n", + "\n", + "# Apply the shifter\n", + "shifted_df = shifter.fit_transform(modin_tf)\n", + "\n", + "# Print the shifted data\n", + "print(\"Shifted data:\")\n", + "print(shifted_df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ff95236-87eb-487e-9a65-fce69340d3f6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "shifted_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f4efe10f-e4ca-4b61-821d-87959557a51e", + "metadata": {}, + "source": [ + "## Part 2: Shifting for Deep Learning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c6ef6be-d13b-4576-bdef-fa4afbb687a5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Step 5: Apply the TemporalTargetShifter in deep learning mode\n", + "MODE_DEEP_LEARNING = \"deep_learning\"\n", + "\n", + "print(f\"\\nApplying Target Shifter in {MODE_DEEP_LEARNING} mode...\")\n", + "\n", + "# Setup the TemporalTargetShifter for deep learning mode with a sequence length\n", + "sequence_length = 3 # Length of sequence for deep learning\n", + "shifter_dl = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=sequence_length, verbose=True)\n", + "\n", + "# Apply the shifter\n", + "shifted_dl_df = shifter_dl.fit_transform(modin_tf)\n", + "\n", + "# Print the shifted data with sequences\n", + "print(\"Shifted data for deep learning mode (sequences):\")\n", + "print(shifted_dl_df.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "369d0213-0bca-4c05-af9e-42daa260b3fe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "shifted_dl_df" + ] + }, + { + "cell_type": "markdown", + "id": "b0cbc6e3-a665-45f2-a9aa-60b9057d5540", + "metadata": {}, + "source": [ + "## Part 4: Shifting for all backends" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "170bad23-b236-4837-b042-7218622c4e62", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "Loading the 'macrodata' dataset from the open-license statsmodels package.\n", + "Using 'realgdp' as the target column for future prediction.\n", + "======================================================================\n", + "======================================================================\n", + "Demonstrating Target Shifter for backend: pd\n", + "Preview of the TimeFrame DataFrame:\n", + " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", + "\n", + " tbilrate unemp pop infl realint \n", + "0 2.82 5.8 177.146 0.00 0.00 \n", + "1 3.08 5.1 177.830 2.34 0.74 \n", + "2 3.82 5.3 178.657 2.74 1.09 \n", + "3 4.33 5.6 179.386 0.27 4.06 \n", + "4 3.50 5.2 180.007 2.31 1.19 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode...\n", + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + "Applying Target Shifter in deep_learning mode...\n", + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \\\n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + " realgdp_sequence \n", + "0 [2710.349, 2778.801, 2775.488] \n", + "1 [2778.801, 2775.488, 2785.204] \n", + "2 [2775.488, 2785.204, 2847.699] \n", + "3 [2785.204, 2847.699, 2834.39] \n", + "4 [2847.699, 2834.39, 2839.022] \n", + "======================================================================\n", + "Demonstrating Target Shifter for backend: pl\n", + "Preview of the TimeFrame DataFrame:\n", + "shape: (5, 13)\n", + "┌─────────────────────┬──────────┬──────────┬─────────┬───┬───────┬─────────┬──────┬─────────┐\n", + "│ ds ┆ realgdp ┆ realcons ┆ realinv ┆ … ┆ unemp ┆ pop ┆ infl ┆ realint │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═════════════════════╪══════════╪══════════╪═════════╪═══╪═══════╪═════════╪══════╪═════════╡\n", + "│ 1959-01-01 00:00:00 ┆ 2710.349 ┆ 1707.4 ┆ 286.898 ┆ … ┆ 5.8 ┆ 177.146 ┆ 0.0 ┆ 0.0 │\n", + "│ 1959-04-01 00:00:00 ┆ 2778.801 ┆ 1733.7 ┆ 310.859 ┆ … ┆ 5.1 ┆ 177.83 ┆ 2.34 ┆ 0.74 │\n", + "│ 1959-07-01 00:00:00 ┆ 2775.488 ┆ 1751.8 ┆ 289.226 ┆ … ┆ 5.3 ┆ 178.657 ┆ 2.74 ┆ 1.09 │\n", + "│ 1959-10-01 00:00:00 ┆ 2785.204 ┆ 1753.7 ┆ 299.356 ┆ … ┆ 5.6 ┆ 179.386 ┆ 0.27 ┆ 4.06 │\n", + "│ 1960-01-01 00:00:00 ┆ 2847.699 ┆ 1770.5 ┆ 331.722 ┆ … ┆ 5.2 ┆ 180.007 ┆ 2.31 ┆ 1.19 │\n", + "└─────────────────────┴──────────┴──────────┴─────────┴───┴───────┴─────────┴──────┴─────────┘\n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode...\n", + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + "shape: (5, 13)\n", + "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬─────────────────┐\n", + "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_shift_1 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪═════════════════╡\n", + "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ 2778.801 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ 2775.488 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ 2785.204 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ 2847.699 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ 2834.39 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴─────────────────┘\n", + "\n", + "Applying Target Shifter in deep_learning mode...\n", + "Rows before shift: 203; Rows after shift: 203; Rows dropped: 0\n", + "Shifted data:\n", + "shape: (5, 13)\n", + "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬────────────────────┐\n", + "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_sequence │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ list[f64] │\n", + "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪════════════════════╡\n", + "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ [2710.349, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2778.801, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488] │\n", + "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ [2778.801, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204] │\n", + "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ [2775.488, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699] │\n", + "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ [2785.204, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699, 2834.39] │\n", + "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ [2847.699, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2834.39, 2839.022] │\n", + "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴────────────────────┘\n", + "======================================================================\n", + "Demonstrating Target Shifter for backend: mpd\n", + "Preview of the TimeFrame DataFrame:\n", + " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", + "\n", + " tbilrate unemp pop infl realint \n", + "0 2.82 5.8 177.146 0.00 0.00 \n", + "1 3.08 5.1 177.830 2.34 0.74 \n", + "2 3.82 5.3 178.657 2.74 1.09 \n", + "3 4.33 5.6 179.386 0.27 4.06 \n", + "4 3.50 5.2 180.007 2.31 1.19 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + "Applying Target Shifter in deep_learning mode...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \\\n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + " realgdp_sequence \n", + "0 [2710.349, 2778.801, 2775.488] \n", + "1 [2778.801, 2775.488, 2785.204] \n", + "2 [2775.488, 2785.204, 2847.699] \n", + "3 [2785.204, 2847.699, 2834.39] \n", + "4 [2847.699, 2834.39, 2839.022] \n" + ] + } + ], + "source": [ + "import modin.pandas as mpd\n", + "import polars as pl\n", + "\n", + "from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, print_divider\n", + "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", + "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "MODE_DEEP_LEARNING = \"deep_learning\"\n", + "\n", + "def load_macrodata(target_col: str = \"realgdp\"):\n", + " \"\"\"Preprocess the dataset with a combined column for time & shifted target.\n", + "\n", + " :param target_col: The column to be used as the target for prediction\n", + " :type target_col: str, optional\n", + " :default target_col: 'realgdp'\n", + "\n", + " :return: Preprocessed DataFrame with shifted target\n", + " :rtype: pd.DataFrame\n", + " \"\"\"\n", + " print_divider()\n", + " print(\"Loading the 'macrodata' dataset from the open-license statsmodels package.\")\n", + " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", + " print_divider()\n", + "\n", + " # Load macrodata dataset\n", + " macro_df = macrodata.load_pandas().data.copy()\n", + "\n", + " # Create 'ds' column by combining 'year' and 'quarter'\n", + " macro_df[\"ds\"] = pd.to_datetime(\n", + " macro_df[\"year\"].astype(int).astype(str)\n", + " + \"-\"\n", + " + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str)\n", + " + \"-01\"\n", + " )\n", + "\n", + " # Drop the 'year' and 'quarter' columns\n", + " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", + "\n", + " # Reorder columns to place 'ds' first\n", + " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", + " macro_df = macro_df[cols].copy()\n", + "\n", + " return macro_df, target_col\n", + "\n", + "\n", + "def init_timeframes_for_backends(macro_df, target_col: str):\n", + " \"\"\"Initialize TimeFrame objects for all backends (Pandas, Polars, Modin) using constants.\n", + "\n", + " :param macro_df: Preprocessed macro dataset.\n", + " :type macro_df: pd.DataFrame\n", + " :param target_col: The target column for prediction.\n", + " :type target_col: str\n", + " :return: A dictionary containing TimeFrame objects for Pandas, Polars, and Modin.\n", + " :rtype: dict\n", + " \"\"\"\n", + " timeframes = {}\n", + "\n", + " # Pandas backend\n", + " macro_pandas_df = pd.DataFrame(macro_df)\n", + " timeframes[BACKEND_PANDAS] = tf(\n", + " macro_pandas_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_PANDAS\n", + " )\n", + "\n", + " # Polars backend\n", + " macro_polars_df = pl.DataFrame(macro_df)\n", + " timeframes[BACKEND_POLARS] = tf(\n", + " macro_polars_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_POLARS\n", + " )\n", + "\n", + " # Modin backend\n", + " macro_modin_df = mpd.DataFrame(macro_df)\n", + " timeframes[BACKEND_MODIN] = tf(\n", + " macro_modin_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_MODIN\n", + " )\n", + "\n", + " return timeframes\n", + "\n", + "\n", + "def apply_target_shifter(tf_obj, mode: str):\n", + " \"\"\"Apply the TemporalTargetShifter in the specified mode.\n", + "\n", + " :param tf_obj: TimeFrame object to apply the shifter to.\n", + " :param mode: Mode of operation (machine_learning or deep_learning).\n", + " \"\"\"\n", + " print(f\"\\nApplying Target Shifter in {mode} mode...\")\n", + "\n", + " # Setup the TemporalTargetShifter\n", + " if mode == MODE_MACHINE_LEARNING:\n", + " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_MACHINE_LEARNING, verbose=True)\n", + " elif mode == MODE_DEEP_LEARNING:\n", + " # In deep learning mode, sequence_length must be provided\n", + " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=3, verbose=True)\n", + " else:\n", + " raise ValueError(f\"Invalid mode: {mode}\")\n", + "\n", + " # Apply the shifter\n", + " shifted_df = shifter.fit_transform(tf_obj)\n", + "\n", + " # Print the result (since it's already a DataFrame, no need for get_data())\n", + " print(\"Shifted data:\")\n", + " print(shifted_df.head())\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Load the macrodata dataset and preprocess\n", + " macro_df, target_col = load_macrodata()\n", + "\n", + " # Initialize TimeFrame objects for various backends using constants\n", + " timeframes = init_timeframes_for_backends(macro_df, target_col)\n", + "\n", + " # Apply and demonstrate shifting for all backends\n", + " for backend, tf_obj in timeframes.items():\n", + " print_divider()\n", + " print(f\"Demonstrating Target Shifter for backend: {backend}\")\n", + " print(\"Preview of the TimeFrame DataFrame:\")\n", + " print(tf_obj.get_data().head())\n", + " print_divider()\n", + "\n", + " # Apply target shifting in machine learning mode\n", + " apply_target_shifter(tf_obj, MODE_MACHINE_LEARNING)\n", + "\n", + " # Apply target shifting in deep learning mode\n", + " apply_target_shifter(tf_obj, MODE_DEEP_LEARNING)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7d45fd7-3773-4d8b-ba66-af01b4aa2dd3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TemporalScope", + "language": "python", + "name": "temporalscope-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb b/tutorial_notebooks/introduction/2_partion_sliding_window.ipynb similarity index 100% rename from tutorial_notebooks/introduction/1_load_data_target_shifter.ipynb rename to tutorial_notebooks/introduction/2_partion_sliding_window.ipynb From 3f44d52918b3fbd82a365fa2a2d1878ff22e5405 Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Sat, 28 Sep 2024 22:12:25 +0000 Subject: [PATCH 2/6] feat(refactored-core-design-and-added-new-unit-tests-for-several-modules---refactored-design-of-core_utils,-temporal_data_loader,-and-partition-modules---added-test-cases-for-partition-padding,-exceptions,-and-validators---updated-sliding-window-and-target-shifter-modules---deleted-and-renamed-tutorial-notebooks-for-consistency---added-missing-datasets-and-related-unit-tests-(still-incomplete)): refactoring & design improvements and the addition of more unit tests --- src/temporalscope/core/core_utils.py | 6 +- src/temporalscope/core/exceptions.py | 140 ++ .../core/temporal_data_loader.py | 696 +++++-- .../core/temporal_target_shifter.py | 176 +- src/temporalscope/datasets/datasets.py | 236 +++ src/temporalscope/partition/padding.py | 744 +++++++ .../partition/partition_validators.py | 2 +- src/temporalscope/partition/sliding_window.py | 302 +-- test/unit/test_core_exceptions.py | 63 + test/unit/test_core_temporal_data_loader.py | 453 ++--- .../unit/test_core_temporal_target_shifter.py | 4 +- test/unit/test_datasets.py | 110 + test/unit/test_partion_data_checks.py | 336 ---- test/unit/test_partition_padding.py | 362 ++++ test/unit/test_partition_validators.py | 336 ++++ .../introduction/1_target_shifter.ipynb | 1764 +++++++++++++---- .../2_partion_sliding_window.ipynb | 1386 ------------- .../2_sliding_window_target_shifter.ipynb | 702 +++++++ 18 files changed, 5091 insertions(+), 2727 deletions(-) create mode 100644 src/temporalscope/core/exceptions.py create mode 100644 src/temporalscope/datasets/datasets.py create mode 100644 src/temporalscope/partition/padding.py create mode 100644 test/unit/test_core_exceptions.py create mode 100644 test/unit/test_datasets.py delete mode 100644 test/unit/test_partion_data_checks.py create mode 100644 test/unit/test_partition_padding.py create mode 100644 test/unit/test_partition_validators.py delete mode 100644 tutorial_notebooks/introduction/2_partion_sliding_window.ipynb create mode 100644 tutorial_notebooks/introduction/2_sliding_window_target_shifter.ipynb diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py index bf485b6..329ac10 100644 --- a/src/temporalscope/core/core_utils.py +++ b/src/temporalscope/core/core_utils.py @@ -140,7 +140,11 @@ def validate_and_convert_input(df: SupportedBackendDataFrame, backend: str) -> S if backend not in backend_conversion_map: raise ValueError(f"Unsupported backend: {backend}") - for dataframe_type, conversion_func in backend_conversion_map[backend].items(): + conversion_map = backend_conversion_map[backend] + if not isinstance(conversion_map, dict): + raise TypeError(f"Conversion map for backend '{backend}' is not a dictionary") + + for dataframe_type, conversion_func in conversion_map.items(): if isinstance(df, dataframe_type): return conversion_func(df) diff --git a/src/temporalscope/core/exceptions.py b/src/temporalscope/core/exceptions.py new file mode 100644 index 0000000..ae4a58b --- /dev/null +++ b/src/temporalscope/core/exceptions.py @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""TemporalScope/src/temporalscope/core/exceptions.py + +This module defines custom exceptions and warnings used throughout the TemporalScope package, +specifically for handling errors and edge cases in the TimeFrame class. These custom error +types and warnings are designed to provide clear and actionable feedback for developers +when issues are encountered during time-series forecasting workflows. + +Use Cases: +---------- +- **TimeColumnError**: Raised when there are validation issues with the `time_col` such as unsupported types. +- **MixedTypesWarning**: Raised when mixed numeric and timestamp types are detected in `time_col`. +- **MixedTimezonesWarning**: Raised when `time_col` contains a mixture of timezone-aware and naive timestamps. + +Classes: +-------- +- `TimeFrameError`: The base class for all custom exceptions in the TimeFrame module. +- `TimeColumnError`: Raised when the time column has invalid values or types. +- `MixedTypesWarning`: Warning issued when the `time_col` contains mixed numeric and timestamp-like types. +- `MixedTimezonesWarning`: Warning issued when the `time_col` contains a mix of timezone-aware and naive timestamps. + +Example Usage: +-------------- +.. code-block:: python + + from temporalscope.core.exceptions import ( + TimeColumnError, MixedTypesWarning, MixedTimezonesWarning + ) + + def validate_time_column(df): + if df['time'].dtype == object: + raise TimeColumnError("Invalid time column data type.") + elif contains_mixed_types(df['time']): + warnings.warn("Mixed numeric and timestamp types.", MixedTypesWarning) + +""" + + +class TimeFrameError(Exception): + """Base class for exceptions in the TimeFrame module. + + This exception serves as the foundation for all errors related to the + `TimeFrame` class. It should be subclassed to create more specific + exceptions for different error conditions. + """ + + pass + + +class TimeColumnError(TimeFrameError): + """ Exception raised for errors related to the `time_col`. + + This error is raised when the `time_col` in the TimeFrame is either + missing, contains unsupported types (non-numeric or non-timestamp), + or has invalid data like null values. + + Attributes: + message (str): Explanation of the error. + + Example Usage: + -------------- + .. code-block:: python + + if not pd.api.types.is_numeric_dtype(df[time_col]) and \ + not pd.api.types.is_datetime64_any_dtype(df[time_col]): + raise TimeColumnError("`time_col` must be numeric or timestamp-like.") + """ + + pass + + +class MixedTypesWarning(UserWarning): + """Warning raised when mixed numeric and timestamp-like types are detected in `time_col`. + + This warning is issued when the time column contains both numeric and + timestamp-like types, which could lead to unpredictable behavior in time + series processing workflows. + + Example Usage: + -------------- + .. code-block:: python + + if numeric_mask and timestamp_mask: + warnings.warn("`time_col` contains mixed numeric and timestamp-like types.", MixedTypesWarning) + """ + + pass + + +class MixedTimezonesWarning(UserWarning): + """Warning raised when mixed timezone-aware and naive timestamps are detected in `time_col`. + + This warning is issued when the time column contains a mix of timezone-aware + and timezone-naive timestamps, which could cause errors in models that + require consistent timestamp formats. + + Example Usage: + -------------- + .. code-block:: python + + if df[time_col].dt.tz is not None and df[time_col].dt.tz.hasnans: + warnings.warn("`time_col` contains mixed timezone-aware and naive timestamps.", MixedTimezonesWarning) + """ + + pass + + +class MixedFrequencyWarning(UserWarning): + """Warning raised when mixed timestamp frequencies are detected in `time_col`. + + This warning is issued when the time column contains timestamps of mixed frequencies + (e.g., daily, monthly, and yearly timestamps), which can lead to inconsistent behavior + in time series operations that assume uniform frequency. + + Example Usage: + -------------- + .. code-block:: python + + inferred_freq = pd.infer_freq(time_col.dropna()) + if inferred_freq is None: + warnings.warn("`time_col` contains mixed timestamp frequencies.", MixedFrequencyWarning) + """ + + pass diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index 99c1212..79650f5 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -20,22 +20,65 @@ This module provides a flexible data loader for time series forecasting, allowing users to define their own preprocessing, loss functions, and explainability workflows. The core assumption is that features are organized in a context window prior to the target column, making the system compatible with SHAP and other explainability methods. -Given the variance in pre-processing techniques, meta-learning & loss-functions TemporalScope explicitly does not +Given the variance in pre-processing techniques, meta-learning & loss-functions, TemporalScope explicitly does not impose constraints on the end-user in the engineering design. +Engineering Design +-------------------- +.. note:: + + TemporalScope is designed with several key assumptions to ensure performance, scalability, and flexibility + across a wide range of time series forecasting and XAI workflows. + + 1. **Preprocessed Data Assumption**: + TemporalScope assumes that the user provides clean, preprocessed data. This includes handling categorical + encoding, missing data imputation, and feature scaling prior to using TemporalScope's partitioning and explainability + methods. Similar assumptions are seen in popular packages such as TensorFlow and GluonTS, which expect the + user to manage data preprocessing outside of the core workflow. + 2. **Time Column Constraints**: + The `time_col` must be either a numeric index or a timestamp. TemporalScope relies on this temporal ordering for + key operations like sliding window partitioning and temporal explainability workflows (e.g., SHAP). Packages like + **Facebook Prophet** and **Darts** also require proper temporal ordering as a baseline assumption for modeling time + series data. + 3. **Numeric Features Requirement**: + Aside from the `time_col`, all other features in the dataset must be numeric. This ensures compatibility with machine + learning and deep learning models that require numeric inputs. As seen in frameworks like TensorFlow and + Prophet, users are expected to preprocess categorical features (e.g., one-hot encoding or embeddings) before + applying modeling or partitioning algorithms. + 4. **Modular Design for Explainability**: + TemporalScope assumes a modular, window-based design that is naturally compatible with model-agnostic explainability + methods like SHAP and LIME. Features are expected to be structured in a temporal context for efficient partitioning + and explainability. This mirrors the design of frameworks like Darts, which use similar assumptions for time + series forecasting and explainability workflows. + + By enforcing these constraints, TemporalScope focuses on its core purpose—time series partitioning, explainability, + and scalability—while leaving more general preprocessing tasks to the user. This follows industry standards seen in + popular time series libraries. + .. seealso:: 1. Van Ness, M., Shen, H., Wang, H., Jin, X., Maddix, D.C., & Gopalswamy, K. (2023). Cross-Frequency Time Series Meta-Forecasting. arXiv preprint arXiv:2302.02077. 2. Woo, G., Liu, C., Kumar, A., Xiong, C., Savarese, S., & Sahoo, D. (2024). Unified training of universal time series forecasting transformers. arXiv preprint arXiv:2402.02592. 3. Trirat, P., Shin, Y., Kang, J., Nam, Y., Na, J., Bae, M., Kim, J., Kim, B., & Lee, J.-G. (2024). Universal time-series representation learning: A survey. arXiv preprint arXiv:2401.03717. + 4. Xu, Q., Zhuo, X., Jiang, C. and Liu, Y., 2019. An artificial neural network for mixed frequency data. Expert Systems with Applications, 118, pp.127-139.4 + 5. Filho, L.L., de Oliveira Werneck, R., Castro, M., Ribeiro Mendes Júnior, P., Lustosa, A., Zampieri, M., Linares, O., Moura, R., Morais, E., Amaral, M. and Salavati, S., 2024. A multi-modal approach for mixed-frequency time series forecasting. Neural Computing and Applications, pp.1-25. """ -from typing import Optional, Union, TYPE_CHECKING +import warnings +from typing import Optional, Union +from datetime import datetime, timedelta, date import modin.pandas as mpd import pandas as pd import polars as pl +from temporalscope.core.exceptions import ( + TimeColumnError, + MixedTypesWarning, + MixedTimezonesWarning, + MixedFrequencyWarning, +) + from temporalscope.core.core_utils import ( BACKEND_MODIN, BACKEND_PANDAS, @@ -47,60 +90,56 @@ validate_input, ) -# Use forward reference for TimeFrame -if TYPE_CHECKING: - from temporalscope.core.temporal_data_loader import TimeFrame - # Define alias with forward reference -TimeFrameCompatibleData = Union['TimeFrame', SupportedBackendDataFrame] # Use string to refer to TimeFrame +TimeFrameCompatibleData = Union["TimeFrame", SupportedBackendDataFrame] + class TimeFrame: """Central class for the TemporalScope package. - Designed to manage time series data across various backends such as - Polars, Pandas, and Modin. This class enables modular and flexible workflows for machine learning, deep learning, - and time series explainability (XAI) methods like temporal SHAP. - - The `TimeFrame` class supports workflows where the target variable can be either 1D scalar data, - typical in classical machine learning, or 3D tensor data, more common in deep learning contexts. - It is an essential component for temporal data analysis, including but not limited to explainability pipelines - like Temporal SHAP and concept drift analysis. - - Designed to be the core data handler in a variety of temporal analysis scenarios, the `TimeFrame` class - integrates seamlessly with other TemporalScope modules and can be extended for more advanced use cases. - - Assumptions: - -------------- - - This package does not impose constraints on grouping or handling duplicates. - - We assume users will build universal models and handle preprocessing (e.g., grouping, deduplication) with - TemporalScope modules or external methods. - - The only requirement is that features are arranged in a context window prior to the target column. - - Example Usage: - -------------- - + The `TimeFrame` class is designed to handle time series data across various backends, including Polars, Pandas, + and Modin. It facilitates workflows for machine learning, deep learning, and explainability methods, while abstracting + away backend-specific implementation details. + + This class automatically infers the appropriate backend, validates the data, and sorts it by time. It ensures + compatibility with temporal XAI techniques (SHAP, Boruta-SHAP, LIME etc) supporting larger data workflows in + production. + + Backend Handling + ---------------- + - If a `dataframe_backend` is explicitly provided, it takes precedence over backend inference. + - If no backend is specified, the class infers the backend from the DataFrame type, supporting Polars, Pandas, and Modin. + + Engineering Design Assumptions + ------------------ + - Universal Models: This class is designed assuming the user has pre-processed their data for compatibility with + deep learning models. Across the TemporalScope utilities (e.g., target shifter, padding, partitioning algorithms), + it is assumed that preprocessing tasks, such as categorical feature encoding, will be managed by the user or + upstream modules. Thus the model will learn global weights and will not groupby categorical variables. + - Mixed Time Frequency supported: Given the flexibility of deep learning models to handle various time frequencies, + this class allows `time_col` to contain mixed frequency data, assuming the user will manage any necessary preprocessing + or alignment outside of this class. + - The `time_col` should be either numeric or timestamp-like for proper temporal ordering. Any mixed or invalid data + types will raise validation errors. + - All non-time columns are expected to be numeric. Users are responsible for handling non-numeric features + (e.g., encoding categorical features). + + Example Usage + ------------- .. code-block:: python - # Example of creating a TimeFrame with a Polars DataFrame + import polars as pl data = pl.DataFrame({ 'time': pl.date_range(start='2021-01-01', periods=100, interval='1d'), 'value': range(100) }) tf = TimeFrame(data, time_col='time', target_col='value') - - # Accessing the data print(tf.get_data().head()) - # Example of creating a TimeFrame with a Modin DataFrame - import modin.pandas as mpd - df = mpd.DataFrame({ - 'time': pd.date_range(start='2021-01-01', periods=100, freq='D'), - 'value': range(100) - }) - tf = TimeFrame(df, time_col='time', target_col='value', backend=BACKEND_MODIN) - - # Accessing the data - print(tf.get_data().head()) + .. seealso:: + - `polars` documentation: https://pola-rs.github.io/polars/ + - `pandas` documentation: https://pandas.pydata.org/ + - `modin` documentation: https://modin.readthedocs.io/ """ def __init__( @@ -108,77 +147,126 @@ def __init__( df: SupportedBackendDataFrame, time_col: str, target_col: str, - backend: Optional[str] = None, + dataframe_backend: Optional[str] = None, sort: bool = True, - ): - """Initialize a TimeFrame object. + ) -> None: + """Initialize a TimeFrame object with required validations and backend handling. - :param df: The input DataFrame. + This constructor validates the provided DataFrame and performs checks on the required columns (`time_col`, + `target_col`). It also ensures compatibility between the DataFrame and the specified or inferred backend. + + :param df: The input DataFrame, which can be any supported backend (e.g., Polars, Pandas, Modin). :type df: SupportedBackendDataFrame - :param time_col: The name of the column representing time in the DataFrame. + :param time_col: The name of the column representing time. Should be numeric or timestamp-like for sorting. :type time_col: str - :param target_col: The name of the column representing the target variable in the DataFrame. + :param target_col: The column representing the target variable. Must be a valid column in the DataFrame. :type target_col: str - :param backend: The backend to use. If not provided, it will be inferred from the DataFrame type. - Supported backends are: - - `BACKEND_POLARS` ('pl') for Polars - - `BACKEND_PANDAS` ('pd') for Pandas - - `BACKEND_MODIN` ('mpd') for Modin - Default is to infer from the DataFrame. - :type backend: Optional[str] - :param sort: Optional. If True, sort the data by `time_col` in ascending order. Default is True. + :param dataframe_backend: The backend to use. If provided, the DataFrame will be converted to the appropriate backend. + If not provided, it will be inferred from the DataFrame type. + Supported backends are: + - `BACKEND_POLARS` ('pl') for Polars + - `BACKEND_PANDAS` ('pd') for Pandas + - `BACKEND_MODIN` ('mpd') for Modin + :type dataframe_backend: Optional[str] + :param sort: If True, the data will be sorted by `time_col` in ascending order. Default is True. :type sort: bool + :raises ValueError: - - If `time_col` or `target_col` is not a non-empty string. - - If required columns are missing in the DataFrame. - - If the inferred or specified backend is not supported. + - If `time_col` or `target_col` is not a valid non-empty string. + - If the input DataFrame is missing required columns or is empty. + - If the inferred or provided backend is unsupported. :raises TypeError: - If the DataFrame type does not match the specified backend. + + .. note:: + - The `time_col` must be numeric or timestamp-like to ensure proper temporal ordering. + - Sorting is automatically performed by `time_col` unless disabled via `sort=False`. + - If `dataframe_backend` is provided, the DataFrame will be converted to the corresponding backend format. + - If `dataframe_backend` is not provided, it will be inferred based on the DataFrame type. + + Example Usage: + -------------- + .. code-block:: python + + import polars as pl + from temporalscope.core.temporal_data_loader import TimeFrame + + data = pl.DataFrame({ + 'time': pl.date_range(start='2021-01-01', periods=5, interval='1d'), + 'value': range(5) + }) + + tf = TimeFrame(data, time_col='time', target_col='value') + print(tf.get_data().head()) """ + + # Ensure time_col and target_col are valid strings if not isinstance(time_col, str) or not time_col: - raise ValueError("time_col must be a non-empty string.") + raise ValueError("`time_col` must be a non-empty string.") if not isinstance(target_col, str) or not target_col: - raise ValueError("target_col must be a non-empty string.") - - # Infer the backend if not explicitly provided - self._backend = backend or self._infer_backend(df) - validate_backend(self._backend) + raise ValueError("`target_col` must be a non-empty string.") - self._cfg = get_default_backend_cfg() + # Set class variables self._time_col = time_col self._target_col = target_col self._sort = sort - # Convert, validate, and set up the DataFrame - self.df = self._setup_timeframe(df) - @property - def backend(self) -> str: - """Return the backend used. + # Use the centralized setup method to handle backend inference, validation, and sorting + self._setup_timeframe(df, dataframe_backend) - :return: The backend identifier (e.g., 'pl', 'pd', 'mpd'). - :rtype: str - """ - return self._backend + @property + def dataframe_backend(self) -> str: + """Return the backend used.""" + return self._dataframe_backend @property def time_col(self) -> str: - """Return the column name representing time. - - :return: The name of the time column. - :rtype: str - """ + """Return the column name representing time.""" return self._time_col @property def target_col(self) -> str: - """Return the column name representing the target variable. + """Return the column name representing the target variable.""" + return self._target_col - :return: The name of the target column. - :rtype: str + @property + def df(self) -> SupportedBackendDataFrame: + """Return the DataFrame in its current state.""" + return self._df + + @df.setter + def df(self, dataframe: SupportedBackendDataFrame): + """Set the internal DataFrame.""" + self._df = dataframe + + def _setup_timeframe(self, df: SupportedBackendDataFrame, dataframe_backend: Optional[str]) -> None: + """Centralized method to set up the TimeFrame instance. + + This method handles backend inference, data validation, and sorting. + It ensures consistency between the initialization and update processes. + + :param df: The input DataFrame to be set up. + :type df: SupportedBackendDataFrame + :param dataframe_backend: The backend to use. If None, it will be inferred. + :type dataframe_backend: Optional[str] + + :raises ValueError: If required validations fail (e.g., missing columns, unsupported backend). """ - return self._target_col - def _infer_backend(self, df: SupportedBackendDataFrame) -> str: + # Infer backend if not provided + self._dataframe_backend = dataframe_backend or self._infer_dataframe_backend(df) + + # Set the DataFrame + self.df = validate_and_convert_input(df, self._dataframe_backend) + + # Validate data (e.g., columns, types) + self.validate_data() + + # Sort the data if necessary + if self._sort: + self.sort_data() + + def _infer_dataframe_backend(self, df: SupportedBackendDataFrame) -> str: """Infer the backend from the DataFrame type. :param df: The input DataFrame. @@ -196,136 +284,406 @@ def _infer_backend(self, df: SupportedBackendDataFrame) -> str: else: raise ValueError(f"Unsupported DataFrame type: {type(df)}") - def _validate_columns(self, df: SupportedBackendDataFrame) -> None: - """Validate the presence of required columns in the DataFrame. + def _validate_numeric_features(self) -> None: + """Validate that all features, except for the time column, are numeric. - :param df: The DataFrame to validate. - :type df: SupportedBackendDataFrame - :raises ValueError: If required columns are missing. + This method checks that all columns, other than the `time_col`, contain numeric data, which is a requirement + for machine learning and deep learning workflows. + + :raises ValueError: If any feature column is not numeric. """ - required_columns = [self._time_col, self._target_col] - missing_columns = [col for col in required_columns if col not in df.columns] - if missing_columns: - raise ValueError(f"Missing required columns: {', '.join(missing_columns)}") + df = self.get_data() + + # Backend-specific handling for numeric validation + if self.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: + non_numeric_columns = [ + col for col in df.columns if col != self.time_col and not pd.api.types.is_numeric_dtype(df[col]) + ] + elif self.dataframe_backend == BACKEND_POLARS: + non_numeric_columns = [col for col in df.columns if col != self.time_col and not df[col].dtype.is_numeric()] + else: + raise ValueError(f"Unsupported backend: {self.dataframe_backend}") + + if non_numeric_columns: + raise ValueError( + f"All features except `time_col` must be numeric. Found non-numeric columns: {non_numeric_columns}." + ) + + def _validate_time_column(self) -> None: + """Validate that the `time_col` in the DataFrame is either numeric or timestamp-like. + + This ensures the `time_col` can be used for temporal operations like sorting + or partitioning, which are essential for time-series forecasting. The `time_col` + must be numeric (e.g., integers) or timestamp-like (e.g., datetime). Mixed frequencies + (e.g., daily and monthly timestamps) are allowed, but mixed data types (e.g., numeric and + string) are not. String data types in `time_col` are not allowed across any backend. + + :raises TimeColumnError: If `time_col` is missing, contains unsupported types (non-numeric or non-timestamp), + or has missing values. + :warns MixedFrequencyWarning: If `time_col` contains mixed frequencies (e.g., daily and monthly timestamps). + :warns MixedTimezonesWarning: If `time_col` contains mixed timezone-aware and timezone-naive entries. + """ + df = self.get_data() - def _sort_data( - self, - df: SupportedBackendDataFrame, - ascending: bool = True, - ) -> SupportedBackendDataFrame: - """Internal method to sort the DataFrame based on the backend. + # Ensure the time column exists + if self.time_col not in df.columns: + raise TimeColumnError(f"Missing required column: {self.time_col}") - :param df: The DataFrame to sort. - :type df: SupportedBackendDataFrame - :param ascending: If True, sort in ascending order; if False, sort in descending order. - :type ascending: bool - :return: The sorted DataFrame. - :rtype: SupportedBackendDataFrame - :raises TypeError: If the DataFrame type does not match the backend. - :raises ValueError: If the backend is unsupported. + # Time column could be Pandas/Modin Series or Polars Series + time_col = df[self.time_col] + + # Narrowing the type to ensure type checking with MyPy + if isinstance(time_col, (pd.Series, mpd.Series)): + # Check for missing values in time_col (specific to Pandas/Modin) + if time_col.isnull().any(): + raise TimeColumnError("Missing values found in `time_col`") + + # Validate if time_col is either numeric or timestamp-like + is_numeric_col = self._is_numeric(time_col) + is_timestamp_col = self._is_timestamp_like(time_col) + + if not is_numeric_col and not is_timestamp_col: + raise TimeColumnError(f"`time_col` must be either numeric or timestamp-like, got {time_col.dtype}") + + # Raise MixedFrequencyWarning if mixed frequencies are detected + if is_timestamp_col and self._has_mixed_frequencies(time_col): + warnings.warn("`time_col` contains mixed timestamp frequencies.", MixedFrequencyWarning) + + # Raise MixedTimezonesWarning if mixed timezones are detected + if is_timestamp_col and self._has_mixed_timezones(time_col): + warnings.warn( + "`time_col` contains mixed timezone-aware and timezone-naive entries.", MixedTimezonesWarning + ) + + elif isinstance(time_col, pl.Series): + # Check for missing values in Polars + if time_col.is_null().sum() > 0: + raise TimeColumnError("Missing values found in `time_col`") + + is_numeric_col = self._is_numeric(time_col) + is_timestamp_col = self._is_timestamp_like(time_col) + + if not is_numeric_col and not is_timestamp_col: + raise TimeColumnError(f"`time_col` must be either numeric or timestamp-like, got {time_col.dtype}") + + # Raise MixedFrequencyWarning if mixed frequencies are detected + if is_timestamp_col and self._has_mixed_frequencies(time_col): + warnings.warn("`time_col` contains mixed timestamp frequencies.", MixedFrequencyWarning) + + # Raise MixedTimezonesWarning if mixed timezones are detected + if is_timestamp_col and self._has_mixed_timezones(time_col): + warnings.warn( + "`time_col` contains mixed timezone-aware and timezone-naive entries.", MixedTimezonesWarning + ) + + def _is_timestamp_like(self, time_col: Union[pd.Series, mpd.Series, pl.Series]) -> bool: + """Check if a time column is timestamp-like based on the backend. + + :param time_col: The time column to check. + :return: True if the column is timestamp-like, False otherwise. """ - # Validate the DataFrame type - validate_input(df, self._backend) + if self.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: + return pd.api.types.is_datetime64_any_dtype(time_col) + elif self.dataframe_backend == BACKEND_POLARS: + return time_col.dtype == pl.Datetime + return False - sort_key = [self._time_col] + def _is_numeric(self, time_col: Union[pd.Series, mpd.Series, pl.Series]) -> bool: + """Check if a time column is numeric based on the backend. - # Mapping of backends to their sort functions - sort_functions = { - BACKEND_POLARS: lambda df: df.sort(by=sort_key, descending=not ascending), - BACKEND_PANDAS: lambda df: df.sort_values(by=sort_key, ascending=ascending), - BACKEND_MODIN: lambda df: df.sort_values(by=sort_key, ascending=ascending), - } + :param time_col: The time column to check. + :return: True if the column is numeric, False otherwise. + """ + if self.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: + return pd.api.types.is_numeric_dtype(time_col) + elif self.dataframe_backend == BACKEND_POLARS: + return time_col.dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64] + return False - try: - return sort_functions[self._backend](df) - except KeyError: - raise ValueError(f"Unsupported backend: {self._backend}") + def _has_mixed_frequencies(self, time_col: Union[pd.Series, mpd.Series, pl.Series]) -> bool: + """Check for mixed frequencies in the time column. - def _setup_timeframe(self, df: SupportedBackendDataFrame) -> SupportedBackendDataFrame: - """Sets up the TimeFrame object by converting, validating, and preparing data as required. + :param time_col: The time column to check for mixed frequencies. + :return: True if mixed frequencies are detected, False otherwise. + """ + if isinstance(time_col, (pd.Series, mpd.Series)): + inferred_freq = pd.infer_freq(time_col.dropna()) + return inferred_freq is None + elif isinstance(time_col, pl.Series): + inferred_freq = time_col.to_pandas().infer_freq() # Converts Polars to Pandas for frequency detection + return inferred_freq is None + return False + + def _has_mixed_timezones(self, time_col: Union[pd.Series, mpd.Series, pl.Series]) -> bool: + """Check for mixed timezone-aware and naive timestamps. + + :param time_col: The time column to check for mixed timezones. + :return: True if mixed timezone-aware and naive timestamps are detected, False otherwise. + """ + if isinstance(time_col, (pd.Series, mpd.Series)): + if time_col.dt.tz is not None: + return time_col.dt.tz.hasnans + elif isinstance(time_col, pl.Series): + dtype_str = str(time_col.dtype) + return "TimeZone" in dtype_str + return False - :param df: The input DataFrame to be processed. - :type df: SupportedBackendDataFrame - :return: The processed DataFrame. + def get_data(self) -> SupportedBackendDataFrame: + """Return the DataFrame in its current state. + + :return: The DataFrame managed by the TimeFrame instance. :rtype: SupportedBackendDataFrame - :raises ValueError: - - If required columns are missing. - - If the specified backend is not supported. - :raises TypeError: If the DataFrame type does not match the backend. + + Example Usage: + -------------- + .. code-block:: python + + from temporalscope.core.temporal_data_loader import TimeFrame + import pandas as pd + + # Create a Pandas DataFrame + data = { + 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), + 'target': range(5, 0, -1) + } + df = pd.DataFrame(data) + + # Initialize a TimeFrame + tf = TimeFrame(df, time_col='time', target_col='target') + + # Retrieve the DataFrame + data = tf.get_data() + print(data.head()) """ - # Convert and validate the input DataFrame - df = validate_and_convert_input(df, self._backend) + return self.df - # Validate the presence of required columns - self._validate_columns(df) + def validate_data(self) -> None: + """Run validation checks on the TimeFrame data to ensure it meets the required constraints. - # Sort data if required - if self._sort: - df = self._sort_data(df) + This method runs all internal validation checks to ensure that: + - The `time_col` is numeric or timestamp-like. + - All features, except `time_col`, are numeric. + - There are no missing values in the `time_col` or `target_col`. + - It checks for mixed frequencies in the `time_col` and raises a warning if detected. + + :raises ValueError: If any of the validation checks fail. + + Example Usage: + -------------- + .. code-block:: python + + from temporalscope.core.temporal_data_loader import TimeFrame + import pandas as pd + + # Create a Pandas DataFrame + data = { + 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), + 'target': range(5, 0, -1) + } + df = pd.DataFrame(data) + + # Initialize a TimeFrame + tf = TimeFrame(df, time_col='time', target_col='target') + + # Validate the data + tf.validate_data() + + """ + # Centralized validation of time column + self._validate_time_column() + + # Ensure all non-time columns are numeric + self._validate_numeric_features() + + # Validate that there are no missing values in the time and target columns + self._validate_no_missing_values() + + # Check for mixed frequencies in the time column + self._check_mixed_frequencies() - return df + # Indicate successful validation + print("Data validation passed successfully.") def sort_data(self, ascending: bool = True) -> None: - """Public method to sort the DataFrame by the time column. + """Sort the DataFrame by the time column in place. :param ascending: If True, sort in ascending order; if False, sort in descending order. :type ascending: bool :raises TypeError: If the DataFrame type does not match the backend. - :raises ValueError: If the backend is unsupported. - """ - self.df = self._sort_data(self.df, ascending=ascending) + :raises ValueError: If the backend is unsupported or validation fails. - def get_data(self) -> SupportedBackendDataFrame: - """Return the DataFrame in its current state. + Example Usage: + -------------- + .. code-block:: python - :return: The DataFrame managed by the TimeFrame instance. - :rtype: SupportedBackendDataFrame + from temporalscope.core.temporal_data_loader import TimeFrame + import pandas as pd + + # Create a Pandas DataFrame + data = { + 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), + 'target': range(5, 0, -1) + } + df = pd.DataFrame(data) + + # Initialize a TimeFrame + tf = TimeFrame(df, time_col='time', target_col='target') + + # Sort the DataFrame in ascending order + tf.sort_data(ascending=True) + print(tf.df) """ - return self.df + # Validate the DataFrame before sorting + self.validate_data() + + sort_key = [self._time_col] + + # Mapping of backends to their sort functions, sorting in place + if self.dataframe_backend == BACKEND_POLARS: + if isinstance(self.df, pl.DataFrame): + self.df = self.df.sort(by=sort_key, descending=not ascending) + else: + raise TypeError(f"Expected Polars DataFrame but got {type(self.df)}") + elif self.dataframe_backend == BACKEND_PANDAS: + if isinstance(self.df, pd.DataFrame): + self.df.sort_values(by=sort_key, ascending=ascending, inplace=True) + else: + raise TypeError(f"Expected Pandas DataFrame but got {type(self.df)}") + elif self.dataframe_backend == BACKEND_MODIN: + if isinstance(self.df, mpd.DataFrame): + self.df.sort_values(by=sort_key, ascending=ascending, inplace=True) + else: + raise TypeError(f"Expected Modin DataFrame but got {type(self.df)}") + else: + raise ValueError(f"Unsupported dataframe backend {self._dataframe_backend}") - def update_data(self, new_df: SupportedBackendDataFrame) -> None: - """Updates the internal DataFrame with the provided new DataFrame. + def update_data( + self, new_df: SupportedBackendDataFrame, time_col: Optional[str] = None, target_col: Optional[str] = None + ) -> None: + """Updates the internal DataFrame with the provided new DataFrame and ensures backend consistency. :param new_df: The new DataFrame to replace the existing one. :type new_df: SupportedBackendDataFrame + :param time_col: The name of the column representing time. Should be numeric or timestamp-like for sorting. Optional. + :type time_col: Optional[str] + :param target_col: The column representing the target variable. Must be a valid column in the DataFrame. Optional. + :type target_col: Optional[str] :raises TypeError: If the new DataFrame type does not match the backend. - :raises ValueError: If required columns are missing in the new DataFrame. + :raises ValueError: If required columns are missing in the new DataFrame, or validation fails. + + Example Usage: + -------------- + .. code-block:: python + + from temporalscope.core.temporal_data_loader import TimeFrame + import pandas as pd + + # Create initial DataFrame + df = pd.DataFrame({ + 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), + 'target': range(5, 0, -1) + }) + + # Initialize a TimeFrame + tf = TimeFrame(df, time_col='time', target_col='target') + + # Create new DataFrame to update + new_df = pd.DataFrame({ + 'time': pd.date_range(start='2021-01-06', periods=5, freq='D'), + 'target': range(1, 6) + }) + + # Update the DataFrame within TimeFrame + tf.update_data(new_df, time_col='time', target_col='target') + print(tf.get_data()) """ - # Validate and convert the new DataFrame - new_df = validate_and_convert_input(new_df, self._backend) - # Validate required columns - self._validate_columns(new_df) - self.df = new_df - def update_target_col(self, new_target_col: Union[pl.Series, pd.Series, mpd.Series]) -> None: + # Infer backend for the new DataFrame if needed + self._dataframe_backend = self._infer_dataframe_backend(new_df) + + # Validate and update the time_col and target_col if provided + if time_col: + if time_col not in new_df.columns: + raise ValueError(f"`time_col` {time_col} not found in the new DataFrame.") + self._time_col = time_col + + if target_col: + if target_col not in new_df.columns: + raise ValueError(f"`target_col` {target_col} not found in the new DataFrame.") + self.update_target_col(new_df[target_col]) + + # Use _setup_timeframe to centralize backend inference, validation, and sorting + self._setup_timeframe(new_df, self._dataframe_backend) + + def update_target_col(self, new_target_col: SupportedBackendDataFrame) -> None: """Updates the target column in the internal DataFrame with the provided new target column. :param new_target_col: The new target column to replace the existing one. :type new_target_col: Union[pl.Series, pd.Series, mpd.Series] :raises TypeError: If the target column type does not match the backend. - :raises ValueError: If the length of the new target column does not match the DataFrame. + :raises ValueError: If the length of the new target column does not match the DataFrame, or validation fails. + + Example Usage: + -------------- + .. code-block:: python + + from temporalscope.core.temporal_data_loader import TimeFrame + import pandas as pd + + # Create a Pandas DataFrame + df = pd.DataFrame({ + 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), + 'target': range(5, 0, -1) + }) + + # Initialize a TimeFrame + tf = TimeFrame(df, time_col='time', target_col='target') + + # Update the target column with new values + new_target = pd.Series([1, 2, 3, 4, 5], name='target') + tf.update_target_col(new_target) + print(tf.get_data()) """ - # Validate the target column type - if self._backend == BACKEND_POLARS: + # Step 1: Validate the target column type + if self._dataframe_backend == BACKEND_POLARS: if not isinstance(new_target_col, pl.Series): raise TypeError("Expected a Polars Series for the Polars backend.") - elif self._backend == BACKEND_PANDAS: + elif self._dataframe_backend == BACKEND_PANDAS: if not isinstance(new_target_col, pd.Series): raise TypeError("Expected a Pandas Series for the Pandas backend.") - elif self._backend == BACKEND_MODIN: + elif self._dataframe_backend == BACKEND_MODIN: if not isinstance(new_target_col, mpd.Series): raise TypeError("Expected a Modin Series for the Modin backend.") else: - raise ValueError(f"Unsupported backend: {self._backend}") + raise ValueError(f"Unsupported dataframe_backend {self._dataframe_backend}") - # Check if the new target column length matches the DataFrame length + # Step 2: Check if the new target column length matches the DataFrame length if len(new_target_col) != len(self.df): raise ValueError("The new target column must have the same number of rows as the DataFrame.") + # Step 3: Validate the entire DataFrame before making changes + self.validate_data() + + # Step 4: If all validations pass, proceed with updating the target column + # Use a temporary copy of the DataFrame for update and commit only after all checks + temp_df = None # Declare once without type hints + + if self._dataframe_backend == BACKEND_POLARS: + temp_df = self.df.clone() # Polars DataFrame uses `clone()` + elif self._dataframe_backend == BACKEND_PANDAS and isinstance(self.df, pd.DataFrame): + temp_df = self.df.copy() # Pandas DataFrame uses `copy()` + elif self._dataframe_backend == BACKEND_MODIN and isinstance(self.df, mpd.DataFrame): + temp_df = self.df.copy() # Modin DataFrame uses `copy()` + else: + raise ValueError(f"Unsupported dataframe_backend {self._dataframe_backend}") + # Update the target column based on the backend - if self._backend == BACKEND_POLARS: - self.df = self.df.with_columns([new_target_col.alias(self._target_col)]) - elif self._backend == BACKEND_PANDAS: - self.df[self._target_col] = new_target_col.to_numpy() # Convert to NumPy for Pandas - elif self._backend == BACKEND_MODIN: - self.df[self._target_col] = new_target_col.to_numpy() # Use .to_numpy() for Modin + if self._dataframe_backend == BACKEND_POLARS: + temp_df = temp_df.with_columns([new_target_col.alias(self._target_col)]) + elif self._dataframe_backend == BACKEND_PANDAS: + temp_df[self._target_col] = new_target_col.to_numpy() # Convert to NumPy for Pandas + elif self._dataframe_backend == BACKEND_MODIN: + temp_df[self._target_col] = new_target_col.to_numpy() # Use .to_numpy() for Modin + + # Step 5: Commit the changes by updating the internal DataFrame + self.df = temp_df diff --git a/src/temporalscope/core/temporal_target_shifter.py b/src/temporalscope/core/temporal_target_shifter.py index 527854f..ff3a0fe 100644 --- a/src/temporalscope/core/temporal_target_shifter.py +++ b/src/temporalscope/core/temporal_target_shifter.py @@ -29,7 +29,7 @@ """ import warnings -from typing import Optional, Union +from typing import Optional, Union, cast import modin.pandas as mpd import pandas as pd @@ -51,21 +51,43 @@ class TemporalTargetShifter: """A class for shifting the target variable in time series data for machine learning or deep learning. - This class works with the `TimeFrame` and partitioned datasets (e.g., from `SlidingWindowPartitioner`) - to shift the target variable by a specified number of lags (time steps). It supports multiple backends - (Polars, Pandas, Modin) and can generate output suitable for both machine learning models (scalar) - and deep learning models (sequence). + This class works with `TimeFrame` objects or raw DataFrame types (Pandas, Modin, Polars) to shift the target variable + by a specified number of lags (time steps). It supports multiple backends and can generate output suitable for + machine learning models (scalar) or deep learning models (sequences). + + Design: + ------- + The `TemporalTargetShifter` follows a strategy pattern, where the data format (backend) is either inferred from the + input or set explicitly. This enables flexible support for different DataFrame libraries. The class ensures that + input type consistency is maintained, and it returns the same data type that is provided. For instance, if the input + is a `TimeFrame`, the output will be a `TimeFrame`. If a raw DataFrame is provided, the output will be a raw + DataFrame of the same type. Assumptions: ------------ - 1. The class applies time shifting globally, without grouping by entities (e.g., tickers or SKUs). - Users should handle any entity-specific grouping outside of this class. - 2. The time shifting is applied to the target column, which may have varying data structures - depending on the backend (Polars, Pandas, Modin). + 1. Time shifting is applied globally, meaning the data is not grouped by entities (e.g., tickers or SKUs). Users + should handle such grouping outside of this class. + 2. The time shifting is applied to a target column, which may have varying data structures depending on the backend + (Polars, Pandas, Modin). + + :param target_col: The column representing the target variable (mandatory). + :type target_col: str + :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. + :type n_lags: int + :param mode: Mode of operation: "machine_learning" for scalar or "deep_learning" for sequences. + Default is "machine_learning". + :type mode: str + :param sequence_length: (Deep Learning Mode Only) The length of the input sequences. Required if mode is "deep_learning". + :type sequence_length: Optional[int] + :param drop_target: Whether to drop the original target column after shifting. Default is True. + :type drop_target: bool + :param verbose: If True, prints information about the number of dropped rows during transformation. + :type verbose: bool + :raises ValueError: If the backend is unsupported or if validation checks fail. Examples -------- - **Using `TimeFrame`:** + **Using TimeFrame:** .. code-block:: python @@ -84,10 +106,10 @@ class TemporalTargetShifter: tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") # Apply target shifting - shifter = TemporalTargetShifter(n_lags=1, target_col="target") + shifter = TemporalTargetShifter(target_col="target", n_lags=1) shifted_df = shifter.fit_transform(tf) - **Using `SlidingWindowPartitioner`:** + **Using SlidingWindowPartitioner:** .. code-block:: python @@ -102,43 +124,27 @@ class TemporalTargetShifter: partitioner = SlidingWindowPartitioner(tf=tf, window_size=10, stride=1) # Apply TemporalTargetShifter on each partition - shifter = TemporalTargetShifter(n_lags=1, target_col="target") + shifter = TemporalTargetShifter(target_col="target", n_lags=1) for partition in partitioner.fit_transform(): shifted_partition = shifter.fit_transform(partition) - - :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. - :type n_lags: int - :param mode: Mode of operation: "machine_learning" for scalar or "deep_learning" for sequences. - Default is "machine_learning". - :type mode: str - :param sequence_length: (Deep Learning Mode Only) The length of the input sequences. Required if mode is "deep_learning". - :type sequence_length: Optional[int] - :param target_col: The column representing the target variable (mandatory). - :type target_col: str - :param drop_target: Whether to drop the original target column after shifting. Default is True. - :type drop_target: bool - :param verbose: If True, prints information about the number of dropped rows during transformation. - :type verbose: bool - :raises ValueError: If the backend is unsupported or if validation checks fail. - """ def __init__( self, + target_col: Optional[str] = None, n_lags: int = 1, mode: str = MODE_MACHINE_LEARNING, sequence_length: Optional[int] = None, - target_col: Optional[str] = None, drop_target: bool = True, verbose: bool = False, ): """Initialize the TemporalTargetShifter. + :param target_col: Column representing the target variable (mandatory). :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. :param mode: Mode of operation: "machine_learning" or "deep_learning". Default is "machine_learning". :param sequence_length: (Deep Learning Mode Only) Length of the input sequences. Required if mode is "deep_learning". - :param target_col: Column representing the target variable (mandatory). :param drop_target: Whether to drop the original target column after shifting. Default is True. :param verbose: Whether to print detailed information about transformations. :raises ValueError: If the target column is not provided or if an invalid mode is selected. @@ -148,8 +154,8 @@ def __init__( the type of input data (TimeFrame or SupportedBackendDataFrame). """ # Validate the mode (should be machine learning or deep learning) - if mode not in [self.MODE_MACHINE_LEARNING, self.MODE_DEEP_LEARNING]: - raise ValueError(f"`mode` must be '{self.MODE_MACHINE_LEARNING}' or '{self.MODE_DEEP_LEARNING}'.") + if mode not in [MODE_MACHINE_LEARNING, MODE_DEEP_LEARNING]: + raise ValueError(f"`mode` must be '{MODE_MACHINE_LEARNING}' or '{MODE_DEEP_LEARNING}'.") # Ensure the target column is provided if target_col is None: @@ -160,51 +166,57 @@ def __init__( raise ValueError("`n_lags` must be greater than 0.") # Handle deep learning mode, ensure sequence length is set - if mode == self.MODE_DEEP_LEARNING and sequence_length is None: + if mode == MODE_DEEP_LEARNING and sequence_length is None: raise ValueError("`sequence_length` must be provided when mode is 'deep_learning'.") # Assign instance attributes + self.target_col = target_col self.n_lags = n_lags self.mode = mode self.sequence_length = sequence_length - self.target_col = target_col self.drop_target = drop_target self.verbose = verbose # The data format will be inferred later during fit() - self.data_format = None # Data format will be inferred during fit() + self.data_format: Optional[str] = None # Print a verbose message if required if verbose: - print(f"Initialized TemporalTargetShifter with mode={mode}, n_lags={n_lags}, target_col={target_col}") + print(f"Initialized TemporalTargetShifter with target_col={target_col}, mode={mode}, n_lags={n_lags}") - def _infer_backend(self, df: SupportedBackendDataFrame) -> str: + def _infer_data_format(self, df: SupportedBackendDataFrame) -> str: """Infer the backend from the DataFrame type. :param df: The input DataFrame. :type df: SupportedBackendDataFrame - :return: The inferred backend ('pl', 'pd', or 'mpd'). + :return: The inferred backend ('BACKEND_POLARS', 'BACKEND_PANDAS', or 'BACKEND_MODIN'). :raises ValueError: If the DataFrame type is unsupported. """ if isinstance(df, pl.DataFrame): - return "pl" + return BACKEND_POLARS elif isinstance(df, pd.DataFrame): - return "pd" + return BACKEND_PANDAS elif isinstance(df, mpd.DataFrame): - return "mpd" + return BACKEND_MODIN else: raise ValueError(f"Unsupported DataFrame type: {type(df)}") - def _set_backend(self, df: SupportedBackendDataFrame) -> None: - """Set or infer the backend based on the DataFrame. + def _set_or_infer_data_format(self, tf: TimeFrameCompatibleData) -> None: + """Set or infer the data format based on the input type. - :param df: The input DataFrame. - :type df: SupportedBackendDataFrame - :raises ValueError: If the backend is not supported. + This method checks if the input is a TimeFrame and uses its data format. + If the input is a raw DataFrame (Pandas, Modin, or Polars), it infers the data format. """ - if self.backend is None: - self.backend = self._infer_backend(df) - validate_backend(self.backend) + if isinstance(tf, TimeFrame): + self.data_format = tf.dataframe_backend + else: + # Infer the data format using the existing _infer_data_format method + self.data_format = self._infer_data_format(tf) + + if self.data_format is None: + raise ValueError("Data format could not be inferred or is not set.") + + validate_backend(self.data_format) def _validate_data(self, tf: TimeFrameCompatibleData) -> None: """Validate the TimeFrame or DataFrame input for consistency. @@ -241,7 +253,7 @@ def _shift_polars(self, df: pl.DataFrame, target_col: str) -> pl.DataFrame: :rtype: pl.DataFrame :raises ValueError: If `sequence_length` or `n_lags` are not properly set. """ - if self.mode == self.MODE_DEEP_LEARNING: + if self.mode == MODE_DEEP_LEARNING: if not isinstance(self.sequence_length, int): raise ValueError("`sequence_length` must be an integer.") shifted_columns = [ @@ -281,7 +293,7 @@ def _shift_pandas_modin( :rtype: Union[pd.DataFrame, mpd.DataFrame] :raises ValueError: If `sequence_length` or `n_lags` are not properly set. """ - if self.mode == self.MODE_DEEP_LEARNING: + if self.mode == MODE_DEEP_LEARNING: if not isinstance(self.sequence_length, int): raise ValueError("`sequence_length` must be an integer.") shifted_columns = [df[target_col].shift(-i) for i in range(self.sequence_length)] @@ -408,7 +420,7 @@ def fit(self, tf: TimeFrameCompatibleData) -> "TemporalTargetShifter": # If input is a TimeFrame, set the backend using the @property method and manage the target column if isinstance(tf, TimeFrame): - self.data_format = tf.backend # Using the @property to access the backend + self.data_format = tf.dataframe_backend # Using the @property to access the backend if not self.target_col: self.target_col = tf._target_col # If target_col not set in the shifter, use TimeFrame's target_col elif self.target_col != tf._target_col: @@ -419,29 +431,33 @@ def fit(self, tf: TimeFrameCompatibleData) -> "TemporalTargetShifter": ) # If input is a raw DataFrame (pandas, modin, or polars), infer the backend elif tf is not None: - self.data_format = self._infer_backend(tf) + self.data_format = self._infer_data_format(tf) else: raise ValueError("Input data is None.") # Return the instance after fitting return self - def transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFrame: + def transform(self, tf: TimeFrameCompatibleData) -> TimeFrameCompatibleData: """Transform the input time series data by shifting the target variable according to the specified number of lags. The `transform` method shifts the target variable in the input data according to the `n_lags` or `sequence_length` set during initialization. This method works directly on either a `TimeFrame` or a raw DataFrame (Pandas, Modin, or Polars), applying the appropriate backend-specific transformation. + Design: + ------- + The method returns the same type as the input: If a `TimeFrame` object is passed in, a `TimeFrame` object is returned. + If a raw DataFrame (Pandas, Modin, or Polars) is passed in, the same type of DataFrame is returned. This ensures that + the transformation remains consistent in pipeline workflows where the type of data object is important. + :param tf: The `TimeFrame` object or a DataFrame (Pandas, Modin, or Polars) that contains the time series data to be transformed. The data should contain a target column that will be shifted. :type tf: TimeFrameCompatibleData :raises ValueError: If the input data is invalid, unsupported, or lacks columns. :raises ValueError: If the backend is unsupported or data validation fails. - :return: A transformed DataFrame or `TimeFrame` with the target variable shifted by the specified lags or sequence - length. If a `TimeFrame` is provided, the returned object will be a `TimeFrame`. Otherwise, a DataFrame - will be returned. - :rtype: SupportedBackendDataFrame + :return: A transformed `TimeFrame` if the input was a `TimeFrame`, otherwise a DataFrame of the same type as the input. + :rtype: TimeFrameCompatibleData Example Usage: -------------- @@ -468,6 +484,7 @@ def transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFrame: # Fit the shifter and transform the data shifter.fit(tf) transformed_data = shifter.transform(tf) + """ # Handle TimeFrame input: sort data and retrieve the DataFrame if isinstance(tf, TimeFrame): @@ -479,7 +496,7 @@ def transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFrame: self.target_col = tf._target_col # Assign the backend from TimeFrame - self.data_format = tf.backend + self.data_format = tf.dataframe_backend # Handle raw DataFrame input elif tf is not None: @@ -493,7 +510,7 @@ def transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFrame: raise ValueError("The input DataFrame does not have columns.") # Set or infer the backend for the DataFrame - self._set_backend(df) + self._set_or_infer_data_format(df) else: raise ValueError("Input data is None.") @@ -512,27 +529,33 @@ def transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFrame: time_col=tf.time_col, target_col=( f"{self.target_col}_shift_{self.n_lags}" - if self.mode == self.MODE_MACHINE_LEARNING + if self.mode == MODE_MACHINE_LEARNING else f"{self.target_col}_sequence" ), - backend=self.data_format, + dataframe_backend=self.data_format, ) return transformed_df - def fit_transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFrame: + def fit_transform(self, tf: TimeFrameCompatibleData) -> TimeFrameCompatibleData: """Fit and transform the input data in a single step. - This method combines the functionality of the `fit` and `transform` methods. It first validates and prepares the input data (fitting), - then applies the target variable shifting (transformation) based on the `n_lags` or `sequence_length` specified during initialization. + This method combines the functionality of the `fit` and `transform` methods. It first validates and prepares the input + data (fitting), then applies the target variable shifting (transformation) based on the `n_lags` or `sequence_length` + specified during initialization. + + Design: + ------- + The output type mirrors the input type. If a `TimeFrame` is provided, a `TimeFrame` is returned. If a raw DataFrame + (Pandas, Modin, or Polars) is provided, the output will be a DataFrame of the same type. This ensures that the + transformation remains consistent with the input, making it easier to work with in pipeline workflows. :param tf: The `TimeFrame` object or a DataFrame (`pandas`, `modin`, or `polars`) to be transformed. - The data should contain a target column that will be shifted according to the `n_lags` or `sequence_length`. :type tf: TimeFrameCompatibleData :raises ValueError: If the input data is invalid or the backend is unsupported. :raises ValueError: If the target column is not set, or is incompatible with the data. - :return: A transformed DataFrame or TimeFrame with the target variable shifted by the specified lags or sequence length. - :rtype: SupportedBackendDataFrame + :return: A transformed `TimeFrame` if the input was a `TimeFrame`, otherwise a DataFrame of the same type as the input. + :rtype: TimeFrameCompatibleData Example Usage: -------------- @@ -551,7 +574,7 @@ def fit_transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFram df = pd.DataFrame(data) # Create a TimeFrame object - tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend="pd") # Initialize TemporalTargetShifter shifter = TemporalTargetShifter(n_lags=2, target_col="target") @@ -565,18 +588,19 @@ def fit_transform(self, tf: TimeFrameCompatibleData) -> SupportedBackendDataFram # Apply the transformation (delegates to backend-specific methods) transformed = self.transform(tf) - # If input was a TimeFrame, return the transformed TimeFrame + # If the input was a TimeFrame, return a new TimeFrame with the transformed DataFrame if isinstance(tf, TimeFrame): + tf_casted = cast(TimeFrame, tf) return TimeFrame( - transformed, - time_col=tf.time_col, + transformed, # Pass the transformed DataFrame directly + time_col=tf_casted.time_col, target_col=( f"{self.target_col}_shift_{self.n_lags}" - if self.mode == self.MODE_MACHINE_LEARNING + if self.mode == MODE_MACHINE_LEARNING else f"{self.target_col}_sequence" ), - backend=self.data_format, # Ensure we use the inferred backend from fit() + dataframe_backend=tf_casted.dataframe_backend, # Ensure we use the original backend from the input ) - # If input was a raw DataFrame, return the transformed DataFrame + # Otherwise, return the transformed raw DataFrame return transformed diff --git a/src/temporalscope/datasets/datasets.py b/src/temporalscope/datasets/datasets.py new file mode 100644 index 0000000..9ec527c --- /dev/null +++ b/src/temporalscope/datasets/datasets.py @@ -0,0 +1,236 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" TemporalScope/src/temporalscope/datasets/datasets.py + +Utility class for loading datasets and initializing TimeFrame objects with multi-backend support. +Supports Pandas, Modin, and Polars as backends for time series forecasting and analysis. + +This class is intended to be used for tutorials and examples that involve open-source datasets +licensed under Apache, MIT, or similar valid open-source licenses. It simplifies dataset loading +and preprocessing while providing compatibility with multiple DataFrame backends, including Pandas, +Modin, and Polars. The class can be easily extended to include additional datasets in the future. + +Example: +--------- +.. code-block:: python + + from temporalscope.datasets.datasets import DatasetLoader + + # Initialize the dataset loader with the 'macrodata' dataset + dataset_loader = DatasetLoader(dataset_name="macrodata") + + # Load and initialize TimeFrames for the specified backends (Pandas, Modin, Polars) + timeframes = dataset_loader.load_and_init_timeframes() + + # Access the Modin TimeFrame and perform operations + modin_tf = timeframes["modin"] + print(modin_tf.get_data().head()) + + # Access metadata of the Modin TimeFrame object + print(modin_tf.__dict__) + +""" + +import pandas as pd +import modin.pandas as mpd +import polars as pl +from statsmodels.datasets import macrodata +from typing import Tuple, Dict, Callable, Union +from temporalscope.core.temporal_data_loader import TimeFrame +from temporalscope.core.core_utils import ( + BACKEND_PANDAS, + BACKEND_MODIN, + BACKEND_POLARS, + SupportedBackendDataFrame, + print_divider +) + +def _load_macrodata() -> Tuple[pd.DataFrame, str]: + """ Load and preprocess the macrodata dataset. + + Combines the 'year' and 'quarter' columns to create a datetime 'ds' column. + The dataset is then returned with the 'realgdp' column as the default target. + + :return: A tuple containing the preprocessed DataFrame and the default target column 'realgdp'. + :rtype: Tuple[pd.DataFrame, str] + """ + dataset_df = macrodata.load_pandas().data.copy() + + # Ensure 'year' and 'quarter' are integers + dataset_df["year"] = dataset_df["year"].astype(int) + dataset_df["quarter"] = dataset_df["quarter"].astype(int) + + # Combine 'year' and 'quarter' to create a datetime 'ds' column + dataset_df["ds"] = pd.to_datetime( + dataset_df["year"].astype(str) + "-" + + ((dataset_df["quarter"] - 1) * 3 + 1).astype(str) + "-01" + ) + + # Drop the 'year' and 'quarter' columns + dataset_df = dataset_df.drop(columns=["year", "quarter"]) # Remove redundant columns + + # Return the dataset and the default target column + return dataset_df, "realgdp" + + +# Map of available datasets to their respective loader functions +AVAILABLE_DATASETS = { + "macrodata": _load_macrodata, + # Future datasets can be added here with their corresponding loading functions +} + +SupportedBackends = Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] + + +class DatasetLoader: + """ A utility class for loading datasets and initializing TimeFrame objects for multiple backends. + + This class supports datasets that are licensed under valid open-source licenses (such as Apache and MIT). + It simplifies loading and preprocessing of datasets and enables compatibility with Pandas, Modin, and Polars + DataFrame backends. Designed for tutorials and practical examples, this class is ideal for educational purposes + and demonstration of time series forecasting workflows. + + Attributes: + ------------ + dataset_name : str + The name of the dataset to be loaded. It must be available in the `AVAILABLE_DATASETS` dictionary. + + Methods: + --------- + load_and_init_timeframes: + Load the specified dataset and initialize TimeFrame objects for multiple backends. + + Example: + --------- + .. code-block:: python + + # Initialize the loader with the 'macrodata' dataset + dataset_loader = DatasetLoader(dataset_name="macrodata") + + # Load and initialize TimeFrames for Pandas, Modin, and Polars + timeframes = dataset_loader.load_and_init_timeframes() + + # Access the Modin TimeFrame object + modin_tf = timeframes["modin"] + print(modin_tf.get_data().head()) + + """ + + def __init__(self, dataset_name: str = "macrodata") -> None: + """ + Initialize DatasetLoader with a specified dataset. + + :param dataset_name: The name of the dataset to load. Must be available in AVAILABLE_DATASETS. + :raises ValueError: If the specified dataset is not available. + """ + if dataset_name not in AVAILABLE_DATASETS: + raise ValueError( + f"Dataset '{dataset_name}' is not supported. Available datasets: {list(AVAILABLE_DATASETS.keys())}" + ) + self.dataset_name = dataset_name + + def _load_dataset_and_target(self) -> Tuple[pd.DataFrame, str]: + """ + Internal method to load the dataset and its associated target column. + + :return: A tuple containing the preprocessed DataFrame and the associated target column name. + :rtype: Tuple[pd.DataFrame, str] + """ + print_divider() + print(f"Loading the '{self.dataset_name}' dataset.") + print_divider() + + # Fetch the dataset loader function and load the dataset with its target column + loader_func: Callable[[], Tuple[pd.DataFrame, str]] = AVAILABLE_DATASETS[self.dataset_name] + dataset_df, target_col = loader_func() + + print(f"Loaded DataFrame shape: {dataset_df.shape}") + print(f"Target column: {target_col}") + print_divider() + + return dataset_df, target_col + + def init_timeframes_for_backends( + self, df: pd.DataFrame, target_col: str, backends: Tuple[str, ...] = (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS) + ) -> Dict[str, TimeFrame]: + """ Initialize TimeFrame objects for the specified backends using the provided DataFrame. + + :param df: The preprocessed DataFrame to initialize TimeFrames with. + :param target_col: The target column to use for TimeFrame initialization. + :param backends: A tuple of supported backends to initialize. Defaults to Pandas, Modin, and Polars. + :return: A dictionary containing TimeFrame objects for each requested backend. + :rtype: Dict[str, TimeFrame] + :raises ValueError: If an unsupported backend is specified. + + Example: + --------- + .. code-block:: python + + from temporalscope.datasets.datasets import DatasetLoader + + dataset_loader = DatasetLoader(dataset_name="macrodata") + timeframes = dataset_loader.init_timeframes_for_backends(df, "realgdp") + + """ + print_divider() + print("Initializing TimeFrame objects for specified backends.") + print_divider() + + timeframes: Dict[str, TimeFrame] = {} + + # Loop through the specified backends and create TimeFrame objects + for backend in backends: + if backend == BACKEND_PANDAS: + timeframes[BACKEND_PANDAS] = TimeFrame( + df=pd.DataFrame(df), time_col="ds", target_col=target_col, dataframe_backend=BACKEND_PANDAS + ) + elif backend == BACKEND_MODIN: + timeframes[BACKEND_MODIN] = TimeFrame( + df=mpd.DataFrame(df), time_col="ds", target_col=target_col, dataframe_backend=BACKEND_MODIN + ) + elif backend == BACKEND_POLARS: + timeframes[BACKEND_POLARS] = TimeFrame( + df=pl.DataFrame(df), time_col="ds", target_col=target_col, dataframe_backend=BACKEND_POLARS + ) + else: + raise ValueError(f"Unsupported backend: {backend}") + + return timeframes + + def load_and_init_timeframes( + self, backends: Tuple[str, ...] = (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS) + ) -> Dict[str, TimeFrame]: + """ Load the dataset and initialize TimeFrames for the specified backends. + + :param backends: A tuple of supported backends to initialize. Defaults to Pandas, Modin, and Polars. + :return: A dictionary containing TimeFrame objects for each backend. + :rtype: Dict[str, TimeFrame] + + Example: + --------- + .. code-block:: python + + dataset_loader = DatasetLoader(dataset_name="macrodata") + timeframes = dataset_loader.load_and_init_timeframes() + + """ + # Load and preprocess the dataset, including determining the target column + df, target_col = self._load_dataset_and_target() + + # Initialize TimeFrames for the specified backends + return self.init_timeframes_for_backends(df, target_col, backends) diff --git a/src/temporalscope/partition/padding.py b/src/temporalscope/partition/padding.py new file mode 100644 index 0000000..be0b943 --- /dev/null +++ b/src/temporalscope/partition/padding.py @@ -0,0 +1,744 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""TemporalScope/src/temporalscope/partition/padding.py + +This module provides utility functions for padding and partitioning time-series DataFrames. +It supports multiple backends such as Pandas, Modin, and Polars, ensuring that data is sorted +by a `time_col` (if provided) before applying transformations such as padding. + +The design of this module aligns with TensorFlow, PyTorch, and Darts for universal ML & DL time-series workflows. +It integrates with the central `TimeFrame` concept in TemporalScope, ensuring compatibility with temporal XAI +and partitioning workflows. + +Core Functionality: +------------------- +Each padding function ensures the DataFrame is sorted based on a `time_col` (if provided) before applying +the selected padding scheme. + +Padding is designed to: +1. Expand Data: When datasets have insufficient data points (e.g., missing timestamps), padding fills in the gaps. +2. Fix Data Shapes: Many ML/DL architectures require fixed input shapes, and padding ensures uniformity across batches or partitions. +3. Maintain Backend Consistency: Padding respects the original backend of the DataFrame (Pandas, Modin, or Polars). +4. Preserve Precision Consistency: Padding operations ensure that data types (e.g., `Float32`, `Int64`) are retained, avoiding unnecessary conversions and ensuring precision consistency throughout the pipeline. + +Design Constraint: +------------------ +For categorical columns, users **must** handle encoding (e.g., label encoding, one-hot encoding) before using any +partitioning or padding utilities. This module focuses only on numerical and time columns. The only special handling +occurs for the `time_col` (if specified), which can be a timestamp or a numeric column. + +Examples: +--------- + .. code-block:: python + + >>> import pandas as pd + >>> df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "time": pd.date_range("20210101", periods=3)}) + >>> padded_df = zero_pad(df, target_len=5, time_col="time") + >>> print(padded_df) + a b time + 0 0 0 2021-01-04 + 1 0 0 2021-01-05 + + >>> import polars as pl + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "time": ["2021-01-01", "2021-01-02", "2021-01-03"]}) + >>> padded_df = pad_dataframe(df, target_len=5, mode="zero", padding="post", time_col="time") + >>> print(padded_df) + +.. seealso:: + 1. Dwarampudi, M. and Reddy, N.V., 2019. Effects of padding on LSTMs and CNNs. arXiv preprint arXiv:1903.07288. + 2. Lafabregue, B., Weber, J., Gançarski, P. and Forestier, G., 2022. End-to-end deep representation learning for time series clustering: a comparative study. Data Mining and Knowledge Discovery, 36(1), pp.29-81. +""" + +import warnings +from typing import Union, Optional, cast +import pandas as pd +import modin.pandas as mpd +import polars as pl +from temporalscope.core.core_utils import SupportedBackendDataFrame + +# Define numeric types for each backend +PANDAS_NUMERIC_TYPES = ["number"] +MODIN_NUMERIC_TYPES = ["number"] # Same as Pandas since Modin mimics Pandas' behavior +POLARS_NUMERIC_TYPES = [pl.Float64, pl.Float32, pl.Int64, pl.Int32, pl.Int16, pl.Int8] + +# Constants for time padding strategies +TIME_PAD_STRATEGIES = { + "nat": pd.NaT, # Use NaT for missing time values + "fill_forward": "fill_forward", # Forward-fill the missing time values +} + +# List of padding schemes +PAD_SCHEMES = ["zero", "forward_fill", "backward_fill", "mean_fill"] + + +def validate_dataframe(df: SupportedBackendDataFrame) -> None: + """Validates the type and emptiness of the DataFrame. + + This function raises exceptions if the DataFrame is not one of the supported + backends (Pandas, Modin, Polars) or if it is empty. + + :param df: The DataFrame (Pandas, Modin, or Polars) to validate. + :raises TypeError: If the DataFrame type is unsupported. + :raises ValueError: If the DataFrame is empty. + """ + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + if df.empty: + raise ValueError("Cannot operate on an empty DataFrame.") + elif isinstance(df, pl.DataFrame): + if df.is_empty(): + raise ValueError("Cannot operate on an empty DataFrame.") + else: + raise TypeError(f"Unsupported DataFrame type: {type(df)}") + + +def sort_dataframe(df: SupportedBackendDataFrame, time_col: str, ascending: bool = True) -> SupportedBackendDataFrame: + """Sort the DataFrame by a time column. + + :param df: The DataFrame to sort (supports Pandas, Modin, or Polars). + :param time_col: The column name to sort by (for time-based sorting). + :param ascending: Whether to sort in ascending or descending order. + :return: The sorted DataFrame. + + :raises ValueError: If the DataFrame is empty or the time_col is missing. + :raises TypeError: If the DataFrame type is unsupported. + :raises ValueError: If time_col is not found in the DataFrame. + :warning: If `time_col` is not numeric or datetime. + """ + validate_dataframe(df) + + # Ensure time_col is provided and exists in the DataFrame + if not time_col or time_col not in df.columns: + raise ValueError(f"Time column '{time_col}' must be provided and exist in the DataFrame.") + + # Issue a warning if time_col is not recognized as numeric or datetime + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + if not pd.api.types.is_numeric_dtype(df[time_col]) and not pd.api.types.is_datetime64_any_dtype(df[time_col]): + warnings.warn(f"Column '{time_col}' is neither numeric nor datetime. Ensure it is processed properly.") + # Sort using Pandas/Modin + return df.sort_values(by=time_col, ascending=ascending) + + elif isinstance(df, pl.DataFrame): + # Polars-specific types: handle proper type-checking without over-constraining + valid_dtypes = [pl.Int64, pl.Int32, pl.Float64, pl.Float32, pl.Datetime] + if df[time_col].dtype not in valid_dtypes: + warnings.warn(f"Column '{time_col}' in Polars DataFrame is neither numeric nor datetime.") + # Sort using Polars + return df.sort(by=time_col, descending=not ascending) + + # Shouldn't be reachable, but for safety + raise TypeError(f"Unsupported DataFrame type: {type(df)}") + + +def ensure_type_consistency( + df: SupportedBackendDataFrame, pad_df: SupportedBackendDataFrame +) -> SupportedBackendDataFrame: + """Ensure the column types of `pad_df` match the column types of `df`. + + This is crucial when padding time-series data to ensure type consistency across numeric + columns, especially in ML/DL workflows where precision must be maintained. Without ensuring + consistency, you risk precision loss or unintended data type conversions (e.g., `float32` to `float64`) + when padding data, which could affect downstream neural networks or XAI models like SHAP. + + :param df: The original DataFrame (Pandas, Modin, or Polars). + :param pad_df: The DataFrame to pad with. + :return: `pad_df` with columns cast to match the types of `df`. + + Examples + -------- + .. code-block:: python + + import pandas as pd + from temporalscope.partition.padding import ensure_type_consistency + + # Original DataFrame + df = pd.DataFrame({ + "a": pd.Series([1.0, 2.0], dtype="float32"), + "b": pd.Series([3, 4], dtype="int64") + }) + + # Padded DataFrame + pad_df = pd.DataFrame({ + "a": [0.0, 0.0], + "b": [0, 0] + }) + + # Ensure type consistency between df and pad_df + pad_df = ensure_type_consistency(df, pad_df) + print(pad_df.dtypes) + + .. note:: + - This function is especially useful when working with frameworks like TensorFlow or PyTorch, + where maintaining precision (e.g., `float32` vs. `float64`) is essential to avoid issues like + gradient explosion or vanishing during training. + - We convert Modin DataFrames to Pandas temporarily to ensure type consistency because Modin’s internal + `astype()` can sometimes cause issues when working with mixed data types or `bool` columns. After + consistency is ensured, we convert the DataFrame back to Modin to maintain backend consistency. + """ + + # If df is a Modin DataFrame, convert to Pandas if possible + is_modin_df = False + if isinstance(df, mpd.DataFrame): + is_modin_df = True + if hasattr(df, "_to_pandas"): + df = df._to_pandas() # Convert to Pandas DataFrame + if hasattr(pad_df, "_to_pandas"): + pad_df = pad_df._to_pandas() # Same for pad_df + + # Handle Pandas DataFrame casting + if isinstance(df, pd.DataFrame): + for col in df.columns: + if col in pad_df.columns: + if df[col].dtype == "bool": + # Convert boolean columns to int64 for consistency + pad_df[col] = pad_df[col].astype("int64") + else: + # Cast column to the original dtype + pad_df[col] = pad_df[col].astype(df[col].dtype) + + # Ensure conversion back to Modin happens if pad_df was converted to Pandas + if is_modin_df and isinstance(pad_df, pd.DataFrame): + pad_df = mpd.DataFrame(pad_df) # Convert back to Modin + + return pad_df + + # Handle Polars DataFrame casting + elif isinstance(df, pl.DataFrame): + for col in df.columns: + if col in pad_df.columns: + pad_df = pad_df.with_columns(pad_df[col].cast(df[col].dtype)) + return pad_df + + # If the DataFrame type is unsupported, raise an error + else: + raise TypeError(f"Unsupported DataFrame type: {type(df)}") + + +def zero_pad( + df: SupportedBackendDataFrame, + target_len: int, + time_col: Optional[str] = None, + padding: str = "post", + ascending: bool = True, + pad_value: Union[int, float] = 0, +) -> SupportedBackendDataFrame: + """Apply padding by adding rows filled with a specified value (default is zero). + + This function only handles numeric columns. If `time_col` is provided, the DataFrame will + be sorted by that column before applying the padding scheme. + + :param df: The DataFrame (Pandas, Modin, or Polars) to pad. + :param target_len: The target number of rows after padding. + :param time_col: Optional. The time column to sort by before padding. + :param padding: Whether to pad 'pre' (before) or 'post' (after). + :param ascending: Whether to sort the data in ascending or descending order. + :param pad_value: The value to use for padding numeric columns (default is 0). + :return: A DataFrame padded with rows filled with the specified value for numeric columns. + + :raises ValueError: If target_len is less than the current DataFrame length. + :raises ValueError: If the DataFrame is empty. + :raises TypeError: If an unsupported DataFrame type is provided. + """ + validate_dataframe(df) + + # Ensure the target length is greater than the current DataFrame length + if target_len < len(df): + raise ValueError("target_len must be greater than the current DataFrame length.") + + # Sort the DataFrame if time_col is provided + df = sort_dataframe(df, time_col, ascending) if time_col else df + num_to_pad = target_len - len(df) + + # Create the padding DataFrame + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + # Handle numeric columns: Fill with pad_value + numeric_cols = df.select_dtypes(include=PANDAS_NUMERIC_TYPES).columns + pad_df = pd.DataFrame(pad_value, index=range(num_to_pad), columns=numeric_cols) + + # Check for non-numeric columns (excluding time_col) and raise warnings + non_numeric_cols = df.select_dtypes(exclude=PANDAS_NUMERIC_TYPES).columns.difference([time_col]) + if not non_numeric_cols.empty: + warnings.warn(f"Non-numeric columns found: {non_numeric_cols}. Padding them with NA (null).") + + # Add missing columns to pad_df + missing_cols = set(df.columns) - set(pad_df.columns) + for col in missing_cols: + if col in non_numeric_cols: + pad_df[col] = pd.NA # Fill non-numeric columns with NA (null) + elif col == time_col: + pad_df[col] = pd.NaT # Fill time column with NaT for datetime consistency + else: + pad_df[col] = pad_value + + # Ensure type consistency + pad_df = ensure_type_consistency(df, pad_df) + + elif isinstance(df, pl.DataFrame): + # Handle numeric columns: Fill with pad_value + numeric_cols = df.select(pl.col(POLARS_NUMERIC_TYPES)).columns + pad_df = pl.DataFrame({col: [pad_value] * num_to_pad for col in numeric_cols}) + + # Check for non-numeric columns (excluding time_col) and raise warnings + non_numeric_cols = [col for col in df.columns if col not in numeric_cols and col != time_col] + if non_numeric_cols: + warnings.warn(f"Non-numeric columns found: {non_numeric_cols}. Padding them with None (null).") + + # Add missing columns to pad_df + missing_cols = {col for col in df.columns if col not in pad_df.columns} + for col in missing_cols: + if col in non_numeric_cols: + pad_df = pad_df.with_columns( + pl.lit(None).cast(df[col].dtype).alias(col) + ) # Fill non-numeric columns with None + elif col == time_col: + pad_df = pad_df.with_columns( + pl.lit(None).cast(pl.Datetime).alias(col) + ) # Ensure time column is datetime + else: + pad_df = pad_df.with_columns(pl.lit(pad_value).alias(col)) + + # Ensure type consistency + pad_df = ensure_type_consistency(df, pad_df) + + # Concatenate padding DataFrame + if padding == "post": + if isinstance(df, pd.DataFrame): + return pd.concat([df, pad_df], ignore_index=True) + elif isinstance(df, mpd.DataFrame): + return mpd.concat([df, pad_df], ignore_index=True) + elif isinstance(df, pl.DataFrame): + return df.vstack(pad_df) + elif padding == "pre": + if isinstance(df, pd.DataFrame): + return pd.concat([pad_df, df], ignore_index=True) + elif isinstance(df, mpd.DataFrame): + return mpd.concat([pad_df, df], ignore_index=True) + elif isinstance(df, pl.DataFrame): + return pad_df.vstack(df) + else: + raise ValueError(f"Invalid padding option: {padding}. Use 'pre' or 'post'.") + + return df + + +def forward_fill_pad( + df: SupportedBackendDataFrame, + target_len: int, + end: int, + reverse: bool, + padding: str = "post", + time_col: Optional[str] = None, + ascending: bool = True, +) -> SupportedBackendDataFrame: + """Apply forward-fill padding by repeating the last or first row. Data will be sorted by `time_col` if provided. + + :param df: The DataFrame (Pandas, Modin, or Polars) to pad. + :param target_len: The target number of rows after padding. + :param end: The index indicating the last valid row for padding. + :param reverse: If True, fill from the start of the DataFrame. + :param padding: Whether to pad 'pre' (before) or 'post' (after). Must be one of ['pre', 'post']. + :param time_col: Optional. The time column to sort by before padding. + :param ascending: Whether to sort the data in ascending or descending order. + :return: A DataFrame padded by forward fill. + + :raises ValueError: If target_len is less than the current DataFrame length. + :raises ValueError: If the DataFrame is empty. + :raises ValueError: If `padding` is not one of ['pre', 'post']. + :raises TypeError: If an unsupported DataFrame type is provided. + :raises ValueError: If non-time columns are not numeric. + + Examples + -------- + .. code-block:: python + + import pandas as pd + from temporalscope.partition.padding import forward_fill_pad + + df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "time": pd.date_range("2021-01-01", periods=2)}) + padded_df = forward_fill_pad(df, target_len=5, end=len(df), reverse=False, time_col="time") + print(padded_df) + + .. note:: + Forward-fill padding is useful in scenarios where missing data is best approximated by the last known + valid value, such as financial data or sensor readings in IoT applications. + """ + + # Validate the padding option + if padding not in ["pre", "post"]: + raise ValueError(f"Invalid padding option: {padding}. Use 'pre' or 'post'.") + + validate_dataframe(df) + + # Ensure the target length is greater than the current DataFrame length + if target_len < len(df): + raise ValueError("target_len must be greater than the current DataFrame length.") + + # Sort the DataFrame by the time column if provided (includes warning for non-numeric/datetime time_col) + df = sort_dataframe(df, time_col, ascending) if time_col else df + + # Validate that all non-time columns are numeric, raise warning if not + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + non_numeric_cols = df.select_dtypes(exclude=PANDAS_NUMERIC_TYPES).columns.difference([time_col]) + if not non_numeric_cols.empty: + warnings.warn(f"Non-numeric columns found: {non_numeric_cols}. Only numeric columns will be padded.") + elif isinstance(df, pl.DataFrame): + non_numeric_cols = [col for col in df.columns if col not in POLARS_NUMERIC_TYPES and col != time_col] + if non_numeric_cols: + warnings.warn(f"Non-numeric columns found: {non_numeric_cols}. Only numeric columns will be padded.") + + # Calculate how many rows to pad + num_to_pad = target_len - len(df) + + # Create the padding DataFrame by repeating the last or first row + if isinstance(df, pd.DataFrame): + pad_row = df.iloc[[end - 1]] if not reverse else df.iloc[[0]] + pad_df = pd.concat([pad_row] * num_to_pad, ignore_index=True) + elif isinstance(df, mpd.DataFrame): + pad_row = df.iloc[[end - 1]] if not reverse else df.iloc[[0]] + pad_df = mpd.concat([pad_row] * num_to_pad, ignore_index=True) # Use Modin's concat + elif isinstance(df, pl.DataFrame): + pad_row = df.slice(end - 1, 1) if not reverse else df.slice(0, 1) + pad_df = pl.concat([pad_row] * num_to_pad) + + # Ensure type consistency after padding + pad_df = ensure_type_consistency(df, pad_df) + + # Append or prepend the padding DataFrame to the original DataFrame + if padding == "post": + if isinstance(df, pd.DataFrame): + return pd.concat([df, pad_df], ignore_index=True) + elif isinstance(df, mpd.DataFrame): + return mpd.concat([df, pad_df], ignore_index=True) # Use Modin's concat + elif isinstance(df, pl.DataFrame): + return df.vstack(pad_df) + elif padding == "pre": + if isinstance(df, pd.DataFrame): + return pd.concat([pad_df, df], ignore_index=True) + elif isinstance(df, mpd.DataFrame): + return mpd.concat([pad_df, df], ignore_index=True) # Use Modin's concat + elif isinstance(df, pl.DataFrame): + return pad_df.vstack(df) + + return df + + +def backward_fill_pad( + df: SupportedBackendDataFrame, + target_len: int, + end: int, + reverse: bool, + padding: str = "post", + time_col: Optional[str] = None, + ascending: bool = True, +) -> SupportedBackendDataFrame: + """Apply backward-fill padding by repeating the first or last row. + + Data will be sorted by `time_col` if provided. + + :param df: The DataFrame (Pandas, Modin, or Polars) to pad. + :param target_len: The target number of rows after padding. + :param end: The index indicating the last valid row for padding. + :param reverse: If True, fill from the start of the DataFrame. + :param padding: Whether to pad 'pre' (before) or 'post' (after). + :param time_col: Optional. The time column to sort by before padding. + :param ascending: Whether to sort the data in ascending or descending order. + :return: A DataFrame padded by backward fill. + + :raises ValueError: If target_len is less than the current DataFrame length. + :raises ValueError: If the DataFrame is empty. + :raises TypeError: If an unsupported DataFrame type is provided. + + Examples + -------- + .. code-block:: python + + import pandas as pd + from temporalscope.partition.padding import backward_fill_pad + + df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "time": pd.date_range("2021-01-01", periods=2)}) + padded_df = backward_fill_pad(df, target_len=5, end=len(df), reverse=False, time_col="time") + print(padded_df) + + .. note:: + Backward-fill padding is often applied when future values are unknown and it's reasonable to assume that + the first valid observation represents future unknowns, which is useful in cases like predictive modeling. + """ + validate_dataframe(df) + + # Ensure the target length is greater than the current DataFrame length + if target_len < len(df): + raise ValueError("target_len must be greater than the current DataFrame length.") + + # Sort the DataFrame by the time column if provided (includes warning for non-numeric/datetime time_col) + df = sort_dataframe(df, time_col, ascending) if time_col else df + + # Validate that all non-time columns are numeric, raise warning if not + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + non_numeric_cols = df.select_dtypes(exclude=PANDAS_NUMERIC_TYPES).columns.difference([time_col]) + if not non_numeric_cols.empty: + warnings.warn(f"Non-numeric columns found: {non_numeric_cols}. Only numeric columns will be padded.") + elif isinstance(df, pl.DataFrame): + non_numeric_cols = [col for col in df.columns if col not in POLARS_NUMERIC_TYPES and col != time_col] + if non_numeric_cols: + warnings.warn(f"Non-numeric columns found: {non_numeric_cols}. Only numeric columns will be padded.") + + # Calculate how many rows to pad + num_to_pad = target_len - len(df) + + # Create the padding DataFrame by repeating the first or last row + if isinstance(df, pd.DataFrame): + pad_row = df.iloc[[0]] if not reverse else df.iloc[[end - 1]] + pad_df = pd.concat([pad_row] * num_to_pad, ignore_index=True) + elif isinstance(df, mpd.DataFrame): + pad_row = df.iloc[[0]] if not reverse else df.iloc[[end - 1]] + pad_df = mpd.concat([pad_row] * num_to_pad, ignore_index=True) # Use Modin's concat + elif isinstance(df, pl.DataFrame): + pad_row = df.slice(0, 1) if not reverse else df.slice(end - 1, 1) + pad_df = pl.concat([pad_row] * num_to_pad) + + # Ensure type consistency after padding + pad_df = ensure_type_consistency(df, pad_df) + + # Append or prepend the padding DataFrame to the original DataFrame + if padding == "post": + if isinstance(df, pd.DataFrame): + return pd.concat([df, pad_df], ignore_index=True) + elif isinstance(df, mpd.DataFrame): + return mpd.concat([df, pad_df], ignore_index=True) # Use Modin's concat + elif isinstance(df, pl.DataFrame): + return df.vstack(pad_df) + elif padding == "pre": + if isinstance(df, pd.DataFrame): + return pd.concat([pad_df, df], ignore_index=True) + elif isinstance(df, mpd.DataFrame): + return mpd.concat([pad_df, df], ignore_index=True) # Use Modin's concat + elif isinstance(df, pl.DataFrame): + return pad_df.vstack(df) + else: + raise ValueError(f"Invalid padding option: {padding}. Use 'pre' or 'post'.") + + # This line ensures that MyPy sees a return in all cases, although it's unreachable. + assert False, "This should never be reached" + + +def mean_fill_pad( + df: SupportedBackendDataFrame, + target_len: int, + end: int, + reverse: bool, + padding: str = "post", + time_col: Optional[str] = None, + ascending: bool = True, +) -> SupportedBackendDataFrame: + """Apply mean-fill padding by filling numeric columns with their mean values. + + Data will be sorted by `time_col` if provided. + + :param df: The DataFrame (Pandas, Modin, or Polars) to pad. + :param target_len: The target number of rows after padding. + :param end: The index indicating the last valid row for padding. + :param reverse: If True, fill from the start of the DataFrame. + :param padding: Whether to pad 'pre' (before) or 'post' (after). + :param time_col: Optional. The time column to sort by before padding. + :param ascending: Whether to sort the data in ascending or descending order. + :return: A DataFrame padded by mean fill. + + :raises ValueError: If target_len is less than the current DataFrame length. + :raises ValueError: If the DataFrame is empty. + :raises TypeError: If an unsupported DataFrame type is provided. + + Examples + -------- + .. code-block:: python + + import pandas as pd + from temporalscope.partition.padding import mean_fill_pad + + df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "time": pd.date_range("2021-01-01", periods=2)}) + padded_df = mean_fill_pad(df, target_len=5, end=len(df), reverse=False, time_col="time") + print(padded_df) + + .. note:: + Mean-fill padding is useful when you want to fill gaps in the data with the mean of the numeric columns. + It is commonly used in time-series forecasting and analytics when you want to smooth over missing values. + """ + validate_dataframe(df) + + # Ensure the target length is greater than the current DataFrame length + if target_len < len(df): + raise ValueError("target_len must be greater than the current DataFrame length.") + + # Sort the DataFrame by the time column if provided (includes warning for non-numeric/datetime time_col) + df = sort_dataframe(df, time_col, ascending) if time_col else df + + # Validate that all non-time columns are numeric, raise warning if not + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + non_numeric_cols = df.select_dtypes(exclude=PANDAS_NUMERIC_TYPES).columns.difference([time_col]) + if not non_numeric_cols.empty: + warnings.warn(f"Non-numeric columns found: {non_numeric_cols}. Only numeric columns will be padded.") + elif isinstance(df, pl.DataFrame): + non_numeric_cols = [col for col in df.columns if col not in POLARS_NUMERIC_TYPES and col != time_col] + if non_numeric_cols: + warnings.warn(f"Non-numeric columns found: {non_numeric_cols}. Only numeric columns will be padded.") + + num_to_pad = target_len - len(df) + + # Handle Pandas and Modin DataFrames + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + numeric_cols = df.select_dtypes(include=PANDAS_NUMERIC_TYPES).columns + mean_values = df[numeric_cols].mean() + pad_df = pd.DataFrame([mean_values] * num_to_pad, columns=numeric_cols) + + # Handle non-numeric columns (nearest row padding) + non_numeric_cols = df.select_dtypes(exclude=PANDAS_NUMERIC_TYPES).columns + if not non_numeric_cols.empty: + nearest_row = df.iloc[[end - 1]] if not reverse else df.iloc[[0]] + for col in non_numeric_cols: + pad_df[col] = nearest_row[col].values[0] + + # Ensure column types match + pad_df = ensure_type_consistency(df, pad_df) + + # Concatenate the DataFrame + if isinstance(df, mpd.DataFrame): + return ( + mpd.concat([df, mpd.DataFrame(pad_df)], ignore_index=True) + if padding == "post" + else mpd.concat([mpd.DataFrame(pad_df), df], ignore_index=True) + ) + else: + return ( + pd.concat([df, pad_df], ignore_index=True) + if padding == "post" + else pd.concat([pad_df, df], ignore_index=True) + ) + + # Handle Polars DataFrame + elif isinstance(df, pl.DataFrame): + numeric_cols = df.select(pl.col(POLARS_NUMERIC_TYPES)).columns + mean_values = {col: df[col].mean() for col in numeric_cols} + pad_df = pl.DataFrame({col: [mean_values[col]] * num_to_pad for col in numeric_cols}) + + # Handle non-numeric columns (nearest row padding) + non_numeric_cols = [col for col in df.columns if col not in numeric_cols] + if non_numeric_cols: + nearest_row = df.slice(end - 1, 1) if not reverse else df.slice(0, 1) + for col in non_numeric_cols: + pad_df = pad_df.with_columns(pl.lit(nearest_row[col][0]).alias(col)) + + # Ensure column types match + pad_df = ensure_type_consistency(df, pad_df) + + # Ensure complete padding + if len(pad_df) != num_to_pad: + raise ValueError(f"Padding mismatch: expected {num_to_pad}, but got {len(pad_df)}") + + # Return padded Polars DataFrame + return df.vstack(pad_df) if padding == "post" else pad_df.vstack(df) + + else: + raise TypeError(f"Unsupported DataFrame type: {type(df)}") + + # This return statement satisfies MyPy's expectation, but should not actually be reachable. + assert False, "This should never be reached" + + +def pad_dataframe( + df: SupportedBackendDataFrame, + target_len: int, + mode: str, + padding: str = "post", + time_col: Optional[str] = None, + ascending: bool = True, + pad_value: Union[int, float, None] = None, + end: Optional[int] = None, + reverse: bool = False, +) -> SupportedBackendDataFrame: + """Apply a padding scheme to a DataFrame, ensuring it's sorted by `time_col` if provided. + + :param df: The DataFrame (Pandas, Modin, or Polars) to pad. + :param target_len: Target number of rows after padding. + :param mode: Padding mode to use. Options are: "zero", "forward_fill", "backward_fill", "mean_fill". + :param padding: Direction to apply padding ('pre' or 'post'). Default is 'post'. + :param time_col: Optional column name to sort by (for time-based sorting). + :param ascending: Whether to sort data in ascending order (default is True). + :param pad_value: Custom value to use for padding in the "zero" mode. Default is None. Ignored for other modes. + :param end: The index indicating the last valid row for padding. Required for forward_fill, backward_fill, and mean_fill modes. + :param reverse: If True, fill from the start of the DataFrame. Required for forward_fill, backward_fill, and mean_fill modes. + :return: The padded DataFrame. + + :raises ValueError: If mode is unknown, if `target_len` is less than the current DataFrame length, or if DataFrame is empty. + :raises TypeError: If an unsupported DataFrame type is provided. + + Examples + -------- + .. code-block:: python + + >>> import pandas as pd + >>> df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "time": pd.date_range("20210101", periods=2)}) + >>> pad_dataframe(df, target_len=4, mode="zero", pad_value=0, time_col="time") + a b time + 0 1 3 2021-01-01 + 1 2 4 2021-01-02 + 2 0 0 NaT + 3 0 0 NaT + + >>> pad_dataframe(df, target_len=4, mode="mean_fill", time_col="time") + a b time + 0 1.0 3.0 2021-01-01 + 1 2.0 4.0 2021-01-02 + 2 1.5 3.5 NaT + 3 1.5 3.5 NaT + """ + validate_dataframe(df) + + if target_len < len(df): + raise ValueError("target_len must be greater than the current DataFrame length.") + + # Ensure the mode is valid + if mode not in PAD_SCHEMES: + raise ValueError(f"Unknown padding mode: {mode}. Available modes: {', '.join(PAD_SCHEMES)}") + + # Sort the DataFrame by the time column if provided (includes warning for non-numeric/datetime time_col) + df = sort_dataframe(df, time_col, ascending) if time_col else df + + # Handle zero padding with a default pad_value of 0 + if mode == "zero": + if pad_value is None: + pad_value = 0 # Default pad_value for zero padding + return zero_pad(df, target_len, time_col=time_col, padding=padding, pad_value=pad_value) + + # Ensure `end` is not None for other modes + if end is None: + raise ValueError(f"`end` parameter is required for {mode} mode.") + + # Dynamically select and call the padding function + if mode == "forward_fill": + return forward_fill_pad( + df, target_len, end=end, reverse=reverse, time_col=time_col, padding=padding, ascending=ascending + ) + elif mode == "backward_fill": + return backward_fill_pad( + df, target_len, end=end, reverse=reverse, time_col=time_col, padding=padding, ascending=ascending + ) + elif mode == "mean_fill": + return mean_fill_pad( + df, target_len, end=end, reverse=reverse, time_col=time_col, padding=padding, ascending=ascending + ) + + # This should never be reached, but included as a safety net + raise ValueError(f"Invalid padding mode: {mode}") diff --git a/src/temporalscope/partition/partition_validators.py b/src/temporalscope/partition/partition_validators.py index bc61e86..fca36de 100644 --- a/src/temporalscope/partition/partition_validators.py +++ b/src/temporalscope/partition/partition_validators.py @@ -33,7 +33,7 @@ import pandas as pd import polars as pl -from temporalscope.conf import validate_backend +from temporalscope.core.core_utils import validate_backend from temporalscope.core.core_utils import SupportedBackendDataFrame PandasLike = TypeVar("PandasLike", pd.DataFrame, mpd.DataFrame) diff --git a/src/temporalscope/partition/sliding_window.py b/src/temporalscope/partition/sliding_window.py index e124795..d7c58ed 100644 --- a/src/temporalscope/partition/sliding_window.py +++ b/src/temporalscope/partition/sliding_window.py @@ -30,6 +30,18 @@ The `SlidingWindowPartitioner` is intended for universal models, which assume flat partitioning across all entities. Users are responsible for preprocessing steps such as deduplication or transforming `time_col` to numerical features. + +.. seealso:: + + 1. Gu, X., See, K.W., Wang, Y., Zhao, L. and Pu, W., 2021. The sliding window and SHAP theory—an improved system with a long short-term memory network model for state of charge prediction in electric vehicle application. Energies, 14(12), p.3692. + 2. Pham, N.T., Nguyen, S.D., Nguyen, V.S.T., Pham, B.N.H. and Dang, D.N.M., 2023. Speech emotion recognition using overlapping sliding window and Shapley additive explainable deep neural network. Journal of Information and Telecommunication, 7(3), pp.317-335. + 3. Van Zyl, C., Ye, X. and Naidoo, R., 2024. Harnessing eXplainable artificial intelligence for feature selection in time series energy forecasting: A comparative analysis of Grad-CAM and SHAP. Applied Energy, 353, p.122079. + 4. Bi, Y., Xiang, D., Ge, Z., Li, F., Jia, C. and Song, J., 2020. An interpretable prediction model for identifying N7-methylguanosine sites based on XGBoost and SHAP. Molecular Therapy-Nucleic Acids, 22, pp.362-372. + 5. Zimmermann, B. and Boussard, M., 2022, May. Improving drift detection by monitoring shapley loss values. In International Conference on Pattern Recognition and Artificial Intelligence (pp. 455-466). Cham: Springer International Publishing. + 6. Li, B., Balestra, C. and Müller, E., 2022. Enabling the visualization of distributional shift using shapley values. In NeurIPS 2022 Workshop on Distribution Shifts: Connecting Methods and Applications. + 7. Seiffer, C., Ziekow, H., Schreier, U. and Gerling, A., 2021. Detection of concept drift in manufacturing data with SHAP values to improve error prediction. DATA ANALYTICS, pp.3-7. + 8. Haug, J., Braun, A., Zürn, S. and Kasneci, G., 2022, October. Change detection for local explainability in evolving data streams. In Proceedings of the 31st ACM International Conference on Information & Knowledge Management (pp. 706-716).] + 9. Zhao, D. and Koh, Y.S., 2020. Feature drift detection in evolving data streams. In Database and Expert Systems Applications: 31st International Conference, DEXA 2020, Bratislava, Slovakia, September 14–17, 2020, Proceedings, Part II 31 (pp. 335-349). Springer International Publishing. """ import itertools @@ -54,6 +66,12 @@ check_sample_size, ) +from temporalscope.partition.padding import PAD_SCHEMES +from temporalscope.partition.padding import zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad + +# Precision constant for floating-point comparisons +PRECISION = 1e-6 + class SlidingWindowPartitioner(TemporalPartitionerProtocol): """Sliding Window Partitioner for dividing time series data into contiguous, non-overlapping partitions. @@ -83,6 +101,41 @@ class SlidingWindowPartitioner(TemporalPartitionerProtocol): The class uses a generator pattern for `fit` and `transform` methods to yield partition indices and data slices one at a time, promoting memory efficiency and lazy loading. + :param tf: The TimeFrame object containing the data to be partitioned. + :type tf: TimeFrame + :param num_partitions: The desired number of partitions to create. If `window_size` is specified, this is ignored. + :type num_partitions: Optional[int] + :param window_size: The size of each partition (number of rows). If specified, it takes precedence over `num_partitions`. + :type window_size: Optional[int] + :param stride: The number of rows to skip between the start points of consecutive partitions. + A stride larger than the window size creates gaps, while a stride equal to the window size results in no gaps. + :type stride: int + :param reverse: Whether the sliding window should move in reverse (from the end to the start of the dataset). + If set to True, the window slides in reverse; if False (default), it slides forward. + :type reverse: bool + :param truncate: Whether to truncate the last partition if its size is smaller than the window size. + Note: For deep learning models, truncation can lead to varying input sizes and should be avoided. + :type truncate: bool + :param train_pct: Percentage of data allocated for training within each partition. Must be provided. + :type train_pct: float + :param test_pct: Percentage of data allocated for testing within each partition. Optional. + :type test_pct: Optional[float] + :param val_pct: Optional percentage of data allocated for validation within each partition. If provided, the sum of `train_pct`, + `test_pct`, and `val_pct` must equal 1.0. + :type val_pct: Optional[float] + :param enable_warnings: Enable warnings for uneven partition sizes. + :type enable_warnings: bool + :param verbose: If set to True, print partitioning details. + :type verbose: bool + + :raises ValueError: + - If neither `window_size` nor `num_partitions` is provided or valid. + - If `stride` is not a positive integer. + - If `train_pct`, `test_pct`, or `val_pct` are not within the range [0, 1]. + - If `train_pct`, `test_pct`, and `val_pct` do not sum to 1.0. + - If the dataset cannot be sorted or retrieved properly from the TimeFrame. + - If any required data is missing or invalid during the partitioning process. + Example Usage: -------------- .. code-block:: python @@ -151,43 +204,10 @@ class SlidingWindowPartitioner(TemporalPartitionerProtocol): 2021-01-04 3 The sliding window moves across the entire dataset, maintaining the temporal order within each partition. - - :param tf: The TimeFrame object containing the data to be partitioned. - :type tf: TimeFrame - :param num_partitions: The desired number of partitions to create. If `window_size` is specified, this is ignored. - :type num_partitions: Optional[int] - :param window_size: The size of each partition (number of rows). If specified, it takes precedence over `num_partitions`. - :type window_size: Optional[int] - :param stride: The number of rows to skip between the start points of consecutive partitions. - A stride larger than the window size creates gaps, while a stride equal to the window size results in no gaps. - :type stride: int - :param reverse: Whether the sliding window should move in reverse (from the end to the start of the dataset). - If set to True, the window slides in reverse; if False (default), it slides forward. - :type reverse: bool - :param truncate: Whether to truncate the last partition if its size is smaller than the window size. - Note: For deep learning models, truncation can lead to varying input sizes and should be avoided. - :type truncate: bool - :param train_pct: Percentage of data allocated for training within each partition. Must be provided. - :type train_pct: float - :param test_pct: Percentage of data allocated for testing within each partition. Optional. - :type test_pct: Optional[float] - :param val_pct: Optional percentage of data allocated for validation within each partition. If provided, the sum of `train_pct`, - `test_pct`, and `val_pct` must equal 1.0. - :type val_pct: Optional[float] - :param enable_warnings: Enable warnings for uneven partition sizes. - :type enable_warnings: bool - :param verbose: If set to True, print partitioning details. - :type verbose: bool - - :raises ValueError: - - If neither `window_size` nor `num_partitions` is provided or valid. - - If `stride` is not a positive integer. - - If `train_pct`, `test_pct`, or `val_pct` are not within the range [0, 1]. - - If `train_pct`, `test_pct`, and `val_pct` do not sum to 1.0. - - If the dataset cannot be sorted or retrieved properly from the TimeFrame. - - If any required data is missing or invalid during the partitioning process. """ + DEFAULT_PAD_SCHEME = "forward_fill" # Define the default padding scheme + def __init__( self, tf: TimeFrame, @@ -197,14 +217,15 @@ def __init__( reverse: bool = False, truncate: bool = True, train_pct: float = 0.7, - test_pct: Optional[float] = 0.2, + test_pct: Optional[float] = 0.3, val_pct: Optional[float] = None, enable_warnings: bool = False, verbose: bool = False, + pad_scheme: str = DEFAULT_PAD_SCHEME, ): """Initialize the SlidingWindowPartitioner with the given parameters. - :param tf: TimeFrame object to partition. + :param tf: TimeFrame object to partition. All columns except `time_col` must be numeric. :param num_partitions: Number of partitions to create (ignored if `window_size` is provided). :param window_size: Size of each partition. :param stride: Number of rows to skip between partitions. @@ -215,10 +236,23 @@ def __init__( :param val_pct: Percentage of data allocated for validation. :param enable_warnings: Enable warnings for uneven partition sizes. :param verbose: Enable verbose output. - :raises ValueError: If input parameters are invalid. + :param pad_scheme: The padding scheme to use for filling partitions. Defaults to 'forward_fill'. + :raises ValueError: If input parameters are invalid or columns (except `time_col`) are not numeric. """ - # Validate the backend - validate_backend(tf.backend) + + # Validate the backend and pad scheme + validate_backend(tf.dataframe_backend) + if pad_scheme not in PAD_SCHEMES: + raise ValueError(f"Invalid pad_scheme: {pad_scheme}. Supported schemes: {PAD_SCHEMES}") + + # Check if all columns except `time_col` are numeric + non_time_cols = [col for col in tf.get_data().columns if col != tf.time_col] + non_numeric_cols = [col for col in non_time_cols if not pd.api.types.is_numeric_dtype(tf.get_data()[col])] + + if non_numeric_cols: + raise ValueError( + f"All columns except `time_col` must be numeric. Non-numeric columns found: {non_numeric_cols}" + ) # Get the number of rows from the TimeFrame object num_rows = tf.get_data().shape[0] @@ -252,6 +286,7 @@ def __init__( self.reverse = reverse self.truncate = truncate self.verbose = verbose + self.pad_scheme = pad_scheme # Assign the chosen padding scheme # Precompute percentages self.train_pct, self.test_pct, self.val_pct = self._precompute_percentages(train_pct, test_pct, val_pct) @@ -268,12 +303,12 @@ def _precompute_percentages( train_pct: float, test_pct: Optional[float], val_pct: Optional[float], - precision: float = 1e-6, # Default precision for floating-point comparisons + precision: float = PRECISION, # Now using the precision constant ) -> Tuple[float, float, float]: """Precompute and validate train, test, and validation percentages. This function ensures that the sum of train, test, and validation percentages equals 1.0. - If `test_pct` is not provided, it will be set to the remaining percentage after the train percentage. + If test_pct is not provided, it will be set to the remaining percentage after the train percentage. :param train_pct: Percentage of data allocated for training. :type train_pct: float @@ -289,75 +324,35 @@ def _precompute_percentages( """ # Validate the train percentage if not (0 <= train_pct <= 1): - raise ValueError("`train_pct` must be between 0 and 1.") + raise ValueError("train_pct must be between 0 and 1.") - # Ensure test_pct and val_pct are set correctly + # Handle test_pct and val_pct cases explicitly if test_pct is None and val_pct is None: test_pct = 1.0 - train_pct val_pct = 0.0 elif test_pct is not None and val_pct is None: if not (0 <= test_pct <= 1): - raise ValueError("`test_pct` must be between 0 and 1.") + raise ValueError("test_pct must be between 0 and 1.") val_pct = 1.0 - train_pct - test_pct elif test_pct is None and val_pct is not None: if not (0 <= val_pct <= 1): - raise ValueError("`val_pct` must be between 0 and 1.") + raise ValueError("val_pct must be between 0 and 1.") test_pct = 1.0 - train_pct - val_pct else: - # Both test_pct and val_pct are provided, ensure they are valid before comparison - if test_pct is None or val_pct is None: - raise ValueError("`test_pct` and `val_pct` cannot be None.") + # Both test_pct and val_pct are provided, ensure they are valid if not (0 <= test_pct <= 1): - raise ValueError("`test_pct` must be between 0 and 1.") + raise ValueError("test_pct must be between 0 and 1.") if not (0 <= val_pct <= 1): - raise ValueError("`val_pct` must be between 0 and 1.") + raise ValueError("val_pct must be between 0 and 1.") - # Ensure they sum to 1.0 + # Ensure they sum to 1.0, handling floating-point imprecision with precision constant total_pct = train_pct + (test_pct or 0) + (val_pct or 0) - if not (abs(total_pct - 1.0) < precision): # Use the precision parameter here + if not (abs(total_pct - 1.0) < precision): # Compare with the precision constant raise ValueError("Train, test, and validation percentages must sum to 1.0.") # Ensure test_pct and val_pct are float types, not None return train_pct, float(test_pct), float(val_pct) - def _pad_partition( - self, - window_size: int, - end: int, - reverse: bool, - ) -> SupportedBackendDataFrame: - """Pad the partition to the required window size by repeating the last row. - - This function ensures that the partition is padded to the full window size by repeating the last row of the - partition until the desired window size is achieved. - - :param window_size: The target window size to pad the partition to. - :type window_size: int - :param end: The index indicating the end of the current partition. - :type end: int - :param reverse: If True, the padding is added to the start; otherwise, it's added at the end. - :type reverse: bool - :return: A DataFrame padded to the specified window size. - :rtype: SupportedBackendDataFrame - """ - df = self.tf.get_data() # Get the DataFrame from TimeFrame - num_to_pad = window_size - df.shape[0] - - if num_to_pad <= 0: - return df # No need to pad - - # Select the row to use for padding (whether reverse or not) - pad_row = df.iloc[[end - 1]] if not reverse else df.iloc[[0]] - - # Create padding by repeating the selected row for the required number of times - pad_rows = pd.concat([pad_row] * num_to_pad, ignore_index=True) - - # Concatenate the padding with the original DataFrame - if reverse: - return pd.concat([pad_rows, df], ignore_index=True) - else: - return pd.concat([df, pad_rows], ignore_index=True) - def _fit_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: """Fit method for partitioning using TimeFrame data. @@ -443,7 +438,8 @@ def _transform_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Union[pd.DataF the partition indices generated by the `fit` method. It processes each partition and splits it into train, test, and optionally validation sets. - If a partition's size is smaller than the specified `window_size`, padding is applied to ensure + If a partition's size is smaller than the specified `window_size`, padding is applied using the selected + padding scheme (`zero_pad`, `forward_fill_pad`, `backward_fill_pad`, or `mean_fill_pad`) to ensure uniform size across partitions, unless `truncate` is set to True. :return: Iterator yielding partitioned DataFrame slices for Pandas/Modin backends. @@ -477,13 +473,12 @@ def _transform_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Union[pd.DataF Notes ----- - - Padding is applied when the size of a partition is smaller than the `window_size`, unless truncation is enabled. - - Ensure that the input DataFrame is not empty to avoid runtime errors. - - Performance Considerations: - --------------------------- - - For very large datasets, the padding process may increase memory usage. Consider using Modin when handling - large datasets to take advantage of distributed processing. + .. note:: + - Padding is applied when the size of a partition is smaller than the `window_size`, unless truncation is enabled. + - The padding scheme is determined by the `pad_scheme` parameter in the constructor (e.g., 'zero', 'forward_fill'). + - Ensure that the input DataFrame is not empty to avoid runtime errors. + - For very large datasets, the padding process may increase memory usage. Consider using Modin when handling + large datasets to take advantage of distributed processing. """ partition_count = 1 df = self.tf.get_data() # Fetch the data from TimeFrame @@ -500,15 +495,36 @@ def _transform_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Union[pd.DataF if start is not None and end is not None } - # Apply padding if partition size is smaller than window_size and truncate is False + # Check if padding is needed (partition size is smaller than window_size and truncate is False) if partition_dict["full"][1] - partition_dict["full"][0] < self.window_size and not self.truncate: - partitioned_data[key]["full"] = self._pad_partition( - self.window_size, - partition_dict["full"][1], - self.reverse, - ) - yield partitioned_data - + # Apply the chosen padding scheme + if self.pad_scheme == "zero": + partitioned_data[key]["full"] = zero_pad( + partitioned_data[key]["full"], target_len=self.window_size + ) + elif self.pad_scheme == "forward_fill": + partitioned_data[key]["full"] = forward_fill_pad( + partitioned_data[key]["full"], + target_len=self.window_size, + end=len(partitioned_data[key]["full"]), + reverse=False, + ) + elif self.pad_scheme == "backward_fill": + partitioned_data[key]["full"] = backward_fill_pad( + partitioned_data[key]["full"], + target_len=self.window_size, + end=len(partitioned_data[key]["full"]), + reverse=False, + ) + elif self.pad_scheme == "mean_fill": + partitioned_data[key]["full"] = mean_fill_pad( + partitioned_data[key]["full"], + target_len=self.window_size, + end=len(partitioned_data[key]["full"]), + reverse=False, + ) + + yield partitioned_data partition_count += 1 def _transform_polars(self) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: @@ -516,7 +532,8 @@ def _transform_polars(self) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: This method generates partitioned data slices for the Polars backend, yielding the data slices corresponding to the partition indices generated by the `fit` method. If the size of a partition is smaller than the - specified `window_size`, padding is applied unless `truncate` is set to True. + specified `window_size`, padding is applied using the selected padding scheme (`zero_pad`, `forward_fill_pad`, + `backward_fill_pad`, or `mean_fill_pad`), unless `truncate` is set to True. :return: Iterator yielding partitioned DataFrame slices for Polars backend. :rtype: Iterator[Dict[str, Dict[str, pl.DataFrame]]] @@ -549,14 +566,12 @@ def _transform_polars(self) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: Notes ----- - - Padding is applied when the size of a partition is smaller than the `window_size`, unless truncation is enabled. - - Polars DataFrames offer better performance with large datasets, especially for complex operations. - - Performance Considerations: - --------------------------- - - For very large datasets, Polars DataFrames are recommended due to their lower memory footprint and faster - performance when compared to Pandas. Use Polars for more efficient partitioning and transformations. - + .. note:: + - Padding is applied when the size of a partition is smaller than the `window_size`, unless truncation is enabled. + - The padding scheme is determined by the `pad_scheme` parameter in the constructor (e.g., 'zero', 'forward_fill'). + - Polars DataFrames offer better performance with large datasets, especially for complex operations. + - For very large datasets, Polars DataFrames are recommended due to their lower memory footprint and faster + performance when compared to Pandas. Use Polars for more efficient partitioning and transformations. """ partition_count = 1 df = self.tf.get_data() # Fetch the data from TimeFrame @@ -575,13 +590,34 @@ def _transform_polars(self) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: # Apply padding if partition size is smaller than window_size and truncate is False if partition_dict["full"][1] - partition_dict["full"][0] < self.window_size and not self.truncate: - partitioned_data[key]["full"] = self._pad_partition( - self.window_size, - partition_dict["full"][1], - self.reverse, - ) - yield partitioned_data - + # Apply the chosen padding scheme for Polars DataFrame + if self.pad_scheme == "zero": + partitioned_data[key]["full"] = zero_pad( + partitioned_data[key]["full"], target_len=self.window_size + ) + elif self.pad_scheme == "forward_fill": + partitioned_data[key]["full"] = forward_fill_pad( + partitioned_data[key]["full"], + target_len=self.window_size, + end=len(partitioned_data[key]["full"]), + reverse=False, + ) + elif self.pad_scheme == "backward_fill": + partitioned_data[key]["full"] = backward_fill_pad( + partitioned_data[key]["full"], + target_len=self.window_size, + end=len(partitioned_data[key]["full"]), + reverse=False, + ) + elif self.pad_scheme == "mean_fill": + partitioned_data[key]["full"] = mean_fill_pad( + partitioned_data[key]["full"], + target_len=self.window_size, + end=len(partitioned_data[key]["full"]), + reverse=False, + ) + + yield partitioned_data partition_count += 1 def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: @@ -631,12 +667,12 @@ def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: df = self.tf.get_data() # Get the dataset from the TimeFrame # Call backend-specific partitioning method - if self.tf.backend in [BACKEND_PANDAS, BACKEND_MODIN]: + if self.tf.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: return self._fit_pandas_modin(df) - elif self.tf.backend == BACKEND_POLARS: + elif self.tf.dataframe_backend == BACKEND_POLARS: return self._fit_polars(df) else: - raise ValueError(f"Unsupported backend: {self.tf.backend}") + raise ValueError(f"Unsupported backend: {self.tf.dataframe_backend}") def transform(self) -> Iterator[Dict[str, Dict[str, SupportedBackendDataFrame]]]: """Generate partitioned data slices for the dataset. @@ -685,12 +721,12 @@ def transform(self) -> Iterator[Dict[str, Dict[str, SupportedBackendDataFrame]]] df = self.tf.get_data() # Get the dataset from the TimeFrame # Call backend-specific transformation method - if self.tf.backend in [BACKEND_PANDAS, BACKEND_MODIN]: + if self.tf.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: return self._transform_pandas_modin(df) - elif self.tf.backend == BACKEND_POLARS: + elif self.tf.dataframe_backend == BACKEND_POLARS: return self._transform_polars(df) else: - raise ValueError(f"Unsupported backend: {self.tf.backend}") + raise ValueError(f"Unsupported backend: {self.tf.dataframe_backend}") def fit_transform(self) -> Iterator[Dict[str, Dict[str, SupportedBackendDataFrame]]]: """Fit and transform the dataset in a single step. @@ -794,17 +830,19 @@ def check_data(self, partition_index: Optional[int] = None) -> None: # Perform sample size, feature ratio, and class balance checks check_sample_size( df_to_check, - backend=self.tf.backend, + backend=self.tf.dataframe_backend, min_samples=min_samples, max_samples=100000, enable_warnings=True, ) - check_feature_to_sample_ratio(df_to_check, backend=self.tf.backend, max_ratio=0.2, enable_warnings=True) + check_feature_to_sample_ratio( + df_to_check, backend=self.tf.dataframe_backend, max_ratio=0.2, enable_warnings=True + ) if self.tf.target_col: check_class_balance( df_to_check, target_col=self.tf.target_col, - backend=self.tf.backend, + backend=self.tf.dataframe_backend, enable_warnings=True, ) diff --git a/test/unit/test_core_exceptions.py b/test/unit/test_core_exceptions.py new file mode 100644 index 0000000..42c1346 --- /dev/null +++ b/test/unit/test_core_exceptions.py @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" TemporalScope/test/unit/test_core_exceptions.py + +This module contains unit tests for the custom exceptions and warnings defined +in the TemporalScope package. These tests ensure that the exceptions are +raised correctly and the warnings are issued in the appropriate scenarios. +""" + +import pytest +import warnings + +from temporalscope.core.exceptions import ( + TimeFrameError, + TimeColumnError, + MixedTypesWarning, + MixedTimezonesWarning, + MixedFrequencyWarning +) + +def test_time_frame_error_inheritance(): + """Test that TimeFrameError is the base class for other exceptions.""" + with pytest.raises(TimeFrameError): + raise TimeFrameError("Base error for the TimeFrame module") + + +def test_time_column_error(): + """Test that TimeColumnError is raised for time column validation errors.""" + with pytest.raises(TimeColumnError): + raise TimeColumnError("Error with the time column") + + +def test_mixed_types_warning(): + """Test that MixedTypesWarning is issued when mixed types are detected.""" + with pytest.warns(MixedTypesWarning, match="Mixed numeric and timestamp-like types"): + warnings.warn("Mixed numeric and timestamp-like types", MixedTypesWarning) + + +def test_mixed_timezones_warning(): + """Test that MixedTimezonesWarning is issued for mixed timezone-aware and naive timestamps.""" + with pytest.warns(MixedTimezonesWarning, match="Mixed timezone-aware and naive timestamps"): + warnings.warn("Mixed timezone-aware and naive timestamps", MixedTimezonesWarning) + + +def test_mixed_frequency_warning(): + """Test that MixedFrequencyWarning is issued when mixed timestamp frequencies are detected.""" + with pytest.warns(MixedFrequencyWarning, match="Mixed timestamp frequencies"): + warnings.warn("Mixed timestamp frequencies", MixedFrequencyWarning) diff --git a/test/unit/test_core_temporal_data_loader.py b/test/unit/test_core_temporal_data_loader.py index 85588a1..679f1fd 100644 --- a/test/unit/test_core_temporal_data_loader.py +++ b/test/unit/test_core_temporal_data_loader.py @@ -17,16 +17,22 @@ # TemporalScope/test/unit/test_core_temporal_data_loader.py - -from datetime import date, timedelta -from typing import Dict, List, Union - +import warnings +from typing import Dict, List, Union, Optional +from datetime import datetime, timedelta, date, timezone import modin.pandas as mpd import numpy as np + import pandas as pd import polars as pl import pytest + +from temporalscope.core.exceptions import ( + TimeColumnError, MixedTypesWarning, MixedTimezonesWarning, MixedFrequencyWarning +) + + from temporalscope.core.core_utils import ( BACKEND_MODIN, BACKEND_PANDAS, @@ -35,261 +41,226 @@ from temporalscope.core.temporal_data_loader import TimeFrame -def create_sample_data(num_samples: int = 100, num_features: int = 3) -> Dict[str, Union[List[date], List[float]]]: - """Create a sample data dictionary for testing. +from datetime import datetime, timedelta, timezone +from typing import Dict, List, Union, Optional +import numpy as np +import pandas as pd +import polars as pl +import modin.pandas as mpd + - :param num_samples: Number of samples to generate, defaults to 100 - :type num_samples: int, optional - :param num_features: Number of feature columns to generate, defaults to 3 - :type num_features: int, optional - :return: A dictionary containing generated data with keys 'time', 'feature_1', ..., 'feature_n', and 'target' - :rtype: Dict[str, Union[List[date], List[float]]] +def create_sample_data( + num_samples: int = 100, + num_features: int = 3, + empty: bool = False, + missing_values: bool = False, + mixed_types: bool = False, + drop_columns: Optional[List[str]] = None, + non_numeric_time: bool = False, + empty_time: bool = False, + mixed_numeric_and_timestamp: bool = False, + date_like_string: bool = False, + object_type_time_col: bool = False, + mixed_timezones: bool = False, + polars_specific: bool = False +) -> Dict[str, Union[List[datetime], List[float], List[Optional[float]]]]: + """ Create a sample dataset for scalable unit testing, supporting various edge cases. + + This function generates sample time-series data for different unit testing scenarios, + including empty datasets, datasets with mixed data types, missing values, or different + types of time columns. It is designed to be flexible, providing various ways to test + data validation for time-series models. + + :param num_samples: Number of samples to generate. + :param num_features: Number of feature columns to generate. + :param empty: If True, generates an empty dataset. + :param missing_values: If True, introduces missing values into the dataset. + :param mixed_types: If True, mixes numeric and string data types in feature columns. + :param drop_columns: List of columns to drop from the dataset. + :param non_numeric_time: If True, replaces the `time_col` with non-numeric values. + :param empty_time: If True, fills the `time_col` with empty values. + :param mixed_numeric_and_timestamp: If True, mixes numeric and timestamp values in `time_col`. + :param date_like_string: If True, fills the `time_col` with date-like string values. + :param object_type_time_col: If True, inserts arrays or complex objects into the `time_col`. + :param mixed_timezones: If True, mixes timestamps with and without timezone information in `time_col`. + :param polars_specific: If True, handles edge cases specific to Polars. + :return: A dictionary containing generated data with keys 'time', 'feature_1', ..., 'feature_n', and 'target'. """ - start_date = date(2021, 1, 1) - data = { - "time": [start_date + timedelta(days=i) for i in range(num_samples)], - } + + if empty: + return {"time": [], "target": []} + + start_date = datetime(2021, 1, 1) + + if empty_time: + data = {"time": [None for _ in range(num_samples)]} + elif non_numeric_time: + data = {"time": ["invalid_time" for _ in range(num_samples)]} + elif mixed_numeric_and_timestamp: + if polars_specific: + data = {"time": [str(start_date + timedelta(days=i)) if i % 2 == 0 else float(i) for i in range(num_samples)]} + else: + data = {"time": [start_date + timedelta(days=i) if i % 2 == 0 else float(i) for i in range(num_samples)]} + elif date_like_string: + data = {"time": [f"2021-01-{i+1:02d}" for i in range(num_samples)]} + elif object_type_time_col: + data = {"time": [[start_date + timedelta(days=i)] for i in range(num_samples)]} + elif mixed_timezones: + data = {"time": [(start_date + timedelta(days=i)).replace(tzinfo=timezone.utc if i % 2 == 0 else None) + for i in range(num_samples)]} + else: + data = {"time": [start_date + timedelta(days=i) for i in range(num_samples)]} - # Generate feature columns for i in range(1, num_features + 1): - data[f"feature_{i}"] = np.random.rand(num_samples).tolist() + if mixed_types: + data[f"feature_{i}"] = [f"str_{i}" if j % 2 == 0 else j for j in range(num_samples)] + else: + data[f"feature_{i}"] = np.random.rand(num_samples).tolist() + + if missing_values: + for i in range(num_samples): + if i % 10 == 0: + for j in range(1, num_features + 1): + data[f"feature_{j}"][i] = None - # Generate target column (e.g., sum of features plus noise) data["target"] = [ - sum(data[f"feature_{j}"][i] for j in range(1, num_features + 1)) + np.random.normal(0, 0.1) + sum(data[f"feature_{j}"][i] for j in range(1, num_features + 1) if isinstance(data[f"feature_{j}"][i], float)) + + np.random.normal(0, 0.1) for i in range(num_samples) ] + if drop_columns: + data = pd.DataFrame(data).drop(columns=drop_columns).to_dict(orient='list') + return data -@pytest.fixture(params=[BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def sample_dataframe(request): - """Fixture to create sample DataFrames for each backend. - :param request: Pytest fixture request object containing the backend parameter. - :type request: _pytest.fixtures.SubRequest - :return: A tuple of the DataFrame and the backend identifier. - :rtype: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - data = create_sample_data() - backend = request.param +@pytest.mark.parametrize( + "backend, case_type, expected_error, expected_warning, match_message", + [ + (BACKEND_POLARS, "missing_time_col", TimeColumnError, None, r"Missing required column: time"), + (BACKEND_PANDAS, "missing_time_col", TimeColumnError, None, r"Missing required column: time"), + (BACKEND_MODIN, "missing_time_col", TimeColumnError, None, r"Missing required column: time"), + (BACKEND_POLARS, "non_numeric_time_col", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), + (BACKEND_PANDAS, "non_numeric_time_col", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), + (BACKEND_MODIN, "non_numeric_time_col", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), + (BACKEND_PANDAS, "empty_time_col", TimeColumnError, None, r"Missing values found in `time_col`"), + (BACKEND_POLARS, "mixed_frequencies", None, MixedFrequencyWarning, r"mixed timestamp frequencies"), + (BACKEND_PANDAS, "mixed_frequencies", None, MixedFrequencyWarning, r"mixed timestamp frequencies"), + (BACKEND_POLARS, "mixed_timezones", None, MixedTimezonesWarning, r"mixed timezone-aware and naive timestamps"), + (BACKEND_PANDAS, "mixed_timezones", None, MixedTimezonesWarning, r"mixed timezone-aware and naive timestamps"), + (BACKEND_POLARS, "date_like_string", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), + (BACKEND_PANDAS, "date_like_string", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), + ] +) +def test_validation_edge_cases(backend, case_type, expected_error, expected_warning, match_message): + """Test validation logic under different edge cases and backends.""" + + polars_specific = backend == BACKEND_POLARS + + if case_type == "missing_time_col": + data = create_sample_data(drop_columns=["time"], polars_specific=polars_specific) + elif case_type == "non_numeric_time_col": + data = create_sample_data(non_numeric_time=True, polars_specific=polars_specific) + elif case_type == "empty_time_col": + data = create_sample_data(empty_time=True, polars_specific=polars_specific) + elif case_type == "mixed_frequencies": + data = create_sample_data(mixed_frequencies=True, polars_specific=polars_specific) + elif case_type == "date_like_string": + data = create_sample_data(date_like_string=True, polars_specific=polars_specific) + elif case_type == "mixed_timezones": + data = create_sample_data(mixed_timezones=True, polars_specific=polars_specific) if backend == BACKEND_POLARS: - # Ensure 'time' column is properly typed - data["time"] = pl.Series(data["time"]) - df = pl.DataFrame(data) + df = pl.DataFrame(data, strict=False) # Allow mixed types for Polars elif backend == BACKEND_PANDAS: df = pd.DataFrame(data) elif backend == BACKEND_MODIN: df = mpd.DataFrame(data) - else: - raise ValueError(f"Unsupported backend: {backend}") - return df, backend - - -def test_timeframe_initialization(sample_dataframe): - """Test the initialization of TimeFrame with various backends. - - :param sample_dataframe: Fixture providing the DataFrame and backend. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - df, backend = sample_dataframe - tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) - assert tf.backend == backend - assert tf.time_col == "time" - assert tf.target_col == "target" - assert len(tf.get_data()) == len(df) - - -def test_sort_data(sample_dataframe): - """Test the sort_data method. - - :param sample_dataframe: Fixture providing the DataFrame and backend. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - df, backend = sample_dataframe - tf = TimeFrame(df, time_col="time", target_col="target", backend=backend, sort=False) - # Shuffle the data - if backend == BACKEND_POLARS: - shuffled_df = tf.get_data().sample(fraction=1.0) - else: - shuffled_df = tf.get_data().sample(frac=1).reset_index(drop=True) - tf.update_data(shuffled_df) - tf.sort_data(ascending=True) - sorted_df = tf.get_data() - # Verify that data is sorted - times = sorted_df[tf.time_col].to_list() if backend == BACKEND_POLARS else sorted_df[tf.time_col].tolist() - assert times == sorted(times) - - -def test_update_data(sample_dataframe): - """Test the update_data method. - - :param sample_dataframe: Fixture providing the DataFrame and backend. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - df, backend = sample_dataframe - tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) - new_data = create_sample_data(num_samples=50) - if backend == BACKEND_POLARS: - new_data["time"] = pl.Series(new_data["time"]) - new_df = pl.DataFrame(new_data) - elif backend == BACKEND_PANDAS: - new_df = pd.DataFrame(new_data) - elif backend == BACKEND_MODIN: - new_df = mpd.DataFrame(new_data) - tf.update_data(new_df) - assert len(tf.get_data()) == 50 - - -def test_update_target_col(sample_dataframe): - """Test the update_target_col method. - - :param sample_dataframe: Fixture providing the DataFrame and backend. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - df, backend = sample_dataframe - tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) - new_target = np.random.rand(len(df)) - if backend == BACKEND_POLARS: - new_target_col = pl.Series(new_target) - elif backend == BACKEND_PANDAS: - new_target_col = pd.Series(new_target) - elif backend == BACKEND_MODIN: - new_target_col = mpd.Series(new_target) - tf.update_target_col(new_target_col) - updated_target = ( - tf.get_data()[tf.target_col].to_numpy() if backend == BACKEND_POLARS else tf.get_data()[tf.target_col].values - ) - np.testing.assert_array_almost_equal(updated_target, new_target) - - -def test_missing_columns(sample_dataframe): - """Test initialization with missing required columns. - - :param sample_dataframe: Fixture providing the DataFrame and backend. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - df, backend = sample_dataframe - # Remove the target column - if backend == BACKEND_POLARS: - df = df.drop(["target"]) - else: - df = df.drop(columns=["target"]) - with pytest.raises(ValueError) as excinfo: - TimeFrame(df, time_col="time", target_col="target", backend=backend) - assert "Missing required columns" in str(excinfo.value) - - -def test_invalid_backend(sample_dataframe): - """Test initialization with an invalid backend. - - :param sample_dataframe: Fixture providing the DataFrame. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - df, _ = sample_dataframe - invalid_backend = "invalid_backend" - with pytest.raises(ValueError) as excinfo: - TimeFrame(df, time_col="time", target_col="target", backend=invalid_backend) - assert f"Unsupported backend '{invalid_backend}'" in str(excinfo.value) - - -def test_invalid_time_col_type(sample_dataframe): - """Test initialization with invalid time_col type. - - :param sample_dataframe: Fixture providing the DataFrame and backend. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - df, backend = sample_dataframe - with pytest.raises(ValueError) as excinfo: - TimeFrame(df, time_col=123, target_col="target", backend=backend) - assert "time_col must be a non-empty string." in str(excinfo.value) - - -def test_invalid_target_col_type(sample_dataframe): - """Test initialization with invalid target_col type. - :param sample_dataframe: Fixture providing the DataFrame and backend. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - df, backend = sample_dataframe - with pytest.raises(ValueError) as excinfo: - TimeFrame(df, time_col="time", target_col=None, backend=backend) - assert "target_col must be a non-empty string." in str(excinfo.value) - - -def test_invalid_dataframe_type(): - """Test initialization with an invalid DataFrame type.""" - invalid_df = "This is not a DataFrame" - with pytest.raises(TypeError): - TimeFrame(invalid_df, time_col="time", target_col="target", backend=BACKEND_POLARS) - - -def test_sort_data_invalid_backend(): - """Test initialization with an unsupported backend.""" - data = create_sample_data() - df = pd.DataFrame(data) - with pytest.raises(ValueError) as excinfo: - TimeFrame(df, time_col="time", target_col="target", backend="unsupported_backend") - assert "Unsupported backend" in str(excinfo.value) - - -def test_update_target_col_invalid_length(sample_dataframe): - """Test update_target_col with mismatched length.""" - df, backend = sample_dataframe - tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) - new_target = np.random.rand(len(df) - 1) # Mismatch length by 1 - if backend == BACKEND_POLARS: - new_target_col = pl.Series(new_target) - elif backend == BACKEND_PANDAS: - new_target_col = pd.Series(new_target) - elif backend == BACKEND_MODIN: - new_target_col = mpd.Series(new_target) + if expected_error: + with pytest.raises(expected_error, match=match_message): + TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) + elif expected_warning: + with pytest.warns(expected_warning, match=match_message if match_message else None): + TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) + + + + +# @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def test_sort_data(backend): +# """Test sorting method for various backends.""" +# data = create_sample_data(num_samples=100) +# if backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend, sort=False) +# # Shuffle and sort +# if backend == BACKEND_POLARS: +# shuffled_df = tf.get_data().sample(fraction=1.0) +# else: +# shuffled_df = tf.get_data().sample(frac=1).reset_index(drop=True) +# tf.update_data(shuffled_df) +# tf.sort_data(ascending=True) +# sorted_df = tf.get_data() + +# # Verify sorting +# times = sorted_df[tf.time_col].to_list() if backend == BACKEND_POLARS else sorted_df[tf.time_col].tolist() +# assert times == sorted(times) + + +# @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def test_update_target_col_invalid_length(backend): +# """Test updating target column with mismatched length.""" +# data = create_sample_data(num_samples=100) +# if backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# new_target = pl.Series(np.random.rand(99)) # One less than expected +# elif backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# new_target = pd.Series(np.random.rand(99)) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) +# new_target = mpd.Series(np.random.rand(99)) + +# tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) +# with pytest.raises(ValueError): +# tf.update_target_col(new_target) + + +# @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def test_missing_columns(backend): +# """Test initialization with missing required columns.""" +# data = create_sample_data(num_samples=100) +# if backend == BACKEND_POLARS: +# df = pl.DataFrame(data).drop(["target"]) +# elif backend == BACKEND_PANDAS: +# df = pd.DataFrame(data).drop(columns=["target"]) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data).drop(columns=["target"]) + +# with pytest.raises(ValueError): +# TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) + + +# @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def test_invalid_backend_initialization(backend): +# """Test invalid backend during initialization.""" +# data = create_sample_data(num_samples=100) +# if backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# invalid_backend = "invalid_backend" +# with pytest.raises(ValueError): +# TimeFrame(df, time_col="time", target_col="target", dataframe_backend=invalid_backend) - with pytest.raises(ValueError) as excinfo: - tf.update_target_col(new_target_col) - - assert "The new target column must have the same number of rows as the DataFrame." in str(excinfo.value) - - - -def test_update_target_col_invalid_type(sample_dataframe): - """Test update_target_col with invalid Series type. - - :param sample_dataframe: Fixture providing the DataFrame and backend. - :type sample_dataframe: Tuple[Union[pl.DataFrame, pd.DataFrame, mpd.DataFrame], str] - """ - df, backend = sample_dataframe - tf = TimeFrame(df, time_col="time", target_col="target", backend=backend) - invalid_series = "This is not a Series" - with pytest.raises(TypeError) as excinfo: - tf.update_target_col(invalid_series) - assert "Expected a" in str(excinfo.value) - - -@pytest.mark.parametrize( - "df_backend,expected_backend", - [(BACKEND_POLARS, BACKEND_POLARS), (BACKEND_PANDAS, BACKEND_PANDAS), (BACKEND_MODIN, BACKEND_MODIN)], -) -def test_infer_backend(sample_dataframe, df_backend, expected_backend): - """Test that the backend is correctly inferred for Polars, Pandas, and Modin DataFrames.""" - df, backend = sample_dataframe - if backend == df_backend: - tf = TimeFrame(df, time_col="time", target_col="target") - inferred_backend = tf._infer_backend(df) - assert inferred_backend == expected_backend - - -def test_infer_backend_invalid(): - """Test that a ValueError is raised for unsupported DataFrame types.""" - invalid_df = "This is not a DataFrame" - - # Creating a valid TimeFrame object first to avoid column validation - valid_df = pd.DataFrame({"time": [1, 2, 3], "target": [1, 2, 3]}) - tf = TimeFrame(valid_df, time_col="time", target_col="target") # Placeholder - - # Now test the _infer_backend method directly on the invalid data - with pytest.raises(ValueError) as excinfo: - tf._infer_backend(invalid_df) - assert "Unsupported DataFrame type" in str(excinfo.value) diff --git a/test/unit/test_core_temporal_target_shifter.py b/test/unit/test_core_temporal_target_shifter.py index d454ff3..427d383 100644 --- a/test/unit/test_core_temporal_target_shifter.py +++ b/test/unit/test_core_temporal_target_shifter.py @@ -130,8 +130,8 @@ def test_time_frame_input(data_format, n_lags): elif data_format == BACKEND_MODIN: df = mpd.DataFrame(data) - # Ensure TimeFrame uses data_format instead of backend - tf = TimeFrame(df, time_col="time", target_col="target", data_format=data_format) + # Ensure TimeFrame uses dataframe_backend + tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=data_format) shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") # Test fitting and transforming TimeFrame diff --git a/test/unit/test_datasets.py b/test/unit/test_datasets.py new file mode 100644 index 0000000..c2fbea9 --- /dev/null +++ b/test/unit/test_datasets.py @@ -0,0 +1,110 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest +from temporalscope.datasets.datasets import DatasetLoader +from temporalscope.core.temporal_data_loader import TimeFrame +from temporalscope.core.core_utils import BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS +import pandas as pd +import modin.pandas as mpd +import polars as pl + +@pytest.fixture +def dataset_loader(): + """Fixture to create a DatasetLoader instance for the macrodata dataset.""" + return DatasetLoader(dataset_name="macrodata") + + +def test_load_dataset_and_target(dataset_loader): + """Test loading the dataset and its target column.""" + df, target_col = dataset_loader._load_dataset_and_target() + assert isinstance(df, pd.DataFrame) + assert target_col == "realgdp" + assert "ds" in df.columns + assert len(df) > 0 # Ensure the dataset is not empty + + +def test_init_timeframes_for_backends(dataset_loader): + """Test initializing TimeFrame objects for multiple backends.""" + df, target_col = dataset_loader._load_dataset_and_target() + + timeframes = dataset_loader.init_timeframes_for_backends(df, target_col) + + # Check if the returned TimeFrame objects for each backend are valid + assert isinstance(timeframes[BACKEND_PANDAS], TimeFrame) + assert isinstance(timeframes[BACKEND_MODIN], TimeFrame) + assert isinstance(timeframes[BACKEND_POLARS], TimeFrame) + + # Ensure correct data in each backend + assert timeframes[BACKEND_PANDAS].dataframe_backend == BACKEND_PANDAS + assert timeframes[BACKEND_MODIN].dataframe_backend == BACKEND_MODIN + assert timeframes[BACKEND_POLARS].dataframe_backend == BACKEND_POLARS + + +def test_load_and_init_timeframes(dataset_loader): + """Test loading dataset and initializing TimeFrames for all backends.""" + timeframes = dataset_loader.load_and_init_timeframes() + + # Check if the returned TimeFrame objects for each backend are valid + assert isinstance(timeframes[BACKEND_PANDAS], TimeFrame) + assert isinstance(timeframes[BACKEND_MODIN], TimeFrame) + assert isinstance(timeframes[BACKEND_POLARS], TimeFrame) + + +def test_invalid_backend_raises_error(dataset_loader): + """Test that initializing with an invalid backend raises a ValueError.""" + df, target_col = dataset_loader._load_dataset_and_target() + + with pytest.raises(ValueError, match="Unsupported backend"): + dataset_loader.init_timeframes_for_backends(df, target_col, backends=("invalid_backend",)) + + +def test_invalid_dataset_name(): + """Test that initializing DatasetLoader with an invalid dataset name raises a ValueError.""" + with pytest.raises(ValueError, match="Dataset 'invalid' is not supported"): + DatasetLoader(dataset_name="invalid") + + +def test_init_timeframes_with_custom_backend(dataset_loader): + """Test initializing TimeFrames with a custom selection of backends.""" + df, target_col = dataset_loader._load_dataset_and_target() + timeframes = dataset_loader.init_timeframes_for_backends(df, target_col, backends=(BACKEND_PANDAS,)) + + # Ensure only the requested backend is initialized + assert BACKEND_PANDAS in timeframes + assert BACKEND_MODIN not in timeframes + assert BACKEND_POLARS not in timeframes + + +def test_load_dataset_internal_call(mocker): + """Test the internal call to _load_dataset_and_target and check the dataset loader function.""" + mocker.patch("temporalscope.datasets.datasets._load_macrodata", return_value=(pd.DataFrame(), "realgdp")) + dataset_loader = DatasetLoader(dataset_name="macrodata") + + df, target_col = dataset_loader._load_dataset_and_target() + + assert target_col == "realgdp" + assert isinstance(df, pd.DataFrame) + + +def test_load_dataset_and_verify_time_column(dataset_loader): + """Test to ensure that the 'ds' column is created and of type datetime.""" + df, target_col = dataset_loader._load_dataset_and_target() + + # Ensure 'ds' column exists and is of datetime type + assert "ds" in df.columns + assert pd.api.types.is_datetime64_any_dtype(df["ds"]) diff --git a/test/unit/test_partion_data_checks.py b/test/unit/test_partion_data_checks.py deleted file mode 100644 index fed87a6..0000000 --- a/test/unit/test_partion_data_checks.py +++ /dev/null @@ -1,336 +0,0 @@ -# """ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# """ -# import modin.pandas as mpd -# import pandas as pd -# import polars as pl -# import pytest - -# from temporalscope.partition.data_checks import ( -# check_binary_numerical_features, -# check_categorical_feature_cardinality, -# check_class_balance, -# check_feature_count, -# check_feature_to_sample_ratio, -# check_numerical_feature_uniqueness, -# check_sample_size, -# ) - - -# @pytest.mark.parametrize( -# "dataframe,backend,min_samples,max_samples,expected_result", -# [ -# (pd.DataFrame({"feature1": range(100)}), "pd", 3000, 50000, False), -# ( -# pl.DataFrame({"feature1": pl.Series(range(100))}), -# "pl", -# 3000, -# 50000, -# False, -# ), -# ( -# mpd.DataFrame({"feature1": range(100000)}), -# "mpd", -# 3000, -# 50000, -# False, -# ), -# ], -# ) -# def test_check_sample_size( -# dataframe, backend, min_samples, max_samples, expected_result -# ): -# """Test sample size check for various dataframes and backends.""" -# assert ( -# check_sample_size( -# dataframe, -# backend=backend, -# min_samples=min_samples, -# max_samples=max_samples, -# ) -# == expected_result -# ) - - -# @pytest.mark.parametrize( -# "dataframe,backend,min_features,expected_result", -# [ -# # Pandas DataFrame -# ( -# pd.DataFrame({"feature1": range(100)}), -# "pd", -# 4, -# False, -# ), # Too few features - Pandas -# # Polars DataFrame -# ( -# pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), -# "pl", -# 4, -# True, -# ), # Enough features - Polars -# # Modin DataFrame -# ( -# mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), -# "mpd", -# 4, -# True, -# ), # Enough features - Modin -# ], -# ) -# def test_check_feature_count(dataframe, backend, min_features, expected_result): -# """Tests check_feature_count for various dataframes and backends.""" -# assert ( -# check_feature_count(dataframe, backend=backend, min_features=min_features) -# == expected_result -# ) - - -# @pytest.mark.parametrize( -# "dataframe,backend,max_ratio,expected_result", -# [ -# ( -# pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), -# "pl", -# 0.1, -# True, -# ), -# ( -# mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), -# "mpd", -# 0.1, -# True, -# ), -# ( -# pd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), -# "pd", -# 0.1, -# True, -# ), -# ], -# ) -# def test_check_feature_to_sample_ratio(dataframe, backend, max_ratio, expected_result): -# """Tests check_feature_to_sample_ratio for various dataframes and backends.""" -# assert ( -# check_feature_to_sample_ratio(dataframe, backend=backend, max_ratio=max_ratio) -# == expected_result -# ) - - -# @pytest.mark.parametrize( -# "dataframe,backend,max_unique_values,expected_result", -# [ -# # Pandas DataFrames -# ( -# pd.DataFrame({"category1": [str(i) for i in range(25)]}), -# "pd", -# 20, -# False, -# ), # Too many unique values - Pandas -# ( -# pd.DataFrame({"category1": ["A", "B", "C"] * 100}), -# "pd", -# 20, -# True, -# ), # Normal unique values - Pandas -# # Polars DataFrames -# ( -# pl.DataFrame({"category1": pl.Series([str(i) for i in range(25)])}), -# "pl", -# 20, -# False, -# ), # Too many unique values - Polars -# ( -# pl.DataFrame({"category1": pl.Series(["A", "B", "C"] * 100)}), -# "pl", -# 20, -# True, -# ), # Normal unique values - Polars -# # Modin DataFrames -# ( -# mpd.DataFrame({"category1": [str(i) for i in range(25)]}), -# "mpd", -# 20, -# False, -# ), # Too many unique values - Modin -# ( -# mpd.DataFrame({"category1": ["A", "B", "C"] * 100}), -# "mpd", -# 20, -# True, -# ), # Normal unique values - Modin -# ], -# ) -# def test_check_categorical_feature_cardinality( -# dataframe, backend, max_unique_values, expected_result -# ): -# """Tests check_categorical_feature_cardinality for various dataframe backends.""" -# assert ( -# check_categorical_feature_cardinality( -# dataframe, backend=backend, max_unique_values=max_unique_values -# ) -# == expected_result -# ) - - -# @pytest.mark.parametrize( -# "dataframe,backend,min_unique_values,expected_result", -# [ -# # Pandas DataFrame -# ( -# pd.DataFrame({"feature1": range(100)}), -# "pd", -# 10, -# True, -# ), # Enough unique values - Pandas -# # Polars DataFrame -# ( -# pl.DataFrame({"feature1": pl.Series(range(100))}), -# "pl", -# 10, -# True, -# ), # Enough unique values - Polars -# # Modin DataFrame -# ( -# mpd.DataFrame({"feature1": [1, 1, 1, 2, 2, 2, 3, 3]}), -# "mpd", -# 10, -# False, -# ), # Too few unique values - Modin -# ( -# mpd.DataFrame({"feature1": range(100)}), -# "mpd", -# 10, -# True, -# ), # Enough unique values - Modin -# ], -# ) -# def test_check_numerical_feature_uniqueness( -# dataframe, backend, min_unique_values, expected_result -# ): -# """Tests check_numerical_feature_uniqueness for various dataframes and backends.""" -# assert ( -# check_numerical_feature_uniqueness( -# dataframe, backend=backend, min_unique_values=min_unique_values -# ) -# == expected_result -# ) - - -# @pytest.mark.parametrize( -# "dataframe,backend,expected_result", -# [ -# # Pandas DataFrame -# ( -# pd.DataFrame({"binary_feature": [0, 1] * 50}), -# "pd", -# False, -# ), # Binary numerical feature - Pandas -# ( -# pd.DataFrame({"feature1": range(100)}), -# "pd", -# True, -# ), # No binary feature - Pandas -# # Polars DataFrame -# ( -# pl.DataFrame({"binary_feature": pl.Series([0, 1] * 50)}), -# "pl", -# False, -# ), # Binary numerical feature - Polars -# ( -# pl.DataFrame({"feature1": pl.Series(range(100))}), -# "pl", -# True, -# ), # No binary feature - Polars -# # Modin DataFrame -# ( -# mpd.DataFrame({"binary_feature": [0, 1] * 50}), -# "mpd", -# False, -# ), # Binary numerical feature - Modin -# ( -# mpd.DataFrame({"feature1": range(100)}), -# "mpd", -# True, -# ), # No binary feature - Modin -# ], -# ) -# def test_check_binary_numerical_features(dataframe, backend, expected_result): -# """Tests check_binary_numerical_features for various dataframes and backends.""" -# assert ( -# check_binary_numerical_features(dataframe, backend=backend) == expected_result -# ) - - -# @pytest.mark.parametrize( -# "dataframe,target_col,backend,expected_result", -# [ -# ( -# pd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), -# "target", -# "pd", -# False, -# ), -# ( -# pd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), -# "target", -# "pd", -# True, -# ), -# ( -# pl.DataFrame( -# { -# "feature1": pl.Series(range(100)), -# "target": pl.Series([1] * 90 + [0] * 10), -# } -# ), -# "target", -# "pl", -# False, -# ), -# ( -# pl.DataFrame( -# { -# "feature1": pl.Series(range(100)), -# "target": pl.Series([0, 1] * 50), -# } -# ), -# "target", -# "pl", -# True, -# ), -# ( -# mpd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), -# "target", -# "mpd", -# False, -# ), -# ( -# mpd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), -# "target", -# "mpd", -# True, -# ), -# ], -# ) -# def test_check_class_balance(dataframe, target_col, backend, expected_result): -# """Tests check_class_balance for various dataframes and backends.""" -# result = check_class_balance(dataframe, target_col=target_col, backend=backend) -# assert ( -# result == expected_result -# ), f"Expected {expected_result}, but got {result} for backend {backend}" diff --git a/test/unit/test_partition_padding.py b/test/unit/test_partition_padding.py new file mode 100644 index 0000000..31f0d35 --- /dev/null +++ b/test/unit/test_partition_padding.py @@ -0,0 +1,362 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TemporalScope/test/unit/test_partition_padding.py + + +import pytest +import numpy as np +import pandas as pd +import modin.pandas as mpd +import polars as pl +from temporalscope.partition.padding import ( + zero_pad, + forward_fill_pad, + backward_fill_pad, + mean_fill_pad, + pad_dataframe, + sort_dataframe, + ensure_type_consistency +) +from temporalscope.core.core_utils import ( + BACKEND_MODIN, + BACKEND_PANDAS, + BACKEND_POLARS, +) + +from temporalscope.core.core_utils import SupportedBackendDataFrame + +np.random.seed(42) # Set a seed for reproducibility + + + +def generate_test_data(backend, num_samples=5): + """Generate test data with consistent column names across all backends.""" + start_date = pd.to_datetime("2021-01-01") + data = { + "feature_1": range(1, num_samples + 1), + "feature_2": range(num_samples, 0, -1), + "target": [i * 10 for i in range(1, num_samples + 1)], + "ds": pd.date_range(start_date, periods=num_samples) # Ensure 'ds' is a date column + } + + if backend == BACKEND_PANDAS: + df = pd.DataFrame(data) + df['ds'] = df['ds'].astype('datetime64[ns]') # Ensure ds is in datetime64[ns] + return df + + elif backend == BACKEND_MODIN: + df = mpd.DataFrame(data) + df['ds'] = df['ds'].astype('datetime64[ns]') # Modin relies on Pandas dtype system + return df + + elif backend == BACKEND_POLARS: + df = pl.DataFrame({ + "feature_1": data["feature_1"], + "feature_2": data["feature_2"], + "target": data["target"], + "ds": [d for d in data["ds"]] # Keep `ds` as a date column + }) + return df.with_columns(pl.col("ds").cast(pl.Datetime)) # Cast ds to Polars datetime + + else: + raise ValueError(f"Unsupported backend: {backend}") + + +@pytest.fixture +def test_data(): + return { + BACKEND_PANDAS: generate_test_data(BACKEND_PANDAS), + BACKEND_MODIN: generate_test_data(BACKEND_MODIN), + BACKEND_POLARS: generate_test_data(BACKEND_POLARS), + } + + +# Utility function to generate empty DataFrame +def get_empty_dataframe(backend): + if backend == BACKEND_PANDAS: + return pd.DataFrame() + elif backend == BACKEND_MODIN: + return mpd.DataFrame() + elif backend == BACKEND_POLARS: + return pl.DataFrame() + else: + raise ValueError(f"Unsupported backend: {backend}") + +def generate_mixed_data(num_samples: int = 5) -> pd.DataFrame: + """Generates a DataFrame with mixed data types (numeric, categorical, datetime). + + This can be used for parametrized tests to check how functions handle different + column types. + + :param num_samples: Number of rows to generate in the DataFrame. + :return: A DataFrame with mixed data types. + """ + start_date = pd.to_datetime("2021-01-01") + data = { + "numeric_col": range(1, num_samples + 1), + "category_col": ["A", "B", "C", "D", "E"][:num_samples], + "datetime_col": pd.date_range(start_date, periods=num_samples), + "mixed_col": ["A", 1, pd.NaT, None, 5][:num_samples], # Mixed types + } + return pd.DataFrame(data) + + + +def check_monotonicity(df: SupportedBackendDataFrame, time_col: str, ascending: bool = True) -> bool: + if isinstance(df, pl.DataFrame): + # Handle Polars DataFrame + diffs = df.select(pl.col(time_col).diff()).select(pl.col(time_col).drop_nulls()) # Handle nulls + if ascending: + return diffs.select(pl.col(time_col).gt(pl.lit(0))).to_series().all() # Use Polars comparison + else: + return diffs.select(pl.col(time_col).lt(pl.lit(0))).to_series().all() + else: + # Handle Pandas and Modin (already handled correctly) + diffs = df[time_col].diff().dropna() # For Pandas/Modin, dropna() works fine + if pd.api.types.is_timedelta64_dtype(diffs): + zero_timedelta = pd.Timedelta(0) + if ascending: + return diffs.gt(zero_timedelta).all() + else: + return diffs.lt(zero_timedelta).all() + else: + if ascending: + return diffs.gt(0).all() + else: + return diffs.lt(0).all() + + + +# Parametrize tests for ascending and descending order +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +@pytest.mark.parametrize("ascending", [True, False]) +def test_sort_dataframe(test_data, backend, ascending): + df = test_data[backend] + sorted_df = sort_dataframe(df, time_col="ds", ascending=ascending) + + # Check sorting for each backend + assert check_monotonicity(sorted_df, "ds", ascending=ascending) + + +# Test for invalid time column in sort_dataframe +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +def test_sort_dataframe_invalid_time_col(test_data, backend): + df = test_data[backend] + with pytest.raises(ValueError): + sort_dataframe(df, time_col="invalid_col") + + +# Test sorting for empty DataFrame +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +def test_sort_dataframe_empty_dataframe(backend): + empty_df = get_empty_dataframe(backend) + with pytest.raises(ValueError): + sort_dataframe(empty_df, time_col="ds") + + +# Test raising TypeError for unsupported input type +def test_sort_dataframe_unsupported_type(): + with pytest.raises(TypeError, match="Unsupported DataFrame type"): + sort_dataframe([], time_col="ds") # List is an unsupported type + + +# Test warning when `time_col` is neither numeric nor datetime +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN]) +def test_sort_dataframe_warning(test_data, backend): + df = test_data[backend] + df["non_time_col"] = ["a", "b", "c", "d", "e"] + + # Ensure warning is raised when time_col is non-numeric and non-datetime + with pytest.warns(UserWarning, match="is neither numeric nor datetime"): + sort_dataframe(df, time_col="non_time_col", ascending=True) + + # Continue with checking valid sorting after warning + sorted_df = sort_dataframe(df, time_col="ds", ascending=True) + assert check_monotonicity(sorted_df, "ds", ascending=True) + + + + + +# Padding function tests with Modin and Polars compatibility +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +@pytest.mark.parametrize("padding_func", [zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad]) +def test_padding_functions(test_data, backend, padding_func): + df = test_data[backend] + + if padding_func == zero_pad: + padded_df = padding_func(df, target_len=7, time_col="ds", pad_value=0) + else: + padded_df = padding_func(df, target_len=7, end=5, reverse=False, time_col="ds") + + assert len(padded_df) == 7 + + +# Ensure the 'ds' column is used consistently across backends in pad_dataframe +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +@pytest.mark.parametrize("mode", ["zero", "forward_fill", "backward_fill", "mean_fill"]) +def test_pad_dataframe(test_data, backend, mode): + df = test_data[backend] + + if mode == "zero": + padded_df = pad_dataframe(df, target_len=7, mode=mode, pad_value=0, time_col="ds") + else: + padded_df = pad_dataframe(df, target_len=7, mode=mode, end=5, reverse=False, time_col="ds") + + assert len(padded_df) == 7 + + +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +def test_empty_dataframe(backend): + if backend == BACKEND_PANDAS: + df = pd.DataFrame() + elif backend == BACKEND_MODIN: + df = mpd.DataFrame() + elif backend == BACKEND_POLARS: + df = pl.DataFrame() + + with pytest.raises(ValueError): + zero_pad(df, target_len=5, time_col="ds") + + +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +def test_invalid_time_col(test_data, backend): + df = test_data[backend] + + with pytest.raises(ValueError): + zero_pad(df, target_len=7, time_col="invalid_col") + + +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +def test_target_len_less_than_current_len(test_data, backend): + df = test_data[backend] + + with pytest.raises(ValueError): + zero_pad(df, target_len=3, time_col="ds") + + +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +def test_sort_dataframe_edge_cases(test_data, backend): + df = test_data[backend] + + # Add non-numeric, non-datetime column to test sorting warnings + if backend == BACKEND_POLARS: + df = df.with_columns(pl.Series("non_numeric", ["a", "b", "c", "d", "e"])) + else: + df["non_numeric"] = ["a", "b", "c", "d", "e"] + + # Ensure warning is raised when time_col is non-numeric and non-datetime + with pytest.warns(UserWarning, match="is neither numeric nor datetime"): + sort_dataframe(df, time_col="non_numeric", ascending=True) + + # Continue with existing tests + sorted_df = sort_dataframe(df, time_col="ds", ascending=True) + if backend == BACKEND_POLARS: + assert sorted_df["ds"].is_sorted() + else: + assert sorted_df["ds"].is_monotonic_increasing + + +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +@pytest.mark.parametrize("padding_func", [zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad]) +def test_padding_functions_with_warnings(test_data, backend, padding_func): + df = test_data[backend] + + # Add non-numeric columns + if backend == BACKEND_POLARS: + df = df.with_columns(pl.Series("non_numeric", ["a", "b", "c", "d", "e"])) + pad_df = pad_dataframe(df, target_len=7, mode="zero", time_col="ds") # Add mode here + pad_df = pad_df.with_columns(pl.lit(None).alias("non_numeric")) # Ensure "non_numeric" exists in pad_df + else: + df["non_numeric"] = ["a", "b", "c", "d", "e"] + + if padding_func == zero_pad: + with pytest.warns(UserWarning, match="Non-numeric columns found"): + padded_df = padding_func(df, target_len=7, time_col="ds", pad_value=0) + else: + padded_df = padding_func(df, target_len=7, end=5, reverse=False, time_col="ds") + + assert len(padded_df) == 7 + + +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +@pytest.mark.parametrize("mode", ["zero", "forward_fill", "backward_fill", "mean_fill"]) +def test_pad_dataframe_type_consistency(test_data, backend, mode): + df = test_data[backend] + + # Add non-numeric column + if backend == BACKEND_POLARS: + df = df.with_columns(pl.Series("non_numeric", ["x", "y", "z", "w", "v"])) + else: + df["non_numeric"] = ["x", "y", "z", "w", "v"] + + if mode == "zero": + with pytest.warns(UserWarning, match="Non-numeric columns found"): + padded_df = pad_dataframe(df, target_len=7, mode=mode, pad_value=0, time_col="ds") + else: + with pytest.warns(UserWarning, match="Non-numeric columns found"): + padded_df = pad_dataframe(df, target_len=7, mode=mode, end=5, reverse=False, time_col="ds") + + assert len(padded_df) == 7 + + # Ensure types are consistent + assert padded_df["feature_1"].dtype == df["feature_1"].dtype + assert padded_df["feature_2"].dtype == df["feature_2"].dtype + +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN]) +def test_pad_dataframe_boolean_to_int64(test_data, backend): + """Test that boolean columns in the DataFrame are correctly cast to int64.""" + df = test_data[backend] + + # Add a boolean column to the DataFrame + if backend == BACKEND_PANDAS: + df["bool_col"] = [True, False, True, False, True] + elif backend == BACKEND_MODIN: + df["bool_col"] = mpd.Series([True, False, True, False, True]) + + # Create a padding DataFrame with the same columns + pad_df = pd.DataFrame({ + "bool_col": [False, False] # Padding with False values (should become 0) + }) + + # Ensure type consistency (bool -> int64) + consistent_df = ensure_type_consistency(df, pad_df) + + # Check that the boolean column is converted to int64 + assert consistent_df["bool_col"].dtype == "int64" + assert (consistent_df["bool_col"] == 0).all() # All padded values should be 0 + + +@pytest.mark.parametrize("backend", [BACKEND_MODIN]) +def test_pad_dataframe_conversion_to_modin(test_data, backend): + """Test that pad_df is correctly converted back to Modin after type consistency check.""" + df = test_data[backend] + + # Create a padding DataFrame with mismatched types + pad_df = pd.DataFrame({ + "feature_1": [0.0, 0.0], + "feature_2": [0, 0], + "target": [0, 0], + "ds": [pd.Timestamp("1970-01-01"), pd.Timestamp("1970-01-01")] + }) + + # Ensure type consistency (pad_df starts as Pandas DataFrame) + consistent_df = ensure_type_consistency(df, pad_df) + + # Ensure pad_df is converted back to Modin if df was Modin + assert isinstance(consistent_df, mpd.DataFrame), "pad_df should be converted back to Modin" diff --git a/test/unit/test_partition_validators.py b/test/unit/test_partition_validators.py new file mode 100644 index 0000000..b0653a1 --- /dev/null +++ b/test/unit/test_partition_validators.py @@ -0,0 +1,336 @@ +# """ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# """ +import modin.pandas as mpd +import pandas as pd +import polars as pl +import pytest + +from temporalscope.partition.partition_validators import ( + check_binary_numerical_features, + check_categorical_feature_cardinality, + check_class_balance, + check_feature_count, + check_feature_to_sample_ratio, + check_numerical_feature_uniqueness, + check_sample_size, +) + + +@pytest.mark.parametrize( + "dataframe,backend,min_samples,max_samples,expected_result", + [ + (pd.DataFrame({"feature1": range(100)}), "pd", 3000, 50000, False), + ( + pl.DataFrame({"feature1": pl.Series(range(100))}), + "pl", + 3000, + 50000, + False, + ), + ( + mpd.DataFrame({"feature1": range(100000)}), + "mpd", + 3000, + 50000, + False, + ), + ], +) +def test_check_sample_size( + dataframe, backend, min_samples, max_samples, expected_result +): + """Test sample size check for various dataframes and backends.""" + assert ( + check_sample_size( + dataframe, + backend=backend, + min_samples=min_samples, + max_samples=max_samples, + ) + == expected_result + ) + + +@pytest.mark.parametrize( + "dataframe,backend,min_features,expected_result", + [ + # Pandas DataFrame + ( + pd.DataFrame({"feature1": range(100)}), + "pd", + 4, + False, + ), # Too few features - Pandas + # Polars DataFrame + ( + pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), + "pl", + 4, + True, + ), # Enough features - Polars + # Modin DataFrame + ( + mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), + "mpd", + 4, + True, + ), # Enough features - Modin + ], +) +def test_check_feature_count(dataframe, backend, min_features, expected_result): + """Tests check_feature_count for various dataframes and backends.""" + assert ( + check_feature_count(dataframe, backend=backend, min_features=min_features) + == expected_result + ) + + +@pytest.mark.parametrize( + "dataframe,backend,max_ratio,expected_result", + [ + ( + pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), + "pl", + 0.1, + True, + ), + ( + mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), + "mpd", + 0.1, + True, + ), + ( + pd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), + "pd", + 0.1, + True, + ), + ], +) +def test_check_feature_to_sample_ratio(dataframe, backend, max_ratio, expected_result): + """Tests check_feature_to_sample_ratio for various dataframes and backends.""" + assert ( + check_feature_to_sample_ratio(dataframe, backend=backend, max_ratio=max_ratio) + == expected_result + ) + + +@pytest.mark.parametrize( + "dataframe,backend,max_unique_values,expected_result", + [ + # Pandas DataFrames + ( + pd.DataFrame({"category1": [str(i) for i in range(25)]}), + "pd", + 20, + False, + ), # Too many unique values - Pandas + ( + pd.DataFrame({"category1": ["A", "B", "C"] * 100}), + "pd", + 20, + True, + ), # Normal unique values - Pandas + # Polars DataFrames + ( + pl.DataFrame({"category1": pl.Series([str(i) for i in range(25)])}), + "pl", + 20, + False, + ), # Too many unique values - Polars + ( + pl.DataFrame({"category1": pl.Series(["A", "B", "C"] * 100)}), + "pl", + 20, + True, + ), # Normal unique values - Polars + # Modin DataFrames + ( + mpd.DataFrame({"category1": [str(i) for i in range(25)]}), + "mpd", + 20, + False, + ), # Too many unique values - Modin + ( + mpd.DataFrame({"category1": ["A", "B", "C"] * 100}), + "mpd", + 20, + True, + ), # Normal unique values - Modin + ], +) +def test_check_categorical_feature_cardinality( + dataframe, backend, max_unique_values, expected_result +): + """Tests check_categorical_feature_cardinality for various dataframe backends.""" + assert ( + check_categorical_feature_cardinality( + dataframe, backend=backend, max_unique_values=max_unique_values + ) + == expected_result + ) + + +@pytest.mark.parametrize( + "dataframe,backend,min_unique_values,expected_result", + [ + # Pandas DataFrame + ( + pd.DataFrame({"feature1": range(100)}), + "pd", + 10, + True, + ), # Enough unique values - Pandas + # Polars DataFrame + ( + pl.DataFrame({"feature1": pl.Series(range(100))}), + "pl", + 10, + True, + ), # Enough unique values - Polars + # Modin DataFrame + ( + mpd.DataFrame({"feature1": [1, 1, 1, 2, 2, 2, 3, 3]}), + "mpd", + 10, + False, + ), # Too few unique values - Modin + ( + mpd.DataFrame({"feature1": range(100)}), + "mpd", + 10, + True, + ), # Enough unique values - Modin + ], +) +def test_check_numerical_feature_uniqueness( + dataframe, backend, min_unique_values, expected_result +): + """Tests check_numerical_feature_uniqueness for various dataframes and backends.""" + assert ( + check_numerical_feature_uniqueness( + dataframe, backend=backend, min_unique_values=min_unique_values + ) + == expected_result + ) + + +@pytest.mark.parametrize( + "dataframe,backend,expected_result", + [ + # Pandas DataFrame + ( + pd.DataFrame({"binary_feature": [0, 1] * 50}), + "pd", + False, + ), # Binary numerical feature - Pandas + ( + pd.DataFrame({"feature1": range(100)}), + "pd", + True, + ), # No binary feature - Pandas + # Polars DataFrame + ( + pl.DataFrame({"binary_feature": pl.Series([0, 1] * 50)}), + "pl", + False, + ), # Binary numerical feature - Polars + ( + pl.DataFrame({"feature1": pl.Series(range(100))}), + "pl", + True, + ), # No binary feature - Polars + # Modin DataFrame + ( + mpd.DataFrame({"binary_feature": [0, 1] * 50}), + "mpd", + False, + ), # Binary numerical feature - Modin + ( + mpd.DataFrame({"feature1": range(100)}), + "mpd", + True, + ), # No binary feature - Modin + ], +) +def test_check_binary_numerical_features(dataframe, backend, expected_result): + """Tests check_binary_numerical_features for various dataframes and backends.""" + assert ( + check_binary_numerical_features(dataframe, backend=backend) == expected_result + ) + + +@pytest.mark.parametrize( + "dataframe,target_col,backend,expected_result", + [ + ( + pd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), + "target", + "pd", + False, + ), + ( + pd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), + "target", + "pd", + True, + ), + ( + pl.DataFrame( + { + "feature1": pl.Series(range(100)), + "target": pl.Series([1] * 90 + [0] * 10), + } + ), + "target", + "pl", + False, + ), + ( + pl.DataFrame( + { + "feature1": pl.Series(range(100)), + "target": pl.Series([0, 1] * 50), + } + ), + "target", + "pl", + True, + ), + ( + mpd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), + "target", + "mpd", + False, + ), + ( + mpd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), + "target", + "mpd", + True, + ), + ], +) +def test_check_class_balance(dataframe, target_col, backend, expected_result): + """Tests check_class_balance for various dataframes and backends.""" + result = check_class_balance(dataframe, target_col=target_col, backend=backend) + assert ( + result == expected_result + ), f"Expected {expected_result}, but got {result} for backend {backend}" diff --git a/tutorial_notebooks/introduction/1_target_shifter.ipynb b/tutorial_notebooks/introduction/1_target_shifter.ipynb index 75dafa2..719d1ad 100644 --- a/tutorial_notebooks/introduction/1_target_shifter.ipynb +++ b/tutorial_notebooks/introduction/1_target_shifter.ipynb @@ -369,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "051a47f4-b8dd-46e3-92c1-39b49ee04f51", "metadata": { "tags": [] @@ -384,7 +384,20 @@ "Using 'realgdp' as the target column for future prediction.\n", "======================================================================\n", "======================================================================\n", - "Initializing TimeFrame for the Modin backend...\n", + "Initializing TimeFrame for the Modin backend...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-09-24 00:35:50,498\tINFO worker.py:1786 -- Started a local Ray instance.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Original DataFrame:\n", " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", @@ -401,7 +414,8 @@ "4 3.50 5.2 180.007 2.31 1.19 \n", "======================================================================\n", "\n", - "Applying Target Shifter in machine_learning mode...\n" + "Applying Target Shifter in machine_learning mode...\n", + "Initialized TemporalTargetShifter with target_col=realgdp, mode=machine_learning, n_lags=1\n" ] }, { @@ -416,30 +430,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "Rows before shift: 202; Rows after shift: 202; Rows dropped: 0\n" - ] - }, - { - "ename": "TypeError", - "evalue": "Input DataFrame type does not match the specified backend 'mpd'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 30\u001b[0m\n\u001b[1;32m 27\u001b[0m shifter \u001b[38;5;241m=\u001b[39m TemporalTargetShifter(n_lags\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, mode\u001b[38;5;241m=\u001b[39mMODE_MACHINE_LEARNING, target_col\u001b[38;5;241m=\u001b[39mtarget_col, verbose\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 29\u001b[0m \u001b[38;5;66;03m# Apply the shifter\u001b[39;00m\n\u001b[0;32m---> 30\u001b[0m shifted_df \u001b[38;5;241m=\u001b[39m \u001b[43mshifter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodin_tf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;66;03m# Print the shifted data\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShifted data:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/core/temporal_target_shifter.py:478\u001b[0m, in \u001b[0;36mTemporalTargetShifter.fit_transform\u001b[0;34m(self, tf)\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;66;03m# Return TimeFrame if input was TimeFrame, otherwise return DataFrame\u001b[39;00m\n\u001b[1;32m 477\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(tf, TimeFrame):\n\u001b[0;32m--> 478\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mTimeFrame\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 479\u001b[0m \u001b[43m \u001b[49m\u001b[43mtransformed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 480\u001b[0m \u001b[43m \u001b[49m\u001b[43mtime_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtime_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 481\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 482\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_col\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m_shift_\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_lags\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 483\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mMODE_MACHINE_LEARNING\u001b[49m\n\u001b[1;32m 484\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_col\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m_sequence\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 485\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[43m \u001b[49m\u001b[43mbackend\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackend\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 487\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 488\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m transformed\n", - "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/core/temporal_data_loader.py:147\u001b[0m, in \u001b[0;36mTimeFrame.__init__\u001b[0;34m(self, df, time_col, target_col, backend, sort)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sort \u001b[38;5;241m=\u001b[39m sort\n\u001b[1;32m 146\u001b[0m \u001b[38;5;66;03m# Convert, validate, and set up the DataFrame\u001b[39;00m\n\u001b[0;32m--> 147\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_setup_timeframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/core/temporal_data_loader.py:252\u001b[0m, in \u001b[0;36mTimeFrame._setup_timeframe\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Sets up the TimeFrame object by converting, validating, and preparing data as required.\u001b[39;00m\n\u001b[1;32m 241\u001b[0m \n\u001b[1;32m 242\u001b[0m \u001b[38;5;124;03m:param df: The input DataFrame to be processed.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[38;5;124;03m:raises TypeError: If the DataFrame type does not match the backend.\u001b[39;00m\n\u001b[1;32m 250\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 251\u001b[0m \u001b[38;5;66;03m# Convert and validate the input DataFrame\u001b[39;00m\n\u001b[0;32m--> 252\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mvalidate_and_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_backend\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 254\u001b[0m \u001b[38;5;66;03m# Validate the presence of required columns\u001b[39;00m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_columns(df)\n", - "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/core/core_utils.py:145\u001b[0m, in \u001b[0;36mvalidate_and_convert_input\u001b[0;34m(df, backend)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(df, dataframe_type):\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m conversion_func(df)\n\u001b[0;32m--> 145\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInput DataFrame type \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(df)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not match the specified backend \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbackend\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mTypeError\u001b[0m: Input DataFrame type does not match the specified backend 'mpd'" + "Rows before shift: 202; Rows after shift: 202; Rows dropped: 0\n", + "======================================================================\n", + "Shifted data:\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n" ] } ], "source": [ "import modin.pandas as mpd\n", + "\n", "from temporalscope.core.core_utils import BACKEND_MODIN\n", "from temporalscope.core.temporal_data_loader import TimeFrame\n", "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", + "from temporalscope.core.core_utils import print_divider\n", "\n", "# Constants for modes\n", "MODE_MACHINE_LEARNING = \"machine_learning\"\n", @@ -451,7 +467,7 @@ "print_divider()\n", "print(\"Initializing TimeFrame for the Modin backend...\")\n", "macro_modin_df = mpd.DataFrame(macro_df)\n", - "modin_tf = TimeFrame(macro_modin_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_MODIN)\n", + "modin_tf = TimeFrame(macro_modin_df, time_col=\"ds\", target_col=target_col, dataframe_backend=BACKEND_MODIN)\n", "\n", "# Step 3: Preview the original data\n", "print(\"Original DataFrame:\")\n", @@ -462,24 +478,168 @@ "print(f\"\\nApplying Target Shifter in {MODE_MACHINE_LEARNING} mode...\")\n", "\n", "# Setup the TemporalTargetShifter\n", - "shifter = TemporalTargetShifter(n_lags=1, mode=MODE_MACHINE_LEARNING, target_col=target_col, verbose=True)\n", + "shifter = TemporalTargetShifter(target_col = target_col, n_lags=1, mode=MODE_MACHINE_LEARNING, verbose=True, )\n", "\n", "# Apply the shifter\n", - "shifted_df = shifter.fit_transform(modin_tf)\n", + "shifted_tf = shifter.fit_transform(modin_tf)\n", + "shifted_df = shifted_tf.get_data()\n", "\n", "# Print the shifted data\n", + "print_divider()\n", "print(\"Shifted data:\")\n", - "print(shifted_df.head())" + "print(shifted_df.head())\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "5ff95236-87eb-487e-9a65-fce69340d3f6", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsrealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealintrealgdp_shift_1
01959-01-011707.4286.898470.0451886.928.98139.72.825.8177.1460.000.002778.801
11959-04-011733.7310.859481.3011919.729.15141.73.085.1177.8302.340.742775.488
21959-07-011751.8289.226491.2601916.429.35140.53.825.3178.6572.741.092785.204
31959-10-011753.7299.356484.0521931.329.37140.04.335.6179.3860.274.062847.699
41960-01-011770.5331.722462.1991955.529.54139.63.505.2180.0072.311.192834.390
\n", + "
" + ], + "text/plain": [ + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "shifted_df.head()" ] @@ -494,104 +654,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "9c6ef6be-d13b-4576-bdef-fa4afbb687a5", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "# Step 5: Apply the TemporalTargetShifter in deep learning mode\n", - "MODE_DEEP_LEARNING = \"deep_learning\"\n", - "\n", - "print(f\"\\nApplying Target Shifter in {MODE_DEEP_LEARNING} mode...\")\n", - "\n", - "# Setup the TemporalTargetShifter for deep learning mode with a sequence length\n", - "sequence_length = 3 # Length of sequence for deep learning\n", - "shifter_dl = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=sequence_length, verbose=True)\n", - "\n", - "# Apply the shifter\n", - "shifted_dl_df = shifter_dl.fit_transform(modin_tf)\n", - "\n", - "# Print the shifted data with sequences\n", - "print(\"Shifted data for deep learning mode (sequences):\")\n", - "print(shifted_dl_df.head())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "369d0213-0bca-4c05-af9e-42daa260b3fe", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "shifted_dl_df" - ] - }, - { - "cell_type": "markdown", - "id": "b0cbc6e3-a665-45f2-a9aa-60b9057d5540", - "metadata": {}, - "source": [ - "## Part 4: Shifting for all backends" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "170bad23-b236-4837-b042-7218622c4e62", - "metadata": { - "tags": [] - }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "======================================================================\n", - "Loading the 'macrodata' dataset from the open-license statsmodels package.\n", - "Using 'realgdp' as the target column for future prediction.\n", - "======================================================================\n", - "======================================================================\n", - "Demonstrating Target Shifter for backend: pd\n", - "Preview of the TimeFrame DataFrame:\n", - " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", - "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", - "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", - "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", - "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", - "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", "\n", - " tbilrate unemp pop infl realint \n", - "0 2.82 5.8 177.146 0.00 0.00 \n", - "1 3.08 5.1 177.830 2.34 0.74 \n", - "2 3.82 5.3 178.657 2.74 1.09 \n", - "3 4.33 5.6 179.386 0.27 4.06 \n", - "4 3.50 5.2 180.007 2.31 1.19 \n", + "Applying Target Shifter in deep_learning mode...\n", + "Initialized TemporalTargetShifter with target_col=realgdp, mode=deep_learning, n_lags=1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 200; Rows after shift: 200; Rows dropped: 0\n", "======================================================================\n", + "Shifted data for deep learning mode (sequences):\n", "\n", - "Applying Target Shifter in machine_learning mode...\n", - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 \n", - "\n", - "Applying Target Shifter in deep_learning mode...\n", - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", @@ -607,292 +700,1197 @@ "4 5.2 180.007 2.31 1.19 2834.390 \n", "\n", " realgdp_sequence \n", - "0 [2710.349, 2778.801, 2775.488] \n", - "1 [2778.801, 2775.488, 2785.204] \n", - "2 [2775.488, 2785.204, 2847.699] \n", - "3 [2785.204, 2847.699, 2834.39] \n", - "4 [2847.699, 2834.39, 2839.022] \n", - "======================================================================\n", - "Demonstrating Target Shifter for backend: pl\n", - "Preview of the TimeFrame DataFrame:\n", - "shape: (5, 13)\n", - "┌─────────────────────┬──────────┬──────────┬─────────┬───┬───────┬─────────┬──────┬─────────┐\n", - "│ ds ┆ realgdp ┆ realcons ┆ realinv ┆ … ┆ unemp ┆ pop ┆ infl ┆ realint │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞═════════════════════╪══════════╪══════════╪═════════╪═══╪═══════╪═════════╪══════╪═════════╡\n", - "│ 1959-01-01 00:00:00 ┆ 2710.349 ┆ 1707.4 ┆ 286.898 ┆ … ┆ 5.8 ┆ 177.146 ┆ 0.0 ┆ 0.0 │\n", - "│ 1959-04-01 00:00:00 ┆ 2778.801 ┆ 1733.7 ┆ 310.859 ┆ … ┆ 5.1 ┆ 177.83 ┆ 2.34 ┆ 0.74 │\n", - "│ 1959-07-01 00:00:00 ┆ 2775.488 ┆ 1751.8 ┆ 289.226 ┆ … ┆ 5.3 ┆ 178.657 ┆ 2.74 ┆ 1.09 │\n", - "│ 1959-10-01 00:00:00 ┆ 2785.204 ┆ 1753.7 ┆ 299.356 ┆ … ┆ 5.6 ┆ 179.386 ┆ 0.27 ┆ 4.06 │\n", - "│ 1960-01-01 00:00:00 ┆ 2847.699 ┆ 1770.5 ┆ 331.722 ┆ … ┆ 5.2 ┆ 180.007 ┆ 2.31 ┆ 1.19 │\n", - "└─────────────────────┴──────────┴──────────┴─────────┴───┴───────┴─────────┴──────┴─────────┘\n", - "======================================================================\n", - "\n", - "Applying Target Shifter in machine_learning mode...\n", - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - "shape: (5, 13)\n", - "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬─────────────────┐\n", - "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_shift_1 │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪═════════════════╡\n", - "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ 2778.801 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ 2775.488 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ 2785.204 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ 2847.699 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ 2834.39 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴─────────────────┘\n", - "\n", - "Applying Target Shifter in deep_learning mode...\n", - "Rows before shift: 203; Rows after shift: 203; Rows dropped: 0\n", - "Shifted data:\n", - "shape: (5, 13)\n", - "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬────────────────────┐\n", - "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_sequence │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ list[f64] │\n", - "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪════════════════════╡\n", - "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ [2710.349, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2778.801, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488] │\n", - "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ [2778.801, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204] │\n", - "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ [2775.488, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699] │\n", - "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ [2785.204, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699, 2834.39] │\n", - "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ [2847.699, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2834.39, 2839.022] │\n", - "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴────────────────────┘\n", - "======================================================================\n", - "Demonstrating Target Shifter for backend: mpd\n", - "Preview of the TimeFrame DataFrame:\n", - " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", - "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", - "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", - "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", - "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", - "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", - "\n", - " tbilrate unemp pop infl realint \n", - "0 2.82 5.8 177.146 0.00 0.00 \n", - "1 3.08 5.1 177.830 2.34 0.74 \n", - "2 3.82 5.3 178.657 2.74 1.09 \n", - "3 4.33 5.6 179.386 0.27 4.06 \n", - "4 3.50 5.2 180.007 2.31 1.19 \n", - "======================================================================\n", - "\n", - "Applying Target Shifter in machine_learning mode...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 \n", - "\n", - "Applying Target Shifter in deep_learning mode...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", - "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \\\n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 \n", - "\n", - " realgdp_sequence \n", - "0 [2710.349, 2778.801, 2775.488] \n", - "1 [2778.801, 2775.488, 2785.204] \n", - "2 [2775.488, 2785.204, 2847.699] \n", - "3 [2785.204, 2847.699, 2834.39] \n", - "4 [2847.699, 2834.39, 2839.022] \n" + "0 (2710.349, 2778.801, 2775.488) \n", + "1 (2778.801, 2775.488, 2785.204) \n", + "2 (2775.488, 2785.204, 2847.699) \n", + "3 (2785.204, 2847.699, 2834.39) \n", + "4 (2847.699, 2834.39, 2839.022) \n" ] } ], "source": [ - "import modin.pandas as mpd\n", - "import polars as pl\n", - "\n", - "from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, print_divider\n", - "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", - "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", - "\n", - "# Constants for modes\n", - "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "# Step 5: Apply the TemporalTargetShifter in deep learning mode\n", "MODE_DEEP_LEARNING = \"deep_learning\"\n", "\n", - "def load_macrodata(target_col: str = \"realgdp\"):\n", - " \"\"\"Preprocess the dataset with a combined column for time & shifted target.\n", - "\n", - " :param target_col: The column to be used as the target for prediction\n", - " :type target_col: str, optional\n", - " :default target_col: 'realgdp'\n", - "\n", - " :return: Preprocessed DataFrame with shifted target\n", - " :rtype: pd.DataFrame\n", - " \"\"\"\n", - " print_divider()\n", - " print(\"Loading the 'macrodata' dataset from the open-license statsmodels package.\")\n", - " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", - " print_divider()\n", - "\n", - " # Load macrodata dataset\n", - " macro_df = macrodata.load_pandas().data.copy()\n", - "\n", - " # Create 'ds' column by combining 'year' and 'quarter'\n", - " macro_df[\"ds\"] = pd.to_datetime(\n", - " macro_df[\"year\"].astype(int).astype(str)\n", - " + \"-\"\n", - " + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str)\n", - " + \"-01\"\n", - " )\n", - "\n", - " # Drop the 'year' and 'quarter' columns\n", - " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", - "\n", - " # Reorder columns to place 'ds' first\n", - " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", - " macro_df = macro_df[cols].copy()\n", - "\n", - " return macro_df, target_col\n", - "\n", - "\n", - "def init_timeframes_for_backends(macro_df, target_col: str):\n", - " \"\"\"Initialize TimeFrame objects for all backends (Pandas, Polars, Modin) using constants.\n", - "\n", - " :param macro_df: Preprocessed macro dataset.\n", - " :type macro_df: pd.DataFrame\n", - " :param target_col: The target column for prediction.\n", - " :type target_col: str\n", - " :return: A dictionary containing TimeFrame objects for Pandas, Polars, and Modin.\n", - " :rtype: dict\n", - " \"\"\"\n", - " timeframes = {}\n", - "\n", - " # Pandas backend\n", - " macro_pandas_df = pd.DataFrame(macro_df)\n", - " timeframes[BACKEND_PANDAS] = tf(\n", - " macro_pandas_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_PANDAS\n", - " )\n", - "\n", - " # Polars backend\n", - " macro_polars_df = pl.DataFrame(macro_df)\n", - " timeframes[BACKEND_POLARS] = tf(\n", - " macro_polars_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_POLARS\n", - " )\n", - "\n", - " # Modin backend\n", - " macro_modin_df = mpd.DataFrame(macro_df)\n", - " timeframes[BACKEND_MODIN] = tf(\n", - " macro_modin_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_MODIN\n", - " )\n", - "\n", - " return timeframes\n", - "\n", - "\n", - "def apply_target_shifter(tf_obj, mode: str):\n", - " \"\"\"Apply the TemporalTargetShifter in the specified mode.\n", - "\n", - " :param tf_obj: TimeFrame object to apply the shifter to.\n", - " :param mode: Mode of operation (machine_learning or deep_learning).\n", - " \"\"\"\n", - " print(f\"\\nApplying Target Shifter in {mode} mode...\")\n", - "\n", - " # Setup the TemporalTargetShifter\n", - " if mode == MODE_MACHINE_LEARNING:\n", - " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_MACHINE_LEARNING, verbose=True)\n", - " elif mode == MODE_DEEP_LEARNING:\n", - " # In deep learning mode, sequence_length must be provided\n", - " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=3, verbose=True)\n", - " else:\n", - " raise ValueError(f\"Invalid mode: {mode}\")\n", - "\n", - " # Apply the shifter\n", - " shifted_df = shifter.fit_transform(tf_obj)\n", - "\n", - " # Print the result (since it's already a DataFrame, no need for get_data())\n", - " print(\"Shifted data:\")\n", - " print(shifted_df.head())\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " # Load the macrodata dataset and preprocess\n", - " macro_df, target_col = load_macrodata()\n", + "print(f\"\\nApplying Target Shifter in {MODE_DEEP_LEARNING} mode...\")\n", "\n", - " # Initialize TimeFrame objects for various backends using constants\n", - " timeframes = init_timeframes_for_backends(macro_df, target_col)\n", + "# Setup the TemporalTargetShifter for deep learning mode with a sequence length\n", + "sequence_length = 3 # Length of sequence for deep learning\n", + "shifter_dl = TemporalTargetShifter(target_col=target_col, n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=sequence_length, verbose=True)\n", "\n", - " # Apply and demonstrate shifting for all backends\n", - " for backend, tf_obj in timeframes.items():\n", - " print_divider()\n", - " print(f\"Demonstrating Target Shifter for backend: {backend}\")\n", - " print(\"Preview of the TimeFrame DataFrame:\")\n", - " print(tf_obj.get_data().head())\n", - " print_divider()\n", + "# Apply the shifter\n", + "shifted_dl_tf = shifter_dl.fit_transform(modin_tf)\n", "\n", - " # Apply target shifting in machine learning mode\n", - " apply_target_shifter(tf_obj, MODE_MACHINE_LEARNING)\n", + "# Because we had a TimeFrame originally we must extract the data\n", + "shifted_dl_df = shifted_dl_tf.get_data()\n", "\n", - " # Apply target shifting in deep learning mode\n", - " apply_target_shifter(tf_obj, MODE_DEEP_LEARNING)\n" + "# Print the shifted data with sequences\n", + "print_divider()\n", + "print(\"Shifted data for deep learning mode (sequences):\\n\")\n", + "print(shifted_dl_df.head())\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c7d45fd7-3773-4d8b-ba66-af01b4aa2dd3", + "execution_count": 5, + "id": "369d0213-0bca-4c05-af9e-42daa260b3fe", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsrealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealintrealgdp_shift_1realgdp_sequence
01959-01-011707.4286.898470.0451886.928.980139.72.825.8177.1460.000.002778.801(2710.349, 2778.801, 2775.488)
11959-04-011733.7310.859481.3011919.729.150141.73.085.1177.8302.340.742775.488(2778.801, 2775.488, 2785.204)
21959-07-011751.8289.226491.2601916.429.350140.53.825.3178.6572.741.092785.204(2775.488, 2785.204, 2847.699)
31959-10-011753.7299.356484.0521931.329.370140.04.335.6179.3860.274.062847.699(2785.204, 2847.699, 2834.39)
41960-01-011770.5331.722462.1991955.529.540139.63.505.2180.0072.311.192834.390(2847.699, 2834.39, 2839.022)
.............................................
1952007-10-019363.62123.426925.1109886.2212.4951377.43.014.8303.2046.38-3.3713366.865(13391.249, 13366.865, 13415.266)
1962008-01-019349.62082.886943.3729826.8213.9971384.01.564.9303.8032.82-1.2613415.266(13366.865, 13415.266, 13324.6)
1972008-04-019351.02026.518961.28010059.0218.6101409.31.745.4304.4838.53-6.7913324.600(13415.266, 13324.6, 13141.92)
1982008-07-019267.71990.693991.5519838.3216.8891474.71.176.0305.270-3.164.3313141.920(13324.6, 13141.92, 12925.41)
1992008-10-019195.31857.6611007.2739920.4212.1741576.50.126.9305.952-8.798.9112925.410(13141.92, 12925.41, 12901.504)
\n", + "

200 rows x 14 columns

\n", + "
" + ], + "text/plain": [ + " ds realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.980 139.7 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.150 141.7 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.350 140.5 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.370 140.0 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.540 139.6 \n", + ".. ... ... ... ... ... ... ... \n", + "195 2007-10-01 9363.6 2123.426 925.110 9886.2 212.495 1377.4 \n", + "196 2008-01-01 9349.6 2082.886 943.372 9826.8 213.997 1384.0 \n", + "197 2008-04-01 9351.0 2026.518 961.280 10059.0 218.610 1409.3 \n", + "198 2008-07-01 9267.7 1990.693 991.551 9838.3 216.889 1474.7 \n", + "199 2008-10-01 9195.3 1857.661 1007.273 9920.4 212.174 1576.5 \n", + "\n", + " tbilrate unemp pop infl realint realgdp_shift_1 \\\n", + "0 2.82 5.8 177.146 0.00 0.00 2778.801 \n", + "1 3.08 5.1 177.830 2.34 0.74 2775.488 \n", + "2 3.82 5.3 178.657 2.74 1.09 2785.204 \n", + "3 4.33 5.6 179.386 0.27 4.06 2847.699 \n", + "4 3.50 5.2 180.007 2.31 1.19 2834.390 \n", + ".. ... ... ... ... ... ... \n", + "195 3.01 4.8 303.204 6.38 -3.37 13366.865 \n", + "196 1.56 4.9 303.803 2.82 -1.26 13415.266 \n", + "197 1.74 5.4 304.483 8.53 -6.79 13324.600 \n", + "198 1.17 6.0 305.270 -3.16 4.33 13141.920 \n", + "199 0.12 6.9 305.952 -8.79 8.91 12925.410 \n", + "\n", + " realgdp_sequence \n", + "0 (2710.349, 2778.801, 2775.488) \n", + "1 (2778.801, 2775.488, 2785.204) \n", + "2 (2775.488, 2785.204, 2847.699) \n", + "3 (2785.204, 2847.699, 2834.39) \n", + "4 (2847.699, 2834.39, 2839.022) \n", + ".. ... \n", + "195 (13391.249, 13366.865, 13415.266) \n", + "196 (13366.865, 13415.266, 13324.6) \n", + "197 (13415.266, 13324.6, 13141.92) \n", + "198 (13324.6, 13141.92, 12925.41) \n", + "199 (13141.92, 12925.41, 12901.504) \n", + "\n", + "[200 rows x 14 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shifted_dl_df" + ] + }, + { + "cell_type": "markdown", + "id": "b0cbc6e3-a665-45f2-a9aa-60b9057d5540", + "metadata": {}, + "source": [ + "## Part 4: Shifting for all backends" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "170bad23-b236-4837-b042-7218622c4e62", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "Loading the 'macrodata' dataset from statsmodels.\n", + "Using 'realgdp' as the target column for future prediction.\n", + "======================================================================\n", + "======================================================================\n", + "Initializing TimeFrame objects for Pandas, Polars, and Modin...\n", + "======================================================================\n", + "Demonstrating Target Shifter for backend: pd\n", + "Preview of the TimeFrame DataFrame:\n", + " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", + "\n", + " tbilrate unemp pop infl realint \n", + "0 2.82 5.8 177.146 0.00 0.00 \n", + "1 3.08 5.1 177.830 2.34 0.74 \n", + "2 3.82 5.3 178.657 2.74 1.09 \n", + "3 4.33 5.6 179.386 0.27 4.06 \n", + "4 3.50 5.2 180.007 2.31 1.19 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode for backend: pd...\n", + "Initialized TemporalTargetShifter with target_col=realgdp, mode=machine_learning, n_lags=1\n", + "Rows before shift: 202; Rows after shift: 202; Rows dropped: 0\n", + "Shifted data (Machine Learning Mode):\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in deep_learning mode for backend: pd...\n", + "Initialized TemporalTargetShifter with target_col=realgdp, mode=deep_learning, n_lags=1\n", + "Rows before shift: 200; Rows after shift: 200; Rows dropped: 0\n", + "Shifted data (Deep Learning Mode):\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \\\n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + " realgdp_sequence \n", + "0 (2710.349, 2778.801, 2775.488) \n", + "1 (2778.801, 2775.488, 2785.204) \n", + "2 (2775.488, 2785.204, 2847.699) \n", + "3 (2785.204, 2847.699, 2834.39) \n", + "4 (2847.699, 2834.39, 2839.022) \n", + "======================================================================\n", + "Demonstrating Target Shifter for backend: pl\n", + "Preview of the TimeFrame DataFrame:\n", + "shape: (5, 13)\n", + "┌─────────────────────┬──────────┬──────────┬─────────┬───┬───────┬─────────┬──────┬─────────┐\n", + "│ ds ┆ realgdp ┆ realcons ┆ realinv ┆ … ┆ unemp ┆ pop ┆ infl ┆ realint │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═════════════════════╪══════════╪══════════╪═════════╪═══╪═══════╪═════════╪══════╪═════════╡\n", + "│ 1959-01-01 00:00:00 ┆ 2710.349 ┆ 1707.4 ┆ 286.898 ┆ … ┆ 5.8 ┆ 177.146 ┆ 0.0 ┆ 0.0 │\n", + "│ 1959-04-01 00:00:00 ┆ 2778.801 ┆ 1733.7 ┆ 310.859 ┆ … ┆ 5.1 ┆ 177.83 ┆ 2.34 ┆ 0.74 │\n", + "│ 1959-07-01 00:00:00 ┆ 2775.488 ┆ 1751.8 ┆ 289.226 ┆ … ┆ 5.3 ┆ 178.657 ┆ 2.74 ┆ 1.09 │\n", + "│ 1959-10-01 00:00:00 ┆ 2785.204 ┆ 1753.7 ┆ 299.356 ┆ … ┆ 5.6 ┆ 179.386 ┆ 0.27 ┆ 4.06 │\n", + "│ 1960-01-01 00:00:00 ┆ 2847.699 ┆ 1770.5 ┆ 331.722 ┆ … ┆ 5.2 ┆ 180.007 ┆ 2.31 ┆ 1.19 │\n", + "└─────────────────────┴──────────┴──────────┴─────────┴───┴───────┴─────────┴──────┴─────────┘\n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode for backend: pl...\n", + "Initialized TemporalTargetShifter with target_col=realgdp, mode=machine_learning, n_lags=1\n", + "Rows before shift: 202; Rows after shift: 202; Rows dropped: 0\n", + "Shifted data (Machine Learning Mode):\n", + "shape: (5, 13)\n", + "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬─────────────────┐\n", + "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_shift_1 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪═════════════════╡\n", + "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ 2778.801 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ 2775.488 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ 2785.204 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ 2847.699 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ 2834.39 │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴─────────────────┘\n", + "======================================================================\n", + "\n", + "Applying Target Shifter in deep_learning mode for backend: pl...\n", + "Initialized TemporalTargetShifter with target_col=realgdp, mode=deep_learning, n_lags=1\n", + "Rows before shift: 201; Rows after shift: 201; Rows dropped: 0\n", + "Shifted data (Deep Learning Mode):\n", + "shape: (5, 13)\n", + "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬────────────────────┐\n", + "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_sequence │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ list[f64] │\n", + "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪════════════════════╡\n", + "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ [2710.349, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2778.801, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488] │\n", + "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ [2778.801, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204] │\n", + "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ [2775.488, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699] │\n", + "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ [2785.204, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699, 2834.39] │\n", + "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ [2847.699, │\n", + "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2834.39, 2839.022] │\n", + "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴────────────────────┘\n", + "======================================================================\n", + "Demonstrating Target Shifter for backend: mpd\n", + "Preview of the TimeFrame DataFrame:\n", + " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", + "\n", + " tbilrate unemp pop infl realint \n", + "0 2.82 5.8 177.146 0.00 0.00 \n", + "1 3.08 5.1 177.830 2.34 0.74 \n", + "2 3.82 5.3 178.657 2.74 1.09 \n", + "3 4.33 5.6 179.386 0.27 4.06 \n", + "4 3.50 5.2 180.007 2.31 1.19 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in machine_learning mode for backend: mpd...\n", + "Initialized TemporalTargetShifter with target_col=realgdp, mode=machine_learning, n_lags=1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 202; Rows after shift: 202; Rows dropped: 0\n", + "Shifted data (Machine Learning Mode):\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "======================================================================\n", + "\n", + "Applying Target Shifter in deep_learning mode for backend: mpd...\n", + "Initialized TemporalTargetShifter with target_col=realgdp, mode=deep_learning, n_lags=1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", + "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows before shift: 200; Rows after shift: 200; Rows dropped: 0\n", + "Shifted data (Deep Learning Mode):\n", + " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", + "\n", + " unemp pop infl realint realgdp_shift_1 \\\n", + "0 5.8 177.146 0.00 0.00 2778.801 \n", + "1 5.1 177.830 2.34 0.74 2775.488 \n", + "2 5.3 178.657 2.74 1.09 2785.204 \n", + "3 5.6 179.386 0.27 4.06 2847.699 \n", + "4 5.2 180.007 2.31 1.19 2834.390 \n", + "\n", + " realgdp_sequence \n", + "0 (2710.349, 2778.801, 2775.488) \n", + "1 (2778.801, 2775.488, 2785.204) \n", + "2 (2775.488, 2785.204, 2847.699) \n", + "3 (2785.204, 2847.699, 2834.39) \n", + "4 (2847.699, 2834.39, 2839.022) \n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import modin.pandas as mpd\n", + "import polars as pl\n", + "\n", + "from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, print_divider\n", + "from temporalscope.core.temporal_data_loader import TimeFrame\n", + "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "MODE_DEEP_LEARNING = \"deep_learning\"\n", + "\n", + "# Load the macrodata dataset and preprocess\n", + "macro_df, target_col = load_macrodata()\n", + "\n", + "# Initialize TimeFrame objects for various backends\n", + "print_divider()\n", + "print(\"Initializing TimeFrame objects for Pandas, Polars, and Modin...\")\n", + "\n", + "# Pandas backend\n", + "macro_pandas_df = pd.DataFrame(macro_df)\n", + "pandas_tf = TimeFrame(\n", + " df=macro_pandas_df, target_col=target_col, time_col=\"ds\", dataframe_backend=BACKEND_PANDAS\n", + ")\n", + "\n", + "# Polars backend\n", + "macro_polars_df = pl.DataFrame(macro_df)\n", + "polars_tf = TimeFrame(\n", + " df=macro_polars_df, target_col=target_col, time_col=\"ds\", dataframe_backend=BACKEND_POLARS\n", + ")\n", + "\n", + "# Modin backend\n", + "macro_modin_df = mpd.DataFrame(macro_df)\n", + "modin_tf = TimeFrame(\n", + " df=macro_modin_df, target_col=target_col, time_col=\"ds\", dataframe_backend=BACKEND_MODIN\n", + ")\n", + "\n", + "# Dictionary of timeframes by backend\n", + "timeframes = {\n", + " BACKEND_PANDAS: pandas_tf,\n", + " BACKEND_POLARS: polars_tf,\n", + " BACKEND_MODIN: modin_tf\n", + "}\n", + "\n", + "# Iterate through each backend and apply the TemporalTargetShifter\n", + "for backend, tf_obj in timeframes.items():\n", + " print_divider()\n", + " print(f\"Demonstrating Target Shifter for backend: {backend}\")\n", + " print(\"Preview of the TimeFrame DataFrame:\")\n", + " print(tf_obj.get_data().head())\n", + " print_divider()\n", + "\n", + " # Apply the TemporalTargetShifter in machine learning mode\n", + " print(f\"\\nApplying Target Shifter in {MODE_MACHINE_LEARNING} mode for backend: {backend}...\")\n", + " shifter_ml = TemporalTargetShifter(target_col=tf_obj.target_col, n_lags=1, mode=MODE_MACHINE_LEARNING, verbose=True)\n", + " shifted_ml_df = shifter_ml.fit_transform(tf_obj).get_data()\n", + " print(\"Shifted data (Machine Learning Mode):\")\n", + " print(shifted_ml_df.head())\n", + "\n", + " print_divider()\n", + "\n", + " # Apply the TemporalTargetShifter in deep learning mode\n", + " print(f\"\\nApplying Target Shifter in {MODE_DEEP_LEARNING} mode for backend: {backend}...\")\n", + " shifter_dl = TemporalTargetShifter(target_col=tf_obj.target_col, n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=3, verbose=True)\n", + " shifted_dl_df = shifter_dl.fit_transform(tf_obj).get_data()\n", + " print(\"Shifted data (Deep Learning Mode):\")\n", + " print(shifted_dl_df.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c7d45fd7-3773-4d8b-ba66-af01b4aa2dd3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsrealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealintrealgdp_shift_1
01959-01-011707.4286.898470.0451886.928.980139.72.825.8177.1460.000.002778.801
11959-04-011733.7310.859481.3011919.729.150141.73.085.1177.8302.340.742775.488
21959-07-011751.8289.226491.2601916.429.350140.53.825.3178.6572.741.092785.204
31959-10-011753.7299.356484.0521931.329.370140.04.335.6179.3860.274.062847.699
41960-01-011770.5331.722462.1991955.529.540139.63.505.2180.0072.311.192834.390
..........................................
1972008-04-019351.02026.518961.28010059.0218.6101409.31.745.4304.4838.53-6.7913324.600
1982008-07-019267.71990.693991.5519838.3216.8891474.71.176.0305.270-3.164.3313141.920
1992008-10-019195.31857.6611007.2739920.4212.1741576.50.126.9305.952-8.798.9112925.410
2002009-01-019209.21558.494996.2879926.4212.6711592.80.228.1306.5470.94-0.7112901.504
2012009-04-019189.01456.6781023.52810077.5214.4691653.60.189.2307.2263.37-3.1912990.341
\n", + "

202 rows x 13 columns

\n", + "
" + ], + "text/plain": [ + " ds realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.980 139.7 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.150 141.7 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.350 140.5 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.370 140.0 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.540 139.6 \n", + ".. ... ... ... ... ... ... ... \n", + "197 2008-04-01 9351.0 2026.518 961.280 10059.0 218.610 1409.3 \n", + "198 2008-07-01 9267.7 1990.693 991.551 9838.3 216.889 1474.7 \n", + "199 2008-10-01 9195.3 1857.661 1007.273 9920.4 212.174 1576.5 \n", + "200 2009-01-01 9209.2 1558.494 996.287 9926.4 212.671 1592.8 \n", + "201 2009-04-01 9189.0 1456.678 1023.528 10077.5 214.469 1653.6 \n", + "\n", + " tbilrate unemp pop infl realint realgdp_shift_1 \n", + "0 2.82 5.8 177.146 0.00 0.00 2778.801 \n", + "1 3.08 5.1 177.830 2.34 0.74 2775.488 \n", + "2 3.82 5.3 178.657 2.74 1.09 2785.204 \n", + "3 4.33 5.6 179.386 0.27 4.06 2847.699 \n", + "4 3.50 5.2 180.007 2.31 1.19 2834.390 \n", + ".. ... ... ... ... ... ... \n", + "197 1.74 5.4 304.483 8.53 -6.79 13324.600 \n", + "198 1.17 6.0 305.270 -3.16 4.33 13141.920 \n", + "199 0.12 6.9 305.952 -8.79 8.91 12925.410 \n", + "200 0.22 8.1 306.547 0.94 -0.71 12901.504 \n", + "201 0.18 9.2 307.226 3.37 -3.19 12990.341 \n", + "\n", + "[202 rows x 13 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shifted_ml_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "797f3a7a-3db8-45ec-99a8-a9463168e40f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsrealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealintrealgdp_shift_1realgdp_sequence
01959-01-011707.4286.898470.0451886.928.980139.72.825.8177.1460.000.002778.801(2710.349, 2778.801, 2775.488)
11959-04-011733.7310.859481.3011919.729.150141.73.085.1177.8302.340.742775.488(2778.801, 2775.488, 2785.204)
21959-07-011751.8289.226491.2601916.429.350140.53.825.3178.6572.741.092785.204(2775.488, 2785.204, 2847.699)
31959-10-011753.7299.356484.0521931.329.370140.04.335.6179.3860.274.062847.699(2785.204, 2847.699, 2834.39)
41960-01-011770.5331.722462.1991955.529.540139.63.505.2180.0072.311.192834.390(2847.699, 2834.39, 2839.022)
.............................................
1952007-10-019363.62123.426925.1109886.2212.4951377.43.014.8303.2046.38-3.3713366.865(13391.249, 13366.865, 13415.266)
1962008-01-019349.62082.886943.3729826.8213.9971384.01.564.9303.8032.82-1.2613415.266(13366.865, 13415.266, 13324.6)
1972008-04-019351.02026.518961.28010059.0218.6101409.31.745.4304.4838.53-6.7913324.600(13415.266, 13324.6, 13141.92)
1982008-07-019267.71990.693991.5519838.3216.8891474.71.176.0305.270-3.164.3313141.920(13324.6, 13141.92, 12925.41)
1992008-10-019195.31857.6611007.2739920.4212.1741576.50.126.9305.952-8.798.9112925.410(13141.92, 12925.41, 12901.504)
\n", + "

200 rows x 14 columns

\n", + "
" + ], + "text/plain": [ + " ds realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.980 139.7 \n", + "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.150 141.7 \n", + "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.350 140.5 \n", + "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.370 140.0 \n", + "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.540 139.6 \n", + ".. ... ... ... ... ... ... ... \n", + "195 2007-10-01 9363.6 2123.426 925.110 9886.2 212.495 1377.4 \n", + "196 2008-01-01 9349.6 2082.886 943.372 9826.8 213.997 1384.0 \n", + "197 2008-04-01 9351.0 2026.518 961.280 10059.0 218.610 1409.3 \n", + "198 2008-07-01 9267.7 1990.693 991.551 9838.3 216.889 1474.7 \n", + "199 2008-10-01 9195.3 1857.661 1007.273 9920.4 212.174 1576.5 \n", + "\n", + " tbilrate unemp pop infl realint realgdp_shift_1 \\\n", + "0 2.82 5.8 177.146 0.00 0.00 2778.801 \n", + "1 3.08 5.1 177.830 2.34 0.74 2775.488 \n", + "2 3.82 5.3 178.657 2.74 1.09 2785.204 \n", + "3 4.33 5.6 179.386 0.27 4.06 2847.699 \n", + "4 3.50 5.2 180.007 2.31 1.19 2834.390 \n", + ".. ... ... ... ... ... ... \n", + "195 3.01 4.8 303.204 6.38 -3.37 13366.865 \n", + "196 1.56 4.9 303.803 2.82 -1.26 13415.266 \n", + "197 1.74 5.4 304.483 8.53 -6.79 13324.600 \n", + "198 1.17 6.0 305.270 -3.16 4.33 13141.920 \n", + "199 0.12 6.9 305.952 -8.79 8.91 12925.410 \n", + "\n", + " realgdp_sequence \n", + "0 (2710.349, 2778.801, 2775.488) \n", + "1 (2778.801, 2775.488, 2785.204) \n", + "2 (2775.488, 2785.204, 2847.699) \n", + "3 (2785.204, 2847.699, 2834.39) \n", + "4 (2847.699, 2834.39, 2839.022) \n", + ".. ... \n", + "195 (13391.249, 13366.865, 13415.266) \n", + "196 (13366.865, 13415.266, 13324.6) \n", + "197 (13415.266, 13324.6, 13141.92) \n", + "198 (13324.6, 13141.92, 12925.41) \n", + "199 (13141.92, 12925.41, 12901.504) \n", + "\n", + "[200 rows x 14 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shifted_dl_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a379b18-ed50-4b7b-baeb-5ecc43b017db", "metadata": {}, "outputs": [], "source": [] diff --git a/tutorial_notebooks/introduction/2_partion_sliding_window.ipynb b/tutorial_notebooks/introduction/2_partion_sliding_window.ipynb deleted file mode 100644 index d1a2fd3..0000000 --- a/tutorial_notebooks/introduction/2_partion_sliding_window.ipynb +++ /dev/null @@ -1,1386 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "c8aefe6f-489e-42fe-9cb9-a20426652424", - "metadata": {}, - "source": [ - "# Temporal Scope Tutorial: Utilizing Target Shifter\n", - "\n", - "## Overview\n", - "\n", - "This tutorial demonstrates how to load macroeconomic data and apply the **TemporalTargetShifter** using the **Modin** backend. The tutorial shows how to shift the target variable in **machine learning** and **deep learning** modes for forecasting tasks. The tool supports flexible configurations for different forecasting needs.\n", - "\n", - "### Summary\n", - "\n", - "| **Step** | **Description** |\n", - "|-----------|---------------------------------------------------------------------------------|\n", - "| **1** | **Data Loading**: Load macroeconomic data and create a datetime column (`ds`). |\n", - "| **2** | **Modin Backend Initialization**: Initialize a `TimeFrame` for scalable data processing with Modin. |\n", - "| **3** | **Target Shifting (ML Mode)**: Shift the target variable (`realgdp`) for one-step-ahead forecasting in **machine learning mode**. |\n", - "| **4** | **Target Shifting (DL Mode)**: Shift the target variable for sequence-based forecasting in **deep learning mode**. |\n", - "\n", - "### Key Concepts\n", - "\n", - "- **One-step ahead forecasting**: Shifting the target variable to predict the next time step for machine learning models.\n", - "- **Sequence forecasting**: Generating sequences of target variables for deep learning models.\n", - "- **Modin Backend**: Scalable version of Pandas for large datasets.\n", - "- **TemporalTargetShifter**: A tool to shift target variables for forecasting tasks, supporting both machine learning and deep learning modes.\n", - "\n", - "### Steps\n", - "\n", - "1. **Load the macroeconomic dataset** using the `statsmodels` library.\n", - "2. **Initialize a TimeFrame** for the Modin backend.\n", - "3. **Apply the Target Shifter** in machine learning mode to shift the target variable by one step (for simple one-step-ahead forecasting).\n", - "4. **Apply the Target Shifter** in deep learning mode to create sequences for sequence-based forecasting tasks.\n" - ] - }, - { - "cell_type": "markdown", - "id": "b9b71cc0-f882-40b6-933d-d38cbe3a56cd", - "metadata": {}, - "source": [ - "# Part 1: Load Macro-Economic Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4b56016b-7609-4e26-bb0b-5d6e4f864c18", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "======================================================================\n", - "Loading the 'macrodata' dataset from statsmodels.\n", - "Using 'realgdp' as the target column for future prediction.\n", - "======================================================================\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dsrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
01959-01-012710.3491707.4286.898470.0451886.928.980139.72.825.8177.1460.000.00
11959-04-012778.8011733.7310.859481.3011919.729.150141.73.085.1177.8302.340.74
21959-07-012775.4881751.8289.226491.2601916.429.350140.53.825.3178.6572.741.09
31959-10-012785.2041753.7299.356484.0521931.329.370140.04.335.6179.3860.274.06
41960-01-012847.6991770.5331.722462.1991955.529.540139.63.505.2180.0072.311.19
..........................................
1982008-07-0113324.6009267.71990.693991.5519838.3216.8891474.71.176.0305.270-3.164.33
1992008-10-0113141.9209195.31857.6611007.2739920.4212.1741576.50.126.9305.952-8.798.91
2002009-01-0112925.4109209.21558.494996.2879926.4212.6711592.80.228.1306.5470.94-0.71
2012009-04-0112901.5049189.01456.6781023.52810077.5214.4691653.60.189.2307.2263.37-3.19
2022009-07-0112990.3419256.01486.3981044.08810040.6216.3851673.90.129.6308.0133.56-3.44
\n", - "

203 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " ds realgdp realcons realinv realgovt realdpi cpi \\\n", - "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.980 \n", - "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.150 \n", - "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.350 \n", - "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.370 \n", - "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.540 \n", - ".. ... ... ... ... ... ... ... \n", - "198 2008-07-01 13324.600 9267.7 1990.693 991.551 9838.3 216.889 \n", - "199 2008-10-01 13141.920 9195.3 1857.661 1007.273 9920.4 212.174 \n", - "200 2009-01-01 12925.410 9209.2 1558.494 996.287 9926.4 212.671 \n", - "201 2009-04-01 12901.504 9189.0 1456.678 1023.528 10077.5 214.469 \n", - "202 2009-07-01 12990.341 9256.0 1486.398 1044.088 10040.6 216.385 \n", - "\n", - " m1 tbilrate unemp pop infl realint \n", - "0 139.7 2.82 5.8 177.146 0.00 0.00 \n", - "1 141.7 3.08 5.1 177.830 2.34 0.74 \n", - "2 140.5 3.82 5.3 178.657 2.74 1.09 \n", - "3 140.0 4.33 5.6 179.386 0.27 4.06 \n", - "4 139.6 3.50 5.2 180.007 2.31 1.19 \n", - ".. ... ... ... ... ... ... \n", - "198 1474.7 1.17 6.0 305.270 -3.16 4.33 \n", - "199 1576.5 0.12 6.9 305.952 -8.79 8.91 \n", - "200 1592.8 0.22 8.1 306.547 0.94 -0.71 \n", - "201 1653.6 0.18 9.2 307.226 3.37 -3.19 \n", - "202 1673.9 0.12 9.6 308.013 3.56 -3.44 \n", - "\n", - "[203 rows x 13 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "from statsmodels.datasets import macrodata\n", - "\n", - "from temporalscope.core.core_utils import print_divider\n", - "\n", - "# Constants for modes\n", - "MODE_MACHINE_LEARNING = \"machine_learning\"\n", - "\n", - "def load_macrodata(target_col: str = \"realgdp\"):\n", - " \"\"\"Preprocess the dataset with a combined column for time target.\n", - " \n", - " :param target_col: The column to be used as the target for prediction.\n", - " :type target_col: str, optional\n", - " :return: Preprocessed DataFrame with target column.\n", - " :rtype: pd.DataFrame\n", - " \"\"\"\n", - " print_divider()\n", - " print(\"Loading the 'macrodata' dataset from statsmodels.\")\n", - " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", - " print_divider()\n", - "\n", - " # Load macrodata dataset\n", - " macro_df = macrodata.load_pandas().data.copy()\n", - "\n", - " # Create 'ds' column combining 'year' and 'quarter'\n", - " macro_df[\"ds\"] = pd.to_datetime(\n", - " macro_df[\"year\"].astype(int).astype(str) + \"-\" + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str) + \"-01\"\n", - " )\n", - "\n", - " # Drop the 'year' and 'quarter' columns\n", - " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", - "\n", - " # Reorder columns to put 'ds' (datetime) first\n", - " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", - " macro_df = macro_df[cols].copy()\n", - "\n", - " return macro_df, target_col\n", - "\n", - "\n", - "# Load the macrodata dataset and preprocess\n", - "macro_df, target_col = load_macrodata()\n", - "macro_df" - ] - }, - { - "cell_type": "markdown", - "id": "5bddbc46-e8cf-421c-8561-363aeef1143c", - "metadata": {}, - "source": [ - "## Part 2: Shifting for Machine Learning" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "051a47f4-b8dd-46e3-92c1-39b49ee04f51", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "======================================================================\n", - "Loading the 'macrodata' dataset from statsmodels.\n", - "Using 'realgdp' as the target column for future prediction.\n", - "======================================================================\n", - "======================================================================\n", - "Initializing TimeFrame for the Modin backend...\n", - "Original DataFrame:\n", - " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", - "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", - "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", - "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", - "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", - "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", - "\n", - " tbilrate unemp pop infl realint \n", - "0 2.82 5.8 177.146 0.00 0.00 \n", - "1 3.08 5.1 177.830 2.34 0.74 \n", - "2 3.82 5.3 178.657 2.74 1.09 \n", - "3 4.33 5.6 179.386 0.27 4.06 \n", - "4 3.50 5.2 180.007 2.31 1.19 \n", - "======================================================================\n", - "\n", - "Applying Target Shifter in machine_learning mode...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 \n" - ] - } - ], - "source": [ - "import modin.pandas as mpd\n", - "\n", - "from temporalscope.core.core_utils import BACKEND_MODIN\n", - "from temporalscope.core.temporal_data_loader import TimeFrame\n", - "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", - "\n", - "# Constants for modes\n", - "MODE_MACHINE_LEARNING = \"machine_learning\"\n", - "\n", - "# Step 1: Load the macrodata dataset and preprocess\n", - "macro_df, target_col = load_macrodata()\n", - "\n", - "# Step 2: Initialize Modin TimeFrame for Modin backend\n", - "print_divider()\n", - "print(\"Initializing TimeFrame for the Modin backend...\")\n", - "macro_modin_df = mpd.DataFrame(macro_df)\n", - "modin_tf = TimeFrame(macro_modin_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_MODIN)\n", - "\n", - "# Step 3: Preview the original data\n", - "print(\"Original DataFrame:\")\n", - "print(modin_tf.get_data().head())\n", - "print_divider()\n", - "\n", - "# Step 4: Apply the TemporalTargetShifter in machine learning mode\n", - "print(f\"\\nApplying Target Shifter in {MODE_MACHINE_LEARNING} mode...\")\n", - "\n", - "# Setup the TemporalTargetShifter\n", - "shifter = TemporalTargetShifter(n_lags=1, mode=MODE_MACHINE_LEARNING, verbose=True)\n", - "\n", - "# Apply the shifter\n", - "shifted_df = shifter.fit_transform(modin_tf)\n", - "\n", - "# Print the shifted data\n", - "print(\"Shifted data:\")\n", - "print(shifted_df.head())\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "5ff95236-87eb-487e-9a65-fce69340d3f6", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dsrealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealintrealgdp_shift_1
01959-01-011707.4286.898470.0451886.928.98139.72.825.8177.1460.000.002778.801
11959-04-011733.7310.859481.3011919.729.15141.73.085.1177.8302.340.742775.488
21959-07-011751.8289.226491.2601916.429.35140.53.825.3178.6572.741.092785.204
31959-10-011753.7299.356484.0521931.329.37140.04.335.6179.3860.274.062847.699
41960-01-011770.5331.722462.1991955.529.54139.63.505.2180.0072.311.192834.390
\n", - "
" - ], - "text/plain": [ - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "shifted_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "f4efe10f-e4ca-4b61-821d-87959557a51e", - "metadata": {}, - "source": [ - "## Part 2: Shifting for Deep Learning" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "9c6ef6be-d13b-4576-bdef-fa4afbb687a5", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Applying Target Shifter in deep_learning mode...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", - "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data for deep learning mode (sequences):\n", - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \\\n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 \n", - "\n", - " realgdp_sequence \n", - "0 [2710.349, 2778.801, 2775.488] \n", - "1 [2778.801, 2775.488, 2785.204] \n", - "2 [2775.488, 2785.204, 2847.699] \n", - "3 [2785.204, 2847.699, 2834.39] \n", - "4 [2847.699, 2834.39, 2839.022] \n" - ] - } - ], - "source": [ - "# Step 5: Apply the TemporalTargetShifter in deep learning mode\n", - "MODE_DEEP_LEARNING = \"deep_learning\"\n", - "\n", - "print(f\"\\nApplying Target Shifter in {MODE_DEEP_LEARNING} mode...\")\n", - "\n", - "# Setup the TemporalTargetShifter for deep learning mode with a sequence length\n", - "sequence_length = 3 # Length of sequence for deep learning\n", - "shifter_dl = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=sequence_length, verbose=True)\n", - "\n", - "# Apply the shifter\n", - "shifted_dl_df = shifter_dl.fit_transform(modin_tf)\n", - "\n", - "# Print the shifted data with sequences\n", - "print(\"Shifted data for deep learning mode (sequences):\")\n", - "print(shifted_dl_df.head())\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "369d0213-0bca-4c05-af9e-42daa260b3fe", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dsrealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealintrealgdp_shift_1realgdp_sequence
01959-01-011707.4286.898470.0451886.928.980139.72.825.8177.1460.000.002778.801[2710.349, 2778.801, 2775.488]
11959-04-011733.7310.859481.3011919.729.150141.73.085.1177.8302.340.742775.488[2778.801, 2775.488, 2785.204]
21959-07-011751.8289.226491.2601916.429.350140.53.825.3178.6572.741.092785.204[2775.488, 2785.204, 2847.699]
31959-10-011753.7299.356484.0521931.329.370140.04.335.6179.3860.274.062847.699[2785.204, 2847.699, 2834.39]
41960-01-011770.5331.722462.1991955.529.540139.63.505.2180.0072.311.192834.390[2847.699, 2834.39, 2839.022]
.............................................
1972008-04-019351.02026.518961.28010059.0218.6101409.31.745.4304.4838.53-6.7913324.600[13415.266, 13324.6, 13141.92]
1982008-07-019267.71990.693991.5519838.3216.8891474.71.176.0305.270-3.164.3313141.920[13324.6, 13141.92, 12925.41]
1992008-10-019195.31857.6611007.2739920.4212.1741576.50.126.9305.952-8.798.9112925.410[13141.92, 12925.41, 12901.504]
2002009-01-019209.21558.494996.2879926.4212.6711592.80.228.1306.5470.94-0.7112901.504[12925.41, 12901.504, 12990.341]
2012009-04-019189.01456.6781023.52810077.5214.4691653.60.189.2307.2263.37-3.1912990.341[12901.504, 12990.341, nan]
\n", - "

202 rows x 14 columns

\n", - "
" - ], - "text/plain": [ - " ds realcons realinv realgovt realdpi cpi m1 \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.980 139.7 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.150 141.7 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.350 140.5 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.370 140.0 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.540 139.6 \n", - ".. ... ... ... ... ... ... ... \n", - "197 2008-04-01 9351.0 2026.518 961.280 10059.0 218.610 1409.3 \n", - "198 2008-07-01 9267.7 1990.693 991.551 9838.3 216.889 1474.7 \n", - "199 2008-10-01 9195.3 1857.661 1007.273 9920.4 212.174 1576.5 \n", - "200 2009-01-01 9209.2 1558.494 996.287 9926.4 212.671 1592.8 \n", - "201 2009-04-01 9189.0 1456.678 1023.528 10077.5 214.469 1653.6 \n", - "\n", - " tbilrate unemp pop infl realint realgdp_shift_1 \\\n", - "0 2.82 5.8 177.146 0.00 0.00 2778.801 \n", - "1 3.08 5.1 177.830 2.34 0.74 2775.488 \n", - "2 3.82 5.3 178.657 2.74 1.09 2785.204 \n", - "3 4.33 5.6 179.386 0.27 4.06 2847.699 \n", - "4 3.50 5.2 180.007 2.31 1.19 2834.390 \n", - ".. ... ... ... ... ... ... \n", - "197 1.74 5.4 304.483 8.53 -6.79 13324.600 \n", - "198 1.17 6.0 305.270 -3.16 4.33 13141.920 \n", - "199 0.12 6.9 305.952 -8.79 8.91 12925.410 \n", - "200 0.22 8.1 306.547 0.94 -0.71 12901.504 \n", - "201 0.18 9.2 307.226 3.37 -3.19 12990.341 \n", - "\n", - " realgdp_sequence \n", - "0 [2710.349, 2778.801, 2775.488] \n", - "1 [2778.801, 2775.488, 2785.204] \n", - "2 [2775.488, 2785.204, 2847.699] \n", - "3 [2785.204, 2847.699, 2834.39] \n", - "4 [2847.699, 2834.39, 2839.022] \n", - ".. ... \n", - "197 [13415.266, 13324.6, 13141.92] \n", - "198 [13324.6, 13141.92, 12925.41] \n", - "199 [13141.92, 12925.41, 12901.504] \n", - "200 [12925.41, 12901.504, 12990.341] \n", - "201 [12901.504, 12990.341, nan] \n", - "\n", - "[202 rows x 14 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "shifted_dl_df" - ] - }, - { - "cell_type": "markdown", - "id": "b0cbc6e3-a665-45f2-a9aa-60b9057d5540", - "metadata": {}, - "source": [ - "## Part 4: Shifting for all backends" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "170bad23-b236-4837-b042-7218622c4e62", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "======================================================================\n", - "Loading the 'macrodata' dataset from the open-license statsmodels package.\n", - "Using 'realgdp' as the target column for future prediction.\n", - "======================================================================\n", - "======================================================================\n", - "Demonstrating Target Shifter for backend: pd\n", - "Preview of the TimeFrame DataFrame:\n", - " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", - "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", - "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", - "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", - "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", - "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", - "\n", - " tbilrate unemp pop infl realint \n", - "0 2.82 5.8 177.146 0.00 0.00 \n", - "1 3.08 5.1 177.830 2.34 0.74 \n", - "2 3.82 5.3 178.657 2.74 1.09 \n", - "3 4.33 5.6 179.386 0.27 4.06 \n", - "4 3.50 5.2 180.007 2.31 1.19 \n", - "======================================================================\n", - "\n", - "Applying Target Shifter in machine_learning mode...\n", - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 \n", - "\n", - "Applying Target Shifter in deep_learning mode...\n", - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \\\n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 \n", - "\n", - " realgdp_sequence \n", - "0 [2710.349, 2778.801, 2775.488] \n", - "1 [2778.801, 2775.488, 2785.204] \n", - "2 [2775.488, 2785.204, 2847.699] \n", - "3 [2785.204, 2847.699, 2834.39] \n", - "4 [2847.699, 2834.39, 2839.022] \n", - "======================================================================\n", - "Demonstrating Target Shifter for backend: pl\n", - "Preview of the TimeFrame DataFrame:\n", - "shape: (5, 13)\n", - "┌─────────────────────┬──────────┬──────────┬─────────┬───┬───────┬─────────┬──────┬─────────┐\n", - "│ ds ┆ realgdp ┆ realcons ┆ realinv ┆ … ┆ unemp ┆ pop ┆ infl ┆ realint │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞═════════════════════╪══════════╪══════════╪═════════╪═══╪═══════╪═════════╪══════╪═════════╡\n", - "│ 1959-01-01 00:00:00 ┆ 2710.349 ┆ 1707.4 ┆ 286.898 ┆ … ┆ 5.8 ┆ 177.146 ┆ 0.0 ┆ 0.0 │\n", - "│ 1959-04-01 00:00:00 ┆ 2778.801 ┆ 1733.7 ┆ 310.859 ┆ … ┆ 5.1 ┆ 177.83 ┆ 2.34 ┆ 0.74 │\n", - "│ 1959-07-01 00:00:00 ┆ 2775.488 ┆ 1751.8 ┆ 289.226 ┆ … ┆ 5.3 ┆ 178.657 ┆ 2.74 ┆ 1.09 │\n", - "│ 1959-10-01 00:00:00 ┆ 2785.204 ┆ 1753.7 ┆ 299.356 ┆ … ┆ 5.6 ┆ 179.386 ┆ 0.27 ┆ 4.06 │\n", - "│ 1960-01-01 00:00:00 ┆ 2847.699 ┆ 1770.5 ┆ 331.722 ┆ … ┆ 5.2 ┆ 180.007 ┆ 2.31 ┆ 1.19 │\n", - "└─────────────────────┴──────────┴──────────┴─────────┴───┴───────┴─────────┴──────┴─────────┘\n", - "======================================================================\n", - "\n", - "Applying Target Shifter in machine_learning mode...\n", - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - "shape: (5, 13)\n", - "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬─────────────────┐\n", - "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_shift_1 │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪═════════════════╡\n", - "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ 2778.801 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ 2775.488 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ 2785.204 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ 2847.699 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ 2834.39 │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴─────────────────┘\n", - "\n", - "Applying Target Shifter in deep_learning mode...\n", - "Rows before shift: 203; Rows after shift: 203; Rows dropped: 0\n", - "Shifted data:\n", - "shape: (5, 13)\n", - "┌──────────────┬──────────┬─────────┬──────────┬───┬─────────┬──────┬─────────┬────────────────────┐\n", - "│ ds ┆ realcons ┆ realinv ┆ realgovt ┆ … ┆ pop ┆ infl ┆ realint ┆ realgdp_sequence │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ datetime[ns] ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ list[f64] │\n", - "╞══════════════╪══════════╪═════════╪══════════╪═══╪═════════╪══════╪═════════╪════════════════════╡\n", - "│ 1959-01-01 ┆ 1707.4 ┆ 286.898 ┆ 470.045 ┆ … ┆ 177.146 ┆ 0.0 ┆ 0.0 ┆ [2710.349, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2778.801, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488] │\n", - "│ 1959-04-01 ┆ 1733.7 ┆ 310.859 ┆ 481.301 ┆ … ┆ 177.83 ┆ 2.34 ┆ 0.74 ┆ [2778.801, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2775.488, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204] │\n", - "│ 1959-07-01 ┆ 1751.8 ┆ 289.226 ┆ 491.26 ┆ … ┆ 178.657 ┆ 2.74 ┆ 1.09 ┆ [2775.488, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2785.204, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699] │\n", - "│ 1959-10-01 ┆ 1753.7 ┆ 299.356 ┆ 484.052 ┆ … ┆ 179.386 ┆ 0.27 ┆ 4.06 ┆ [2785.204, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2847.699, 2834.39] │\n", - "│ 1960-01-01 ┆ 1770.5 ┆ 331.722 ┆ 462.199 ┆ … ┆ 180.007 ┆ 2.31 ┆ 1.19 ┆ [2847.699, │\n", - "│ 00:00:00 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2834.39, 2839.022] │\n", - "└──────────────┴──────────┴─────────┴──────────┴───┴─────────┴──────┴─────────┴────────────────────┘\n", - "======================================================================\n", - "Demonstrating Target Shifter for backend: mpd\n", - "Preview of the TimeFrame DataFrame:\n", - " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", - "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", - "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", - "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", - "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", - "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", - "\n", - " tbilrate unemp pop infl realint \n", - "0 2.82 5.8 177.146 0.00 0.00 \n", - "1 3.08 5.1 177.830 2.34 0.74 \n", - "2 3.82 5.3 178.657 2.74 1.09 \n", - "3 4.33 5.6 179.386 0.27 4.06 \n", - "4 3.50 5.2 180.007 2.31 1.19 \n", - "======================================================================\n", - "\n", - "Applying Target Shifter in machine_learning mode...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 \n", - "\n", - "Applying Target Shifter in deep_learning mode...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n", - "UserWarning: is not currently supported by PandasOnRay, defaulting to pandas implementation.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rows before shift: 203; Rows after shift: 202; Rows dropped: 1\n", - "Shifted data:\n", - " ds realcons realinv realgovt realdpi cpi m1 tbilrate \\\n", - "0 1959-01-01 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 \n", - "1 1959-04-01 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 \n", - "2 1959-07-01 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 \n", - "3 1959-10-01 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 \n", - "4 1960-01-01 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 \n", - "\n", - " unemp pop infl realint realgdp_shift_1 \\\n", - "0 5.8 177.146 0.00 0.00 2778.801 \n", - "1 5.1 177.830 2.34 0.74 2775.488 \n", - "2 5.3 178.657 2.74 1.09 2785.204 \n", - "3 5.6 179.386 0.27 4.06 2847.699 \n", - "4 5.2 180.007 2.31 1.19 2834.390 \n", - "\n", - " realgdp_sequence \n", - "0 [2710.349, 2778.801, 2775.488] \n", - "1 [2778.801, 2775.488, 2785.204] \n", - "2 [2775.488, 2785.204, 2847.699] \n", - "3 [2785.204, 2847.699, 2834.39] \n", - "4 [2847.699, 2834.39, 2839.022] \n" - ] - } - ], - "source": [ - "import modin.pandas as mpd\n", - "import polars as pl\n", - "\n", - "from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, print_divider\n", - "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", - "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", - "\n", - "# Constants for modes\n", - "MODE_MACHINE_LEARNING = \"machine_learning\"\n", - "MODE_DEEP_LEARNING = \"deep_learning\"\n", - "\n", - "def load_macrodata(target_col: str = \"realgdp\"):\n", - " \"\"\"Preprocess the dataset with a combined column for time & shifted target.\n", - "\n", - " :param target_col: The column to be used as the target for prediction\n", - " :type target_col: str, optional\n", - " :default target_col: 'realgdp'\n", - "\n", - " :return: Preprocessed DataFrame with shifted target\n", - " :rtype: pd.DataFrame\n", - " \"\"\"\n", - " print_divider()\n", - " print(\"Loading the 'macrodata' dataset from the open-license statsmodels package.\")\n", - " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", - " print_divider()\n", - "\n", - " # Load macrodata dataset\n", - " macro_df = macrodata.load_pandas().data.copy()\n", - "\n", - " # Create 'ds' column by combining 'year' and 'quarter'\n", - " macro_df[\"ds\"] = pd.to_datetime(\n", - " macro_df[\"year\"].astype(int).astype(str)\n", - " + \"-\"\n", - " + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str)\n", - " + \"-01\"\n", - " )\n", - "\n", - " # Drop the 'year' and 'quarter' columns\n", - " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", - "\n", - " # Reorder columns to place 'ds' first\n", - " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", - " macro_df = macro_df[cols].copy()\n", - "\n", - " return macro_df, target_col\n", - "\n", - "\n", - "def init_timeframes_for_backends(macro_df, target_col: str):\n", - " \"\"\"Initialize TimeFrame objects for all backends (Pandas, Polars, Modin) using constants.\n", - "\n", - " :param macro_df: Preprocessed macro dataset.\n", - " :type macro_df: pd.DataFrame\n", - " :param target_col: The target column for prediction.\n", - " :type target_col: str\n", - " :return: A dictionary containing TimeFrame objects for Pandas, Polars, and Modin.\n", - " :rtype: dict\n", - " \"\"\"\n", - " timeframes = {}\n", - "\n", - " # Pandas backend\n", - " macro_pandas_df = pd.DataFrame(macro_df)\n", - " timeframes[BACKEND_PANDAS] = tf(\n", - " macro_pandas_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_PANDAS\n", - " )\n", - "\n", - " # Polars backend\n", - " macro_polars_df = pl.DataFrame(macro_df)\n", - " timeframes[BACKEND_POLARS] = tf(\n", - " macro_polars_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_POLARS\n", - " )\n", - "\n", - " # Modin backend\n", - " macro_modin_df = mpd.DataFrame(macro_df)\n", - " timeframes[BACKEND_MODIN] = tf(\n", - " macro_modin_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_MODIN\n", - " )\n", - "\n", - " return timeframes\n", - "\n", - "\n", - "def apply_target_shifter(tf_obj, mode: str):\n", - " \"\"\"Apply the TemporalTargetShifter in the specified mode.\n", - "\n", - " :param tf_obj: TimeFrame object to apply the shifter to.\n", - " :param mode: Mode of operation (machine_learning or deep_learning).\n", - " \"\"\"\n", - " print(f\"\\nApplying Target Shifter in {mode} mode...\")\n", - "\n", - " # Setup the TemporalTargetShifter\n", - " if mode == MODE_MACHINE_LEARNING:\n", - " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_MACHINE_LEARNING, verbose=True)\n", - " elif mode == MODE_DEEP_LEARNING:\n", - " # In deep learning mode, sequence_length must be provided\n", - " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=3, verbose=True)\n", - " else:\n", - " raise ValueError(f\"Invalid mode: {mode}\")\n", - "\n", - " # Apply the shifter\n", - " shifted_df = shifter.fit_transform(tf_obj)\n", - "\n", - " # Print the result (since it's already a DataFrame, no need for get_data())\n", - " print(\"Shifted data:\")\n", - " print(shifted_df.head())\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " # Load the macrodata dataset and preprocess\n", - " macro_df, target_col = load_macrodata()\n", - "\n", - " # Initialize TimeFrame objects for various backends using constants\n", - " timeframes = init_timeframes_for_backends(macro_df, target_col)\n", - "\n", - " # Apply and demonstrate shifting for all backends\n", - " for backend, tf_obj in timeframes.items():\n", - " print_divider()\n", - " print(f\"Demonstrating Target Shifter for backend: {backend}\")\n", - " print(\"Preview of the TimeFrame DataFrame:\")\n", - " print(tf_obj.get_data().head())\n", - " print_divider()\n", - "\n", - " # Apply target shifting in machine learning mode\n", - " apply_target_shifter(tf_obj, MODE_MACHINE_LEARNING)\n", - "\n", - " # Apply target shifting in deep learning mode\n", - " apply_target_shifter(tf_obj, MODE_DEEP_LEARNING)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7d45fd7-3773-4d8b-ba66-af01b4aa2dd3", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "TemporalScope", - "language": "python", - "name": "temporalscope-kernel" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tutorial_notebooks/introduction/2_sliding_window_target_shifter.ipynb b/tutorial_notebooks/introduction/2_sliding_window_target_shifter.ipynb new file mode 100644 index 0000000..15f42bc --- /dev/null +++ b/tutorial_notebooks/introduction/2_sliding_window_target_shifter.ipynb @@ -0,0 +1,702 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c8aefe6f-489e-42fe-9cb9-a20426652424", + "metadata": {}, + "source": [ + "# Temporal Scope Tutorial: Utilizing Target Shifter\n", + "\n", + "## Overview\n", + "\n", + "This tutorial demonstrates how to load macroeconomic data and apply the **TemporalTargetShifter** using the **Modin** backend. The tutorial shows how to shift the target variable in **machine learning** and **deep learning** modes for forecasting tasks. The tool supports flexible configurations for different forecasting needs.\n", + "\n", + "### Summary\n", + "\n", + "| **Step** | **Description** |\n", + "|-----------|---------------------------------------------------------------------------------|\n", + "| **1** | **Data Loading**: Load macroeconomic data and create a datetime column (`ds`). |\n", + "| **2** | **Modin Backend Initialization**: Initialize a `TimeFrame` for scalable data processing with Modin. |\n", + "| **3** | **Target Shifting (ML Mode)**: Shift the target variable (`realgdp`) for one-step-ahead forecasting in **machine learning mode**. |\n", + "| **4** | **Target Shifting (DL Mode)**: Shift the target variable for sequence-based forecasting in **deep learning mode**. |\n", + "\n", + "### Key Concepts\n", + "\n", + "- **One-step ahead forecasting**: Shifting the target variable to predict the next time step for machine learning models.\n", + "- **Sequence forecasting**: Generating sequences of target variables for deep learning models.\n", + "- **Modin Backend**: Scalable version of Pandas for large datasets.\n", + "- **TemporalTargetShifter**: A tool to shift target variables for forecasting tasks, supporting both machine learning and deep learning modes.\n", + "\n", + "### Steps\n", + "\n", + "1. **Load the macroeconomic dataset** using the `statsmodels` library.\n", + "2. **Initialize a TimeFrame** for the Modin backend.\n", + "3. **Apply the Target Shifter** in machine learning mode to shift the target variable by one step (for simple one-step-ahead forecasting).\n", + "4. **Apply the Target Shifter** in deep learning mode to create sequences for sequence-based forecasting tasks.\n" + ] + }, + { + "cell_type": "markdown", + "id": "b9b71cc0-f882-40b6-933d-d38cbe3a56cd", + "metadata": {}, + "source": [ + "# Part 1: Load Macro-Economic Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4b56016b-7609-4e26-bb0b-5d6e4f864c18", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "Loading the 'macrodata' dataset from statsmodels.\n", + "Using 'realgdp' as the target column for future prediction.\n", + "======================================================================\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
01959-01-012710.3491707.4286.898470.0451886.928.980139.72.825.8177.1460.000.00
11959-04-012778.8011733.7310.859481.3011919.729.150141.73.085.1177.8302.340.74
21959-07-012775.4881751.8289.226491.2601916.429.350140.53.825.3178.6572.741.09
31959-10-012785.2041753.7299.356484.0521931.329.370140.04.335.6179.3860.274.06
41960-01-012847.6991770.5331.722462.1991955.529.540139.63.505.2180.0072.311.19
..........................................
1982008-07-0113324.6009267.71990.693991.5519838.3216.8891474.71.176.0305.270-3.164.33
1992008-10-0113141.9209195.31857.6611007.2739920.4212.1741576.50.126.9305.952-8.798.91
2002009-01-0112925.4109209.21558.494996.2879926.4212.6711592.80.228.1306.5470.94-0.71
2012009-04-0112901.5049189.01456.6781023.52810077.5214.4691653.60.189.2307.2263.37-3.19
2022009-07-0112990.3419256.01486.3981044.08810040.6216.3851673.90.129.6308.0133.56-3.44
\n", + "

203 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " ds realgdp realcons realinv realgovt realdpi cpi \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.980 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.150 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.350 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.370 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.540 \n", + ".. ... ... ... ... ... ... ... \n", + "198 2008-07-01 13324.600 9267.7 1990.693 991.551 9838.3 216.889 \n", + "199 2008-10-01 13141.920 9195.3 1857.661 1007.273 9920.4 212.174 \n", + "200 2009-01-01 12925.410 9209.2 1558.494 996.287 9926.4 212.671 \n", + "201 2009-04-01 12901.504 9189.0 1456.678 1023.528 10077.5 214.469 \n", + "202 2009-07-01 12990.341 9256.0 1486.398 1044.088 10040.6 216.385 \n", + "\n", + " m1 tbilrate unemp pop infl realint \n", + "0 139.7 2.82 5.8 177.146 0.00 0.00 \n", + "1 141.7 3.08 5.1 177.830 2.34 0.74 \n", + "2 140.5 3.82 5.3 178.657 2.74 1.09 \n", + "3 140.0 4.33 5.6 179.386 0.27 4.06 \n", + "4 139.6 3.50 5.2 180.007 2.31 1.19 \n", + ".. ... ... ... ... ... ... \n", + "198 1474.7 1.17 6.0 305.270 -3.16 4.33 \n", + "199 1576.5 0.12 6.9 305.952 -8.79 8.91 \n", + "200 1592.8 0.22 8.1 306.547 0.94 -0.71 \n", + "201 1653.6 0.18 9.2 307.226 3.37 -3.19 \n", + "202 1673.9 0.12 9.6 308.013 3.56 -3.44 \n", + "\n", + "[203 rows x 13 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from statsmodels.datasets import macrodata\n", + "\n", + "from temporalscope.core.core_utils import print_divider\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "\n", + "def load_macrodata(target_col: str = \"realgdp\"):\n", + " \"\"\"Preprocess the dataset with a combined column for time target.\n", + " \n", + " :param target_col: The column to be used as the target for prediction.\n", + " :type target_col: str, optional\n", + " :return: Preprocessed DataFrame with target column.\n", + " :rtype: pd.DataFrame\n", + " \"\"\"\n", + " print_divider()\n", + " print(\"Loading the 'macrodata' dataset from statsmodels.\")\n", + " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", + " print_divider()\n", + "\n", + " # Load macrodata dataset\n", + " macro_df = macrodata.load_pandas().data.copy()\n", + "\n", + " # Create 'ds' column combining 'year' and 'quarter'\n", + " macro_df[\"ds\"] = pd.to_datetime(\n", + " macro_df[\"year\"].astype(int).astype(str) + \"-\" + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str) + \"-01\"\n", + " )\n", + "\n", + " # Drop the 'year' and 'quarter' columns\n", + " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", + "\n", + " # Reorder columns to put 'ds' (datetime) first\n", + " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", + " macro_df = macro_df[cols].copy()\n", + "\n", + " return macro_df, target_col\n", + "\n", + "\n", + "# Load the macrodata dataset and preprocess\n", + "macro_df, target_col = load_macrodata()\n", + "macro_df" + ] + }, + { + "cell_type": "markdown", + "id": "5bddbc46-e8cf-421c-8561-363aeef1143c", + "metadata": {}, + "source": [ + "## Part 2: Shifting for Machine Learning" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "051a47f4-b8dd-46e3-92c1-39b49ee04f51", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "Loading the 'macrodata' dataset from statsmodels.\n", + "Using 'realgdp' as the target column for future prediction.\n", + "======================================================================\n", + "======================================================================\n", + "Initializing TimeFrame for the Modin backend...\n", + "Original DataFrame:\n", + " ds realgdp realcons realinv realgovt realdpi cpi m1 \\\n", + "0 1959-01-01 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 \n", + "1 1959-04-01 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 \n", + "2 1959-07-01 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 \n", + "3 1959-10-01 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 \n", + "4 1960-01-01 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 \n", + "\n", + " tbilrate unemp pop infl realint \n", + "0 2.82 5.8 177.146 0.00 0.00 \n", + "1 3.08 5.1 177.830 2.34 0.74 \n", + "2 3.82 5.3 178.657 2.74 1.09 \n", + "3 4.33 5.6 179.386 0.27 4.06 \n", + "4 3.50 5.2 180.007 2.31 1.19 \n", + "======================================================================\n" + ] + }, + { + "ename": "TypeError", + "evalue": "SlidingWindowPartitioner._fit_pandas_modin() takes 1 positional argument but 2 were given", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 28\u001b[0m\n\u001b[1;32m 25\u001b[0m partitioner \u001b[38;5;241m=\u001b[39m SlidingWindowPartitioner(tf\u001b[38;5;241m=\u001b[39mmodin_tf, num_partitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m, train_pct\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.7\u001b[39m, test_pct\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.3\u001b[39m) \u001b[38;5;66;03m# Set train/test split\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# Step 5: Get the partitions\u001b[39;00m\n\u001b[0;32m---> 28\u001b[0m partitions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mpartitioner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 30\u001b[0m print_divider()\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTotal partitions created: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(partitions)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/partition/sliding_window.py:738\u001b[0m, in \u001b[0;36mSlidingWindowPartitioner.fit_transform\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 694\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfit_transform\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Dict[\u001b[38;5;28mstr\u001b[39m, Dict[\u001b[38;5;28mstr\u001b[39m, SupportedBackendDataFrame]]]:\n\u001b[1;32m 695\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Fit and transform the dataset in a single step.\u001b[39;00m\n\u001b[1;32m 696\u001b[0m \n\u001b[1;32m 697\u001b[0m \u001b[38;5;124;03m This method combines the functionality of the `fit` and `transform` methods. It first generates partition indices\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[38;5;124;03m - :meth:`transform`: For generating the actual partitioned slices.\u001b[39;00m\n\u001b[1;32m 737\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 738\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m partition_indices \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 739\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransform()\n", + "File \u001b[0;32m/notebooks/TemporalScope/src/temporalscope/partition/sliding_window.py:634\u001b[0m, in \u001b[0;36mSlidingWindowPartitioner.fit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 632\u001b[0m \u001b[38;5;66;03m# Call backend-specific partitioning method\u001b[39;00m\n\u001b[1;32m 633\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtf\u001b[38;5;241m.\u001b[39mdataframe_backend \u001b[38;5;129;01min\u001b[39;00m [BACKEND_PANDAS, BACKEND_MODIN]:\n\u001b[0;32m--> 634\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_pandas_modin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 635\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtf\u001b[38;5;241m.\u001b[39mdataframe_backend \u001b[38;5;241m==\u001b[39m BACKEND_POLARS:\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_polars(df)\n", + "\u001b[0;31mTypeError\u001b[0m: SlidingWindowPartitioner._fit_pandas_modin() takes 1 positional argument but 2 were given" + ] + } + ], + "source": [ + "import modin.pandas as mpd\n", + "from temporalscope.partition.sliding_window import SlidingWindowPartitioner\n", + "from temporalscope.core.core_utils import BACKEND_MODIN, print_divider\n", + "from temporalscope.core.temporal_data_loader import TimeFrame\n", + "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "\n", + "# Step 1: Load the macrodata dataset and preprocess\n", + "macro_df, target_col = load_macrodata()\n", + "\n", + "# Step 2: Initialize Modin TimeFrame for Modin backend\n", + "print_divider()\n", + "print(\"Initializing TimeFrame for the Modin backend...\")\n", + "macro_modin_df = mpd.DataFrame(macro_df)\n", + "modin_tf = TimeFrame(macro_modin_df, time_col=\"ds\", target_col=target_col, dataframe_backend=BACKEND_MODIN)\n", + "\n", + "# Step 3: Preview the original data\n", + "print(\"Original DataFrame:\")\n", + "print(modin_tf.get_data().head())\n", + "print_divider()\n", + "\n", + "# Step 4: Create 2 partitions using `num_partitions=2` with train and test split\n", + "partitioner = SlidingWindowPartitioner(tf=modin_tf, num_partitions=2, train_pct=0.7, test_pct=0.3) # Set train/test split\n", + "\n", + "# Step 5: Get the partitions\n", + "partitions = list(partitioner.fit_transform())\n", + "\n", + "print_divider()\n", + "print(f\"Total partitions created: {len(partitions)}\")\n", + "for i, partition in enumerate(partitions):\n", + " print(f\"Partition {i+1} preview:\")\n", + " print(partition['partition_1']['train'].head()) # Access the train split\n", + " print(partition['partition_1']['test'].head()) # Access the test split\n", + " print_divider()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65b0099f-100e-4253-b45a-c93a84633447", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "shifted_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ff95236-87eb-487e-9a65-fce69340d3f6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "shifted_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f4efe10f-e4ca-4b61-821d-87959557a51e", + "metadata": {}, + "source": [ + "## Part 2: Shifting for Deep Learning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c6ef6be-d13b-4576-bdef-fa4afbb687a5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Step 5: Apply the TemporalTargetShifter in deep learning mode\n", + "MODE_DEEP_LEARNING = \"deep_learning\"\n", + "\n", + "print(f\"\\nApplying Target Shifter in {MODE_DEEP_LEARNING} mode...\")\n", + "\n", + "# Setup the TemporalTargetShifter for deep learning mode with a sequence length\n", + "sequence_length = 3 # Length of sequence for deep learning\n", + "shifter_dl = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=sequence_length, verbose=True)\n", + "\n", + "# Apply the shifter\n", + "shifted_dl_df = shifter_dl.fit_transform(modin_tf)\n", + "\n", + "# Print the shifted data with sequences\n", + "print(\"Shifted data for deep learning mode (sequences):\")\n", + "print(shifted_dl_df.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "369d0213-0bca-4c05-af9e-42daa260b3fe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "shifted_dl_df" + ] + }, + { + "cell_type": "markdown", + "id": "b0cbc6e3-a665-45f2-a9aa-60b9057d5540", + "metadata": {}, + "source": [ + "## Part 4: Shifting for all backends" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "170bad23-b236-4837-b042-7218622c4e62", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import modin.pandas as mpd\n", + "import polars as pl\n", + "\n", + "from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, print_divider\n", + "from temporalscope.core.temporal_data_loader import TimeFrame as tf\n", + "from temporalscope.core.temporal_target_shifter import TemporalTargetShifter\n", + "\n", + "# Constants for modes\n", + "MODE_MACHINE_LEARNING = \"machine_learning\"\n", + "MODE_DEEP_LEARNING = \"deep_learning\"\n", + "\n", + "def load_macrodata(target_col: str = \"realgdp\"):\n", + " \"\"\"Preprocess the dataset with a combined column for time & shifted target.\n", + "\n", + " :param target_col: The column to be used as the target for prediction\n", + " :type target_col: str, optional\n", + " :default target_col: 'realgdp'\n", + "\n", + " :return: Preprocessed DataFrame with shifted target\n", + " :rtype: pd.DataFrame\n", + " \"\"\"\n", + " print_divider()\n", + " print(\"Loading the 'macrodata' dataset from the open-license statsmodels package.\")\n", + " print(f\"Using '{target_col}' as the target column for future prediction.\")\n", + " print_divider()\n", + "\n", + " # Load macrodata dataset\n", + " macro_df = macrodata.load_pandas().data.copy()\n", + "\n", + " # Create 'ds' column by combining 'year' and 'quarter'\n", + " macro_df[\"ds\"] = pd.to_datetime(\n", + " macro_df[\"year\"].astype(int).astype(str)\n", + " + \"-\"\n", + " + ((macro_df[\"quarter\"] - 1) * 3 + 1).astype(int).astype(str)\n", + " + \"-01\"\n", + " )\n", + "\n", + " # Drop the 'year' and 'quarter' columns\n", + " macro_df = macro_df.drop(columns=[\"year\", \"quarter\"])\n", + "\n", + " # Reorder columns to place 'ds' first\n", + " cols = [\"ds\"] + [col for col in macro_df.columns if col != \"ds\"]\n", + " macro_df = macro_df[cols].copy()\n", + "\n", + " return macro_df, target_col\n", + "\n", + "\n", + "def init_timeframes_for_backends(macro_df, target_col: str):\n", + " \"\"\"Initialize TimeFrame objects for all backends (Pandas, Polars, Modin) using constants.\n", + "\n", + " :param macro_df: Preprocessed macro dataset.\n", + " :type macro_df: pd.DataFrame\n", + " :param target_col: The target column for prediction.\n", + " :type target_col: str\n", + " :return: A dictionary containing TimeFrame objects for Pandas, Polars, and Modin.\n", + " :rtype: dict\n", + " \"\"\"\n", + " timeframes = {}\n", + "\n", + " # Pandas backend\n", + " macro_pandas_df = pd.DataFrame(macro_df)\n", + " timeframes[BACKEND_PANDAS] = tf(\n", + " macro_pandas_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_PANDAS\n", + " )\n", + "\n", + " # Polars backend\n", + " macro_polars_df = pl.DataFrame(macro_df)\n", + " timeframes[BACKEND_POLARS] = tf(\n", + " macro_polars_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_POLARS\n", + " )\n", + "\n", + " # Modin backend\n", + " macro_modin_df = mpd.DataFrame(macro_df)\n", + " timeframes[BACKEND_MODIN] = tf(\n", + " macro_modin_df, time_col=\"ds\", target_col=target_col, backend=BACKEND_MODIN\n", + " )\n", + "\n", + " return timeframes\n", + "\n", + "\n", + "def apply_target_shifter(tf_obj, mode: str):\n", + " \"\"\"Apply the TemporalTargetShifter in the specified mode.\n", + "\n", + " :param tf_obj: TimeFrame object to apply the shifter to.\n", + " :param mode: Mode of operation (machine_learning or deep_learning).\n", + " \"\"\"\n", + " print(f\"\\nApplying Target Shifter in {mode} mode...\")\n", + "\n", + " # Setup the TemporalTargetShifter\n", + " if mode == MODE_MACHINE_LEARNING:\n", + " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_MACHINE_LEARNING, verbose=True)\n", + " elif mode == MODE_DEEP_LEARNING:\n", + " # In deep learning mode, sequence_length must be provided\n", + " shifter = TemporalTargetShifter(n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=3, verbose=True)\n", + " else:\n", + " raise ValueError(f\"Invalid mode: {mode}\")\n", + "\n", + " # Apply the shifter\n", + " shifted_df = shifter.fit_transform(tf_obj)\n", + "\n", + " # Print the result (since it's already a DataFrame, no need for get_data())\n", + " print(\"Shifted data:\")\n", + " print(shifted_df.head())\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Load the macrodata dataset and preprocess\n", + " macro_df, target_col = load_macrodata()\n", + "\n", + " # Initialize TimeFrame objects for various backends using constants\n", + " timeframes = init_timeframes_for_backends(macro_df, target_col)\n", + "\n", + " # Apply and demonstrate shifting for all backends\n", + " for backend, tf_obj in timeframes.items():\n", + " print_divider()\n", + " print(f\"Demonstrating Target Shifter for backend: {backend}\")\n", + " print(\"Preview of the TimeFrame DataFrame:\")\n", + " print(tf_obj.get_data().head())\n", + " print_divider()\n", + "\n", + " # Apply target shifting in machine learning mode\n", + " apply_target_shifter(tf_obj, MODE_MACHINE_LEARNING)\n", + "\n", + " # Apply target shifting in deep learning mode\n", + " apply_target_shifter(tf_obj, MODE_DEEP_LEARNING)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7d45fd7-3773-4d8b-ba66-af01b4aa2dd3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TemporalScope", + "language": "python", + "name": "temporalscope-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b3320d064392bab1685bc9710179aba0833f14ad Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Sun, 6 Oct 2024 02:05:20 +0000 Subject: [PATCH 3/6] feat(feat:-refactor-api-and-add-data-generator-for-multi-backend-support): refactor core api design and add systematic data generator - implement new core api to support multiple backends (pandas, polars, modin) - add synthetic_data_generator for systematic testing across backends - refactor core modules: core_utils, exceptions, temporal_data_loader, temporal_target_shifter - add new temporal_core_processing module - restructure and update test files to align with new api design - enhance functionality to support both single-step and multi-step operations - update pyproject.toml to reflect new structure and dependencies --- pyproject.toml | 5 +- src/temporalscope/core/core_utils.py | 460 ++++++- src/temporalscope/core/exceptions.py | 23 +- .../core/temporal_core_processing.py | 104 ++ .../core/temporal_data_loader.py | 540 +++----- .../core/temporal_target_shifter.py | 1114 ++++++++--------- src/temporalscope/datasets/datasets.py | 37 +- .../datasets/synthetic_data_generator.py | 335 +++++ test/unit/core/test_core_utils.py | 493 ++++++++ .../test_exceptions.py} | 17 +- test/unit/core/test_temporal_data_loader.py | 296 +++++ .../unit/core/test_temporal_target_shifter.py | 224 ++++ test/unit/{ => datasets}/test_datasets.py | 71 +- .../datasets/test_synthetic_data_generator.py | 107 ++ test/unit/partition/test_partition_padding.py | 362 ++++++ .../partition/test_partition_validators.py | 336 +++++ test/unit/test_core_temporal_data_loader.py | 266 ---- .../unit/test_core_temporal_target_shifter.py | 224 ---- test/unit/test_core_utils.py | 250 ---- test/unit/test_partition_padding.py | 362 ------ test/unit/test_partition_validators.py | 336 ----- 21 files changed, 3497 insertions(+), 2465 deletions(-) create mode 100644 src/temporalscope/core/temporal_core_processing.py create mode 100644 src/temporalscope/datasets/synthetic_data_generator.py create mode 100644 test/unit/core/test_core_utils.py rename test/unit/{test_core_exceptions.py => core/test_exceptions.py} (80%) create mode 100644 test/unit/core/test_temporal_data_loader.py create mode 100644 test/unit/core/test_temporal_target_shifter.py rename test/unit/{ => datasets}/test_datasets.py (60%) create mode 100644 test/unit/datasets/test_synthetic_data_generator.py create mode 100644 test/unit/partition/test_partition_padding.py create mode 100644 test/unit/partition/test_partition_validators.py delete mode 100644 test/unit/test_core_temporal_data_loader.py delete mode 100644 test/unit/test_core_temporal_target_shifter.py delete mode 100644 test/unit/test_core_utils.py delete mode 100644 test/unit/test_partition_padding.py delete mode 100644 test/unit/test_partition_validators.py diff --git a/pyproject.toml b/pyproject.toml index 2fbe849..a4bd3e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ readthedocs = "sphinx-build -WTb html . $READTHEDOCS_OUTPUT/html" serve = "python -m http.server --directory _build" [tool.hatch.envs.test] -extra-dependencies = ["pytest", "pytest-cov", "pytest-custom_exit_code"] +extra-dependencies = ["pytest", "pytest-cov", "pytest-custom_exit_code", "pytest-mock"] [tool.hatch.envs.test.scripts] unit = 'pytest --cov-report xml:coverage.xml --cov="temporalscope" -m "not integration" {args:test}' @@ -170,6 +170,7 @@ python_version = "3.10" ignore_missing_imports = true warn_unreachable = true exclude = 'test/*' +warn_return_any = false # Turn off MyPy warnings for missing return types [tool.bandit] exclude_dirs = ["test"] @@ -193,6 +194,7 @@ docformat = """ docformatter --check --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope || \ docformatter --in-place --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope """ +clear-coverage = "coverage erase" # Automated developer Q&A script quality-assurance = """ pytest && @@ -207,7 +209,6 @@ python -m ipykernel install --user --name temporalscope-kernel --display-name "T echo "Jupyter kernel 'TemporalScope' created. You can now use it in Jupyter notebooks." """ - [tool.commitizen] version = "0.1.0" update_changelog_on_bump = true diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py index 329ac10..1683628 100644 --- a/src/temporalscope/core/core_utils.py +++ b/src/temporalscope/core/core_utils.py @@ -17,17 +17,99 @@ """TemporalScope/src/temporalscope/core/core_utils.py. -This module provides utility functions that can be used throughout the TemporalScope package. It includes methods for -printing dividers, checking for nulls and NaNs, and validating the backend. +This module provides essential utility functions for the TemporalScope package, +including support for: +- Backend validation (Pandas, Modin, Polars). +- Checking for nulls, NaNs, and handling mixed frequency issues in time series + data. +- Managing different modes (Single-step vs. Multi-step) for machine learning and + deep learning workflows. + +Engineering Design Assumptions: +------------------------------- +TemporalScope is designed around two core modes for time series workflows, based +on the assumption that users handle their own preprocessing (e.g., managing NaNs, +encoding categorical variables, scaling, etc.). + +These modes represent generalized structures that support both machine learning +and deep learning tasks, giving users the flexibility to manage their own +model-building workflows. + +1. Single-step mode: + - In this mode, each row of the data corresponds to a single time step with a + scalar target. + - Compatible with traditional machine learning frameworks (e.g., Scikit-learn, + XGBoost) as well as deep learning libraries like TensorFlow and PyTorch. + - The data is structured as a single DataFrame, where each row is an + observation, and the target is scalar. + - Example workflows include regression, classification, and survival models. + - TemporalScope allows simple shifting/lagging of variables within this mode. + - After partitioning (e.g., using a sliding window), users can convert the + data into the required format for their model. + +2. Multi-step mode: + - This mode supports tasks like sequence forecasting, where the target is a + sequence of values (multi-step). + - Data is split into two DataFrames: one for the input sequences (X) + and one for the target sequences (Y). + - This mode is most commonly used in deep learning frameworks such as + TensorFlow and PyTorch, where the task involves predicting sequences of + time steps (e.g., seq2seq models). + - TemporalScope's partitioning algorithms (e.g., sliding window) can + partition this data for time series forecasting, making it ready for + sequential models. + +Core Purpose: +------------- +TemporalScope provides utility support for popular APIs such as TensorFlow, +PyTorch, Keras, and model-agnostic explainability tools (SHAP, Boruta-SHAP, LIME). +These utilities allow TemporalScope to fit seamlessly into machine learning and +deep learning workflows, while providing model-agnostic insights. + +Supported Modes: +---------------- +The following table illustrates the two core modes supported by TemporalScope. +These are generalized super-structures for time series tasks. Users are expected +to customize their workflows for specific model-building tasks (e.g., tree-based +models, neural networks, etc.): + ++----------------+-------------------------------------------------------------------+ +| Mode | Description | +| | Data Structure | ++----------------+-------------------------------------------------------------------+ +| single_step | General machine learning tasks with scalar targets. Each row is | +| | a single time step, and the target is scalar. | +| | Single DataFrame: each row is an observation. | +| | Frameworks: Scikit-learn, XGBoost, TensorFlow, PyTorch, etc. | ++----------------+-------------------------------------------------------------------+ +| multi_step | Sequential time series tasks (e.g., seq2seq) for deep learning. | +| | The data is split into sequences (input X, target Y). | +| | Two DataFrames: X for input sequences, Y for targets. | +| | Frameworks: TensorFlow, PyTorch, Keras. | ++----------------+-------------------------------------------------------------------+ + +.. note:: + + The table above is illustrative of common time series workflows that are + supported by machine learning and deep learning frameworks. Users will need + to manage their own data preprocessing (e.g., handling NaNs, scaling features, + encoding categorical variables) to ensure compatibility with these frameworks. + + TemporalScope provides tools for integrating popular model-agnostic + explainability techniques such as SHAP, Boruta-SHAP, and LIME, allowing users + to extract insights from any type of model. """ import os -from typing import Dict, NoReturn, Optional, Union, cast +from typing import Dict, Optional, Union, cast, Callable, Type +from datetime import datetime, timedelta, date +import warnings import modin.pandas as mpd import pandas as pd import polars as pl from dotenv import load_dotenv +from temporalscope.core.exceptions import UnsupportedBackendError, MixedFrequencyWarning # Load environment variables from the .env file load_dotenv() @@ -36,8 +118,10 @@ BACKEND_POLARS = "pl" BACKEND_PANDAS = "pd" BACKEND_MODIN = "mpd" -MODE_MACHINE_LEARNING = "machine_learning" -MODE_DEEP_LEARNING = "deep_learning" + +# Modes for TemporalScope +MODE_SINGLE_STEP = "single_step" +MODE_MULTI_STEP = "multi_step" # Mapping of backend keys to their full names or module references BACKENDS = { @@ -50,6 +134,8 @@ "BACKENDS": BACKENDS, } +SUPPORTED_MULTI_STEP_BACKENDS = [BACKEND_PANDAS] + # Define a type alias for DataFrames that support Pandas, Modin, and Polars backends SupportedBackendDataFrame = Union[pd.DataFrame, mpd.DataFrame, pl.DataFrame] @@ -68,87 +154,131 @@ def validate_backend(backend: str) -> None: :param backend: The backend to validate ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). :type backend: str - :raises ValueError: If the backend is not supported. + :raises UnsupportedBackendError: If the backend is not supported. """ if backend not in TF_DEFAULT_CFG["BACKENDS"]: - raise ValueError( - f"Unsupported backend '{backend}'. Supported backends are: " - f"{', '.join(TF_DEFAULT_CFG['BACKENDS'].keys())}." - ) + raise UnsupportedBackendError(backend) -def raise_invalid_backend(backend: str) -> NoReturn: - """Raise a ValueError for an invalid backend. +def infer_backend_from_dataframe(df: SupportedBackendDataFrame) -> str: + """Infer the backend from the DataFrame type. - :param backend: The backend to validate ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). - :type backend: str - :raises ValueError: If the backend is not supported. + :param df: The input DataFrame. + :type df: SupportedBackendDataFrame + :return: The inferred backend ('pl', 'pd', or 'mpd'). + :rtype: str + :raises UnsupportedBackendError: If the DataFrame type is unsupported. """ - raise ValueError(f"Unsupported backend: {backend}") + if isinstance(df, pl.DataFrame): + return BACKEND_POLARS + elif isinstance(df, pd.DataFrame): + return BACKEND_PANDAS + elif isinstance(df, mpd.DataFrame): + return BACKEND_MODIN + else: + raise UnsupportedBackendError(f"Unsupported DataFrame type: {type(df)}") -def validate_input(df: SupportedBackendDataFrame, backend: str) -> None: - """Validate that the DataFrame matches the expected type for the specified backend. +def validate_mode(backend: str, mode: str) -> None: + """Validate if the backend supports the given mode. - :param df: The DataFrame to validate. - :type df: SupportedBackendDataFrame - :param backend: The backend against which to validate the DataFrame's type ('pl', 'pd', 'mpd'). - :type backend: str - :raises TypeError: If the DataFrame does not match the expected type for the backend. + :param backend: The backend type ('pl', 'pd', or 'mpd'). + :param mode: The mode type ('single_step' or 'multi_step'). + :raises NotImplementedError: If the backend does not support the requested mode. """ - if backend == BACKEND_POLARS and not isinstance(df, pl.DataFrame): - raise TypeError("Expected a Polars DataFrame.") - elif backend == BACKEND_PANDAS and not isinstance(df, pd.DataFrame): - raise TypeError("Expected a Pandas DataFrame.") - elif backend == BACKEND_MODIN and not isinstance(df, mpd.DataFrame): - raise TypeError("Expected a Modin DataFrame.") + if mode == MODE_MULTI_STEP and backend not in SUPPORTED_MULTI_STEP_BACKENDS: + raise NotImplementedError(f"The '{backend}' backend does not support multi-step mode.") -def validate_and_convert_input(df: SupportedBackendDataFrame, backend: str) -> SupportedBackendDataFrame: - """Validates and converts the input DataFrame to the specified backend type. +def validate_and_convert_input( + df: SupportedBackendDataFrame, + backend: str, + time_col: Optional[str] = None, + mode: str = MODE_SINGLE_STEP +) -> SupportedBackendDataFrame: + """Validates and converts the input DataFrame to the specified backend type, with optional time column casting. :param df: The input DataFrame to validate and convert. - :type df: SupportedBackendDataFrame :param backend: The desired backend type ('pl', 'pd', or 'mpd'). - :type backend: str + :param time_col: Optional; the name of the time column for casting. + :param mode: The processing mode ('single_step' or 'multi_step'). + :raises TypeError: If input DataFrame type doesn't match the specified backend or conversion fails. + :raises NotImplementedError: If multi-step mode is requested for unsupported backends or unsupported conversion to Polars. :return: The DataFrame converted to the specified backend type. - :rtype: SupportedBackendDataFrame - :raises TypeError: If the input DataFrame type doesn't match the specified backend or conversion fails. - :raises ValueError: If the backend is not supported. + + Example + ------- + Here's how you would use this function to convert a Pandas DataFrame to Polars: + + .. code-block:: python + + import pandas as pd + import polars as pl + + data = {'col1': [1, 2], 'col2': [3, 4], 'time': pd.date_range(start='1/1/2023', periods=2)} + df = pd.DataFrame(data) + + # Convert the DataFrame from Pandas to Polars, with an optional time column for casting + converted_df = validate_and_convert_input(df, 'pl', time_col='time') + print(type(converted_df)) # Output: + + # If you don't need to cast the time column, just omit the time_col argument + converted_df = validate_and_convert_input(df, 'pl') + print(type(converted_df)) # Output: + + .. note:: + - This function first converts the input DataFrame into the appropriate backend. + - If `time_col` is specified and the backend is Polars, it casts the time column to `pl.Datetime`. + - Pandas to Polars conversion is currently unsupported and raises a `NotImplementedError`. This needs to be implemented later. """ - validate_backend(backend) # Validates if backend is supported + # Validate the backend and mode combination + validate_backend(backend) + validate_mode(backend, mode) - # Mapping for backends and conversion functions - backend_conversion_map = { + # Backend conversion map + backend_conversion_map: Dict[ + str, Dict[Type[SupportedBackendDataFrame], Callable[[SupportedBackendDataFrame], SupportedBackendDataFrame]] + ] = { BACKEND_POLARS: { + # Polars to Polars pl.DataFrame: lambda x: x, - pd.DataFrame: pl.from_pandas, + # Pandas to Polars - currently not supported + pd.DataFrame: lambda x: (_ for _ in ()).throw(NotImplementedError("Pandas to Polars conversion is not currently supported.")), + # Modin to Polars mpd.DataFrame: lambda x: pl.from_pandas(x._to_pandas()), }, BACKEND_PANDAS: { - pd.DataFrame: lambda x: x, - pl.DataFrame: lambda x: x.to_pandas(), - mpd.DataFrame: lambda x: x._to_pandas(), + pd.DataFrame: lambda x: x, # Pandas to Pandas + pl.DataFrame: lambda x: x.to_pandas(), # Polars to Pandas + mpd.DataFrame: lambda x: x._to_pandas() if hasattr(x, "_to_pandas") else x, # Modin to Pandas }, BACKEND_MODIN: { - mpd.DataFrame: lambda x: x, - pd.DataFrame: lambda x: mpd.DataFrame(x), - pl.DataFrame: lambda x: mpd.DataFrame(x.to_pandas()), + mpd.DataFrame: lambda x: x, # Modin to Modin + pd.DataFrame: lambda x: mpd.DataFrame(x), # Pandas to Modin + pl.DataFrame: lambda x: mpd.DataFrame(x.to_pandas()), # Polars to Modin via Pandas }, } - if backend not in backend_conversion_map: - raise ValueError(f"Unsupported backend: {backend}") + # Step 1: Convert the DataFrame to the desired backend + converted_df = None + for dataframe_type, conversion_func in backend_conversion_map[backend].items(): + if isinstance(df, dataframe_type): + converted_df = conversion_func(df) + break + + if converted_df is None: + raise TypeError(f"Input DataFrame type {type(df)} does not match the specified backend '{backend}'") - conversion_map = backend_conversion_map[backend] - if not isinstance(conversion_map, dict): - raise TypeError(f"Conversion map for backend '{backend}' is not a dictionary") + # Step 2: Explicitly cast the time column to pl.Datetime if backend is Polars and the column exists + if backend == BACKEND_POLARS and time_col and time_col in converted_df.columns: + # Force cast time_col to pl.Datetime + converted_df = converted_df.with_columns(pl.col(time_col).cast(pl.Datetime)) - for dataframe_type, conversion_func in conversion_map.items(): - if isinstance(df, dataframe_type): - return conversion_func(df) + # Check the type of the column and assert it is correct + assert isinstance(converted_df[time_col][0], pl.Datetime), f"Expected a timestamp-like time column, but got {type(converted_df[time_col][0])}" + + return converted_df - raise TypeError(f"Input DataFrame type {type(df)} does not match the specified backend '{backend}'") def get_api_keys() -> Dict[str, Optional[str]]: @@ -190,20 +320,20 @@ def check_nulls(df: SupportedBackendDataFrame, backend: str) -> bool: :type backend: str :return: True if there are null values, False otherwise. :rtype: bool - :raises ValueError: If the backend is not supported. + :raises UnsupportedBackendError: If the backend is not supported. """ validate_backend(backend) if backend == BACKEND_PANDAS: return bool(cast(pd.DataFrame, df).isnull().values.any()) elif backend == BACKEND_POLARS: - polars_df = cast(pl.DataFrame, df) - null_count = polars_df.null_count().select(pl.col("*").sum()).to_numpy().sum() + null_count = cast(pl.DataFrame, df).null_count().select(pl.col("*").sum()).to_numpy().sum() return bool(null_count > 0) elif backend == BACKEND_MODIN: return bool(cast(mpd.DataFrame, df).isnull().values.any()) - else: - raise_invalid_backend(backend) + + # Suppress the warning since this path is unreachable due to `validate_backend` + # mypy: ignore def check_nans(df: SupportedBackendDataFrame, backend: str) -> bool: @@ -211,21 +341,221 @@ def check_nans(df: SupportedBackendDataFrame, backend: str) -> bool: :param df: The DataFrame to check for NaN values. :type df: SupportedBackendDataFrame - :param backend: The backend used for the DataFrame ('pl', 'pd', 'mpd'). + :param backend: The backend used for the DataFrame ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin'). :type backend: str :return: True if there are NaN values, False otherwise. :rtype: bool - :raises ValueError: If the backend is not supported. + :raises UnsupportedBackendError: If the backend is not supported. """ validate_backend(backend) if backend == BACKEND_PANDAS: return bool(cast(pd.DataFrame, df).isna().values.any()) elif backend == BACKEND_POLARS: - polars_df = cast(pl.DataFrame, df) - nan_count = polars_df.select((polars_df == float("nan")).sum()).to_numpy().sum() + nan_count = cast(pl.DataFrame, df).select(pl.col("*").is_nan().sum()).to_numpy().sum() return bool(nan_count > 0) elif backend == BACKEND_MODIN: return bool(cast(mpd.DataFrame, df).isna().values.any()) + + # Suppress the warning since this path is unreachable due to `validate_backend` + # mypy: ignore + + +def is_timestamp_like(df: SupportedBackendDataFrame, time_col: str) -> bool: + """Check if the specified column in the DataFrame is timestamp-like. + + This function can be used in the context of time series modeling to + validate that the time column is in an appropriate format for further + temporal operations such as sorting or windowing. + + This function assumes that the DataFrame has been pre-validated to ensure + it is using a supported backend. + + :param df: The DataFrame containing the time column. + :type df: SupportedBackendDataFrame + :param time_col: The name of the column representing time data. + :type time_col: str + :return: True if the column is timestamp-like, otherwise False. + :rtype: bool + :raises ValueError: If the time_col does not exist in the DataFrame. + + .. note:: + This function is primarily used for warning users if the time column is not + timestamp-like, but the final decision on how to handle this rests with the user. + """ + if time_col not in df.columns: + raise ValueError(f"Column '{time_col}' not found in the DataFrame.") + + time_column = df[time_col] + + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + return pd.api.types.is_datetime64_any_dtype(time_column) + elif isinstance(df, pl.DataFrame): + return time_column.dtype == pl.Datetime + + +def is_numeric(df: SupportedBackendDataFrame, time_col: str) -> bool: + """Check if the specified column in the DataFrame is numeric. + + :param df: The DataFrame containing the time column. + :type df: SupportedBackendDataFrame + :param time_col: The name of the column representing time data. + :type time_col: str + :return: True if the column is numeric, otherwise False. + :rtype: bool + :raises ValueError: If the time_col does not exist in the DataFrame. + """ + if time_col not in df.columns: + raise ValueError(f"Column '{time_col}' not found in the DataFrame.") + + time_column = df[time_col] + + # Handle empty columns for different backends + if isinstance(df, pl.DataFrame): + # Polars: Check if the DataFrame has zero rows or if the column is empty + if df.height == 0 or time_column.is_empty(): + return False + elif isinstance(df, mpd.DataFrame): + # Modin: Check if the column is empty by using length + if len(time_column) == 0: + return False + elif isinstance(df, pd.DataFrame): + # Pandas: Check if the column is empty + if isinstance(time_column, pd.Series) and time_column.empty: + return False + + # Check if the column is numeric based on the backend + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + return pd.api.types.is_numeric_dtype(time_column) + elif isinstance(df, pl.DataFrame): + return time_column.dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64] + + +def has_mixed_frequencies(df: SupportedBackendDataFrame, time_col: str, min_non_null_values: int = 3) -> bool: + """Check if the given time column in the DataFrame contains mixed frequencies. + + This function is essential in time series data, as mixed frequencies (e.g., a mix of daily + and monthly data) can lead to inconsistent modeling outcomes. While some models may handle + mixed frequencies, others might struggle with this data structure. + + The function ensures that a minimum number of non-null values are present + before inferring the frequency to avoid issues with small datasets. + + :param df: The DataFrame containing the time column. + :type df: SupportedBackendDataFrame + :param time_col: The name of the column representing time data. + :type time_col: str + :param min_non_null_values: The minimum number of non-null values required to infer a frequency. + Default is 3, which ensures enough data points for frequency inference. + :type min_non_null_values: int + :return: True if mixed frequencies are detected, otherwise False. + :rtype: bool + :raises ValueError: If the time_col does not exist in the DataFrame. + :raises UnsupportedBackendError: If the DataFrame is from an unsupported backend. + :raises MixedFrequencyWarning: If mixed timestamp frequencies are detected. + + .. warning:: + If mixed frequencies are detected, the user should be aware of potential issues in modeling. This function + will raise a warning but not prevent further operations, leaving it up to the user to handle. + """ + if time_col not in df.columns: + raise ValueError(f"Column '{time_col}' not found in the DataFrame.") + + # Drop null values in the time column + if isinstance(df, pl.DataFrame): + time_column = df[time_col].drop_nulls() else: - raise_invalid_backend(backend) + time_column = df[time_col].dropna() + + # Ensure there are at least min_non_null_values non-null values to infer frequency + if len(time_column) < min_non_null_values: + return False + + # Infer frequency depending on backend + if isinstance(df, (pd.DataFrame, mpd.DataFrame)): + inferred_freq = pd.infer_freq(time_column) + elif isinstance(df, pl.DataFrame): + inferred_freq = pd.infer_freq(time_column.to_pandas()) + + if inferred_freq is None: + warnings.warn("Mixed timestamp frequencies detected in the time column.", MixedFrequencyWarning) + return True + return False + + +def sort_dataframe( + df: SupportedBackendDataFrame, time_col: str, backend: str, ascending: bool = True +) -> SupportedBackendDataFrame: + """Sorts a DataFrame by the specified time column based on the backend. + + :param df: The DataFrame to be sorted. + :type df: SupportedBackendDataFrame + :param time_col: The name of the column to sort by. + :type time_col: str + :param backend: The backend used for the DataFrame ('pl', 'pd', or 'mpd'). + :type backend: str + :param ascending: If True, sort in ascending order; if False, sort in descending order. Default is True. + :type ascending: bool + :return: The sorted DataFrame. + :rtype: SupportedBackendDataFrame + :raises TypeError: If the DataFrame type does not match the backend. + :raises UnsupportedBackendError: If the backend is unsupported or validation fails. + """ + # Validate backend + validate_backend(backend) + + # Select backend-specific sorting logic + if backend == BACKEND_POLARS: + if not isinstance(df, pl.DataFrame): + raise TypeError(f"Expected Polars DataFrame but got {type(df)}") + return df.sort(by=time_col, descending=not ascending) + + elif backend == BACKEND_PANDAS: + if not isinstance(df, pd.DataFrame): + raise TypeError(f"Expected Pandas DataFrame but got {type(df)}") + df.sort_values(by=time_col, ascending=ascending, inplace=True) + return df + + elif backend == BACKEND_MODIN: + if not isinstance(df, mpd.DataFrame): + raise TypeError(f"Expected Modin DataFrame but got {type(df)}") + df.sort_values(by=time_col, ascending=ascending, inplace=True) + return df + + +def check_empty_columns(df: SupportedBackendDataFrame, backend: str) -> bool: + """Check for empty columns in the DataFrame using the specified backend. + + This function ensures that none of the columns in the DataFrame are effectively empty + (i.e., they contain only NaN or None values or are entirely empty). + It returns True if any column is found to be effectively empty, and False otherwise. + + :param df: The DataFrame to check for empty columns. + :type df: SupportedBackendDataFrame + :param backend: The backend used for the DataFrame ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin). + :type backend: str + :return: True if there are any effectively empty columns, False otherwise. + :rtype: bool + :raises UnsupportedBackendError: If the backend is not supported. + :raises ValueError: If the DataFrame does not contain columns. + """ + # Validate the backend + validate_backend(backend) + + # Check for columns in the DataFrame + if df.shape[1] == 0: + raise ValueError("The DataFrame contains no columns to check.") + + # Define backend-specific logic for checking empty columns + if backend == BACKEND_PANDAS: + if any(cast(pd.DataFrame, df)[col].isnull().all() for col in df.columns): + return True + elif backend == BACKEND_POLARS: + if any(cast(pl.DataFrame, df)[col].null_count() == len(df) for col in df.columns): + return True + elif backend == BACKEND_MODIN: + if any(cast(mpd.DataFrame, df)[col].isnull().all() for col in df.columns): + return True + + # If no empty columns were found, return False + return False diff --git a/src/temporalscope/core/exceptions.py b/src/temporalscope/core/exceptions.py index ae4a58b..3095e48 100644 --- a/src/temporalscope/core/exceptions.py +++ b/src/temporalscope/core/exceptions.py @@ -66,13 +66,13 @@ class TimeFrameError(Exception): class TimeColumnError(TimeFrameError): """ Exception raised for errors related to the `time_col`. - This error is raised when the `time_col` in the TimeFrame is either - missing, contains unsupported types (non-numeric or non-timestamp), + This error is raised when the `time_col` in the TimeFrame is either + missing, contains unsupported types (non-numeric or non-timestamp), or has invalid data like null values. Attributes: message (str): Explanation of the error. - + Example Usage: -------------- .. code-block:: python @@ -138,3 +138,20 @@ class MixedFrequencyWarning(UserWarning): """ pass + + +class UnsupportedBackendError(Exception): + """Exception raised when an unsupported backend is encountered. + + This error is raised when a user attempts to use a backend that is not + supported by TemporalScope. It centralizes backend validation errors across the package. + + Attributes: + backend (str): The invalid backend that caused the error. + message (str): Explanation of the error. + """ + + def __init__(self, backend: str, message: str = "Unsupported backend"): + self.backend = backend + self.message = f"{message}: {backend}. Supported backends are 'pd', 'mpd', 'pl'." + super().__init__(self.message) diff --git a/src/temporalscope/core/temporal_core_processing.py b/src/temporalscope/core/temporal_core_processing.py new file mode 100644 index 0000000..bfc2e6e --- /dev/null +++ b/src/temporalscope/core/temporal_core_processing.py @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""TemporalScope/src/temporalscope/core/temporal_core_processing.py + +Core Dataset Conversion and Interoperability Layer + +This module provides core functionalities for dataset preparation and conversion, primarily +focused on handling multi-step workflows and ensuring interoperability between backends like +Pandas, TensorFlow, Modin, and Polars. It facilitates conversions required for downstream +tasks such as those used by the `temporal_target_shifter.py` module, ensuring multi-step +processing is smooth and integrated with deep learning and machine learning frameworks. + +The module is fully functional, avoiding object-oriented over-complication, following a +functional approach for ease of use and extensibility. + +Key Features: +------------- +- **Dataset Conversion**: Functions for converting between formats (e.g., Pandas, TensorFlow). +- **Interoperability**: Manages conversions between different backends for multi-step workflows. +- **Support for Future Extensions**: Stubbed for future implementations of key features required + by downstream tasks like multi-step target handling and TensorFlow dataset conversion. + +Example Usage: +-------------- +.. code-block:: python + + from temporal_core_processing import convert_to_tensorflow, convert_to_pandas + + # Example DataFrame + df = pd.DataFrame({ + 'time': pd.date_range(start='2023-01-01', periods=100, freq='D'), + 'feature_1': range(100), + 'target': range(100) + }) + + # Convert DataFrame to TensorFlow Dataset + tf_dataset = convert_to_tensorflow(df) + + # Convert TensorFlow Dataset back to Pandas + df_back = convert_to_pandas(tf_dataset) +""" + +from typing import Union +import pandas as pd +import polars as pl +import modin.pandas as mpd +import tensorflow as tf + +from temporalscope.core.core_utils import SupportedBackendDataFrame + + +def convert_to_tensorflow(df: SupportedBackendDataFrame) -> tf.data.Dataset: + """ + Stub: Convert a DataFrame to a TensorFlow Dataset. + + This function will convert Pandas, Modin, or Polars DataFrames into a TensorFlow Dataset + to enable compatibility with deep learning frameworks like TensorFlow. + + :param df: The input DataFrame to convert. + :return: A TensorFlow `tf.data.Dataset` object. + """ + pass + + +def convert_to_pandas(df: SupportedBackendDataFrame) -> pd.DataFrame: + """ + Stub: Convert a DataFrame or TensorFlow Dataset to a Pandas DataFrame. + + This function will handle converting Modin, Polars, or TensorFlow Datasets back to Pandas + DataFrames to ensure interoperability across backends and downstream tasks. + + :param df: The input DataFrame or TensorFlow Dataset. + :return: A Pandas DataFrame. + """ + pass + + +def handle_multi_step_conversion(df: pd.DataFrame, sequence_length: int) -> pd.DataFrame: + """ + Stub: Prepare DataFrame for multi-step forecasting. + + This function will handle the preparation of multi-step targets by expanding the target + column into sequences of the specified length, suitable for sequential models. + + :param df: The input DataFrame containing single-step targets. + :param sequence_length: The length of the target sequence for multi-step forecasting. + :return: A DataFrame with expanded target sequences. + """ + pass diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index 79650f5..31ded7d 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -25,31 +25,51 @@ Engineering Design -------------------- -.. note:: TemporalScope is designed with several key assumptions to ensure performance, scalability, and flexibility across a wide range of time series forecasting and XAI workflows. - 1. **Preprocessed Data Assumption**: + 1. Preprocessed Data Assumption: TemporalScope assumes that the user provides clean, preprocessed data. This includes handling categorical encoding, missing data imputation, and feature scaling prior to using TemporalScope's partitioning and explainability methods. Similar assumptions are seen in popular packages such as TensorFlow and GluonTS, which expect the user to manage data preprocessing outside of the core workflow. - 2. **Time Column Constraints**: + + 2. Time Column Constraints: The `time_col` must be either a numeric index or a timestamp. TemporalScope relies on this temporal ordering for - key operations like sliding window partitioning and temporal explainability workflows (e.g., SHAP). Packages like - **Facebook Prophet** and **Darts** also require proper temporal ordering as a baseline assumption for modeling time - series data. - 3. **Numeric Features Requirement**: + key operations like sliding window partitioning and temporal explainability workflows (e.g., SHAP). + + 3. Numeric Features Requirement: Aside from the `time_col`, all other features in the dataset must be numeric. This ensures compatibility with machine - learning and deep learning models that require numeric inputs. As seen in frameworks like TensorFlow and - Prophet, users are expected to preprocess categorical features (e.g., one-hot encoding or embeddings) before - applying modeling or partitioning algorithms. - 4. **Modular Design for Explainability**: - TemporalScope assumes a modular, window-based design that is naturally compatible with model-agnostic explainability - methods like SHAP and LIME. Features are expected to be structured in a temporal context for efficient partitioning - and explainability. This mirrors the design of frameworks like Darts, which use similar assumptions for time - series forecasting and explainability workflows. + learning and deep learning models that require numeric inputs. As seen in frameworks like TensorFlow, users are expected + to preprocess categorical features (e.g., one-hot encoding or embeddings) before applying modeling or partitioning algorithms. + + 4. Universal Model Assumption: + TemporalScope is designed with the assumption that models trained will operate on the entire dataset without + automatically applying hidden groupings or segmentations (e.g., for mixed-frequency data). This ensures that users + can leverage frameworks like SHAP, Boruta-SHAP, and LIME for model-agnostic explainability without limitations. + + 5. Supported Data Modes: + + TemporalScope also integrates seamlessly with model-agnostic explainability techniques like SHAP, LIME, and + Boruta-SHAP, allowing insights to be extracted from most machine learning and deep learning models. + + The following table illustrates the two primary modes supported by TemporalScope and their typical use cases: + + +--------------------+----------------------------------------------------+--------------------------------------------+ + | **Mode** | **Description** | **Compatible Frameworks** | + +--------------------+----------------------------------------------------+--------------------------------------------+ + | Single-step mode | Suitable for scalar target machine learning tasks. | Scikit-learn, XGBoost, LightGBM, SHAP | + | | Each row represents a single time step. | TensorFlow (for standard regression tasks) | + +--------------------+----------------------------------------------------+--------------------------------------------+ + | Multi-step mode | Suitable for deep learning tasks like sequence | TensorFlow, PyTorch, Keras, SHAP, LIME | + | | forecasting. Input sequences (`X`) and output | (for seq2seq models, sequence forecasting) | + | | sequences (`Y`) are handled as separate datasets. | | + +--------------------+----------------------------------------------------+--------------------------------------------+ + + + These modes follow standard assumptions in time series forecasting libraries, allowing for seamless integration + with different models while requiring the user to manage their own data preprocessing. By enforcing these constraints, TemporalScope focuses on its core purpose—time series partitioning, explainability, and scalability—while leaving more general preprocessing tasks to the user. This follows industry standards seen in @@ -57,15 +77,40 @@ .. seealso:: - 1. Van Ness, M., Shen, H., Wang, H., Jin, X., Maddix, D.C., & Gopalswamy, K. (2023). Cross-Frequency Time Series Meta-Forecasting. arXiv preprint arXiv:2302.02077. - 2. Woo, G., Liu, C., Kumar, A., Xiong, C., Savarese, S., & Sahoo, D. (2024). Unified training of universal time series forecasting transformers. arXiv preprint arXiv:2402.02592. - 3. Trirat, P., Shin, Y., Kang, J., Nam, Y., Na, J., Bae, M., Kim, J., Kim, B., & Lee, J.-G. (2024). Universal time-series representation learning: A survey. arXiv preprint arXiv:2401.03717. - 4. Xu, Q., Zhuo, X., Jiang, C. and Liu, Y., 2019. An artificial neural network for mixed frequency data. Expert Systems with Applications, 118, pp.127-139.4 - 5. Filho, L.L., de Oliveira Werneck, R., Castro, M., Ribeiro Mendes Júnior, P., Lustosa, A., Zampieri, M., Linares, O., Moura, R., Morais, E., Amaral, M. and Salavati, S., 2024. A multi-modal approach for mixed-frequency time series forecasting. Neural Computing and Applications, pp.1-25. + 1. Van Ness, M., Shen, H., Wang, H., Jin, X., Maddix, D.C., & Gopalswamy, K. + (2023). Cross-Frequency Time Series Meta-Forecasting. arXiv preprint + arXiv:2302.02077. + + 2. Woo, G., Liu, C., Kumar, A., Xiong, C., Savarese, S., & Sahoo, D. (2024). + Unified training of universal time series forecasting transformers. arXiv + preprint arXiv:2402.02592. + + 3. Trirat, P., Shin, Y., Kang, J., Nam, Y., Na, J., Bae, M., Kim, J., Kim, B., & + Lee, J.-G. (2024). Universal time-series representation learning: A survey. + arXiv preprint arXiv:2401.03717. + + 4. Xu, Q., Zhuo, X., Jiang, C., & Liu, Y. (2019). An artificial neural network + for mixed frequency data. Expert Systems with Applications, 118, pp.127-139. + + 5. Filho, L.L., de Oliveira Werneck, R., Castro, M., Ribeiro Mendes Júnior, P., + Lustosa, A., Zampieri, M., Linares, O., Moura, R., Morais, E., Amaral, M., & + Salavati, S. (2024). A multi-modal approach for mixed-frequency time series + forecasting. Neural Computing and Applications, pp.1-25. + +.. note:: + + - Multi-Step Mode Limitation: Multi-step mode is not fully supported for backends like Modin and Polars + due to their inability to handle vectorized (sequence) targets in a single cell. This limitation will require + an interoperability layer for converting datasets into compatible formats (e.g., TensorFlow's `tf.data.Dataset` + or flattening the target sequences for use in Modin/Polars). + - Single-step mode: All backends (Pandas, Modin, Polars) work as expected without the need for special handling. + - Recommendation: For multi-step mode, please use Pandas for now until support is added for other backends. + Future releases will include an interoperability step to handle vectorized targets across different backends. + """ import warnings -from typing import Optional, Union +from typing import Optional, Union, cast from datetime import datetime, timedelta, date import modin.pandas as mpd @@ -75,8 +120,8 @@ from temporalscope.core.exceptions import ( TimeColumnError, MixedTypesWarning, - MixedTimezonesWarning, MixedFrequencyWarning, + UnsupportedBackendError, ) from temporalscope.core.core_utils import ( @@ -84,12 +129,18 @@ BACKEND_PANDAS, BACKEND_POLARS, SupportedBackendDataFrame, - get_default_backend_cfg, validate_and_convert_input, + infer_backend_from_dataframe, validate_backend, - validate_input, + is_numeric, + is_timestamp_like, + has_mixed_frequencies, + sort_dataframe, + check_empty_columns, + check_nulls, ) + # Define alias with forward reference TimeFrameCompatibleData = Union["TimeFrame", SupportedBackendDataFrame] @@ -250,183 +301,35 @@ def _setup_timeframe(self, df: SupportedBackendDataFrame, dataframe_backend: Opt :param dataframe_backend: The backend to use. If None, it will be inferred. :type dataframe_backend: Optional[str] - :raises ValueError: If required validations fail (e.g., missing columns, unsupported backend). - """ - - # Infer backend if not provided - self._dataframe_backend = dataframe_backend or self._infer_dataframe_backend(df) - - # Set the DataFrame - self.df = validate_and_convert_input(df, self._dataframe_backend) - - # Validate data (e.g., columns, types) - self.validate_data() - - # Sort the data if necessary - if self._sort: - self.sort_data() - - def _infer_dataframe_backend(self, df: SupportedBackendDataFrame) -> str: - """Infer the backend from the DataFrame type. - - :param df: The input DataFrame. - :type df: SupportedBackendDataFrame - :return: The inferred backend ('pl', 'pd', or 'mpd'). - :rtype: str - :raises ValueError: If the DataFrame type is unsupported. - """ - if isinstance(df, pl.DataFrame): - return BACKEND_POLARS - elif isinstance(df, pd.DataFrame): - return BACKEND_PANDAS - elif isinstance(df, mpd.DataFrame): - return BACKEND_MODIN - else: - raise ValueError(f"Unsupported DataFrame type: {type(df)}") - - def _validate_numeric_features(self) -> None: - """Validate that all features, except for the time column, are numeric. - - This method checks that all columns, other than the `time_col`, contain numeric data, which is a requirement - for machine learning and deep learning workflows. - - :raises ValueError: If any feature column is not numeric. + :raises UnsupportedBackendError: If the DataFrame's backend is unsupported. + :raises TimeColumnError: If the time column fails validation. + :raises ValueError: If other general validations (e.g., missing columns) fail. """ - df = self.get_data() - - # Backend-specific handling for numeric validation - if self.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: - non_numeric_columns = [ - col for col in df.columns if col != self.time_col and not pd.api.types.is_numeric_dtype(df[col]) - ] - elif self.dataframe_backend == BACKEND_POLARS: - non_numeric_columns = [col for col in df.columns if col != self.time_col and not df[col].dtype.is_numeric()] - else: - raise ValueError(f"Unsupported backend: {self.dataframe_backend}") - - if non_numeric_columns: - raise ValueError( - f"All features except `time_col` must be numeric. Found non-numeric columns: {non_numeric_columns}." - ) - - def _validate_time_column(self) -> None: - """Validate that the `time_col` in the DataFrame is either numeric or timestamp-like. - - This ensures the `time_col` can be used for temporal operations like sorting - or partitioning, which are essential for time-series forecasting. The `time_col` - must be numeric (e.g., integers) or timestamp-like (e.g., datetime). Mixed frequencies - (e.g., daily and monthly timestamps) are allowed, but mixed data types (e.g., numeric and - string) are not. String data types in `time_col` are not allowed across any backend. - - :raises TimeColumnError: If `time_col` is missing, contains unsupported types (non-numeric or non-timestamp), - or has missing values. - :warns MixedFrequencyWarning: If `time_col` contains mixed frequencies (e.g., daily and monthly timestamps). - :warns MixedTimezonesWarning: If `time_col` contains mixed timezone-aware and timezone-naive entries. - """ - df = self.get_data() - - # Ensure the time column exists - if self.time_col not in df.columns: - raise TimeColumnError(f"Missing required column: {self.time_col}") - - # Time column could be Pandas/Modin Series or Polars Series - time_col = df[self.time_col] - - # Narrowing the type to ensure type checking with MyPy - if isinstance(time_col, (pd.Series, mpd.Series)): - # Check for missing values in time_col (specific to Pandas/Modin) - if time_col.isnull().any(): - raise TimeColumnError("Missing values found in `time_col`") - - # Validate if time_col is either numeric or timestamp-like - is_numeric_col = self._is_numeric(time_col) - is_timestamp_col = self._is_timestamp_like(time_col) - - if not is_numeric_col and not is_timestamp_col: - raise TimeColumnError(f"`time_col` must be either numeric or timestamp-like, got {time_col.dtype}") - - # Raise MixedFrequencyWarning if mixed frequencies are detected - if is_timestamp_col and self._has_mixed_frequencies(time_col): - warnings.warn("`time_col` contains mixed timestamp frequencies.", MixedFrequencyWarning) - - # Raise MixedTimezonesWarning if mixed timezones are detected - if is_timestamp_col and self._has_mixed_timezones(time_col): - warnings.warn( - "`time_col` contains mixed timezone-aware and timezone-naive entries.", MixedTimezonesWarning - ) - - elif isinstance(time_col, pl.Series): - # Check for missing values in Polars - if time_col.is_null().sum() > 0: - raise TimeColumnError("Missing values found in `time_col`") - - is_numeric_col = self._is_numeric(time_col) - is_timestamp_col = self._is_timestamp_like(time_col) - - if not is_numeric_col and not is_timestamp_col: - raise TimeColumnError(f"`time_col` must be either numeric or timestamp-like, got {time_col.dtype}") - - # Raise MixedFrequencyWarning if mixed frequencies are detected - if is_timestamp_col and self._has_mixed_frequencies(time_col): - warnings.warn("`time_col` contains mixed timestamp frequencies.", MixedFrequencyWarning) - - # Raise MixedTimezonesWarning if mixed timezones are detected - if is_timestamp_col and self._has_mixed_timezones(time_col): - warnings.warn( - "`time_col` contains mixed timezone-aware and timezone-naive entries.", MixedTimezonesWarning - ) + try: + # Step 1: Check if a backend was provided, otherwise infer from the DataFrame + if dataframe_backend is not None: + self._dataframe_backend = dataframe_backend + else: + self._dataframe_backend = infer_backend_from_dataframe(df) - def _is_timestamp_like(self, time_col: Union[pd.Series, mpd.Series, pl.Series]) -> bool: - """Check if a time column is timestamp-like based on the backend. + # Step 2: Validate and convert the input DataFrame to the correct backend format + self.df = validate_and_convert_input(df, self._dataframe_backend) - :param time_col: The time column to check. - :return: True if the column is timestamp-like, False otherwise. - """ - if self.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: - return pd.api.types.is_datetime64_any_dtype(time_col) - elif self.dataframe_backend == BACKEND_POLARS: - return time_col.dtype == pl.Datetime - return False + # Step 3: Perform data validation + self.validate_data() - def _is_numeric(self, time_col: Union[pd.Series, mpd.Series, pl.Series]) -> bool: - """Check if a time column is numeric based on the backend. + # Step 4: Sort the DataFrame if sorting is enabled + if self._sort: + self.sort_data() - :param time_col: The time column to check. - :return: True if the column is numeric, False otherwise. - """ - if self.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: - return pd.api.types.is_numeric_dtype(time_col) - elif self.dataframe_backend == BACKEND_POLARS: - return time_col.dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64] - return False + except UnsupportedBackendError as e: + raise UnsupportedBackendError(f"Unsupported backend: {e}") - def _has_mixed_frequencies(self, time_col: Union[pd.Series, mpd.Series, pl.Series]) -> bool: - """Check for mixed frequencies in the time column. + except TimeColumnError as e: + raise TimeColumnError(f"Time column validation failed: {e}") - :param time_col: The time column to check for mixed frequencies. - :return: True if mixed frequencies are detected, False otherwise. - """ - if isinstance(time_col, (pd.Series, mpd.Series)): - inferred_freq = pd.infer_freq(time_col.dropna()) - return inferred_freq is None - elif isinstance(time_col, pl.Series): - inferred_freq = time_col.to_pandas().infer_freq() # Converts Polars to Pandas for frequency detection - return inferred_freq is None - return False - - def _has_mixed_timezones(self, time_col: Union[pd.Series, mpd.Series, pl.Series]) -> bool: - """Check for mixed timezone-aware and naive timestamps. - - :param time_col: The time column to check for mixed timezones. - :return: True if mixed timezone-aware and naive timestamps are detected, False otherwise. - """ - if isinstance(time_col, (pd.Series, mpd.Series)): - if time_col.dt.tz is not None: - return time_col.dt.tz.hasnans - elif isinstance(time_col, pl.Series): - dtype_str = str(time_col.dtype) - return "TimeZone" in dtype_str - return False + except ValueError as e: + raise ValueError(f"General validation error: {e}") def get_data(self) -> SupportedBackendDataFrame: """Return the DataFrame in its current state. @@ -457,56 +360,12 @@ def get_data(self) -> SupportedBackendDataFrame: """ return self.df - def validate_data(self) -> None: - """Run validation checks on the TimeFrame data to ensure it meets the required constraints. - - This method runs all internal validation checks to ensure that: - - The `time_col` is numeric or timestamp-like. - - All features, except `time_col`, are numeric. - - There are no missing values in the `time_col` or `target_col`. - - It checks for mixed frequencies in the `time_col` and raises a warning if detected. - - :raises ValueError: If any of the validation checks fail. - - Example Usage: - -------------- - .. code-block:: python - - from temporalscope.core.temporal_data_loader import TimeFrame - import pandas as pd - - # Create a Pandas DataFrame - data = { - 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), - 'target': range(5, 0, -1) - } - df = pd.DataFrame(data) - - # Initialize a TimeFrame - tf = TimeFrame(df, time_col='time', target_col='target') - - # Validate the data - tf.validate_data() - - """ - # Centralized validation of time column - self._validate_time_column() - - # Ensure all non-time columns are numeric - self._validate_numeric_features() - - # Validate that there are no missing values in the time and target columns - self._validate_no_missing_values() - - # Check for mixed frequencies in the time column - self._check_mixed_frequencies() - - # Indicate successful validation - print("Data validation passed successfully.") - def sort_data(self, ascending: bool = True) -> None: """Sort the DataFrame by the time column in place. + This method sorts the DataFrame based on the `time_col` in ascending or descending order. + The sorting logic is handled based on the backend (Pandas, Polars, or Modin) via the `sort_dataframe` utility. + :param ascending: If True, sort in ascending order; if False, sort in descending order. :type ascending: bool :raises TypeError: If the DataFrame type does not match the backend. @@ -532,44 +391,41 @@ def sort_data(self, ascending: bool = True) -> None: # Sort the DataFrame in ascending order tf.sort_data(ascending=True) print(tf.df) + + # Sort the DataFrame in descending order + tf.sort_data(ascending=False) + print(tf.df) """ - # Validate the DataFrame before sorting + # Ensure the DataFrame is valid before sorting self.validate_data() - sort_key = [self._time_col] - - # Mapping of backends to their sort functions, sorting in place - if self.dataframe_backend == BACKEND_POLARS: - if isinstance(self.df, pl.DataFrame): - self.df = self.df.sort(by=sort_key, descending=not ascending) - else: - raise TypeError(f"Expected Polars DataFrame but got {type(self.df)}") - elif self.dataframe_backend == BACKEND_PANDAS: - if isinstance(self.df, pd.DataFrame): - self.df.sort_values(by=sort_key, ascending=ascending, inplace=True) - else: - raise TypeError(f"Expected Pandas DataFrame but got {type(self.df)}") - elif self.dataframe_backend == BACKEND_MODIN: - if isinstance(self.df, mpd.DataFrame): - self.df.sort_values(by=sort_key, ascending=ascending, inplace=True) - else: - raise TypeError(f"Expected Modin DataFrame but got {type(self.df)}") - else: - raise ValueError(f"Unsupported dataframe backend {self._dataframe_backend}") + # Use the utility function from core_utils to perform the sort + self.df = sort_dataframe(self.df, self._time_col, self.dataframe_backend, ascending) def update_data( - self, new_df: SupportedBackendDataFrame, time_col: Optional[str] = None, target_col: Optional[str] = None + self, + new_df: Optional[SupportedBackendDataFrame] = None, + new_target_col: Optional[Union[pl.Series, pd.Series, mpd.Series]] = None, + time_col: Optional[str] = None, + target_col: Optional[str] = None, ) -> None: - """Updates the internal DataFrame with the provided new DataFrame and ensures backend consistency. - - :param new_df: The new DataFrame to replace the existing one. - :type new_df: SupportedBackendDataFrame - :param time_col: The name of the column representing time. Should be numeric or timestamp-like for sorting. Optional. - :type time_col: Optional[str] - :param target_col: The column representing the target variable. Must be a valid column in the DataFrame. Optional. - :type target_col: Optional[str] - :raises TypeError: If the new DataFrame type does not match the backend. - :raises ValueError: If required columns are missing in the new DataFrame, or validation fails. + """Update the internal DataFrame and columns with new data. + + This method updates the internal DataFrame (`df`) and/or the `target_col` with the new data provided. + It ensures the backend remains consistent across Polars, Pandas, or Modin. It validates the input + DataFrame, checks its length, and performs safe updates. + + :param new_df: The new DataFrame to replace the existing one. Optional. + :type new_df: SupportedBackendDataFrame, optional + :param new_target_col: The new target column to replace the existing one. Optional. + :type new_target_col: Union[pl.Series, pd.Series, mpd.Series], optional + :param time_col: The name of the column representing time. Optional. + :type time_col: str, optional + :param target_col: The column representing the target variable. Optional. + :type target_col: str, optional + :raises TypeError: If the target column type does not match the backend or is not numeric. + :raises ValueError: If the length of the new target column or new DataFrame does not match the existing one. + :raises UnsupportedBackendError: If the backend is unsupported. Example Usage: -------------- @@ -578,7 +434,7 @@ def update_data( from temporalscope.core.temporal_data_loader import TimeFrame import pandas as pd - # Create initial DataFrame + # Create a Pandas DataFrame df = pd.DataFrame({ 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), 'target': range(5, 0, -1) @@ -587,41 +443,58 @@ def update_data( # Initialize a TimeFrame tf = TimeFrame(df, time_col='time', target_col='target') - # Create new DataFrame to update - new_df = pd.DataFrame({ - 'time': pd.date_range(start='2021-01-06', periods=5, freq='D'), - 'target': range(1, 6) - }) - - # Update the DataFrame within TimeFrame - tf.update_data(new_df, time_col='time', target_col='target') + # Update the DataFrame and target column + new_target = pd.Series([1, 2, 3, 4, 5], name='target') + tf.update_data(new_df=None, new_target_col=new_target) print(tf.get_data()) """ - # Infer backend for the new DataFrame if needed - self._dataframe_backend = self._infer_dataframe_backend(new_df) - - # Validate and update the time_col and target_col if provided + # Update time_col and target_col if provided if time_col: - if time_col not in new_df.columns: - raise ValueError(f"`time_col` {time_col} not found in the new DataFrame.") self._time_col = time_col - if target_col: - if target_col not in new_df.columns: - raise ValueError(f"`target_col` {target_col} not found in the new DataFrame.") - self.update_target_col(new_df[target_col]) + self._target_col = target_col + + # If a new DataFrame is provided, validate and convert it + if new_df is not None: + self.df = validate_and_convert_input(new_df, self._dataframe_backend) + + # If a new target column is provided, validate and update it + if new_target_col is not None: + # Ensure the new target column has the same length as the current DataFrame + if len(new_target_col) != len(self.df): + raise ValueError("The new target column must have the same number of rows as the existing DataFrame.") + + # Validate that the target column is numeric + if not is_numeric(self.df, self._target_col): + raise TypeError(f"The target column '{self._target_col}' must be numeric.") + + # Update the target column using backend-specific logic + if self._dataframe_backend == BACKEND_POLARS: + self.df = self.df.with_columns([new_target_col.alias(self._target_col)]) + elif self._dataframe_backend == BACKEND_PANDAS: + self.df[self._target_col] = new_target_col.to_numpy() # Convert to NumPy for Pandas + elif self._dataframe_backend == BACKEND_MODIN: + self.df[self._target_col] = new_target_col.to_numpy() # Convert to NumPy for Modin + + # Perform validation of the data after updating + self.validate_data() - # Use _setup_timeframe to centralize backend inference, validation, and sorting - self._setup_timeframe(new_df, self._dataframe_backend) + # Sort the DataFrame if needed + if self._sort: + self.sort_data() - def update_target_col(self, new_target_col: SupportedBackendDataFrame) -> None: - """Updates the target column in the internal DataFrame with the provided new target column. + def validate_data(self) -> None: + """Run validation checks on the TimeFrame data to ensure it meets the required constraints. - :param new_target_col: The new target column to replace the existing one. - :type new_target_col: Union[pl.Series, pd.Series, mpd.Series] - :raises TypeError: If the target column type does not match the backend. - :raises ValueError: If the length of the new target column does not match the DataFrame, or validation fails. + This method performs the following validations: + - The `time_col` is either numeric or timestamp-like. + - All columns, except for the `time_col`, are numeric. + - There are no missing values in the `time_col` and `target_col`. + - No columns in the DataFrame are entirely empty. + + :raises ValueError: If any validation checks fail. + :raises UnsupportedBackendError: If the DataFrame backend is unsupported. Example Usage: -------------- @@ -639,51 +512,26 @@ def update_target_col(self, new_target_col: SupportedBackendDataFrame) -> None: # Initialize a TimeFrame tf = TimeFrame(df, time_col='time', target_col='target') - # Update the target column with new values - new_target = pd.Series([1, 2, 3, 4, 5], name='target') - tf.update_target_col(new_target) - print(tf.get_data()) + # Run validation on the TimeFrame + tf.validate_data() """ - # Step 1: Validate the target column type - if self._dataframe_backend == BACKEND_POLARS: - if not isinstance(new_target_col, pl.Series): - raise TypeError("Expected a Polars Series for the Polars backend.") - elif self._dataframe_backend == BACKEND_PANDAS: - if not isinstance(new_target_col, pd.Series): - raise TypeError("Expected a Pandas Series for the Pandas backend.") - elif self._dataframe_backend == BACKEND_MODIN: - if not isinstance(new_target_col, mpd.Series): - raise TypeError("Expected a Modin Series for the Modin backend.") - else: - raise ValueError(f"Unsupported dataframe_backend {self._dataframe_backend}") - - # Step 2: Check if the new target column length matches the DataFrame length - if len(new_target_col) != len(self.df): - raise ValueError("The new target column must have the same number of rows as the DataFrame.") - - # Step 3: Validate the entire DataFrame before making changes - self.validate_data() + # 1. Check if any columns are entirely empty + if check_empty_columns(self.df, self._dataframe_backend): + raise ValueError("One or more columns in the DataFrame are entirely empty (all values are NaN or None).") + + # 2. Validate `time_col` is numeric or timestamp-like + if not is_numeric(self.df, self._time_col) and not is_timestamp_like(self.df, self._time_col): + raise TimeColumnError( + f"`time_col` must be numeric or timestamp-like, found {self.df[self._time_col].dtype}" + ) + + # 3. Validate all non-time columns are numeric + non_numeric_columns = [col for col in self.df.columns if col != self._time_col and not is_numeric(self.df, col)] + if non_numeric_columns: + raise ValueError( + f"All features except `time_col` must be numeric. Non-numeric columns: {non_numeric_columns}" + ) - # Step 4: If all validations pass, proceed with updating the target column - # Use a temporary copy of the DataFrame for update and commit only after all checks - temp_df = None # Declare once without type hints - - if self._dataframe_backend == BACKEND_POLARS: - temp_df = self.df.clone() # Polars DataFrame uses `clone()` - elif self._dataframe_backend == BACKEND_PANDAS and isinstance(self.df, pd.DataFrame): - temp_df = self.df.copy() # Pandas DataFrame uses `copy()` - elif self._dataframe_backend == BACKEND_MODIN and isinstance(self.df, mpd.DataFrame): - temp_df = self.df.copy() # Modin DataFrame uses `copy()` - else: - raise ValueError(f"Unsupported dataframe_backend {self._dataframe_backend}") - - # Update the target column based on the backend - if self._dataframe_backend == BACKEND_POLARS: - temp_df = temp_df.with_columns([new_target_col.alias(self._target_col)]) - elif self._dataframe_backend == BACKEND_PANDAS: - temp_df[self._target_col] = new_target_col.to_numpy() # Convert to NumPy for Pandas - elif self._dataframe_backend == BACKEND_MODIN: - temp_df[self._target_col] = new_target_col.to_numpy() # Use .to_numpy() for Modin - - # Step 5: Commit the changes by updating the internal DataFrame - self.df = temp_df + # 4. Check for missing values in `time_col` and `target_col` + if check_nulls(self.df, self._dataframe_backend): + raise ValueError(f"Missing values found in `time_col` or `target_col`.") diff --git a/src/temporalscope/core/temporal_target_shifter.py b/src/temporalscope/core/temporal_target_shifter.py index ff3a0fe..a09be28 100644 --- a/src/temporalscope/core/temporal_target_shifter.py +++ b/src/temporalscope/core/temporal_target_shifter.py @@ -39,8 +39,6 @@ BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, - MODE_MACHINE_LEARNING, - MODE_DEEP_LEARNING, SupportedBackendDataFrame, validate_backend, ) @@ -48,559 +46,559 @@ from temporalscope.core.temporal_data_loader import TimeFrameCompatibleData -class TemporalTargetShifter: - """A class for shifting the target variable in time series data for machine learning or deep learning. - - This class works with `TimeFrame` objects or raw DataFrame types (Pandas, Modin, Polars) to shift the target variable - by a specified number of lags (time steps). It supports multiple backends and can generate output suitable for - machine learning models (scalar) or deep learning models (sequences). - - Design: - ------- - The `TemporalTargetShifter` follows a strategy pattern, where the data format (backend) is either inferred from the - input or set explicitly. This enables flexible support for different DataFrame libraries. The class ensures that - input type consistency is maintained, and it returns the same data type that is provided. For instance, if the input - is a `TimeFrame`, the output will be a `TimeFrame`. If a raw DataFrame is provided, the output will be a raw - DataFrame of the same type. - - Assumptions: - ------------ - 1. Time shifting is applied globally, meaning the data is not grouped by entities (e.g., tickers or SKUs). Users - should handle such grouping outside of this class. - 2. The time shifting is applied to a target column, which may have varying data structures depending on the backend - (Polars, Pandas, Modin). - - :param target_col: The column representing the target variable (mandatory). - :type target_col: str - :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. - :type n_lags: int - :param mode: Mode of operation: "machine_learning" for scalar or "deep_learning" for sequences. - Default is "machine_learning". - :type mode: str - :param sequence_length: (Deep Learning Mode Only) The length of the input sequences. Required if mode is "deep_learning". - :type sequence_length: Optional[int] - :param drop_target: Whether to drop the original target column after shifting. Default is True. - :type drop_target: bool - :param verbose: If True, prints information about the number of dropped rows during transformation. - :type verbose: bool - :raises ValueError: If the backend is unsupported or if validation checks fail. - - Examples - -------- - **Using TimeFrame:** - - .. code-block:: python - - from temporalscope.core.temporal_data_loader import TimeFrame - from temporalscope.core.temporal_target_shifter import TemporalTargetShifter - - # Create a sample Pandas DataFrame - data = { - 'time': pd.date_range(start='2022-01-01', periods=100), - 'target': np.random.rand(100), - 'feature_1': np.random.rand(100) - } - df = pd.DataFrame(data) - - # Create a TimeFrame object - tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") - - # Apply target shifting - shifter = TemporalTargetShifter(target_col="target", n_lags=1) - shifted_df = shifter.fit_transform(tf) - - **Using SlidingWindowPartitioner:** - - .. code-block:: python - - from temporalscope.partition.sliding_window import SlidingWindowPartitioner - from temporalscope.core.temporal_data_loader import TimeFrame - from temporalscope.core.temporal_target_shifter import TemporalTargetShifter - - # Create a sample TimeFrame - tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") - - # Create a SlidingWindowPartitioner - partitioner = SlidingWindowPartitioner(tf=tf, window_size=10, stride=1) - - # Apply TemporalTargetShifter on each partition - shifter = TemporalTargetShifter(target_col="target", n_lags=1) - for partition in partitioner.fit_transform(): - shifted_partition = shifter.fit_transform(partition) - """ - - def __init__( - self, - target_col: Optional[str] = None, - n_lags: int = 1, - mode: str = MODE_MACHINE_LEARNING, - sequence_length: Optional[int] = None, - drop_target: bool = True, - verbose: bool = False, - ): - """Initialize the TemporalTargetShifter. - - :param target_col: Column representing the target variable (mandatory). - :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. - :param mode: Mode of operation: "machine_learning" or "deep_learning". Default is "machine_learning". - :param sequence_length: (Deep Learning Mode Only) Length of the input sequences. Required if mode is - "deep_learning". - :param drop_target: Whether to drop the original target column after shifting. Default is True. - :param verbose: Whether to print detailed information about transformations. - :raises ValueError: If the target column is not provided or if an invalid mode is selected. - - Note: - The data_format is set to None during initialization and will be inferred in the fit() method based on - the type of input data (TimeFrame or SupportedBackendDataFrame). - """ - # Validate the mode (should be machine learning or deep learning) - if mode not in [MODE_MACHINE_LEARNING, MODE_DEEP_LEARNING]: - raise ValueError(f"`mode` must be '{MODE_MACHINE_LEARNING}' or '{MODE_DEEP_LEARNING}'.") - - # Ensure the target column is provided - if target_col is None: - raise ValueError("`target_col` must be explicitly provided for TemporalTargetShifter.") - - # Validate n_lags (should be greater than 0) - if n_lags <= 0: - raise ValueError("`n_lags` must be greater than 0.") - - # Handle deep learning mode, ensure sequence length is set - if mode == MODE_DEEP_LEARNING and sequence_length is None: - raise ValueError("`sequence_length` must be provided when mode is 'deep_learning'.") - - # Assign instance attributes - self.target_col = target_col - self.n_lags = n_lags - self.mode = mode - self.sequence_length = sequence_length - self.drop_target = drop_target - self.verbose = verbose - - # The data format will be inferred later during fit() - self.data_format: Optional[str] = None - - # Print a verbose message if required - if verbose: - print(f"Initialized TemporalTargetShifter with target_col={target_col}, mode={mode}, n_lags={n_lags}") - - def _infer_data_format(self, df: SupportedBackendDataFrame) -> str: - """Infer the backend from the DataFrame type. - - :param df: The input DataFrame. - :type df: SupportedBackendDataFrame - :return: The inferred backend ('BACKEND_POLARS', 'BACKEND_PANDAS', or 'BACKEND_MODIN'). - :raises ValueError: If the DataFrame type is unsupported. - """ - if isinstance(df, pl.DataFrame): - return BACKEND_POLARS - elif isinstance(df, pd.DataFrame): - return BACKEND_PANDAS - elif isinstance(df, mpd.DataFrame): - return BACKEND_MODIN - else: - raise ValueError(f"Unsupported DataFrame type: {type(df)}") - - def _set_or_infer_data_format(self, tf: TimeFrameCompatibleData) -> None: - """Set or infer the data format based on the input type. - - This method checks if the input is a TimeFrame and uses its data format. - If the input is a raw DataFrame (Pandas, Modin, or Polars), it infers the data format. - """ - if isinstance(tf, TimeFrame): - self.data_format = tf.dataframe_backend - else: - # Infer the data format using the existing _infer_data_format method - self.data_format = self._infer_data_format(tf) - - if self.data_format is None: - raise ValueError("Data format could not be inferred or is not set.") - - validate_backend(self.data_format) - - def _validate_data(self, tf: TimeFrameCompatibleData) -> None: - """Validate the TimeFrame or DataFrame input for consistency. - - This method ensures that the input data is valid and non-empty, regardless of whether it is a TimeFrame or a raw DataFrame. - - :param tf: The `TimeFrame` object or a raw DataFrame (Pandas, Modin, or Polars) to be validated. - :type tf: TimeFrameCompatibleData - :raises ValueError: If the input data is empty or invalid. - """ - if isinstance(tf, TimeFrame): - df = tf.get_data() - else: - df = tf - - # Check if the DataFrame is empty - if isinstance(df, (pd.DataFrame, mpd.DataFrame)): - if df is None or df.empty: - raise ValueError("Input DataFrame is empty.") - elif isinstance(df, pl.DataFrame): - if df.is_empty(): - raise ValueError("Input DataFrame is empty.") - else: - raise ValueError("Unsupported DataFrame type.") - - def _shift_polars(self, df: pl.DataFrame, target_col: str) -> pl.DataFrame: - """Shift the target variable in a Polars DataFrame. - - :param df: The Polars DataFrame containing the time series data. - :type df: pl.DataFrame - :param target_col: The column representing the target variable. - :type target_col: str - :return: The Polars DataFrame with the shifted target variable. - :rtype: pl.DataFrame - :raises ValueError: If `sequence_length` or `n_lags` are not properly set. - """ - if self.mode == MODE_DEEP_LEARNING: - if not isinstance(self.sequence_length, int): - raise ValueError("`sequence_length` must be an integer.") - shifted_columns = [ - df[target_col].shift(-i).alias(f"{target_col}_shift_{i}") for i in range(self.sequence_length) - ] - df = df.with_columns(shifted_columns) - df = df.with_columns( - pl.concat_list([pl.col(f"{target_col}_shift_{i}") for i in range(self.sequence_length)]).alias( - f"{target_col}_sequence" - ) - ) - df = df.drop([f"{target_col}_shift_{i}" for i in range(self.sequence_length)]) - df = df.drop_nulls() - df = df.slice(0, len(df) - self.sequence_length + 1) - else: - df = df.with_columns(df[target_col].shift(-self.n_lags).alias(f"{target_col}_shift_{self.n_lags}")) - df = df.drop_nulls() - - if df.is_empty(): - raise ValueError("DataFrame is empty after shifting operation.") - - if self.drop_target: - df = df.drop(target_col) - - return df - - def _shift_pandas_modin( - self, df: Union[pd.DataFrame, mpd.DataFrame], target_col: str - ) -> Union[pd.DataFrame, mpd.DataFrame]: - """Shift the target variable in a Pandas or Modin DataFrame. - - :param df: The Pandas or Modin DataFrame containing the time series data. - :type df: Union[pd.DataFrame, mpd.DataFrame] - :param target_col: The column representing the target variable. - :type target_col: str - :return: The DataFrame with the shifted target variable. - :rtype: Union[pd.DataFrame, mpd.DataFrame] - :raises ValueError: If `sequence_length` or `n_lags` are not properly set. - """ - if self.mode == MODE_DEEP_LEARNING: - if not isinstance(self.sequence_length, int): - raise ValueError("`sequence_length` must be an integer.") - shifted_columns = [df[target_col].shift(-i) for i in range(self.sequence_length)] - df[f"{target_col}_sequence"] = list(zip(*shifted_columns)) - df = df.dropna() - df = df.iloc[: -self.sequence_length + 1] - else: - df[f"{target_col}_shift_{self.n_lags}"] = df[target_col].shift(-self.n_lags) - df = df.dropna() - - if df.empty: - raise ValueError("DataFrame is empty after shifting operation.") - - if self.drop_target: - df = df.drop(columns=[target_col]) - - return df - - def _transform_pandas_modin(self, df: Union[pd.DataFrame, mpd.DataFrame]) -> Union[pd.DataFrame, mpd.DataFrame]: - """Handle shifting for Pandas or Modin backends. - - :param df: The input DataFrame (Pandas or Modin). - :type df: Union[pd.DataFrame, mpd.DataFrame] - :return: The transformed DataFrame with the target column shifted. - :rtype: Union[pd.DataFrame, mpd.DataFrame] - :raises ValueError: If `target_col` is not set. - """ - # Ensure target_col is not None - if self.target_col is None: - raise ValueError("`target_col` must be set before transformation.") - - df = self._shift_pandas_modin(df, self.target_col) - - rows_before = len(df) - df = df.dropna() # Handle missing values - rows_after = len(df) - - if rows_after == 0: - raise ValueError("All rows were dropped during transformation.") - - self._print_dropped_rows(rows_before, rows_after) - return df - - def _transform_polars(self, df: pl.DataFrame) -> pl.DataFrame: - """Handle shifting for Polars backend. - - :param df: The input Polars DataFrame. - :type df: pl.DataFrame - :return: The transformed Polars DataFrame with the target column shifted. - :rtype: pl.DataFrame - :raises ValueError: If `target_col` is not set. - """ - # Ensure target_col is not None - if self.target_col is None: - raise ValueError("`target_col` must be set before transformation.") - - df = self._shift_polars(df, self.target_col) - - rows_before = df.shape[0] - df = df.drop_nulls() - rows_after = df.shape[0] - - if rows_after == 0: - raise ValueError("All rows were dropped during transformation.") - - self._print_dropped_rows(rows_before, rows_after) - return df - - def _print_dropped_rows(self, rows_before: int, rows_after: int) -> None: - """Print information about dropped rows if verbose mode is enabled. - - :param rows_before: Number of rows before dropping nulls. - :type rows_before: int - :param rows_after: Number of rows after dropping nulls. - :type rows_after: int - """ - if self.verbose: - rows_dropped = rows_before - rows_after - print(f"Rows before shift: {rows_before}; Rows after shift: {rows_after}; Rows dropped: {rows_dropped}") - - def fit(self, tf: TimeFrameCompatibleData) -> "TemporalTargetShifter": - """Validate and prepare the target data for transformation based on the inferred data format (backend). - - The `fit` method initializes the data format (whether it's a `TimeFrame` or a raw DataFrame) and validates the input data. - It ensures the target column is consistent with the input data and sets the backend (`data_format`), which will be used - in subsequent transformations. - - :param tf: The `TimeFrame` object or a raw DataFrame (`pandas`, `modin`, or `polars`) that contains the time series data. - The data should contain a target column that will be shifted. - :type tf: TimeFrameCompatibleData - :raises ValueError: If the target column is not provided, the data is invalid, or the backend format is unsupported. - :raises Warning: If the target column provided in `TemporalTargetShifter` differs from the one in the `TimeFrame`. - :return: The fitted `TemporalTargetShifter` instance, ready for transforming the data. - :rtype: TemporalTargetShifter - - Example Usage: - -------------- - .. code-block:: python - - from temporalscope.core.temporal_target_shifter import TemporalTargetShifter - from temporalscope.core.temporal_data_loader import TimeFrame - import pandas as pd - import numpy as np - - # Create a sample Pandas DataFrame - data = { - 'time': pd.date_range(start='2022-01-01', periods=100), - 'target': np.random.rand(100), - 'feature_1': np.random.rand(100) - } - df = pd.DataFrame(data) - - # Create a TimeFrame object - tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") - - # Create a TemporalTargetShifter instance - shifter = TemporalTargetShifter(n_lags=2, target_col="target") - - # Fit the shifter to the TimeFrame - shifter.fit(tf) - """ - # Validate the input data (whether it's TimeFrame or DataFrame) - self._validate_data(tf) - - # If input is a TimeFrame, set the backend using the @property method and manage the target column - if isinstance(tf, TimeFrame): - self.data_format = tf.dataframe_backend # Using the @property to access the backend - if not self.target_col: - self.target_col = tf._target_col # If target_col not set in the shifter, use TimeFrame's target_col - elif self.target_col != tf._target_col: - warnings.warn( - f"The `target_col` in TemporalTargetShifter ('{self.target_col}') differs from the TimeFrame's " - f"target_col ('{tf._target_col}').", - UserWarning, - ) - # If input is a raw DataFrame (pandas, modin, or polars), infer the backend - elif tf is not None: - self.data_format = self._infer_data_format(tf) - else: - raise ValueError("Input data is None.") - - # Return the instance after fitting - return self - - def transform(self, tf: TimeFrameCompatibleData) -> TimeFrameCompatibleData: - """Transform the input time series data by shifting the target variable according to the specified number of lags. - - The `transform` method shifts the target variable in the input data according to the `n_lags` or `sequence_length` - set during initialization. This method works directly on either a `TimeFrame` or a raw DataFrame (Pandas, Modin, - or Polars), applying the appropriate backend-specific transformation. - - Design: - ------- - The method returns the same type as the input: If a `TimeFrame` object is passed in, a `TimeFrame` object is returned. - If a raw DataFrame (Pandas, Modin, or Polars) is passed in, the same type of DataFrame is returned. This ensures that - the transformation remains consistent in pipeline workflows where the type of data object is important. - - :param tf: The `TimeFrame` object or a DataFrame (Pandas, Modin, or Polars) that contains the time series data - to be transformed. The data should contain a target column that will be shifted. - :type tf: TimeFrameCompatibleData - :raises ValueError: If the input data is invalid, unsupported, or lacks columns. - :raises ValueError: If the backend is unsupported or data validation fails. - :return: A transformed `TimeFrame` if the input was a `TimeFrame`, otherwise a DataFrame of the same type as the input. - :rtype: TimeFrameCompatibleData - - Example Usage: - -------------- - .. code-block:: python - - from temporalscope.core.temporal_target_shifter import TemporalTargetShifter - from temporalscope.core.temporal_data_loader import TimeFrame - import pandas as pd - - # Create a sample Pandas DataFrame - data = { - 'time': pd.date_range(start='2022-01-01', periods=100), - 'target': np.random.rand(100), - 'feature_1': np.random.rand(100) - } - df = pd.DataFrame(data) - - # Create a TimeFrame object - tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") - - # Initialize TemporalTargetShifter - shifter = TemporalTargetShifter(n_lags=2, target_col="target") - - # Fit the shifter and transform the data - shifter.fit(tf) - transformed_data = shifter.transform(tf) - - """ - # Handle TimeFrame input: sort data and retrieve the DataFrame - if isinstance(tf, TimeFrame): - tf.sort_data() # Ensure data is sorted before shifting - df = tf.get_data() - - # If target_col isn't set in the shifter, retrieve it from TimeFrame - if not self.target_col: - self.target_col = tf._target_col - - # Assign the backend from TimeFrame - self.data_format = tf.dataframe_backend - - # Handle raw DataFrame input - elif tf is not None: - df = tf - - # Infer the target column from the input if not already set - if not self.target_col: - if hasattr(df, "columns"): - self.target_col = df.columns[-1] - else: - raise ValueError("The input DataFrame does not have columns.") - - # Set or infer the backend for the DataFrame - self._set_or_infer_data_format(df) - else: - raise ValueError("Input data is None.") - - # Delegate transformation to backend-specific methods - if self.data_format == BACKEND_PANDAS or self.data_format == BACKEND_MODIN: - transformed_df = self._transform_pandas_modin(df) - elif self.data_format == BACKEND_POLARS: - transformed_df = self._transform_polars(df) - else: - raise ValueError(f"Unsupported backend: {self.data_format}") - - # If the input was a TimeFrame, return a transformed TimeFrame - if isinstance(tf, TimeFrame): - return TimeFrame( - transformed_df, - time_col=tf.time_col, - target_col=( - f"{self.target_col}_shift_{self.n_lags}" - if self.mode == MODE_MACHINE_LEARNING - else f"{self.target_col}_sequence" - ), - dataframe_backend=self.data_format, - ) - - return transformed_df - - def fit_transform(self, tf: TimeFrameCompatibleData) -> TimeFrameCompatibleData: - """Fit and transform the input data in a single step. - - This method combines the functionality of the `fit` and `transform` methods. It first validates and prepares the input - data (fitting), then applies the target variable shifting (transformation) based on the `n_lags` or `sequence_length` - specified during initialization. - - Design: - ------- - The output type mirrors the input type. If a `TimeFrame` is provided, a `TimeFrame` is returned. If a raw DataFrame - (Pandas, Modin, or Polars) is provided, the output will be a DataFrame of the same type. This ensures that the - transformation remains consistent with the input, making it easier to work with in pipeline workflows. - - :param tf: The `TimeFrame` object or a DataFrame (`pandas`, `modin`, or `polars`) to be transformed. - :type tf: TimeFrameCompatibleData - :raises ValueError: If the input data is invalid or the backend is unsupported. - :raises ValueError: If the target column is not set, or is incompatible with the data. - :return: A transformed `TimeFrame` if the input was a `TimeFrame`, otherwise a DataFrame of the same type as the input. - :rtype: TimeFrameCompatibleData - - Example Usage: - -------------- - .. code-block:: python - - from temporalscope.core.temporal_target_shifter import TemporalTargetShifter - from temporalscope.core.temporal_data_loader import TimeFrame - import pandas as pd - - # Create a sample Pandas DataFrame - data = { - 'time': pd.date_range(start='2022-01-01', periods=100), - 'target': np.random.rand(100), - 'feature_1': np.random.rand(100) - } - df = pd.DataFrame(data) - - # Create a TimeFrame object - tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend="pd") - - # Initialize TemporalTargetShifter - shifter = TemporalTargetShifter(n_lags=2, target_col="target") - - # Fit and transform in a single step - shifted_data = shifter.fit_transform(tf) - """ - # Fit the data (infers backend and validates input) - self.fit(tf) - - # Apply the transformation (delegates to backend-specific methods) - transformed = self.transform(tf) - - # If the input was a TimeFrame, return a new TimeFrame with the transformed DataFrame - if isinstance(tf, TimeFrame): - tf_casted = cast(TimeFrame, tf) - return TimeFrame( - transformed, # Pass the transformed DataFrame directly - time_col=tf_casted.time_col, - target_col=( - f"{self.target_col}_shift_{self.n_lags}" - if self.mode == MODE_MACHINE_LEARNING - else f"{self.target_col}_sequence" - ), - dataframe_backend=tf_casted.dataframe_backend, # Ensure we use the original backend from the input - ) - - # Otherwise, return the transformed raw DataFrame - return transformed +# class TemporalTargetShifter: +# """A class for shifting the target variable in time series data for machine learning or deep learning. + +# This class works with `TimeFrame` objects or raw DataFrame types (Pandas, Modin, Polars) to shift the target variable +# by a specified number of lags (time steps). It supports multiple backends and can generate output suitable for +# machine learning models (scalar) or deep learning models (sequences). + +# Design: +# ------- +# The `TemporalTargetShifter` follows a strategy pattern, where the data format (backend) is either inferred from the +# input or set explicitly. This enables flexible support for different DataFrame libraries. The class ensures that +# input type consistency is maintained, and it returns the same data type that is provided. For instance, if the input +# is a `TimeFrame`, the output will be a `TimeFrame`. If a raw DataFrame is provided, the output will be a raw +# DataFrame of the same type. + +# Assumptions: +# ------------ +# 1. Time shifting is applied globally, meaning the data is not grouped by entities (e.g., tickers or SKUs). Users +# should handle such grouping outside of this class. +# 2. The time shifting is applied to a target column, which may have varying data structures depending on the backend +# (Polars, Pandas, Modin). + +# :param target_col: The column representing the target variable (mandatory). +# :type target_col: str +# :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. +# :type n_lags: int +# :param mode: Mode of operation: "machine_learning" for scalar or "deep_learning" for sequences. +# Default is "machine_learning". +# :type mode: str +# :param sequence_length: (Deep Learning Mode Only) The length of the input sequences. Required if mode is "deep_learning". +# :type sequence_length: Optional[int] +# :param drop_target: Whether to drop the original target column after shifting. Default is True. +# :type drop_target: bool +# :param verbose: If True, prints information about the number of dropped rows during transformation. +# :type verbose: bool +# :raises ValueError: If the backend is unsupported or if validation checks fail. + +# Examples +# -------- +# **Using TimeFrame:** + +# .. code-block:: python + +# from temporalscope.core.temporal_data_loader import TimeFrame +# from temporalscope.core.temporal_target_shifter import TemporalTargetShifter + +# # Create a sample Pandas DataFrame +# data = { +# 'time': pd.date_range(start='2022-01-01', periods=100), +# 'target': np.random.rand(100), +# 'feature_1': np.random.rand(100) +# } +# df = pd.DataFrame(data) + +# # Create a TimeFrame object +# tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + +# # Apply target shifting +# shifter = TemporalTargetShifter(target_col="target", n_lags=1) +# shifted_df = shifter.fit_transform(tf) + +# **Using SlidingWindowPartitioner:** + +# .. code-block:: python + +# from temporalscope.partition.sliding_window import SlidingWindowPartitioner +# from temporalscope.core.temporal_data_loader import TimeFrame +# from temporalscope.core.temporal_target_shifter import TemporalTargetShifter + +# # Create a sample TimeFrame +# tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + +# # Create a SlidingWindowPartitioner +# partitioner = SlidingWindowPartitioner(tf=tf, window_size=10, stride=1) + +# # Apply TemporalTargetShifter on each partition +# shifter = TemporalTargetShifter(target_col="target", n_lags=1) +# for partition in partitioner.fit_transform(): +# shifted_partition = shifter.fit_transform(partition) +# """ + +# def __init__( +# self, +# target_col: Optional[str] = None, +# n_lags: int = 1, +# mode: str = MODE_MACHINE_LEARNING, +# sequence_length: Optional[int] = None, +# drop_target: bool = True, +# verbose: bool = False, +# ): +# """Initialize the TemporalTargetShifter. + +# :param target_col: Column representing the target variable (mandatory). +# :param n_lags: Number of lags (time steps) to shift the target variable. Default is 1. +# :param mode: Mode of operation: "machine_learning" or "deep_learning". Default is "machine_learning". +# :param sequence_length: (Deep Learning Mode Only) Length of the input sequences. Required if mode is +# "deep_learning". +# :param drop_target: Whether to drop the original target column after shifting. Default is True. +# :param verbose: Whether to print detailed information about transformations. +# :raises ValueError: If the target column is not provided or if an invalid mode is selected. + +# Note: +# The data_format is set to None during initialization and will be inferred in the fit() method based on +# the type of input data (TimeFrame or SupportedBackendDataFrame). +# """ +# # Validate the mode (should be machine learning or deep learning) +# if mode not in [MODE_MACHINE_LEARNING, MODE_DEEP_LEARNING]: +# raise ValueError(f"`mode` must be '{MODE_MACHINE_LEARNING}' or '{MODE_DEEP_LEARNING}'.") + +# # Ensure the target column is provided +# if target_col is None: +# raise ValueError("`target_col` must be explicitly provided for TemporalTargetShifter.") + +# # Validate n_lags (should be greater than 0) +# if n_lags <= 0: +# raise ValueError("`n_lags` must be greater than 0.") + +# # Handle deep learning mode, ensure sequence length is set +# if mode == MODE_DEEP_LEARNING and sequence_length is None: +# raise ValueError("`sequence_length` must be provided when mode is 'deep_learning'.") + +# # Assign instance attributes +# self.target_col = target_col +# self.n_lags = n_lags +# self.mode = mode +# self.sequence_length = sequence_length +# self.drop_target = drop_target +# self.verbose = verbose + +# # The data format will be inferred later during fit() +# self.data_format: Optional[str] = None + +# # Print a verbose message if required +# if verbose: +# print(f"Initialized TemporalTargetShifter with target_col={target_col}, mode={mode}, n_lags={n_lags}") + +# def _infer_data_format(self, df: SupportedBackendDataFrame) -> str: +# """Infer the backend from the DataFrame type. + +# :param df: The input DataFrame. +# :type df: SupportedBackendDataFrame +# :return: The inferred backend ('BACKEND_POLARS', 'BACKEND_PANDAS', or 'BACKEND_MODIN'). +# :raises ValueError: If the DataFrame type is unsupported. +# """ +# if isinstance(df, pl.DataFrame): +# return BACKEND_POLARS +# elif isinstance(df, pd.DataFrame): +# return BACKEND_PANDAS +# elif isinstance(df, mpd.DataFrame): +# return BACKEND_MODIN +# else: +# raise ValueError(f"Unsupported DataFrame type: {type(df)}") + +# def _set_or_infer_data_format(self, tf: TimeFrameCompatibleData) -> None: +# """Set or infer the data format based on the input type. + +# This method checks if the input is a TimeFrame and uses its data format. +# If the input is a raw DataFrame (Pandas, Modin, or Polars), it infers the data format. +# """ +# if isinstance(tf, TimeFrame): +# self.data_format = tf.dataframe_backend +# else: +# # Infer the data format using the existing _infer_data_format method +# self.data_format = self._infer_data_format(tf) + +# if self.data_format is None: +# raise ValueError("Data format could not be inferred or is not set.") + +# validate_backend(self.data_format) + +# def _validate_data(self, tf: TimeFrameCompatibleData) -> None: +# """Validate the TimeFrame or DataFrame input for consistency. + +# This method ensures that the input data is valid and non-empty, regardless of whether it is a TimeFrame or a raw DataFrame. + +# :param tf: The `TimeFrame` object or a raw DataFrame (Pandas, Modin, or Polars) to be validated. +# :type tf: TimeFrameCompatibleData +# :raises ValueError: If the input data is empty or invalid. +# """ +# if isinstance(tf, TimeFrame): +# df = tf.get_data() +# else: +# df = tf + +# # Check if the DataFrame is empty +# if isinstance(df, (pd.DataFrame, mpd.DataFrame)): +# if df is None or df.empty: +# raise ValueError("Input DataFrame is empty.") +# elif isinstance(df, pl.DataFrame): +# if df.is_empty(): +# raise ValueError("Input DataFrame is empty.") +# else: +# raise ValueError("Unsupported DataFrame type.") + +# def _shift_polars(self, df: pl.DataFrame, target_col: str) -> pl.DataFrame: +# """Shift the target variable in a Polars DataFrame. + +# :param df: The Polars DataFrame containing the time series data. +# :type df: pl.DataFrame +# :param target_col: The column representing the target variable. +# :type target_col: str +# :return: The Polars DataFrame with the shifted target variable. +# :rtype: pl.DataFrame +# :raises ValueError: If `sequence_length` or `n_lags` are not properly set. +# """ +# if self.mode == MODE_DEEP_LEARNING: +# if not isinstance(self.sequence_length, int): +# raise ValueError("`sequence_length` must be an integer.") +# shifted_columns = [ +# df[target_col].shift(-i).alias(f"{target_col}_shift_{i}") for i in range(self.sequence_length) +# ] +# df = df.with_columns(shifted_columns) +# df = df.with_columns( +# pl.concat_list([pl.col(f"{target_col}_shift_{i}") for i in range(self.sequence_length)]).alias( +# f"{target_col}_sequence" +# ) +# ) +# df = df.drop([f"{target_col}_shift_{i}" for i in range(self.sequence_length)]) +# df = df.drop_nulls() +# df = df.slice(0, len(df) - self.sequence_length + 1) +# else: +# df = df.with_columns(df[target_col].shift(-self.n_lags).alias(f"{target_col}_shift_{self.n_lags}")) +# df = df.drop_nulls() + +# if df.is_empty(): +# raise ValueError("DataFrame is empty after shifting operation.") + +# if self.drop_target: +# df = df.drop(target_col) + +# return df + +# def _shift_pandas_modin( +# self, df: Union[pd.DataFrame, mpd.DataFrame], target_col: str +# ) -> Union[pd.DataFrame, mpd.DataFrame]: +# """Shift the target variable in a Pandas or Modin DataFrame. + +# :param df: The Pandas or Modin DataFrame containing the time series data. +# :type df: Union[pd.DataFrame, mpd.DataFrame] +# :param target_col: The column representing the target variable. +# :type target_col: str +# :return: The DataFrame with the shifted target variable. +# :rtype: Union[pd.DataFrame, mpd.DataFrame] +# :raises ValueError: If `sequence_length` or `n_lags` are not properly set. +# """ +# if self.mode == MODE_DEEP_LEARNING: +# if not isinstance(self.sequence_length, int): +# raise ValueError("`sequence_length` must be an integer.") +# shifted_columns = [df[target_col].shift(-i) for i in range(self.sequence_length)] +# df[f"{target_col}_sequence"] = list(zip(*shifted_columns)) +# df = df.dropna() +# df = df.iloc[: -self.sequence_length + 1] +# else: +# df[f"{target_col}_shift_{self.n_lags}"] = df[target_col].shift(-self.n_lags) +# df = df.dropna() + +# if df.empty: +# raise ValueError("DataFrame is empty after shifting operation.") + +# if self.drop_target: +# df = df.drop(columns=[target_col]) + +# return df + +# def _transform_pandas_modin(self, df: Union[pd.DataFrame, mpd.DataFrame]) -> Union[pd.DataFrame, mpd.DataFrame]: +# """Handle shifting for Pandas or Modin backends. + +# :param df: The input DataFrame (Pandas or Modin). +# :type df: Union[pd.DataFrame, mpd.DataFrame] +# :return: The transformed DataFrame with the target column shifted. +# :rtype: Union[pd.DataFrame, mpd.DataFrame] +# :raises ValueError: If `target_col` is not set. +# """ +# # Ensure target_col is not None +# if self.target_col is None: +# raise ValueError("`target_col` must be set before transformation.") + +# df = self._shift_pandas_modin(df, self.target_col) + +# rows_before = len(df) +# df = df.dropna() # Handle missing values +# rows_after = len(df) + +# if rows_after == 0: +# raise ValueError("All rows were dropped during transformation.") + +# self._print_dropped_rows(rows_before, rows_after) +# return df + +# def _transform_polars(self, df: pl.DataFrame) -> pl.DataFrame: +# """Handle shifting for Polars backend. + +# :param df: The input Polars DataFrame. +# :type df: pl.DataFrame +# :return: The transformed Polars DataFrame with the target column shifted. +# :rtype: pl.DataFrame +# :raises ValueError: If `target_col` is not set. +# """ +# # Ensure target_col is not None +# if self.target_col is None: +# raise ValueError("`target_col` must be set before transformation.") + +# df = self._shift_polars(df, self.target_col) + +# rows_before = df.shape[0] +# df = df.drop_nulls() +# rows_after = df.shape[0] + +# if rows_after == 0: +# raise ValueError("All rows were dropped during transformation.") + +# self._print_dropped_rows(rows_before, rows_after) +# return df + +# def _print_dropped_rows(self, rows_before: int, rows_after: int) -> None: +# """Print information about dropped rows if verbose mode is enabled. + +# :param rows_before: Number of rows before dropping nulls. +# :type rows_before: int +# :param rows_after: Number of rows after dropping nulls. +# :type rows_after: int +# """ +# if self.verbose: +# rows_dropped = rows_before - rows_after +# print(f"Rows before shift: {rows_before}; Rows after shift: {rows_after}; Rows dropped: {rows_dropped}") + +# def fit(self, tf: TimeFrameCompatibleData) -> "TemporalTargetShifter": +# """Validate and prepare the target data for transformation based on the inferred data format (backend). + +# The `fit` method initializes the data format (whether it's a `TimeFrame` or a raw DataFrame) and validates the input data. +# It ensures the target column is consistent with the input data and sets the backend (`data_format`), which will be used +# in subsequent transformations. + +# :param tf: The `TimeFrame` object or a raw DataFrame (`pandas`, `modin`, or `polars`) that contains the time series data. +# The data should contain a target column that will be shifted. +# :type tf: TimeFrameCompatibleData +# :raises ValueError: If the target column is not provided, the data is invalid, or the backend format is unsupported. +# :raises Warning: If the target column provided in `TemporalTargetShifter` differs from the one in the `TimeFrame`. +# :return: The fitted `TemporalTargetShifter` instance, ready for transforming the data. +# :rtype: TemporalTargetShifter + +# Example Usage: +# -------------- +# .. code-block:: python + +# from temporalscope.core.temporal_target_shifter import TemporalTargetShifter +# from temporalscope.core.temporal_data_loader import TimeFrame +# import pandas as pd +# import numpy as np + +# # Create a sample Pandas DataFrame +# data = { +# 'time': pd.date_range(start='2022-01-01', periods=100), +# 'target': np.random.rand(100), +# 'feature_1': np.random.rand(100) +# } +# df = pd.DataFrame(data) + +# # Create a TimeFrame object +# tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + +# # Create a TemporalTargetShifter instance +# shifter = TemporalTargetShifter(n_lags=2, target_col="target") + +# # Fit the shifter to the TimeFrame +# shifter.fit(tf) +# """ +# # Validate the input data (whether it's TimeFrame or DataFrame) +# self._validate_data(tf) + +# # If input is a TimeFrame, set the backend using the @property method and manage the target column +# if isinstance(tf, TimeFrame): +# self.data_format = tf.dataframe_backend # Using the @property to access the backend +# if not self.target_col: +# self.target_col = tf._target_col # If target_col not set in the shifter, use TimeFrame's target_col +# elif self.target_col != tf._target_col: +# warnings.warn( +# f"The `target_col` in TemporalTargetShifter ('{self.target_col}') differs from the TimeFrame's " +# f"target_col ('{tf._target_col}').", +# UserWarning, +# ) +# # If input is a raw DataFrame (pandas, modin, or polars), infer the backend +# elif tf is not None: +# self.data_format = self._infer_data_format(tf) +# else: +# raise ValueError("Input data is None.") + +# # Return the instance after fitting +# return self + +# def transform(self, tf: TimeFrameCompatibleData) -> TimeFrameCompatibleData: +# """Transform the input time series data by shifting the target variable according to the specified number of lags. + +# The `transform` method shifts the target variable in the input data according to the `n_lags` or `sequence_length` +# set during initialization. This method works directly on either a `TimeFrame` or a raw DataFrame (Pandas, Modin, +# or Polars), applying the appropriate backend-specific transformation. + +# Design: +# ------- +# The method returns the same type as the input: If a `TimeFrame` object is passed in, a `TimeFrame` object is returned. +# If a raw DataFrame (Pandas, Modin, or Polars) is passed in, the same type of DataFrame is returned. This ensures that +# the transformation remains consistent in pipeline workflows where the type of data object is important. + +# :param tf: The `TimeFrame` object or a DataFrame (Pandas, Modin, or Polars) that contains the time series data +# to be transformed. The data should contain a target column that will be shifted. +# :type tf: TimeFrameCompatibleData +# :raises ValueError: If the input data is invalid, unsupported, or lacks columns. +# :raises ValueError: If the backend is unsupported or data validation fails. +# :return: A transformed `TimeFrame` if the input was a `TimeFrame`, otherwise a DataFrame of the same type as the input. +# :rtype: TimeFrameCompatibleData + +# Example Usage: +# -------------- +# .. code-block:: python + +# from temporalscope.core.temporal_target_shifter import TemporalTargetShifter +# from temporalscope.core.temporal_data_loader import TimeFrame +# import pandas as pd + +# # Create a sample Pandas DataFrame +# data = { +# 'time': pd.date_range(start='2022-01-01', periods=100), +# 'target': np.random.rand(100), +# 'feature_1': np.random.rand(100) +# } +# df = pd.DataFrame(data) + +# # Create a TimeFrame object +# tf = TimeFrame(df, time_col="time", target_col="target", backend="pd") + +# # Initialize TemporalTargetShifter +# shifter = TemporalTargetShifter(n_lags=2, target_col="target") + +# # Fit the shifter and transform the data +# shifter.fit(tf) +# transformed_data = shifter.transform(tf) + +# """ +# # Handle TimeFrame input: sort data and retrieve the DataFrame +# if isinstance(tf, TimeFrame): +# tf.sort_data() # Ensure data is sorted before shifting +# df = tf.get_data() + +# # If target_col isn't set in the shifter, retrieve it from TimeFrame +# if not self.target_col: +# self.target_col = tf._target_col + +# # Assign the backend from TimeFrame +# self.data_format = tf.dataframe_backend + +# # Handle raw DataFrame input +# elif tf is not None: +# df = tf + +# # Infer the target column from the input if not already set +# if not self.target_col: +# if hasattr(df, "columns"): +# self.target_col = df.columns[-1] +# else: +# raise ValueError("The input DataFrame does not have columns.") + +# # Set or infer the backend for the DataFrame +# self._set_or_infer_data_format(df) +# else: +# raise ValueError("Input data is None.") + +# # Delegate transformation to backend-specific methods +# if self.data_format == BACKEND_PANDAS or self.data_format == BACKEND_MODIN: +# transformed_df = self._transform_pandas_modin(df) +# elif self.data_format == BACKEND_POLARS: +# transformed_df = self._transform_polars(df) +# else: +# raise ValueError(f"Unsupported backend: {self.data_format}") + +# # If the input was a TimeFrame, return a transformed TimeFrame +# if isinstance(tf, TimeFrame): +# return TimeFrame( +# transformed_df, +# time_col=tf.time_col, +# target_col=( +# f"{self.target_col}_shift_{self.n_lags}" +# if self.mode == MODE_MACHINE_LEARNING +# else f"{self.target_col}_sequence" +# ), +# dataframe_backend=self.data_format, +# ) + +# return transformed_df + +# def fit_transform(self, tf: TimeFrameCompatibleData) -> TimeFrameCompatibleData: +# """Fit and transform the input data in a single step. + +# This method combines the functionality of the `fit` and `transform` methods. It first validates and prepares the input +# data (fitting), then applies the target variable shifting (transformation) based on the `n_lags` or `sequence_length` +# specified during initialization. + +# Design: +# ------- +# The output type mirrors the input type. If a `TimeFrame` is provided, a `TimeFrame` is returned. If a raw DataFrame +# (Pandas, Modin, or Polars) is provided, the output will be a DataFrame of the same type. This ensures that the +# transformation remains consistent with the input, making it easier to work with in pipeline workflows. + +# :param tf: The `TimeFrame` object or a DataFrame (`pandas`, `modin`, or `polars`) to be transformed. +# :type tf: TimeFrameCompatibleData +# :raises ValueError: If the input data is invalid or the backend is unsupported. +# :raises ValueError: If the target column is not set, or is incompatible with the data. +# :return: A transformed `TimeFrame` if the input was a `TimeFrame`, otherwise a DataFrame of the same type as the input. +# :rtype: TimeFrameCompatibleData + +# Example Usage: +# -------------- +# .. code-block:: python + +# from temporalscope.core.temporal_target_shifter import TemporalTargetShifter +# from temporalscope.core.temporal_data_loader import TimeFrame +# import pandas as pd + +# # Create a sample Pandas DataFrame +# data = { +# 'time': pd.date_range(start='2022-01-01', periods=100), +# 'target': np.random.rand(100), +# 'feature_1': np.random.rand(100) +# } +# df = pd.DataFrame(data) + +# # Create a TimeFrame object +# tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend="pd") + +# # Initialize TemporalTargetShifter +# shifter = TemporalTargetShifter(n_lags=2, target_col="target") + +# # Fit and transform in a single step +# shifted_data = shifter.fit_transform(tf) +# """ +# # Fit the data (infers backend and validates input) +# self.fit(tf) + +# # Apply the transformation (delegates to backend-specific methods) +# transformed = self.transform(tf) + +# # If the input was a TimeFrame, return a new TimeFrame with the transformed DataFrame +# if isinstance(tf, TimeFrame): +# tf_casted = cast(TimeFrame, tf) +# return TimeFrame( +# transformed, # Pass the transformed DataFrame directly +# time_col=tf_casted.time_col, +# target_col=( +# f"{self.target_col}_shift_{self.n_lags}" +# if self.mode == MODE_MACHINE_LEARNING +# else f"{self.target_col}_sequence" +# ), +# dataframe_backend=tf_casted.dataframe_backend, # Ensure we use the original backend from the input +# ) + +# # Otherwise, return the transformed raw DataFrame +# return transformed diff --git a/src/temporalscope/datasets/datasets.py b/src/temporalscope/datasets/datasets.py index 9ec527c..fad04e7 100644 --- a/src/temporalscope/datasets/datasets.py +++ b/src/temporalscope/datasets/datasets.py @@ -15,14 +15,14 @@ # specific language governing permissions and limitations # under the License. -""" TemporalScope/src/temporalscope/datasets/datasets.py +"""TemporalScope/src/temporalscope/datasets/datasets.py Utility class for loading datasets and initializing TimeFrame objects with multi-backend support. Supports Pandas, Modin, and Polars as backends for time series forecasting and analysis. This class is intended to be used for tutorials and examples that involve open-source datasets -licensed under Apache, MIT, or similar valid open-source licenses. It simplifies dataset loading -and preprocessing while providing compatibility with multiple DataFrame backends, including Pandas, +licensed under Apache, MIT, or similar valid open-source licenses. It simplifies dataset loading +and preprocessing while providing compatibility with multiple DataFrame backends, including Pandas, Modin, and Polars. The class can be easily extended to include additional datasets in the future. Example: @@ -57,11 +57,12 @@ BACKEND_MODIN, BACKEND_POLARS, SupportedBackendDataFrame, - print_divider + print_divider, ) + def _load_macrodata() -> Tuple[pd.DataFrame, str]: - """ Load and preprocess the macrodata dataset. + """Load and preprocess the macrodata dataset. Combines the 'year' and 'quarter' columns to create a datetime 'ds' column. The dataset is then returned with the 'realgdp' column as the default target. @@ -77,8 +78,7 @@ def _load_macrodata() -> Tuple[pd.DataFrame, str]: # Combine 'year' and 'quarter' to create a datetime 'ds' column dataset_df["ds"] = pd.to_datetime( - dataset_df["year"].astype(str) + "-" + - ((dataset_df["quarter"] - 1) * 3 + 1).astype(str) + "-01" + dataset_df["year"].astype(str) + "-" + ((dataset_df["quarter"] - 1) * 3 + 1).astype(str) + "-01" ) # Drop the 'year' and 'quarter' columns @@ -98,7 +98,7 @@ def _load_macrodata() -> Tuple[pd.DataFrame, str]: class DatasetLoader: - """ A utility class for loading datasets and initializing TimeFrame objects for multiple backends. + """A utility class for loading datasets and initializing TimeFrame objects for multiple backends. This class supports datasets that are licensed under valid open-source licenses (such as Apache and MIT). It simplifies loading and preprocessing of datasets and enables compatibility with Pandas, Modin, and Polars @@ -132,9 +132,9 @@ class DatasetLoader: """ def __init__(self, dataset_name: str = "macrodata") -> None: - """ + """ Initialize DatasetLoader with a specified dataset. - + :param dataset_name: The name of the dataset to load. Must be available in AVAILABLE_DATASETS. :raises ValueError: If the specified dataset is not available. """ @@ -145,9 +145,9 @@ def __init__(self, dataset_name: str = "macrodata") -> None: self.dataset_name = dataset_name def _load_dataset_and_target(self) -> Tuple[pd.DataFrame, str]: - """ + """ Internal method to load the dataset and its associated target column. - + :return: A tuple containing the preprocessed DataFrame and the associated target column name. :rtype: Tuple[pd.DataFrame, str] """ @@ -166,10 +166,13 @@ def _load_dataset_and_target(self) -> Tuple[pd.DataFrame, str]: return dataset_df, target_col def init_timeframes_for_backends( - self, df: pd.DataFrame, target_col: str, backends: Tuple[str, ...] = (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS) + self, + df: pd.DataFrame, + target_col: str, + backends: Tuple[str, ...] = (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS), ) -> Dict[str, TimeFrame]: - """ Initialize TimeFrame objects for the specified backends using the provided DataFrame. - + """Initialize TimeFrame objects for the specified backends using the provided DataFrame. + :param df: The preprocessed DataFrame to initialize TimeFrames with. :param target_col: The target column to use for TimeFrame initialization. :param backends: A tuple of supported backends to initialize. Defaults to Pandas, Modin, and Polars. @@ -215,8 +218,8 @@ def init_timeframes_for_backends( def load_and_init_timeframes( self, backends: Tuple[str, ...] = (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS) ) -> Dict[str, TimeFrame]: - """ Load the dataset and initialize TimeFrames for the specified backends. - + """Load the dataset and initialize TimeFrames for the specified backends. + :param backends: A tuple of supported backends to initialize. Defaults to Pandas, Modin, and Polars. :return: A dictionary containing TimeFrame objects for each backend. :rtype: Dict[str, TimeFrame] diff --git a/src/temporalscope/datasets/synthetic_data_generator.py b/src/temporalscope/datasets/synthetic_data_generator.py new file mode 100644 index 0000000..9f43a1f --- /dev/null +++ b/src/temporalscope/datasets/synthetic_data_generator.py @@ -0,0 +1,335 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""TemporalScope/src/temporalscope/datasets/synthetic_data_generator.py + +This module provides utility functions for generating synthetic time series data, specifically designed to facilitate +unit testing and benchmarking of various components across the TemporalScope ecosystem. The generated data simulates +real-world time series with configurable features, ensuring comprehensive coverage of test cases for various modes, such +as single-step and multi-step target data handling. This module is intended for use within automated test pipelines, and +it plays a critical role in maintaining code stability and robustness across different test suites. + +Core Purpose: +------------- +The `synthetic_data_generator` is a centralized utility for creating synthetic time series data that enables robust testing +of TemporalScope's core modules. It supports the generation of data for various modes, ensuring that TemporalScope modules +can handle a wide range of time series data, including edge cases and anomalies. + +Supported Use Cases: +--------------------- +- Single-step mode: Generates scalar target values for tasks where each row represents a single time step. +- Multi-step mode: Produces input-output sequence data for sequence forecasting, where input sequences (`X`) and output + sequences (`Y`) are handled as part of a unified dataset but with vectorized targets. + +.. note:: + - **Batch size**: This package assumes no default batch size; batch size is typically managed by the data loader (e.g., + TensorFlow `DataLoader`, PyTorch `DataLoader`). The synthetic data generator provides the raw data structure, which is + then partitioned and batched as needed in downstream pipelines (e.g., after target shifting or partitioning). + + - **TimeFrame and Target Shape**: The TemporalScope framework checks if the target is scalar or vector (sequence). The + generated data in multi-step mode follows a unified structure, with the target represented as a sequence in the same + DataFrame. This ensures compatibility with popular machine learning libraries that are compatible with SHAP, LIME, and + other explainability methods. + +.. seealso:: + For further details on the single-step and multi-step modes, refer to the core TemporalScope documentation on data handling. + +Example Visualization: +---------------------- +Here’s a visual demonstration of the datasets generated for single-step and multi-step modes, including the shape +of input (`X`) and target (`Y`) data compatible with most popular ML frameworks like TensorFlow, PyTorch, and SHAP. + +Single-step mode: + +------------+------------+------------+------------+-----------+ + | time | feature_1 | feature_2 | feature_3 | target | + +============+============+============+============+===========+ + | 2023-01-01 | 0.15 | 0.67 | 0.89 | 0.33 | + +------------+------------+------------+------------+-----------+ + | 2023-01-02 | 0.24 | 0.41 | 0.92 | 0.28 | + +------------+------------+------------+------------+-----------+ + + Shape: + - `X`: (num_samples, num_features) + - `Y`: (num_samples, 1) # Scalar target for each time step + +Multi-step mode (with vectorized targets): + + +------------+------------+------------+------------+-------------+ + | time | feature_1 | feature_2 | feature_3 | target | + +============+============+============+============+=============+ + | 2023-01-01 | 0.15 | 0.67 | 0.89 | [0.3, 0.4] | + +------------+------------+------------+------------+-------------+ + | 2023-01-02 | 0.24 | 0.41 | 0.92 | [0.5, 0.6] | + +------------+------------+------------+------------+-------------+ + + Shape: + - `X`: (num_samples, num_features) + - `Y`: (num_samples, sequence_length) # Vectorized target for each input sequence + +Example Usage: +-------------- +.. code-block:: python + + # Generating data for single-step mode + df = create_sample_data(num_samples=100, num_features=3, mode='single_step') + print(df.head()) # Shows the generated data with features and a scalar target. + + # Generating data for multi-step mode + df = create_sample_data(num_samples=100, num_features=3, mode='multi_step') + print(df.head()) # Shows the generated input sequence (`X`) and target sequence (`Y`). +""" + +import numpy as np +from datetime import datetime +from typing import Any, Optional, Tuple, Callable +import pytest + +import pandas as pd +import polars as pl +import modin.pandas as mpd + + +from temporalscope.core.core_utils import ( + BACKEND_PANDAS, + BACKEND_MODIN, + BACKEND_POLARS, + MODE_SINGLE_STEP, + MODE_MULTI_STEP, + SUPPORTED_MULTI_STEP_BACKENDS, + validate_backend, + validate_mode +) +from temporalscope.core.exceptions import UnsupportedBackendError +from temporalscope.core.core_utils import SupportedBackendDataFrame +from temporalscope.core.core_utils import validate_backend, validate_and_convert_input, BACKEND_MODIN, BACKEND_POLARS + +# Constants +DEFAULT_NUM_SAMPLES = 100 +DEFAULT_NUM_FEATURES = 3 +SEED = 42 +DEFAULT_NAN_INTERVAL = 10 # Default interval for inserting NaNs +DEFAULT_NULL_INTERVAL = 15 # Default interval for inserting nulls + + +import numpy as np +from datetime import datetime +import pandas as pd +from temporalscope.core.core_utils import validate_and_convert_input + +def create_sample_data( + backend: str, + num_samples: int = DEFAULT_NUM_SAMPLES, + num_features: int = DEFAULT_NUM_FEATURES, + with_nulls: bool = False, + with_nans: bool = False, + timestamp_like: bool = False, + numeric: bool = False, + mixed_frequencies: bool = False, + mixed_timezones: bool = False, + mode: str = MODE_SINGLE_STEP, + seed: Optional[int] = SEED, + nan_interval: int = DEFAULT_NAN_INTERVAL, + null_interval: int = DEFAULT_NULL_INTERVAL, +) -> pd.DataFrame: + """Generate synthetic time series data for testing XAI workflows across the TemporalScope ecosystem. + + This function generates synthetic time series data with configurable features, ensuring comprehensive test + coverage for various machine learning, deep learning, and survival model applications. The generated data + supports key XAI (Explainable AI) techniques that are model-agnostic, making this utility essential for + testing model interpretability, such as SHAP, LIME, and other time series XAI workflows. + + For interoperability reasons, the data is first generated using Pandas and then converted to the preferred backend. + + :param backend: + The backend to use ('pd' for Pandas, 'mpd' for Modin, 'pl' for Polars). + :type backend: str + :param num_samples: + Number of rows (samples) to generate. (default: 100) + :type num_samples: int + :param num_features: + Number of feature columns to generate. (default: 3) + :type num_features: int + :param with_nulls: + If True, introduces null values. (default: False) + :type with_nulls: bool + :param with_nans: + If True, introduces NaN values. (default: False) + :type with_nans: bool + :param timestamp_like: + If True, includes a timestamp-like column. (default: False) + :type timestamp_like: bool + :param numeric: + If True, includes a numeric 'time' column instead of a timestamp. (default: False) + :type numeric: bool + :param mixed_frequencies: + If True, simulates mixed time intervals in the 'time' column. (default: False) + :type mixed_frequencies: bool + :param mixed_timezones: + If True, generates both timezone-aware and naive time data. (default: False) + :type mixed_timezones: bool + :param mode: + Mode for generating the target column. Supported modes: 'single_step', 'multi_step'. (default: 'single_step') + :type mode: str + :param seed: + Random seed for reproducibility. (default: 42) + :type seed: Optional[int] + :param nan_interval: + Interval at which NaN values are inserted in the second feature. (default: 10) + :type nan_interval: int + :param null_interval: + Interval at which null values are inserted in the third feature. (default: 15) + :type null_interval: int + + :return: + A Pandas DataFrame. The DataFrame is converted to the specified backend after generation. + :rtype: pd.DataFrame + + :raises ValueError: + If an unsupported mode or incompatible configuration is provided, or if multi-step mode is used with unsupported backends. + + Example Visualization: + ---------------------- + Single-step mode: + +------------+------------+------------+------------+-----------+ + | time | feature_1 | feature_2 | feature_3 | target | + +============+============+============+============+===========+ + | 2023-01-01 | 0.15 | 0.67 | 0.89 | 0.33 | + +------------+------------+------------+------------+-----------+ + | 2023-01-02 | 0.24 | 0.41 | 0.92 | 0.28 | + +------------+------------+------------+------------+-----------+ + + Multi-step mode (with vectorized targets): + +------------+------------+------------+------------+-------------+ + | time | feature_1 | feature_2 | feature_3 | target | + +============+============+============+============+=============+ + | 2023-01-01 | 0.15 | 0.67 | 0.89 | [0.3, 0.4] | + +------------+------------+------------+------------+-------------+ + | 2023-01-02 | 0.24 | 0.41 | 0.92 | [0.5, 0.6] | + +------------+------------+------------+------------+-------------+ + + Shape: + - `X`: (num_samples, num_features) + - `Y`: (num_samples, sequence_length) # Vectorized target for each input sequence + """ + # Validate the backend and mode + validate_backend(backend) + validate_mode(backend, mode) + + # Check if multi-step mode is supported for the backend + if mode == MODE_MULTI_STEP and backend not in SUPPORTED_MULTI_STEP_BACKENDS: + raise ValueError(f"Multi-step mode is not supported for the '{backend}' backend.") + + if seed is not None: + np.random.seed(seed) + + # Generate feature columns + data = {f"feature_{i+1}": np.random.rand(num_samples) for i in range(num_features)} + + # Insert NaNs and nulls if required + if with_nans: + for i in range(0, num_samples, nan_interval): + data["feature_2"][i] = np.nan + if with_nulls: + for i in range(0, num_samples, null_interval): + data["feature_3"][i] = None + + # Handle timestamp-like or numeric columns + if timestamp_like and numeric: + raise ValueError("Cannot have both 'timestamp_like' and 'numeric' time columns.") + + if timestamp_like: + data["time"] = pd.date_range("2023-01-01", periods=num_samples, freq="D") + elif numeric: + data["time"] = np.arange(num_samples, dtype=np.float64) + + # Handle mixed frequencies or timezones + if mixed_frequencies: + data["time"] = ( + pd.date_range("2023-01-01", periods=num_samples // 2, freq="D").tolist() + + pd.date_range("2023-02-01", periods=num_samples // 2, freq="M").tolist() + ) + + if mixed_timezones: + time_with_timezones = ( + pd.date_range("2023-01-01", periods=num_samples // 2, freq="D").tz_localize("UTC").tolist() + ) + time_without_timezones = pd.date_range("2023-01-01", periods=num_samples // 2, freq="D").tolist() + data["time"] = time_with_timezones + time_without_timezones + + # Generate target based on the mode + if mode == MODE_SINGLE_STEP: + data["target"] = np.random.rand(num_samples) + elif mode == MODE_MULTI_STEP: + data["target"] = [np.random.rand(10) for _ in range(num_samples)] + else: + raise ValueError(f"Unsupported mode: {mode}") + + # Create the DataFrame using Pandas + df = pd.DataFrame(data) + + # Convert to the specified backend if required + if backend != BACKEND_PANDAS: + df = validate_and_convert_input(df, backend) + + return df + + +@pytest.fixture +def sample_df_with_conditions() -> Callable[[Optional[str], Any], Tuple[SupportedBackendDataFrame, str]]: + """Pytest fixture for creating DataFrames for each backend (Pandas, Modin, Polars) with customizable conditions. + + This function generates synthetic data using Pandas and leaves the conversion to the backend + to be handled by the centralized `validate_and_convert_input` function. + + :return: + A function that generates a DataFrame and the backend type based on user-specified conditions. + :rtype: Callable[[Optional[str], Any], Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]] + + .. example:: + + .. code-block:: python + + df, backend = sample_df_with_conditions(backend="pd", with_nulls=True) + assert df.isnull().sum().sum() > 0 # Ensure nulls are present + + df, backend = sample_df_with_conditions(backend="mpd", mode="multi_step") + assert isinstance(df["target"][0], list) # Multi-step mode returns sequences + + """ + + def _create_sample_df(backend: Optional[str] = None, **kwargs: Any) -> Tuple[SupportedBackendDataFrame, str]: + """Internal helper function to create a sample DataFrame based on the specified backend and options. + + This function generates the data with Pandas and leaves backend conversion to `validate_and_convert_input`. + + :param backend: + The desired backend ('pd', 'mpd', or 'pl'). + :type backend: Optional[str] + :param kwargs: + Additional options for creating the sample data (e.g., with_nans, timestamp_like). + :type kwargs: dict + :return: + A tuple containing the generated DataFrame and the backend type. + :rtype: Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str] + """ + # Generate the sample data using Pandas + df = create_sample_data(backend=BACKEND_PANDAS, **kwargs) + + # Return the DataFrame and the provided backend + return df, backend + + return _create_sample_df diff --git a/test/unit/core/test_core_utils.py b/test/unit/core/test_core_utils.py new file mode 100644 index 0000000..c67821a --- /dev/null +++ b/test/unit/core/test_core_utils.py @@ -0,0 +1,493 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TemporalScope/test/unit/test_core_utils.py + +import warnings +import pytest +from unittest.mock import patch +from typing import Optional, Tuple, Union + +import modin.pandas as mpd +import pandas as pd +import polars as pl +import numpy as np + +# Import core utility functions +from temporalscope.core.core_utils import ( + check_nans, + check_nulls, + get_api_keys, + get_default_backend_cfg, + validate_and_convert_input, + validate_backend, + print_divider, + infer_backend_from_dataframe, + is_timestamp_like, + is_numeric, + has_mixed_frequencies, + sort_dataframe, + check_empty_columns +) + +# Import exceptions +from temporalscope.core.exceptions import UnsupportedBackendError, MixedFrequencyWarning, MixedTimezonesWarning + +# Import the sample data generation and fixture from test_data_utils +from temporalscope.datasets.synthetic_data_generator import create_sample_data, sample_df_with_conditions + +# # Constants +# BACKEND_PANDAS = "pd" +# BACKEND_MODIN = "mpd" +# BACKEND_POLARS = "pl" +# SUPPORTED_BACKENDS = [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS] + +# # Mock API key constants +# MOCK_OPENAI_API_KEY = "mock_openai_key" +# MOCK_CLAUDE_API_KEY = "mock_claude_key" + +# # --- Tests with Parametrization --- + +# @pytest.mark.parametrize( +# "check_func, with_nulls, with_nans", +# [ +# (check_nulls, True, False), # Test with nulls, no NaNs +# (check_nulls, False, False), # Test without nulls +# (check_nans, False, True), # Test with NaNs +# (check_nans, False, False), # Test without NaNs +# ] +# ) +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_check_funcs(backend, sample_df_with_conditions, check_func, with_nulls, with_nans): +# """Test check_nulls and check_nans for both nulls and NaNs across backends.""" +# df, _ = sample_df_with_conditions(backend=backend, with_nulls=with_nulls, with_nans=with_nans) +# result = validate_and_convert_input(df, backend) + +# if check_func == check_nulls: +# # Calculate nulls for each backend +# if backend == BACKEND_POLARS: +# # Polars: Check if null count is greater than 0 +# result_check = result.null_count().select(pl.col("*").sum()).to_numpy().sum() > 0 +# else: +# # Pandas and Modin +# result_check = result.isnull().any().any() +# expected = with_nulls # True if nulls were introduced, else False +# else: +# # Calculate NaNs for each backend +# if backend == BACKEND_POLARS: +# # Polars: Use .is_nan() on each column and sum up NaN values +# result_check = result.select(pl.col("*").is_nan().sum()).to_numpy().sum() > 0 +# else: +# # Pandas and Modin +# result_check = result.isna().any().any() +# expected = with_nans # True if NaNs were introduced, else False + +# assert result_check == expected, f"Expected {expected} but got {result_check} for backend {backend} using {check_func.__name__}" + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_check_nulls(backend, sample_df_with_conditions): +# """Test check_nulls for detecting null values across backends.""" +# # Case 1: DataFrame with nulls +# df_with_nulls, _ = sample_df_with_conditions(backend=backend, with_nulls=True) +# result_with_nulls = check_nulls(df_with_nulls, backend) +# assert result_with_nulls is True, f"Expected True but got {result_with_nulls} for backend {backend} with nulls" + +# # Case 2: DataFrame without nulls +# df_without_nulls, _ = sample_df_with_conditions(backend=backend, with_nulls=False) +# result_without_nulls = check_nulls(df_without_nulls, backend) +# assert result_without_nulls is False, f"Expected False but got {result_without_nulls} for backend {backend} without nulls" + + +# @pytest.mark.parametrize("unsupported_backend", ["unsupported_backend", "invalid_backend", "spark"]) +# def test_check_nulls_unsupported_backend(unsupported_backend): +# """Test that check_nulls raises UnsupportedBackendError for unsupported backends.""" +# df = pd.DataFrame({"col1": [1, 2, 3]}) # Sample DataFrame +# with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): +# check_nulls(df, unsupported_backend) + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_check_nans(backend, sample_df_with_conditions): +# """Test check_nans for detecting NaN values across backends.""" +# # Case 1: DataFrame with NaNs +# df_with_nans, _ = sample_df_with_conditions(backend=backend, with_nans=True) +# result_with_nans = check_nans(df_with_nans, backend) +# assert result_with_nans is True, f"Expected True but got {result_with_nans} for backend {backend} with NaNs" + +# # Case 2: DataFrame without NaNs +# df_without_nans, _ = sample_df_with_conditions(backend=backend, with_nans=False) +# result_without_nans = check_nans(df_without_nans, backend) +# assert result_without_nans is False, f"Expected False but got {result_without_nans} for backend {backend} without NaNs" + + +# @pytest.mark.parametrize("unsupported_backend", ["unsupported_backend", "invalid_backend", "spark"]) +# def test_check_nans_unsupported_backend(unsupported_backend): +# """Test that check_nans raises UnsupportedBackendError for unsupported backends.""" +# df = pd.DataFrame({"col1": [1, 2, 3]}) # Sample DataFrame +# with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): +# check_nans(df, unsupported_backend) + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_validate_backend_supported(backend): +# """Test that supported backends are validated successfully.""" +# validate_backend(backend) + + +# @pytest.mark.parametrize("invalid_backend", ["tf", "spark", "unknown"]) +# def test_validate_backend_unsupported(invalid_backend): +# """Test that unsupported backends raise an UnsupportedBackendError.""" +# with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): +# validate_backend(invalid_backend) + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# @pytest.mark.parametrize("target_backend", SUPPORTED_BACKENDS) +# def test_validate_and_convert_input(sample_df_with_conditions, backend, target_backend): +# """Test that DataFrame conversion between backends works correctly.""" +# df, _ = sample_df_with_conditions(backend=backend, with_nulls=False) +# result = validate_and_convert_input(df, target_backend) + +# if target_backend == BACKEND_PANDAS: +# assert isinstance(result, pd.DataFrame), f"Expected Pandas DataFrame but got {type(result)}" +# elif target_backend == BACKEND_POLARS: +# assert isinstance(result, pl.DataFrame), f"Expected Polars DataFrame but got {type(result)}" +# elif target_backend == BACKEND_MODIN: +# assert isinstance(result, mpd.DataFrame), f"Expected Modin DataFrame but got {type(result)}" + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_validate_and_convert_input_invalid_type(backend): +# """Test that validate_and_convert_input raises TypeError when given an invalid DataFrame type.""" +# invalid_df = "This is not a DataFrame" +# with pytest.raises(TypeError, match="Input DataFrame type"): +# validate_and_convert_input(invalid_df, backend) + + +# @pytest.mark.parametrize("invalid_backend", ["unsupported_backend", "excel", "json", None]) +# def test_validate_and_convert_input_invalid_backend(sample_df_with_conditions, invalid_backend): +# """Test that validate_and_convert_input raises UnsupportedBackendError for invalid or None backend.""" +# df, _ = sample_df_with_conditions(backend=BACKEND_PANDAS) +# with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): +# validate_and_convert_input(df, invalid_backend) + + +# def test_print_divider(capsys): +# """Test the print_divider function outputs the correct string.""" +# print_divider("-", 50) +# captured = capsys.readouterr() +# assert captured.out == "-" * 50 + "\n" + + +# def test_get_api_keys(): +# """Test that get_api_keys retrieves environment variables correctly.""" +# with patch.dict("os.environ", {"OPENAI_API_KEY": MOCK_OPENAI_API_KEY, "CLAUDE_API_KEY": MOCK_CLAUDE_API_KEY}): +# api_keys = get_api_keys() +# assert api_keys["OPENAI_API_KEY"] == MOCK_OPENAI_API_KEY +# assert api_keys["CLAUDE_API_KEY"] == MOCK_CLAUDE_API_KEY + +# with patch.dict("os.environ", {}, clear=True): +# api_keys = get_api_keys() +# assert api_keys["OPENAI_API_KEY"] is None +# assert api_keys["CLAUDE_API_KEY"] is None + + +# def test_get_default_backend_cfg(): +# """Test that the default backend configuration is returned correctly.""" +# expected_cfg = { +# "BACKENDS": { +# BACKEND_POLARS: "polars", +# BACKEND_PANDAS: "pandas", +# BACKEND_MODIN: "modin", +# } +# } +# result = get_default_backend_cfg() +# assert result == expected_cfg, f"Expected {expected_cfg} but got {result}" + + +# def test_validate_and_convert_input_modin_to_polars(sample_df_with_conditions): +# """Test Modin DataFrame conversion to Polars.""" +# # Create a sample Modin DataFrame +# df_modin, _ = sample_df_with_conditions(backend=BACKEND_MODIN) + +# # Mock the _to_pandas method to ensure it's called +# with patch.object(mpd.DataFrame, "_to_pandas", return_value=df_modin._to_pandas()) as mock_to_pandas: +# # Convert from Modin to Polars +# result = validate_and_convert_input(df_modin, BACKEND_POLARS) +# assert isinstance(result, pl.DataFrame), f"Expected Polars DataFrame but got {type(result)}" +# mock_to_pandas.assert_called_once() # Ensure _to_pandas is called + + + +# @pytest.mark.parametrize( +# "input_df, expected_backend", +# [ +# (pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}), BACKEND_PANDAS), # Pandas DataFrame +# (pl.DataFrame({'col1': [1, 2], 'col2': [3, 4]}), BACKEND_POLARS), # Polars DataFrame +# (mpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}), BACKEND_MODIN), # Modin DataFrame +# ] +# ) +# def test_infer_backend_from_dataframe(input_df, expected_backend): +# """Test the infer_backend_from_dataframe function for supported backends.""" +# assert infer_backend_from_dataframe(input_df) == expected_backend + +# def test_infer_backend_from_dataframe_unsupported(): +# """Test that infer_backend_from_dataframe raises an UnsupportedBackendError for unsupported backends.""" +# invalid_df = "This is not a DataFrame" +# with pytest.raises(UnsupportedBackendError, match="Unsupported DataFrame type"): +# infer_backend_from_dataframe(invalid_df) + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_is_timestamp_like(backend, sample_df_with_conditions): +# """Test is_timestamp_like for timestamp-like columns across backends.""" +# df, _ = sample_df_with_conditions(backend=backend, timestamp_like=True) +# result = is_timestamp_like(df, "time") +# assert result is True, f"Expected True for timestamp-like column but got {result}" + +# df, _ = sample_df_with_conditions(backend=backend, numeric=True) # Non-timestamp column +# result = is_timestamp_like(df, "time") +# assert result is False, f"Expected False for non-timestamp column but got {result}" + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_is_numeric(backend, sample_df_with_conditions): +# """Test is_numeric for numeric columns across backends.""" +# df, _ = sample_df_with_conditions(backend=backend, numeric=True) +# result = is_numeric(df, "time") +# assert result is True, f"Expected True for numeric column but got {result}" + +# df, _ = sample_df_with_conditions(backend=backend, timestamp_like=True) +# result = is_numeric(df, "time") +# assert result is False, f"Expected False for non-numeric column but got {result}" + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_has_mixed_frequencies(backend, sample_df_with_conditions): +# """Test has_mixed_frequencies for mixed frequency time columns across backends.""" +# df, _ = sample_df_with_conditions(backend=backend, mixed_frequencies=True) +# result = has_mixed_frequencies(df, "time") +# assert result is True, f"Expected True for mixed frequencies but got {result}" + +# df, _ = sample_df_with_conditions(backend=backend, timestamp_like=True) +# result = has_mixed_frequencies(df, "time") +# assert result is False, f"Expected False for consistent frequencies but got {result}" + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_time_column_not_found(backend, sample_df_with_conditions): +# """Test that ValueError is raised when the time column does not exist.""" +# df, _ = sample_df_with_conditions(backend=backend) # Create a sample DataFrame without the 'non_existing_time_col' +# with pytest.raises(ValueError, match="Column 'non_existing_time_col' not found"): +# is_timestamp_like(df, "non_existing_time_col") + +# with pytest.raises(ValueError, match="Column 'non_existing_time_col' not found"): +# is_numeric(df, "non_existing_time_col") + +# with pytest.raises(ValueError, match="Column 'non_existing_time_col' not found"): +# has_mixed_frequencies(df, "non_existing_time_col") + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_empty_dataframe(backend): +# """Test handling of empty DataFrames across backends.""" +# # Create an empty DataFrame based on the backend +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame({"time": []}) # Empty but with a time column +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame({"time": []}) +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame({"time": []}) + +# # Ensure the functions return False or handle empty DataFrames gracefully +# assert not is_timestamp_like(df, "time"), "Expected False for is_timestamp_like on empty DataFrame" +# assert not is_numeric(df, "time"), "Expected False for is_numeric on empty DataFrame" +# assert not has_mixed_frequencies(df, "time"), "Expected False for has_mixed_frequencies on empty DataFrame" + + +# @pytest.mark.parametrize( +# "backend, wrong_backend, expected_exception, expected_message", +# [ +# # Generalized the regex to match a rough pattern for class types, without being too specific +# (BACKEND_POLARS, BACKEND_PANDAS, TypeError, r"Expected Pandas DataFrame but got .*polars.*"), +# (BACKEND_PANDAS, BACKEND_POLARS, TypeError, r"Expected Polars DataFrame but got .*pandas.*"), +# (BACKEND_MODIN, BACKEND_PANDAS, TypeError, r"Expected Pandas DataFrame but got .*modin.*"), +# (BACKEND_MODIN, BACKEND_POLARS, TypeError, r"Expected Polars DataFrame but got .*modin.*"), +# (BACKEND_PANDAS, "unsupported_backend", UnsupportedBackendError, r"Unsupported backend: unsupported_backend\. Supported backends are 'pd', 'mpd', 'pl'\."), +# ] +# ) +# def test_sort_dataframe_exceptions(sample_df_with_conditions, backend, wrong_backend, expected_exception, expected_message): +# """Test that sort_dataframe raises the correct exceptions for invalid DataFrame types and unsupported backends.""" +# # Create a sample DataFrame for the correct backend +# df, _ = sample_df_with_conditions(backend=backend, numeric=True) + +# # Try sorting the DataFrame using the wrong backend and expect exceptions +# with pytest.raises(expected_exception, match=expected_message): +# sort_dataframe(df, "time", wrong_backend) + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# @pytest.mark.parametrize("ascending", [True, False]) +# def test_sort_dataframe(sample_df_with_conditions, backend, ascending): +# """Test that sort_dataframe correctly sorts the DataFrame by time column.""" +# # Create a sample DataFrame with a numeric time column +# df, _ = sample_df_with_conditions(backend=backend, numeric=True) + +# # Sort the DataFrame using the utility function +# sorted_df = sort_dataframe(df, "time", backend, ascending) + +# # Extract the time column from the sorted DataFrame +# sorted_time_column = sorted_df["time"].to_numpy() if backend != BACKEND_POLARS else sorted_df["time"].to_numpy().flatten() + +# # Calculate the expected sorted time column +# expected_sorted_time_column = sorted(df["time"].to_numpy(), reverse=not ascending) + +# # Ensure the time column is correctly sorted +# assert all(sorted_time_column == expected_sorted_time_column), f"Expected sorted time column {expected_sorted_time_column} but got {sorted_time_column} for backend {backend} with ascending={ascending}" + +# # --- IKdividual tests for Modin backend --- + +# @pytest.mark.parametrize( +# "wrong_backend, expected_exception, expected_substring", +# [ +# (BACKEND_PANDAS, TypeError, "Expected Pandas DataFrame"), # Substring matching for Pandas +# (BACKEND_POLARS, TypeError, "Expected Polars DataFrame"), # Substring matching for Polars +# ("unsupported_backend", UnsupportedBackendError, "Unsupported backend"), # Catch unsupported backend +# ] +# ) +# def test_sort_dataframe_modin_exceptions(sample_df_with_conditions, wrong_backend, expected_exception, expected_substring): +# """Test that sort_dataframe raises the correct exceptions for Modin DataFrames and wrong backends.""" +# # Create a sample DataFrame for Modin backend +# df, _ = sample_df_with_conditions(backend=BACKEND_MODIN, numeric=True) + +# # Try sorting the Modin DataFrame using the wrong backend and expect exceptions +# with pytest.raises(expected_exception) as exc_info: +# sort_dataframe(df, "time", wrong_backend) + +# # Ensure that the expected substring is in the exception message +# assert expected_substring in str(exc_info.value), f"Expected substring '{expected_substring}' in exception message but got: {str(exc_info.value)}" + +# @pytest.mark.parametrize( +# "wrong_df, backend, expected_exception, expected_substring", +# [ +# (pd.DataFrame({"time": [1, 2, 3]}), BACKEND_MODIN, TypeError, "Expected Modin DataFrame"), # Force the specific TypeError for Modin +# ] +# ) +# def test_sort_dataframe_modin_type_error(sample_df_with_conditions, wrong_df, backend, expected_exception, expected_substring): +# """Test that sort_dataframe raises TypeError for non-Modin DataFrames when the backend is Modin.""" +# # Try sorting a non-Modin DataFrame using the Modin backend and expect exceptions +# with pytest.raises(expected_exception) as exc_info: +# sort_dataframe(wrong_df, "time", backend) + +# # Ensure that the expected substring is in the exception message +# assert expected_substring in str(exc_info.value), f"Expected substring '{expected_substring}' in exception message but got: {str(exc_info.value)}" + + +# @pytest.mark.parametrize( +# "backend, with_empty_columns, expected_result", +# [ +# (BACKEND_PANDAS, True, True), # Test for empty columns in Pandas +# (BACKEND_PANDAS, False, False), # Test for no empty columns in Pandas +# (BACKEND_POLARS, True, True), # Test for empty columns in Polars +# (BACKEND_POLARS, False, False), # Test for no empty columns in Polars +# (BACKEND_MODIN, True, True), # Test for empty columns in Modin +# (BACKEND_MODIN, False, False), # Test for no empty columns in Modin +# ] +# ) +# def test_check_empty_columns(backend, sample_df_with_conditions, with_empty_columns, expected_result): +# """Test check_empty_columns for detecting empty columns across backends.""" + +# # Case 1: Create a sample DataFrame with or without empty columns +# if with_empty_columns: +# data = create_sample_data(num_samples=100) +# # Fill empty column with None (consistent length with other columns) +# data["empty_col"] = [None] * 100 +# else: +# data = create_sample_data(num_samples=100) + +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# # Check for empty columns +# result = check_empty_columns(df, backend) + +# # Ensure the result matches the expected outcome +# assert result == expected_result, f"Expected {expected_result} but got {result} for backend {backend} with empty columns={with_empty_columns}" + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_check_empty_columns_empty_dataframe(backend): +# """Test that check_empty_columns raises ValueError for empty DataFrames across all backends.""" + +# # Case: Empty DataFrame (no columns) +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame() # Create an empty DataFrame for Pandas +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame() # Create an empty DataFrame for Polars +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame() # Create an empty DataFrame for Modin + +# # Expect a ValueError due to the lack of columns +# with pytest.raises(ValueError, match="The DataFrame contains no columns to check."): +# check_empty_columns(df, backend) + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_check_no_empty_columns(backend): +# """Test that check_empty_columns returns False when all columns have non-empty data.""" + +# # Case: DataFrame with non-empty columns (no NaN/None values) +# data = create_sample_data(num_samples=100) + +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# # Check for empty columns, should return False since no columns are empty +# result = check_empty_columns(df, backend) + +# # Assert that the function returns False (indicating no empty columns) +# assert result is False, "Expected False when no columns are empty, but got True" + + +# @pytest.mark.parametrize("backend", SUPPORTED_BACKENDS) +# def test_check_empty_columns_no_empty(backend): +# """Test that check_empty_columns returns False when no columns are empty.""" + +# # Case: DataFrame with no empty columns (all columns have valid data) +# data = create_sample_data(num_samples=100) + +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# # Check for empty columns, should return False since no columns are empty +# result = check_empty_columns(df, backend) + +# # Ensure that the function correctly returns False (indicating no empty columns) +# assert result is False, "Expected False when no columns are empty, but got True" diff --git a/test/unit/test_core_exceptions.py b/test/unit/core/test_exceptions.py similarity index 80% rename from test/unit/test_core_exceptions.py rename to test/unit/core/test_exceptions.py index 42c1346..ffa5f97 100644 --- a/test/unit/test_core_exceptions.py +++ b/test/unit/core/test_exceptions.py @@ -17,8 +17,8 @@ """ TemporalScope/test/unit/test_core_exceptions.py -This module contains unit tests for the custom exceptions and warnings defined -in the TemporalScope package. These tests ensure that the exceptions are +This module contains unit tests for the custom exceptions and warnings defined +in the TemporalScope package. These tests ensure that the exceptions are raised correctly and the warnings are issued in the appropriate scenarios. """ @@ -30,9 +30,14 @@ TimeColumnError, MixedTypesWarning, MixedTimezonesWarning, - MixedFrequencyWarning + MixedFrequencyWarning, + UnsupportedBackendError ) +def test_unsupported_backend_error(): + """Test that UnsupportedBackendError is raised with the correct message.""" + with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): + raise UnsupportedBackendError("Unsupported backend 'invalid_backend'") def test_time_frame_error_inheritance(): """Test that TimeFrameError is the base class for other exceptions.""" with pytest.raises(TimeFrameError): @@ -61,3 +66,9 @@ def test_mixed_frequency_warning(): """Test that MixedFrequencyWarning is issued when mixed timestamp frequencies are detected.""" with pytest.warns(MixedFrequencyWarning, match="Mixed timestamp frequencies"): warnings.warn("Mixed timestamp frequencies", MixedFrequencyWarning) + + +def test_unsupported_backend_error(): + """Test that UnsupportedBackendError is raised with the correct message.""" + with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): + raise UnsupportedBackendError("Unsupported backend 'invalid_backend'") diff --git a/test/unit/core/test_temporal_data_loader.py b/test/unit/core/test_temporal_data_loader.py new file mode 100644 index 0000000..083328a --- /dev/null +++ b/test/unit/core/test_temporal_data_loader.py @@ -0,0 +1,296 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS +# OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +# TemporalScope/test/unit/test_core_temporal_data_loader.py + +import pytest +from typing import Dict, Union, Optional, List +from datetime import datetime, timedelta, timezone + +import numpy as np +import pandas as pd +import polars as pl +import modin.pandas as mpd + +from temporalscope.core.temporal_data_loader import TimeFrame + +from temporalscope.core.exceptions import ( + TimeColumnError, + MixedTypesWarning, + MixedFrequencyWarning, + UnsupportedBackendError, +) + +BACKEND_POLARS = "pl" +BACKEND_PANDAS = "pd" +BACKEND_MODIN = "mpd" + + +# Utility to create sample data for various edge cases +# def create_sample_data( +# num_samples: int = 100, num_features: int = 3, missing_values: bool = False, +# mixed_frequencies: bool = False, non_numeric_time: bool = False, mixed_timezones: bool = False +# ) -> Dict[str, Union[List[datetime], List[float]]]: +# """Create sample data to test edge cases with numeric features and a time column.""" + +# start_date = datetime(2021, 1, 1) +# if non_numeric_time: +# data = {"time": ["non_numeric" for _ in range(num_samples)]} +# elif mixed_timezones: +# data = {"time": [(start_date + timedelta(days=i)).replace(tzinfo=timezone.utc if i % 2 == 0 else None) +# for i in range(num_samples)]} +# else: +# data = {"time": [start_date + timedelta(days=i) for i in range(num_samples)]} + +# for i in range(1, num_features + 1): +# data[f"feature_{i}"] = np.random.rand(num_samples).tolist() + +# if mixed_frequencies: +# data["time"] = pd.date_range(start='2021-01-01', periods=num_samples // 2, freq='D').tolist() +# data["time"] += pd.date_range(start='2021-02-01', periods=num_samples // 2, freq='M').tolist() + +# if missing_values: +# for i in range(num_samples): +# if i % 10 == 0: +# data[f"feature_1"][i] = None + +# data["target"] = np.random.rand(num_samples).tolist() +# return data + + +# @pytest.mark.parametrize( +# "backend, case_type, expected_error, match_message", +# [ +# (BACKEND_POLARS, "non_numeric_time_col", TimeColumnError, r"`time_col` must be numeric or timestamp-like"), +# (BACKEND_PANDAS, "non_numeric_time_col", TimeColumnError, r"`time_col` must be numeric or timestamp-like"), +# (BACKEND_MODIN, "non_numeric_time_col", TimeColumnError, r"`time_col` must be numeric or timestamp-like"), +# (BACKEND_PANDAS, "mixed_frequencies", None, r"Mixed timestamp frequencies detected in the time column."), # Update match message +# ] +# ) +# def test_validation_edge_cases(backend, case_type, expected_error, match_message): +# """Test validation logic for edge cases like non-numeric time columns and mixed frequencies.""" + +# if case_type == "non_numeric_time_col": +# data = create_sample_data(non_numeric_time=True) +# elif case_type == "mixed_frequencies": +# data = create_sample_data(mixed_frequencies=True) + +# if backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# if expected_error: +# with pytest.raises(expected_error, match=match_message): +# TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) +# else: +# if case_type == "mixed_frequencies": +# with pytest.warns(MixedFrequencyWarning, match=match_message): # Expect MixedFrequencyWarning +# TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) + + +# @pytest.mark.parametrize( +# "backend, time_col, target_col, expected_error, match_message, infer_backend", +# [ +# # Valid cases with explicit backend +# (BACKEND_PANDAS, "time", "target", None, None, False), +# (BACKEND_POLARS, "time", "target", None, None, False), +# (BACKEND_MODIN, "time", "target", None, None, False), + +# # Valid cases with inferred backend +# (BACKEND_PANDAS, "time", "target", None, None, True), +# (BACKEND_POLARS, "time", "target", None, None, True), +# (BACKEND_MODIN, "time", "target", None, None, True), + +# # Invalid `time_col` cases +# (BACKEND_PANDAS, "", "target", ValueError, "`time_col` must be a non-empty string.", False), +# (BACKEND_POLARS, "", "target", ValueError, "`time_col` must be a non-empty string.", False), +# (BACKEND_MODIN, "", "target", ValueError, "`time_col` must be a non-empty string.", False), + +# # Invalid `target_col` cases +# (BACKEND_PANDAS, "time", "", ValueError, "`target_col` must be a non-empty string.", False), +# (BACKEND_POLARS, "time", "", ValueError, "`target_col` must be a non-empty string.", False), +# (BACKEND_MODIN, "time", "", ValueError, "`target_col` must be a non-empty string.", False), + +# # Invalid backend cases +# ("invalid_backend", "time", "target", UnsupportedBackendError, "Unsupported backend", False), +# ] +# ) +# def test_timeframe_init_and_get_data(backend, time_col, target_col, expected_error, match_message, infer_backend): +# """Test initialization of TimeFrame class and `get_data` method across backends, including invalid cases.""" + +# data = create_sample_data() + +# if backend in [BACKEND_PANDAS, BACKEND_POLARS, BACKEND_MODIN]: +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) +# else: +# # Use a dummy Pandas DataFrame for unsupported backend +# df = pd.DataFrame(data) + +# if expected_error: +# with pytest.raises(expected_error, match=match_message): +# if infer_backend: +# # Don't pass the backend to trigger inference +# TimeFrame(df, time_col=time_col, target_col=target_col) +# else: +# # Pass backend explicitly (invalid backend case covered here) +# TimeFrame(df, time_col=time_col, target_col=target_col, dataframe_backend=backend) +# else: +# # Initialize the TimeFrame +# if infer_backend: +# tf = TimeFrame(df, time_col=time_col, target_col=target_col) +# else: +# tf = TimeFrame(df, time_col=time_col, target_col=target_col, dataframe_backend=backend) + +# # Ensure `get_data` returns the correct DataFrame +# result_df = tf.get_data() +# assert result_df.shape[0] == 100 # Ensure the DataFrame has the expected number of rows +# assert tf.time_col == time_col # Time column should match the expected value +# assert tf.target_col == target_col # Target column should match the expected value + +# if infer_backend: +# # Check that the backend was correctly inferred +# assert tf.dataframe_backend == backend, f"Expected inferred backend {backend}, but got {tf.dataframe_backend}" + +# @pytest.mark.parametrize( +# "backend, time_col, target_col, expected_error, match_message", +# [ +# # Missing `time_col` in DataFrame (should raise ValueError) +# (BACKEND_PANDAS, "invalid_time", "target", ValueError, "`time_col` 'invalid_time' not found"), +# (BACKEND_POLARS, "invalid_time", "target", ValueError, "`time_col` 'invalid_time' not found"), +# (BACKEND_MODIN, "invalid_time", "target", ValueError, "`time_col` 'invalid_time' not found"), + +# # Missing `target_col` in DataFrame (should raise ValueError) +# (BACKEND_PANDAS, "time", "invalid_target", ValueError, "`target_col` 'invalid_target' not found"), +# (BACKEND_POLARS, "time", "invalid_target", ValueError, "`target_col` 'invalid_target' not found"), +# (BACKEND_MODIN, "time", "invalid_target", ValueError, "`target_col` 'invalid_target' not found"), +# ] +# ) +# def test_timeframe_missing_columns(backend, time_col, target_col, expected_error, match_message): +# """Test that missing columns raise ValueError with the correct message.""" +# data = create_sample_data() + +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# with pytest.raises(expected_error, match=match_message): +# TimeFrame(df, time_col=time_col, target_col=target_col, dataframe_backend=backend) + + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_POLARS, BACKEND_MODIN]) +# def test_sort_data(backend): +# """Test sorting of the DataFrame by time column using `sort_data` method.""" + +# data = create_sample_data(num_samples=10) + +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) + +# # Sort the DataFrame in descending order +# tf.sort_data(ascending=False) + +# sorted_df = tf.get_data() + +# # For Polars use .to_numpy() or .row() to access the rows +# if backend == BACKEND_POLARS: +# time_col_np = sorted_df["time"].to_numpy() +# assert time_col_np[0] > time_col_np[-1] +# else: +# assert sorted_df["time"].iloc[0] > sorted_df["time"].iloc[-1] + + + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_POLARS, BACKEND_MODIN]) +# def test_update_target_col(backend): +# """Test `update_target_col` method across backends by updating the target column.""" + +# data = create_sample_data() + +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) + +# # New target column +# new_target_col = np.random.rand(100) + +# if backend == BACKEND_PANDAS: +# new_target_series = pd.Series(new_target_col, name="target") +# elif backend == BACKEND_POLARS: +# new_target_series = pl.Series("target", new_target_col) +# elif backend == BACKEND_MODIN: +# new_target_series = mpd.Series(new_target_col, name="target") + +# # Update the target column +# tf.update_target_col(new_target_series) + +# # Ensure the target column has been updated correctly +# updated_df = tf.get_data() +# assert np.allclose(updated_df["target"], new_target_col), "Target column update failed." + + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_POLARS, BACKEND_MODIN]) +# def test_validate_and_update_data(backend): +# """Test `update_data` method by updating the entire DataFrame.""" + +# data = create_sample_data(num_samples=100) + +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) + +# # Create a new DataFrame with 50 samples +# new_data = create_sample_data(num_samples=50) + +# if backend == BACKEND_PANDAS: +# new_df = pd.DataFrame(new_data) +# elif backend == BACKEND_POLARS: +# new_df = pl.DataFrame(new_data) +# elif backend == BACKEND_MODIN: +# new_df = mpd.DataFrame(new_data) + +# # Update the DataFrame in the TimeFrame instance +# tf.update_data(new_df) + +# # Ensure the new data has been updated correctly +# updated_df = tf.get_data() +# assert updated_df.shape[0] == 50, "DataFrame update failed. Expected 50 rows." diff --git a/test/unit/core/test_temporal_target_shifter.py b/test/unit/core/test_temporal_target_shifter.py new file mode 100644 index 0000000..3452c7d --- /dev/null +++ b/test/unit/core/test_temporal_target_shifter.py @@ -0,0 +1,224 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TemporalScope/test/unit/test_core_temporal_target_shifter.py + +# import modin.pandas as mpd +# import numpy as np +# import pandas as pd +# import polars as pl +# import pytest + +# from temporalscope.core.core_utils import ( +# BACKEND_MODIN, +# BACKEND_PANDAS, +# BACKEND_POLARS, +# MODE_MACHINE_LEARNING, +# MODE_DEEP_LEARNING, +# ) +# from temporalscope.core.temporal_data_loader import TimeFrame +# from temporalscope.core.temporal_target_shifter import TemporalTargetShifter + + +# # Fixture to generate sample dataframes for different data_formats +# @pytest.fixture(params=[BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def sample_dataframe(request): +# """Fixture to generate sample dataframes for different data_formats.""" +# data = { +# "time": pd.date_range(start="2022-01-01", periods=100), +# "target": np.random.rand(100), +# "feature_1": np.random.rand(100), +# "feature_2": np.random.rand(100), +# } +# data_format = request.param +# if data_format == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif data_format == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif data_format == BACKEND_MODIN: +# df = mpd.DataFrame(data) +# return df, data_format, "target" + + +# # Parametrized Test for data_format Inference, n_lags, and Modes +# @pytest.mark.parametrize( +# "n_lags, mode, sequence_length", +# [ +# (1, MODE_MACHINE_LEARNING, None), +# (3, MODE_MACHINE_LEARNING, None), +# (1, MODE_DEEP_LEARNING, 5), +# ], +# ) +# @pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) # Parametrizing data_formats as well +# def test_data_format_inference(data_format, n_lags, mode, sequence_length): +# """Test data_format inference and shifting functionality across all data_formats.""" +# # Generate data for the current data_format +# data = { +# "time": pd.date_range(start="2022-01-01", periods=100), +# "target": np.random.rand(100), +# "feature_1": np.random.rand(100), +# "feature_2": np.random.rand(100), +# } + +# if data_format == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif data_format == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif data_format == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# # Initialize shifter +# shifter = TemporalTargetShifter(n_lags=n_lags, mode=mode, sequence_length=sequence_length, target_col="target") + +# # Test fitting the dataframe and checking the inferred data_format +# shifter.fit(df) +# assert shifter.data_format == data_format + +# # Test transformation (ensure no crashes) +# transformed = shifter.transform(df) +# assert transformed is not None + + +# # Parametrized test for invalid data and expected errors across data_formats +# @pytest.mark.parametrize( +# "invalid_data", +# [ +# None, # Null input should raise an error +# pd.DataFrame(), # Empty DataFrame should raise an error +# ], +# ) +# @pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def test_invalid_data_handling(data_format, invalid_data): +# """Test invalid data handling for empty or None DataFrames across data_formats.""" +# shifter = TemporalTargetShifter(n_lags=1, target_col="target") + +# with pytest.raises(ValueError): +# shifter.fit(invalid_data) + + +# # Parametrized test for TimeFrame inputs and transformation across all data_formats +# @pytest.mark.parametrize("n_lags", [1, 2]) +# @pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def test_time_frame_input(data_format, n_lags): +# """Test TimeFrame input handling and transformation across all data_formats.""" +# # Generate data for the current data_format +# data = { +# "time": pd.date_range(start="2022-01-01", periods=100), +# "target": np.random.rand(100), +# "feature_1": np.random.rand(100), +# "feature_2": np.random.rand(100), +# } + +# if data_format == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif data_format == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif data_format == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# # Ensure TimeFrame uses dataframe_backend +# tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=data_format) +# shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") + +# # Test fitting and transforming TimeFrame +# shifter.fit(tf) +# transformed = shifter.transform(tf) +# assert transformed is not None + + +# # Parametrized test for deep learning mode with different sequence lengths across all data_formats +# @pytest.mark.parametrize("sequence_length", [3, 5]) +# @pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def test_deep_learning_mode(data_format, sequence_length): +# """Test deep learning mode sequence generation across all data_formats.""" +# # Generate data for the current data_format +# data = { +# "time": pd.date_range(start="2022-01-01", periods=100), +# "target": np.random.rand(100), +# "feature_1": np.random.rand(100), +# "feature_2": np.random.rand(100), +# } + +# if data_format == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif data_format == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif data_format == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# shifter = TemporalTargetShifter( +# n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=sequence_length, target_col="target" +# ) + +# shifter.fit(df) +# transformed = shifter.transform(df) +# assert transformed is not None + + +# # Test verbose mode with stdout capture +# @pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def test_verbose_mode(data_format, capfd): +# """Test verbose mode output and row dropping information.""" +# # Generate data for the current data_format +# data = { +# "time": pd.date_range(start="2022-01-01", periods=100), +# "target": np.random.rand(100), +# "feature_1": np.random.rand(100), +# "feature_2": np.random.rand(100), +# } + +# if data_format == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif data_format == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif data_format == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# shifter = TemporalTargetShifter(n_lags=1, target_col="target", verbose=True) + +# shifter.fit(df) +# shifter.transform(df) + +# # Capture stdout and check for printed verbose information +# captured = capfd.readouterr() +# assert "Rows before shift" in captured.out + + +# # Parametrized test for fit_transform method for all data_formats +# @pytest.mark.parametrize("n_lags", [1, 2]) +# @pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) +# def test_fit_transform(data_format, n_lags): +# """Test fit_transform() method for all data_formats.""" +# # Generate data for the current data_format +# data = { +# "time": pd.date_range(start="2022-01-01", periods=100), +# "target": np.random.rand(100), +# "feature_1": np.random.rand(100), +# "feature_2": np.random.rand(100), +# } + +# if data_format == BACKEND_POLARS: +# df = pl.DataFrame(data) +# elif data_format == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# elif data_format == BACKEND_MODIN: +# df = mpd.DataFrame(data) + +# shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") + +# transformed = shifter.fit_transform(df) +# assert transformed is not None diff --git a/test/unit/test_datasets.py b/test/unit/datasets/test_datasets.py similarity index 60% rename from test/unit/test_datasets.py rename to test/unit/datasets/test_datasets.py index c2fbea9..4703c12 100644 --- a/test/unit/test_datasets.py +++ b/test/unit/datasets/test_datasets.py @@ -29,40 +29,27 @@ def dataset_loader(): return DatasetLoader(dataset_name="macrodata") -def test_load_dataset_and_target(dataset_loader): - """Test loading the dataset and its target column.""" +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +def test_init_timeframes_for_backends_parametrized(dataset_loader, backend): + """Test initializing TimeFrame objects for different backends.""" df, target_col = dataset_loader._load_dataset_and_target() - assert isinstance(df, pd.DataFrame) - assert target_col == "realgdp" - assert "ds" in df.columns - assert len(df) > 0 # Ensure the dataset is not empty + timeframes = dataset_loader.init_timeframes_for_backends(df, target_col, backends=(backend,)) -def test_init_timeframes_for_backends(dataset_loader): - """Test initializing TimeFrame objects for multiple backends.""" - df, target_col = dataset_loader._load_dataset_and_target() - - timeframes = dataset_loader.init_timeframes_for_backends(df, target_col) - - # Check if the returned TimeFrame objects for each backend are valid - assert isinstance(timeframes[BACKEND_PANDAS], TimeFrame) - assert isinstance(timeframes[BACKEND_MODIN], TimeFrame) - assert isinstance(timeframes[BACKEND_POLARS], TimeFrame) + assert isinstance(timeframes[backend], TimeFrame) - # Ensure correct data in each backend - assert timeframes[BACKEND_PANDAS].dataframe_backend == BACKEND_PANDAS - assert timeframes[BACKEND_MODIN].dataframe_backend == BACKEND_MODIN - assert timeframes[BACKEND_POLARS].dataframe_backend == BACKEND_POLARS + # Check that the backend is correct + assert timeframes[backend].dataframe_backend == backend -def test_load_and_init_timeframes(dataset_loader): - """Test loading dataset and initializing TimeFrames for all backends.""" - timeframes = dataset_loader.load_and_init_timeframes() +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +def test_load_and_init_timeframes_parametrized(dataset_loader, backend): + """Test loading dataset and initializing TimeFrames for each backend.""" + timeframes = dataset_loader.load_and_init_timeframes(backends=(backend,)) - # Check if the returned TimeFrame objects for each backend are valid - assert isinstance(timeframes[BACKEND_PANDAS], TimeFrame) - assert isinstance(timeframes[BACKEND_MODIN], TimeFrame) - assert isinstance(timeframes[BACKEND_POLARS], TimeFrame) + # Check if the returned TimeFrame object is valid for the backend + assert isinstance(timeframes[backend], TimeFrame) + assert timeframes[backend].dataframe_backend == backend def test_invalid_backend_raises_error(dataset_loader): @@ -79,15 +66,15 @@ def test_invalid_dataset_name(): DatasetLoader(dataset_name="invalid") -def test_init_timeframes_with_custom_backend(dataset_loader): - """Test initializing TimeFrames with a custom selection of backends.""" +@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +def test_init_timeframes_with_custom_backend(dataset_loader, backend): + """Test initializing TimeFrames with a custom backend selection.""" df, target_col = dataset_loader._load_dataset_and_target() - timeframes = dataset_loader.init_timeframes_for_backends(df, target_col, backends=(BACKEND_PANDAS,)) + timeframes = dataset_loader.init_timeframes_for_backends(df, target_col, backends=(backend,)) # Ensure only the requested backend is initialized - assert BACKEND_PANDAS in timeframes - assert BACKEND_MODIN not in timeframes - assert BACKEND_POLARS not in timeframes + assert backend in timeframes + assert isinstance(timeframes[backend], TimeFrame) def test_load_dataset_internal_call(mocker): @@ -108,3 +95,21 @@ def test_load_dataset_and_verify_time_column(dataset_loader): # Ensure 'ds' column exists and is of datetime type assert "ds" in df.columns assert pd.api.types.is_datetime64_any_dtype(df["ds"]) + +@pytest.mark.parametrize("backends", [ + (BACKEND_PANDAS,), + (BACKEND_MODIN,), + (BACKEND_POLARS,), + (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS) +]) +def test_load_and_init_timeframes_return(dataset_loader, backends): + """Test that the returned timeframes object is a dictionary and contains the expected backends.""" + timeframes = dataset_loader.load_and_init_timeframes(backends=backends) + + # Ensure the return value is a dictionary + assert isinstance(timeframes, dict) + + # Check that the returned dictionary contains the expected backends + for backend in backends: + assert backend in timeframes + assert isinstance(timeframes[backend], TimeFrame) diff --git a/test/unit/datasets/test_synthetic_data_generator.py b/test/unit/datasets/test_synthetic_data_generator.py new file mode 100644 index 0000000..1d19e40 --- /dev/null +++ b/test/unit/datasets/test_synthetic_data_generator.py @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TemporalScope/test/unit/datasets/test_synthetic_data_generator.py + +import pytest +import pandas as pd +import polars as pl +import modin.pandas as mpd +import numpy as np +from temporalscope.datasets.synthetic_data_generator import ( + create_sample_data, + BACKEND_PANDAS, + BACKEND_MODIN, + BACKEND_POLARS, + MODE_SINGLE_STEP, + MODE_MULTI_STEP, +) + +# Skip unsupported backends for multi-step mode and Pandas-to-Polars conversion +@pytest.mark.parametrize("num_samples, num_features, mode", [ + (100, 3, MODE_SINGLE_STEP), # Single-step mode + pytest.param(100, 3, MODE_MULTI_STEP, marks=pytest.mark.xfail(reason="Unsupported multi-step mode for Modin and Polars")), + (0, 0, MODE_SINGLE_STEP), # Zero samples and features + (1000, 10, MODE_SINGLE_STEP) # Large data +]) +@pytest.mark.parametrize("backend", [ + BACKEND_PANDAS, + BACKEND_MODIN, + pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")) +]) +def test_create_sample_data_basic(num_samples, num_features, mode, backend): + """Test that data generation works for both single-step and multi-step modes.""" + + # Generate synthetic data + df = create_sample_data(backend=backend, num_samples=num_samples, num_features=num_features, mode=mode) + + # Check if DataFrame is empty before accessing data + if num_samples == 0: + if backend == BACKEND_POLARS: + assert df.is_empty(), "DataFrame should be empty when num_samples is 0 for Polars." + else: + assert df.empty, "DataFrame should be empty when num_samples is 0." + else: + assert len(df) == num_samples, f"Mismatch in expected number of samples: {num_samples}" + + # Check if target is scalar for single-step mode + if mode == MODE_SINGLE_STEP: + if backend == BACKEND_POLARS: + assert isinstance(df["target"][0], float), "Single-step mode should generate scalar target values." + else: + assert np.isscalar(df["target"].iloc[0]), "Single-step mode should generate scalar target values." + + # Check if target is vector for multi-step mode + if mode == MODE_MULTI_STEP: + assert isinstance(df["target"][0], (list, np.ndarray)), "Multi-step mode should generate vectorized target values." + + +@pytest.mark.parametrize("timestamp_like, numeric, mixed_frequencies, mixed_timezones", [ + (True, False, False, False), # Timestamp-like time column + (False, True, False, False), # Numeric time column +]) +@pytest.mark.parametrize("backend", [ + BACKEND_PANDAS, + BACKEND_MODIN, + pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")) +]) +def test_time_column_generation(timestamp_like, numeric, mixed_frequencies, mixed_timezones, backend): + """Test that time columns are generated with the correct type and properties.""" + + num_samples, num_features = 100, 3 + df = create_sample_data( + backend=backend, + num_samples=num_samples, + num_features=num_features, + timestamp_like=timestamp_like, + numeric=numeric, + mixed_frequencies=mixed_frequencies, + mixed_timezones=mixed_timezones + ) + + # Validate the type of the time column based on configuration + if timestamp_like: + if backend == BACKEND_POLARS: + assert isinstance(df["time"][0], pl.datatypes.Datetime), "Expected a timestamp-like time column" + else: + assert isinstance(df['time'].iloc[0], pd.Timestamp), "Expected a timestamp-like time column" + + if numeric: + if backend == BACKEND_POLARS: + assert isinstance(df["time"][0], float), "Expected a numeric time column" + else: + assert isinstance(df['time'].iloc[0], np.float64), "Expected a numeric time column" diff --git a/test/unit/partition/test_partition_padding.py b/test/unit/partition/test_partition_padding.py new file mode 100644 index 0000000..eccecf3 --- /dev/null +++ b/test/unit/partition/test_partition_padding.py @@ -0,0 +1,362 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# TemporalScope/test/unit/test_partition_padding.py + + +# import pytest +# import numpy as np +# import pandas as pd +# import modin.pandas as mpd +# import polars as pl +# from temporalscope.partition.padding import ( +# zero_pad, +# forward_fill_pad, +# backward_fill_pad, +# mean_fill_pad, +# pad_dataframe, +# sort_dataframe, +# ensure_type_consistency +# ) +# from temporalscope.core.core_utils import ( +# BACKEND_MODIN, +# BACKEND_PANDAS, +# BACKEND_POLARS, +# ) + +# from temporalscope.core.core_utils import SupportedBackendDataFrame + +# np.random.seed(42) # Set a seed for reproducibility + + + +# def generate_test_data(backend, num_samples=5): +# """Generate test data with consistent column names across all backends.""" +# start_date = pd.to_datetime("2021-01-01") +# data = { +# "feature_1": range(1, num_samples + 1), +# "feature_2": range(num_samples, 0, -1), +# "target": [i * 10 for i in range(1, num_samples + 1)], +# "ds": pd.date_range(start_date, periods=num_samples) # Ensure 'ds' is a date column +# } + +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame(data) +# df['ds'] = df['ds'].astype('datetime64[ns]') # Ensure ds is in datetime64[ns] +# return df + +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame(data) +# df['ds'] = df['ds'].astype('datetime64[ns]') # Modin relies on Pandas dtype system +# return df + +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame({ +# "feature_1": data["feature_1"], +# "feature_2": data["feature_2"], +# "target": data["target"], +# "ds": [d for d in data["ds"]] # Keep `ds` as a date column +# }) +# return df.with_columns(pl.col("ds").cast(pl.Datetime)) # Cast ds to Polars datetime + +# else: +# raise ValueError(f"Unsupported backend: {backend}") + + +# @pytest.fixture +# def test_data(): +# return { +# BACKEND_PANDAS: generate_test_data(BACKEND_PANDAS), +# BACKEND_MODIN: generate_test_data(BACKEND_MODIN), +# BACKEND_POLARS: generate_test_data(BACKEND_POLARS), +# } + + +# # Utility function to generate empty DataFrame +# def get_empty_dataframe(backend): +# if backend == BACKEND_PANDAS: +# return pd.DataFrame() +# elif backend == BACKEND_MODIN: +# return mpd.DataFrame() +# elif backend == BACKEND_POLARS: +# return pl.DataFrame() +# else: +# raise ValueError(f"Unsupported backend: {backend}") + +# def generate_mixed_data(num_samples: int = 5) -> pd.DataFrame: +# """Generates a DataFrame with mixed data types (numeric, categorical, datetime). + +# This can be used for parametrized tests to check how functions handle different +# column types. + +# :param num_samples: Number of rows to generate in the DataFrame. +# :return: A DataFrame with mixed data types. +# """ +# start_date = pd.to_datetime("2021-01-01") +# data = { +# "numeric_col": range(1, num_samples + 1), +# "category_col": ["A", "B", "C", "D", "E"][:num_samples], +# "datetime_col": pd.date_range(start_date, periods=num_samples), +# "mixed_col": ["A", 1, pd.NaT, None, 5][:num_samples], # Mixed types +# } +# return pd.DataFrame(data) + + + +# def check_monotonicity(df: SupportedBackendDataFrame, time_col: str, ascending: bool = True) -> bool: +# if isinstance(df, pl.DataFrame): +# # Handle Polars DataFrame +# diffs = df.select(pl.col(time_col).diff()).select(pl.col(time_col).drop_nulls()) # Handle nulls +# if ascending: +# return diffs.select(pl.col(time_col).gt(pl.lit(0))).to_series().all() # Use Polars comparison +# else: +# return diffs.select(pl.col(time_col).lt(pl.lit(0))).to_series().all() +# else: +# # Handle Pandas and Modin (already handled correctly) +# diffs = df[time_col].diff().dropna() # For Pandas/Modin, dropna() works fine +# if pd.api.types.is_timedelta64_dtype(diffs): +# zero_timedelta = pd.Timedelta(0) +# if ascending: +# return diffs.gt(zero_timedelta).all() +# else: +# return diffs.lt(zero_timedelta).all() +# else: +# if ascending: +# return diffs.gt(0).all() +# else: +# return diffs.lt(0).all() + + + +# # Parametrize tests for ascending and descending order +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# @pytest.mark.parametrize("ascending", [True, False]) +# def test_sort_dataframe(test_data, backend, ascending): +# df = test_data[backend] +# sorted_df = sort_dataframe(df, time_col="ds", ascending=ascending) + +# # Check sorting for each backend +# assert check_monotonicity(sorted_df, "ds", ascending=ascending) + + +# # Test for invalid time column in sort_dataframe +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# def test_sort_dataframe_invalid_time_col(test_data, backend): +# df = test_data[backend] +# with pytest.raises(ValueError): +# sort_dataframe(df, time_col="invalid_col") + + +# # Test sorting for empty DataFrame +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# def test_sort_dataframe_empty_dataframe(backend): +# empty_df = get_empty_dataframe(backend) +# with pytest.raises(ValueError): +# sort_dataframe(empty_df, time_col="ds") + + +# # Test raising TypeError for unsupported input type +# def test_sort_dataframe_unsupported_type(): +# with pytest.raises(TypeError, match="Unsupported DataFrame type"): +# sort_dataframe([], time_col="ds") # List is an unsupported type + + +# # Test warning when `time_col` is neither numeric nor datetime +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN]) +# def test_sort_dataframe_warning(test_data, backend): +# df = test_data[backend] +# df["non_time_col"] = ["a", "b", "c", "d", "e"] + +# # Ensure warning is raised when time_col is non-numeric and non-datetime +# with pytest.warns(UserWarning, match="is neither numeric nor datetime"): +# sort_dataframe(df, time_col="non_time_col", ascending=True) + +# # Continue with checking valid sorting after warning +# sorted_df = sort_dataframe(df, time_col="ds", ascending=True) +# assert check_monotonicity(sorted_df, "ds", ascending=True) + + + + + +# # Padding function tests with Modin and Polars compatibility +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# @pytest.mark.parametrize("padding_func", [zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad]) +# def test_padding_functions(test_data, backend, padding_func): +# df = test_data[backend] + +# if padding_func == zero_pad: +# padded_df = padding_func(df, target_len=7, time_col="ds", pad_value=0) +# else: +# padded_df = padding_func(df, target_len=7, end=5, reverse=False, time_col="ds") + +# assert len(padded_df) == 7 + + +# # Ensure the 'ds' column is used consistently across backends in pad_dataframe +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# @pytest.mark.parametrize("mode", ["zero", "forward_fill", "backward_fill", "mean_fill"]) +# def test_pad_dataframe(test_data, backend, mode): +# df = test_data[backend] + +# if mode == "zero": +# padded_df = pad_dataframe(df, target_len=7, mode=mode, pad_value=0, time_col="ds") +# else: +# padded_df = pad_dataframe(df, target_len=7, mode=mode, end=5, reverse=False, time_col="ds") + +# assert len(padded_df) == 7 + + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# def test_empty_dataframe(backend): +# if backend == BACKEND_PANDAS: +# df = pd.DataFrame() +# elif backend == BACKEND_MODIN: +# df = mpd.DataFrame() +# elif backend == BACKEND_POLARS: +# df = pl.DataFrame() + +# with pytest.raises(ValueError): +# zero_pad(df, target_len=5, time_col="ds") + + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# def test_invalid_time_col(test_data, backend): +# df = test_data[backend] + +# with pytest.raises(ValueError): +# zero_pad(df, target_len=7, time_col="invalid_col") + + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# def test_target_len_less_than_current_len(test_data, backend): +# df = test_data[backend] + +# with pytest.raises(ValueError): +# zero_pad(df, target_len=3, time_col="ds") + + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# def test_sort_dataframe_edge_cases(test_data, backend): +# df = test_data[backend] + +# # Add non-numeric, non-datetime column to test sorting warnings +# if backend == BACKEND_POLARS: +# df = df.with_columns(pl.Series("non_numeric", ["a", "b", "c", "d", "e"])) +# else: +# df["non_numeric"] = ["a", "b", "c", "d", "e"] + +# # Ensure warning is raised when time_col is non-numeric and non-datetime +# with pytest.warns(UserWarning, match="is neither numeric nor datetime"): +# sort_dataframe(df, time_col="non_numeric", ascending=True) + +# # Continue with existing tests +# sorted_df = sort_dataframe(df, time_col="ds", ascending=True) +# if backend == BACKEND_POLARS: +# assert sorted_df["ds"].is_sorted() +# else: +# assert sorted_df["ds"].is_monotonic_increasing + + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# @pytest.mark.parametrize("padding_func", [zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad]) +# def test_padding_functions_with_warnings(test_data, backend, padding_func): +# df = test_data[backend] + +# # Add non-numeric columns +# if backend == BACKEND_POLARS: +# df = df.with_columns(pl.Series("non_numeric", ["a", "b", "c", "d", "e"])) +# pad_df = pad_dataframe(df, target_len=7, mode="zero", time_col="ds") # Add mode here +# pad_df = pad_df.with_columns(pl.lit(None).alias("non_numeric")) # Ensure "non_numeric" exists in pad_df +# else: +# df["non_numeric"] = ["a", "b", "c", "d", "e"] + +# if padding_func == zero_pad: +# with pytest.warns(UserWarning, match="Non-numeric columns found"): +# padded_df = padding_func(df, target_len=7, time_col="ds", pad_value=0) +# else: +# padded_df = padding_func(df, target_len=7, end=5, reverse=False, time_col="ds") + +# assert len(padded_df) == 7 + + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) +# @pytest.mark.parametrize("mode", ["zero", "forward_fill", "backward_fill", "mean_fill"]) +# def test_pad_dataframe_type_consistency(test_data, backend, mode): +# df = test_data[backend] + +# # Add non-numeric column +# if backend == BACKEND_POLARS: +# df = df.with_columns(pl.Series("non_numeric", ["x", "y", "z", "w", "v"])) +# else: +# df["non_numeric"] = ["x", "y", "z", "w", "v"] + +# if mode == "zero": +# with pytest.warns(UserWarning, match="Non-numeric columns found"): +# padded_df = pad_dataframe(df, target_len=7, mode=mode, pad_value=0, time_col="ds") +# else: +# with pytest.warns(UserWarning, match="Non-numeric columns found"): +# padded_df = pad_dataframe(df, target_len=7, mode=mode, end=5, reverse=False, time_col="ds") + +# assert len(padded_df) == 7 + +# # Ensure types are consistent +# assert padded_df["feature_1"].dtype == df["feature_1"].dtype +# assert padded_df["feature_2"].dtype == df["feature_2"].dtype + +# @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN]) +# def test_pad_dataframe_boolean_to_int64(test_data, backend): +# """Test that boolean columns in the DataFrame are correctly cast to int64.""" +# df = test_data[backend] + +# # Add a boolean column to the DataFrame +# if backend == BACKEND_PANDAS: +# df["bool_col"] = [True, False, True, False, True] +# elif backend == BACKEND_MODIN: +# df["bool_col"] = mpd.Series([True, False, True, False, True]) + +# # Create a padding DataFrame with the same columns +# pad_df = pd.DataFrame({ +# "bool_col": [False, False] # Padding with False values (should become 0) +# }) + +# # Ensure type consistency (bool -> int64) +# consistent_df = ensure_type_consistency(df, pad_df) + +# # Check that the boolean column is converted to int64 +# assert consistent_df["bool_col"].dtype == "int64" +# assert (consistent_df["bool_col"] == 0).all() # All padded values should be 0 + + +# @pytest.mark.parametrize("backend", [BACKEND_MODIN]) +# def test_pad_dataframe_conversion_to_modin(test_data, backend): +# """Test that pad_df is correctly converted back to Modin after type consistency check.""" +# df = test_data[backend] + +# # Create a padding DataFrame with mismatched types +# pad_df = pd.DataFrame({ +# "feature_1": [0.0, 0.0], +# "feature_2": [0, 0], +# "target": [0, 0], +# "ds": [pd.Timestamp("1970-01-01"), pd.Timestamp("1970-01-01")] +# }) + +# # Ensure type consistency (pad_df starts as Pandas DataFrame) +# consistent_df = ensure_type_consistency(df, pad_df) + +# # Ensure pad_df is converted back to Modin if df was Modin +# assert isinstance(consistent_df, mpd.DataFrame), "pad_df should be converted back to Modin" diff --git a/test/unit/partition/test_partition_validators.py b/test/unit/partition/test_partition_validators.py new file mode 100644 index 0000000..707a811 --- /dev/null +++ b/test/unit/partition/test_partition_validators.py @@ -0,0 +1,336 @@ +# """ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# """ +# import modin.pandas as mpd +# import pandas as pd +# import polars as pl +# import pytest + +# from temporalscope.partition.partition_validators import ( +# check_binary_numerical_features, +# check_categorical_feature_cardinality, +# check_class_balance, +# check_feature_count, +# check_feature_to_sample_ratio, +# check_numerical_feature_uniqueness, +# check_sample_size, +# ) + + +# @pytest.mark.parametrize( +# "dataframe,backend,min_samples,max_samples,expected_result", +# [ +# (pd.DataFrame({"feature1": range(100)}), "pd", 3000, 50000, False), +# ( +# pl.DataFrame({"feature1": pl.Series(range(100))}), +# "pl", +# 3000, +# 50000, +# False, +# ), +# ( +# mpd.DataFrame({"feature1": range(100000)}), +# "mpd", +# 3000, +# 50000, +# False, +# ), +# ], +# ) +# def test_check_sample_size( +# dataframe, backend, min_samples, max_samples, expected_result +# ): +# """Test sample size check for various dataframes and backends.""" +# assert ( +# check_sample_size( +# dataframe, +# backend=backend, +# min_samples=min_samples, +# max_samples=max_samples, +# ) +# == expected_result +# ) + + +# @pytest.mark.parametrize( +# "dataframe,backend,min_features,expected_result", +# [ +# # Pandas DataFrame +# ( +# pd.DataFrame({"feature1": range(100)}), +# "pd", +# 4, +# False, +# ), # Too few features - Pandas +# # Polars DataFrame +# ( +# pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), +# "pl", +# 4, +# True, +# ), # Enough features - Polars +# # Modin DataFrame +# ( +# mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), +# "mpd", +# 4, +# True, +# ), # Enough features - Modin +# ], +# ) +# def test_check_feature_count(dataframe, backend, min_features, expected_result): +# """Tests check_feature_count for various dataframes and backends.""" +# assert ( +# check_feature_count(dataframe, backend=backend, min_features=min_features) +# == expected_result +# ) + + +# @pytest.mark.parametrize( +# "dataframe,backend,max_ratio,expected_result", +# [ +# ( +# pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), +# "pl", +# 0.1, +# True, +# ), +# ( +# mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), +# "mpd", +# 0.1, +# True, +# ), +# ( +# pd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), +# "pd", +# 0.1, +# True, +# ), +# ], +# ) +# def test_check_feature_to_sample_ratio(dataframe, backend, max_ratio, expected_result): +# """Tests check_feature_to_sample_ratio for various dataframes and backends.""" +# assert ( +# check_feature_to_sample_ratio(dataframe, backend=backend, max_ratio=max_ratio) +# == expected_result +# ) + + +# @pytest.mark.parametrize( +# "dataframe,backend,max_unique_values,expected_result", +# [ +# # Pandas DataFrames +# ( +# pd.DataFrame({"category1": [str(i) for i in range(25)]}), +# "pd", +# 20, +# False, +# ), # Too many unique values - Pandas +# ( +# pd.DataFrame({"category1": ["A", "B", "C"] * 100}), +# "pd", +# 20, +# True, +# ), # Normal unique values - Pandas +# # Polars DataFrames +# ( +# pl.DataFrame({"category1": pl.Series([str(i) for i in range(25)])}), +# "pl", +# 20, +# False, +# ), # Too many unique values - Polars +# ( +# pl.DataFrame({"category1": pl.Series(["A", "B", "C"] * 100)}), +# "pl", +# 20, +# True, +# ), # Normal unique values - Polars +# # Modin DataFrames +# ( +# mpd.DataFrame({"category1": [str(i) for i in range(25)]}), +# "mpd", +# 20, +# False, +# ), # Too many unique values - Modin +# ( +# mpd.DataFrame({"category1": ["A", "B", "C"] * 100}), +# "mpd", +# 20, +# True, +# ), # Normal unique values - Modin +# ], +# ) +# def test_check_categorical_feature_cardinality( +# dataframe, backend, max_unique_values, expected_result +# ): +# """Tests check_categorical_feature_cardinality for various dataframe backends.""" +# assert ( +# check_categorical_feature_cardinality( +# dataframe, backend=backend, max_unique_values=max_unique_values +# ) +# == expected_result +# ) + + +# @pytest.mark.parametrize( +# "dataframe,backend,min_unique_values,expected_result", +# [ +# # Pandas DataFrame +# ( +# pd.DataFrame({"feature1": range(100)}), +# "pd", +# 10, +# True, +# ), # Enough unique values - Pandas +# # Polars DataFrame +# ( +# pl.DataFrame({"feature1": pl.Series(range(100))}), +# "pl", +# 10, +# True, +# ), # Enough unique values - Polars +# # Modin DataFrame +# ( +# mpd.DataFrame({"feature1": [1, 1, 1, 2, 2, 2, 3, 3]}), +# "mpd", +# 10, +# False, +# ), # Too few unique values - Modin +# ( +# mpd.DataFrame({"feature1": range(100)}), +# "mpd", +# 10, +# True, +# ), # Enough unique values - Modin +# ], +# ) +# def test_check_numerical_feature_uniqueness( +# dataframe, backend, min_unique_values, expected_result +# ): +# """Tests check_numerical_feature_uniqueness for various dataframes and backends.""" +# assert ( +# check_numerical_feature_uniqueness( +# dataframe, backend=backend, min_unique_values=min_unique_values +# ) +# == expected_result +# ) + + +# @pytest.mark.parametrize( +# "dataframe,backend,expected_result", +# [ +# # Pandas DataFrame +# ( +# pd.DataFrame({"binary_feature": [0, 1] * 50}), +# "pd", +# False, +# ), # Binary numerical feature - Pandas +# ( +# pd.DataFrame({"feature1": range(100)}), +# "pd", +# True, +# ), # No binary feature - Pandas +# # Polars DataFrame +# ( +# pl.DataFrame({"binary_feature": pl.Series([0, 1] * 50)}), +# "pl", +# False, +# ), # Binary numerical feature - Polars +# ( +# pl.DataFrame({"feature1": pl.Series(range(100))}), +# "pl", +# True, +# ), # No binary feature - Polars +# # Modin DataFrame +# ( +# mpd.DataFrame({"binary_feature": [0, 1] * 50}), +# "mpd", +# False, +# ), # Binary numerical feature - Modin +# ( +# mpd.DataFrame({"feature1": range(100)}), +# "mpd", +# True, +# ), # No binary feature - Modin +# ], +# ) +# def test_check_binary_numerical_features(dataframe, backend, expected_result): +# """Tests check_binary_numerical_features for various dataframes and backends.""" +# assert ( +# check_binary_numerical_features(dataframe, backend=backend) == expected_result +# ) + + +# @pytest.mark.parametrize( +# "dataframe,target_col,backend,expected_result", +# [ +# ( +# pd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), +# "target", +# "pd", +# False, +# ), +# ( +# pd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), +# "target", +# "pd", +# True, +# ), +# ( +# pl.DataFrame( +# { +# "feature1": pl.Series(range(100)), +# "target": pl.Series([1] * 90 + [0] * 10), +# } +# ), +# "target", +# "pl", +# False, +# ), +# ( +# pl.DataFrame( +# { +# "feature1": pl.Series(range(100)), +# "target": pl.Series([0, 1] * 50), +# } +# ), +# "target", +# "pl", +# True, +# ), +# ( +# mpd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), +# "target", +# "mpd", +# False, +# ), +# ( +# mpd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), +# "target", +# "mpd", +# True, +# ), +# ], +# ) +# def test_check_class_balance(dataframe, target_col, backend, expected_result): +# """Tests check_class_balance for various dataframes and backends.""" +# result = check_class_balance(dataframe, target_col=target_col, backend=backend) +# assert ( +# result == expected_result +# ), f"Expected {expected_result}, but got {result} for backend {backend}" diff --git a/test/unit/test_core_temporal_data_loader.py b/test/unit/test_core_temporal_data_loader.py deleted file mode 100644 index 679f1fd..0000000 --- a/test/unit/test_core_temporal_data_loader.py +++ /dev/null @@ -1,266 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# TemporalScope/test/unit/test_core_temporal_data_loader.py - -import warnings -from typing import Dict, List, Union, Optional -from datetime import datetime, timedelta, date, timezone -import modin.pandas as mpd -import numpy as np - -import pandas as pd -import polars as pl -import pytest - - -from temporalscope.core.exceptions import ( - TimeColumnError, MixedTypesWarning, MixedTimezonesWarning, MixedFrequencyWarning -) - - -from temporalscope.core.core_utils import ( - BACKEND_MODIN, - BACKEND_PANDAS, - BACKEND_POLARS, -) -from temporalscope.core.temporal_data_loader import TimeFrame - - -from datetime import datetime, timedelta, timezone -from typing import Dict, List, Union, Optional -import numpy as np -import pandas as pd -import polars as pl -import modin.pandas as mpd - - -def create_sample_data( - num_samples: int = 100, - num_features: int = 3, - empty: bool = False, - missing_values: bool = False, - mixed_types: bool = False, - drop_columns: Optional[List[str]] = None, - non_numeric_time: bool = False, - empty_time: bool = False, - mixed_numeric_and_timestamp: bool = False, - date_like_string: bool = False, - object_type_time_col: bool = False, - mixed_timezones: bool = False, - polars_specific: bool = False -) -> Dict[str, Union[List[datetime], List[float], List[Optional[float]]]]: - """ Create a sample dataset for scalable unit testing, supporting various edge cases. - - This function generates sample time-series data for different unit testing scenarios, - including empty datasets, datasets with mixed data types, missing values, or different - types of time columns. It is designed to be flexible, providing various ways to test - data validation for time-series models. - - :param num_samples: Number of samples to generate. - :param num_features: Number of feature columns to generate. - :param empty: If True, generates an empty dataset. - :param missing_values: If True, introduces missing values into the dataset. - :param mixed_types: If True, mixes numeric and string data types in feature columns. - :param drop_columns: List of columns to drop from the dataset. - :param non_numeric_time: If True, replaces the `time_col` with non-numeric values. - :param empty_time: If True, fills the `time_col` with empty values. - :param mixed_numeric_and_timestamp: If True, mixes numeric and timestamp values in `time_col`. - :param date_like_string: If True, fills the `time_col` with date-like string values. - :param object_type_time_col: If True, inserts arrays or complex objects into the `time_col`. - :param mixed_timezones: If True, mixes timestamps with and without timezone information in `time_col`. - :param polars_specific: If True, handles edge cases specific to Polars. - :return: A dictionary containing generated data with keys 'time', 'feature_1', ..., 'feature_n', and 'target'. - """ - - if empty: - return {"time": [], "target": []} - - start_date = datetime(2021, 1, 1) - - if empty_time: - data = {"time": [None for _ in range(num_samples)]} - elif non_numeric_time: - data = {"time": ["invalid_time" for _ in range(num_samples)]} - elif mixed_numeric_and_timestamp: - if polars_specific: - data = {"time": [str(start_date + timedelta(days=i)) if i % 2 == 0 else float(i) for i in range(num_samples)]} - else: - data = {"time": [start_date + timedelta(days=i) if i % 2 == 0 else float(i) for i in range(num_samples)]} - elif date_like_string: - data = {"time": [f"2021-01-{i+1:02d}" for i in range(num_samples)]} - elif object_type_time_col: - data = {"time": [[start_date + timedelta(days=i)] for i in range(num_samples)]} - elif mixed_timezones: - data = {"time": [(start_date + timedelta(days=i)).replace(tzinfo=timezone.utc if i % 2 == 0 else None) - for i in range(num_samples)]} - else: - data = {"time": [start_date + timedelta(days=i) for i in range(num_samples)]} - - for i in range(1, num_features + 1): - if mixed_types: - data[f"feature_{i}"] = [f"str_{i}" if j % 2 == 0 else j for j in range(num_samples)] - else: - data[f"feature_{i}"] = np.random.rand(num_samples).tolist() - - if missing_values: - for i in range(num_samples): - if i % 10 == 0: - for j in range(1, num_features + 1): - data[f"feature_{j}"][i] = None - - data["target"] = [ - sum(data[f"feature_{j}"][i] for j in range(1, num_features + 1) if isinstance(data[f"feature_{j}"][i], float)) + - np.random.normal(0, 0.1) - for i in range(num_samples) - ] - - if drop_columns: - data = pd.DataFrame(data).drop(columns=drop_columns).to_dict(orient='list') - - return data - - - -@pytest.mark.parametrize( - "backend, case_type, expected_error, expected_warning, match_message", - [ - (BACKEND_POLARS, "missing_time_col", TimeColumnError, None, r"Missing required column: time"), - (BACKEND_PANDAS, "missing_time_col", TimeColumnError, None, r"Missing required column: time"), - (BACKEND_MODIN, "missing_time_col", TimeColumnError, None, r"Missing required column: time"), - (BACKEND_POLARS, "non_numeric_time_col", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), - (BACKEND_PANDAS, "non_numeric_time_col", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), - (BACKEND_MODIN, "non_numeric_time_col", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), - (BACKEND_PANDAS, "empty_time_col", TimeColumnError, None, r"Missing values found in `time_col`"), - (BACKEND_POLARS, "mixed_frequencies", None, MixedFrequencyWarning, r"mixed timestamp frequencies"), - (BACKEND_PANDAS, "mixed_frequencies", None, MixedFrequencyWarning, r"mixed timestamp frequencies"), - (BACKEND_POLARS, "mixed_timezones", None, MixedTimezonesWarning, r"mixed timezone-aware and naive timestamps"), - (BACKEND_PANDAS, "mixed_timezones", None, MixedTimezonesWarning, r"mixed timezone-aware and naive timestamps"), - (BACKEND_POLARS, "date_like_string", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), - (BACKEND_PANDAS, "date_like_string", TimeColumnError, None, r"`time_col` must be numeric or timestamp-like"), - ] -) -def test_validation_edge_cases(backend, case_type, expected_error, expected_warning, match_message): - """Test validation logic under different edge cases and backends.""" - - polars_specific = backend == BACKEND_POLARS - - if case_type == "missing_time_col": - data = create_sample_data(drop_columns=["time"], polars_specific=polars_specific) - elif case_type == "non_numeric_time_col": - data = create_sample_data(non_numeric_time=True, polars_specific=polars_specific) - elif case_type == "empty_time_col": - data = create_sample_data(empty_time=True, polars_specific=polars_specific) - elif case_type == "mixed_frequencies": - data = create_sample_data(mixed_frequencies=True, polars_specific=polars_specific) - elif case_type == "date_like_string": - data = create_sample_data(date_like_string=True, polars_specific=polars_specific) - elif case_type == "mixed_timezones": - data = create_sample_data(mixed_timezones=True, polars_specific=polars_specific) - - if backend == BACKEND_POLARS: - df = pl.DataFrame(data, strict=False) # Allow mixed types for Polars - elif backend == BACKEND_PANDAS: - df = pd.DataFrame(data) - elif backend == BACKEND_MODIN: - df = mpd.DataFrame(data) - - if expected_error: - with pytest.raises(expected_error, match=match_message): - TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) - elif expected_warning: - with pytest.warns(expected_warning, match=match_message if match_message else None): - TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) - - - - -# @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -# def test_sort_data(backend): -# """Test sorting method for various backends.""" -# data = create_sample_data(num_samples=100) -# if backend == BACKEND_POLARS: -# df = pl.DataFrame(data) -# elif backend == BACKEND_PANDAS: -# df = pd.DataFrame(data) -# elif backend == BACKEND_MODIN: -# df = mpd.DataFrame(data) - -# tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend, sort=False) -# # Shuffle and sort -# if backend == BACKEND_POLARS: -# shuffled_df = tf.get_data().sample(fraction=1.0) -# else: -# shuffled_df = tf.get_data().sample(frac=1).reset_index(drop=True) -# tf.update_data(shuffled_df) -# tf.sort_data(ascending=True) -# sorted_df = tf.get_data() - -# # Verify sorting -# times = sorted_df[tf.time_col].to_list() if backend == BACKEND_POLARS else sorted_df[tf.time_col].tolist() -# assert times == sorted(times) - - -# @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -# def test_update_target_col_invalid_length(backend): -# """Test updating target column with mismatched length.""" -# data = create_sample_data(num_samples=100) -# if backend == BACKEND_POLARS: -# df = pl.DataFrame(data) -# new_target = pl.Series(np.random.rand(99)) # One less than expected -# elif backend == BACKEND_PANDAS: -# df = pd.DataFrame(data) -# new_target = pd.Series(np.random.rand(99)) -# elif backend == BACKEND_MODIN: -# df = mpd.DataFrame(data) -# new_target = mpd.Series(np.random.rand(99)) - -# tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) -# with pytest.raises(ValueError): -# tf.update_target_col(new_target) - - -# @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -# def test_missing_columns(backend): -# """Test initialization with missing required columns.""" -# data = create_sample_data(num_samples=100) -# if backend == BACKEND_POLARS: -# df = pl.DataFrame(data).drop(["target"]) -# elif backend == BACKEND_PANDAS: -# df = pd.DataFrame(data).drop(columns=["target"]) -# elif backend == BACKEND_MODIN: -# df = mpd.DataFrame(data).drop(columns=["target"]) - -# with pytest.raises(ValueError): -# TimeFrame(df, time_col="time", target_col="target", dataframe_backend=backend) - - -# @pytest.mark.parametrize("backend", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -# def test_invalid_backend_initialization(backend): -# """Test invalid backend during initialization.""" -# data = create_sample_data(num_samples=100) -# if backend == BACKEND_POLARS: -# df = pl.DataFrame(data) -# elif backend == BACKEND_PANDAS: -# df = pd.DataFrame(data) -# elif backend == BACKEND_MODIN: -# df = mpd.DataFrame(data) - -# invalid_backend = "invalid_backend" -# with pytest.raises(ValueError): -# TimeFrame(df, time_col="time", target_col="target", dataframe_backend=invalid_backend) - diff --git a/test/unit/test_core_temporal_target_shifter.py b/test/unit/test_core_temporal_target_shifter.py deleted file mode 100644 index 427d383..0000000 --- a/test/unit/test_core_temporal_target_shifter.py +++ /dev/null @@ -1,224 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# TemporalScope/test/unit/test_core_temporal_target_shifter.py - -import modin.pandas as mpd -import numpy as np -import pandas as pd -import polars as pl -import pytest - -from temporalscope.core.core_utils import ( - BACKEND_MODIN, - BACKEND_PANDAS, - BACKEND_POLARS, - MODE_MACHINE_LEARNING, - MODE_DEEP_LEARNING, -) -from temporalscope.core.temporal_data_loader import TimeFrame -from temporalscope.core.temporal_target_shifter import TemporalTargetShifter - - -# Fixture to generate sample dataframes for different data_formats -@pytest.fixture(params=[BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def sample_dataframe(request): - """Fixture to generate sample dataframes for different data_formats.""" - data = { - "time": pd.date_range(start="2022-01-01", periods=100), - "target": np.random.rand(100), - "feature_1": np.random.rand(100), - "feature_2": np.random.rand(100), - } - data_format = request.param - if data_format == BACKEND_POLARS: - df = pl.DataFrame(data) - elif data_format == BACKEND_PANDAS: - df = pd.DataFrame(data) - elif data_format == BACKEND_MODIN: - df = mpd.DataFrame(data) - return df, data_format, "target" - - -# Parametrized Test for data_format Inference, n_lags, and Modes -@pytest.mark.parametrize( - "n_lags, mode, sequence_length", - [ - (1, MODE_MACHINE_LEARNING, None), - (3, MODE_MACHINE_LEARNING, None), - (1, MODE_DEEP_LEARNING, 5), - ], -) -@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) # Parametrizing data_formats as well -def test_data_format_inference(data_format, n_lags, mode, sequence_length): - """Test data_format inference and shifting functionality across all data_formats.""" - # Generate data for the current data_format - data = { - "time": pd.date_range(start="2022-01-01", periods=100), - "target": np.random.rand(100), - "feature_1": np.random.rand(100), - "feature_2": np.random.rand(100), - } - - if data_format == BACKEND_POLARS: - df = pl.DataFrame(data) - elif data_format == BACKEND_PANDAS: - df = pd.DataFrame(data) - elif data_format == BACKEND_MODIN: - df = mpd.DataFrame(data) - - # Initialize shifter - shifter = TemporalTargetShifter(n_lags=n_lags, mode=mode, sequence_length=sequence_length, target_col="target") - - # Test fitting the dataframe and checking the inferred data_format - shifter.fit(df) - assert shifter.data_format == data_format - - # Test transformation (ensure no crashes) - transformed = shifter.transform(df) - assert transformed is not None - - -# Parametrized test for invalid data and expected errors across data_formats -@pytest.mark.parametrize( - "invalid_data", - [ - None, # Null input should raise an error - pd.DataFrame(), # Empty DataFrame should raise an error - ], -) -@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_invalid_data_handling(data_format, invalid_data): - """Test invalid data handling for empty or None DataFrames across data_formats.""" - shifter = TemporalTargetShifter(n_lags=1, target_col="target") - - with pytest.raises(ValueError): - shifter.fit(invalid_data) - - -# Parametrized test for TimeFrame inputs and transformation across all data_formats -@pytest.mark.parametrize("n_lags", [1, 2]) -@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_time_frame_input(data_format, n_lags): - """Test TimeFrame input handling and transformation across all data_formats.""" - # Generate data for the current data_format - data = { - "time": pd.date_range(start="2022-01-01", periods=100), - "target": np.random.rand(100), - "feature_1": np.random.rand(100), - "feature_2": np.random.rand(100), - } - - if data_format == BACKEND_POLARS: - df = pl.DataFrame(data) - elif data_format == BACKEND_PANDAS: - df = pd.DataFrame(data) - elif data_format == BACKEND_MODIN: - df = mpd.DataFrame(data) - - # Ensure TimeFrame uses dataframe_backend - tf = TimeFrame(df, time_col="time", target_col="target", dataframe_backend=data_format) - shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") - - # Test fitting and transforming TimeFrame - shifter.fit(tf) - transformed = shifter.transform(tf) - assert transformed is not None - - -# Parametrized test for deep learning mode with different sequence lengths across all data_formats -@pytest.mark.parametrize("sequence_length", [3, 5]) -@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_deep_learning_mode(data_format, sequence_length): - """Test deep learning mode sequence generation across all data_formats.""" - # Generate data for the current data_format - data = { - "time": pd.date_range(start="2022-01-01", periods=100), - "target": np.random.rand(100), - "feature_1": np.random.rand(100), - "feature_2": np.random.rand(100), - } - - if data_format == BACKEND_POLARS: - df = pl.DataFrame(data) - elif data_format == BACKEND_PANDAS: - df = pd.DataFrame(data) - elif data_format == BACKEND_MODIN: - df = mpd.DataFrame(data) - - shifter = TemporalTargetShifter( - n_lags=1, mode=MODE_DEEP_LEARNING, sequence_length=sequence_length, target_col="target" - ) - - shifter.fit(df) - transformed = shifter.transform(df) - assert transformed is not None - - -# Test verbose mode with stdout capture -@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_verbose_mode(data_format, capfd): - """Test verbose mode output and row dropping information.""" - # Generate data for the current data_format - data = { - "time": pd.date_range(start="2022-01-01", periods=100), - "target": np.random.rand(100), - "feature_1": np.random.rand(100), - "feature_2": np.random.rand(100), - } - - if data_format == BACKEND_POLARS: - df = pl.DataFrame(data) - elif data_format == BACKEND_PANDAS: - df = pd.DataFrame(data) - elif data_format == BACKEND_MODIN: - df = mpd.DataFrame(data) - - shifter = TemporalTargetShifter(n_lags=1, target_col="target", verbose=True) - - shifter.fit(df) - shifter.transform(df) - - # Capture stdout and check for printed verbose information - captured = capfd.readouterr() - assert "Rows before shift" in captured.out - - -# Parametrized test for fit_transform method for all data_formats -@pytest.mark.parametrize("n_lags", [1, 2]) -@pytest.mark.parametrize("data_format", [BACKEND_POLARS, BACKEND_PANDAS, BACKEND_MODIN]) -def test_fit_transform(data_format, n_lags): - """Test fit_transform() method for all data_formats.""" - # Generate data for the current data_format - data = { - "time": pd.date_range(start="2022-01-01", periods=100), - "target": np.random.rand(100), - "feature_1": np.random.rand(100), - "feature_2": np.random.rand(100), - } - - if data_format == BACKEND_POLARS: - df = pl.DataFrame(data) - elif data_format == BACKEND_PANDAS: - df = pd.DataFrame(data) - elif data_format == BACKEND_MODIN: - df = mpd.DataFrame(data) - - shifter = TemporalTargetShifter(n_lags=n_lags, target_col="target") - - transformed = shifter.fit_transform(df) - assert transformed is not None diff --git a/test/unit/test_core_utils.py b/test/unit/test_core_utils.py deleted file mode 100644 index 75f6d55..0000000 --- a/test/unit/test_core_utils.py +++ /dev/null @@ -1,250 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""TemporalScope/test/unit/test_core_utils.py - -TemporalScope is Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import warnings -from typing import Optional, Tuple, Union -from unittest.mock import patch - -import modin.pandas as mpd -import numpy as np -import pandas as pd -import polars as pl -import pytest - -from temporalscope.core.core_utils import ( - check_nans, - check_nulls, - get_api_keys, - get_default_backend_cfg, - print_divider, - validate_and_convert_input, - validate_backend, - validate_input, -) - -warnings.filterwarnings("ignore", message=".*defaulting to pandas.*") - -# Mock API key constants -MOCK_OPENAI_API_KEY = "mock_openai_key" -MOCK_CLAUDE_API_KEY = "mock_claude_key" - - -# --- Data Generation Functions --- -def create_sample_data(num_samples: int = 100, with_nulls=False, with_nans=False): - """Create sample data with options for introducing nulls and NaNs.""" - data = { - "feature_1": np.random.rand(num_samples).tolist(), - "feature_2": np.random.rand(num_samples).tolist(), - "feature_3": np.random.rand(num_samples).tolist(), - } - - if with_nans: - for i in range(0, num_samples, 10): - data["feature_2"][i] = float("nan") # Every 10th value is NaN - - if with_nulls: - for i in range(0, num_samples, 15): - data["feature_3"][i] = None # Every 15th value is Null - - return data - - -# Unified fixture for data with nulls and NaNs -@pytest.fixture -def sample_df_with_conditions(): - """Fixture for creating DataFrames for each backend. - - Provides a function to generate sample DataFrames with optional nulls or NaNs. - - :return: A function that generates a DataFrame and backend identifier based on the specified conditions. - :rtype: Callable[[Optional[str], bool, bool], Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]] - """ - - def _create_sample_df( - backend: Optional[str] = None, with_nulls: bool = False, with_nans: bool = False - ) -> Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]: - """Creates a sample DataFrame for the specified backend with optional nulls and NaNs. - - :param backend: The backend to use ('pd', 'pl', 'mpd'). Defaults to 'pd' if None. - :type backend: Optional[str] - :param with_nulls: Whether to include null values in the data. Defaults to False. - :type with_nulls: bool - :param with_nans: Whether to include NaN values in the data. Defaults to False. - :type with_nans: bool - :return: A tuple containing the DataFrame and the backend string. - :rtype: Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str] - :raises ValueError: If an unsupported backend is specified. - """ - data = create_sample_data(with_nulls=with_nulls, with_nans=with_nans) - if backend is None: - backend = "pd" # Default to pandas for backward compatibility - if backend == "pd": - return pd.DataFrame(data), "pd" - elif backend == "pl": - return pl.DataFrame(data), "pl" - elif backend == "mpd": - return mpd.DataFrame(data), "mpd" - else: - raise ValueError(f"Unsupported backend '{backend}'") - - return _create_sample_df - - -# --- Tests --- - - -def test_get_api_keys(): - """Test that get_api_keys retrieves environment variables correctly.""" - with patch.dict("os.environ", {"OPENAI_API_KEY": MOCK_OPENAI_API_KEY, "CLAUDE_API_KEY": MOCK_CLAUDE_API_KEY}): - api_keys = get_api_keys() - assert api_keys["OPENAI_API_KEY"] == MOCK_OPENAI_API_KEY - assert api_keys["CLAUDE_API_KEY"] == MOCK_CLAUDE_API_KEY - - with patch.dict("os.environ", {}, clear=True): - api_keys = get_api_keys() - assert api_keys["OPENAI_API_KEY"] is None - assert api_keys["CLAUDE_API_KEY"] is None - - -@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -@pytest.mark.parametrize("with_nans", [True, False]) -def test_check_nans(backend, sample_df_with_conditions, with_nans): - """Test check_nans for both NaNs present and no NaNs across backends.""" - df, _ = sample_df_with_conditions(backend=backend, with_nans=with_nans) - result = check_nans(df, backend) - expected = with_nans # True if NaNs were introduced, else False - assert result == expected, f"Expected {expected} but got {result} for backend {backend}" - - -def test_get_default_backend_cfg(): - """Test that the default backend configuration is returned correctly.""" - expected_cfg = {"BACKENDS": {"pl": "polars", "pd": "pandas", "mpd": "modin"}} - result = get_default_backend_cfg() - assert result == expected_cfg - - -@pytest.mark.parametrize("backend", ["pl", "pd", "mpd"]) -def test_validate_backend_supported(backend): - """Test that supported backends are validated successfully.""" - validate_backend(backend) - - -@pytest.mark.parametrize("invalid_backend", ["tf", "spark", "unknown"]) -def test_validate_backend_unsupported(invalid_backend): - """Test that unsupported backends raise a ValueError.""" - with pytest.raises(ValueError, match="Unsupported backend"): - validate_backend(invalid_backend) - - -@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -@pytest.mark.parametrize("target_backend", ["pl", "pd", "mpd"]) -def test_validate_and_convert_input(sample_df_with_conditions, backend, target_backend): - """Test that DataFrame conversion between backends works correctly.""" - df, _ = sample_df_with_conditions(backend=backend, with_nulls=False) - result = validate_and_convert_input(df, target_backend) - - if target_backend == "pd": - assert isinstance(result, pd.DataFrame), f"Expected Pandas DataFrame but got {type(result)}" - elif target_backend == "pl": - assert isinstance(result, pl.DataFrame), f"Expected Polars DataFrame but got {type(result)}" - elif target_backend == "mpd": - assert isinstance(result, mpd.DataFrame), f"Expected Modin DataFrame but got {type(result)}" - - -@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -def test_validate_and_convert_input_invalid_type(backend): - """Test that validate_and_convert_input raises TypeError when given an invalid DataFrame type.""" - invalid_df = "This is not a DataFrame" - - with pytest.raises(TypeError, match="Input DataFrame type"): - validate_and_convert_input(invalid_df, backend) - - -def test_print_divider(capsys): - """Test the print_divider function outputs the correct string.""" - print_divider("-", 50) - captured = capsys.readouterr() - assert captured.out == "-" * 50 + "\n" - - -def test_check_nans_invalid_backend(sample_df_with_conditions): - """Test that an unsupported backend raises a ValueError in check_nans.""" - df, _ = sample_df_with_conditions(with_nans=True) - with pytest.raises(ValueError, match="Unsupported backend"): - check_nans(df, "invalid_backend") - - -@pytest.mark.parametrize( - "backend, expected_type", - [ - ("pl", pl.DataFrame), - ("pd", pd.DataFrame), - ("mpd", mpd.DataFrame), - ], -) -def test_validate_input_correct_backend(sample_df_with_conditions, backend, expected_type): - """Test that validate_input passes when the DataFrame matches the backend.""" - df, _ = sample_df_with_conditions(backend=backend, with_nulls=False) - validate_input(df, backend) - - -@pytest.mark.parametrize("df_backend", ["pd", "pl", "mpd"]) -@pytest.mark.parametrize("validate_backend", ["pd", "pl", "mpd"]) -def test_validate_input_mismatched_backend(sample_df_with_conditions, df_backend, validate_backend): - """Test that validate_input raises TypeError when the DataFrame does not match the backend.""" - df, _ = sample_df_with_conditions(backend=df_backend, with_nulls=False) - - if df_backend != validate_backend: - # Expect TypeError when backends don't match - with pytest.raises(TypeError, match="Expected a"): - validate_input(df, validate_backend) - else: - # Should pass when backends match - validate_input(df, validate_backend) - - -@pytest.mark.parametrize("backend", ["pd", "pl", "mpd"]) -@pytest.mark.parametrize("with_nulls", [True, False]) -def test_check_nulls(backend, sample_df_with_conditions, with_nulls): - """Test check_nulls for both nulls present and no nulls across backends.""" - df, _ = sample_df_with_conditions(backend=backend, with_nulls=with_nulls) - result = check_nulls(df, backend) - expected = with_nulls # True if nulls were introduced, else False - assert result == expected, f"Expected {expected} but got {result} for backend {backend}" - - -# Test for invalid backend handling -def test_check_nulls_invalid_backend(sample_df_with_conditions): - """Test that check_nulls raises ValueError when given an unsupported backend.""" - df, _ = sample_df_with_conditions(with_nulls=True) - with pytest.raises(ValueError, match="Unsupported backend"): - check_nulls(df, "invalid_backend") diff --git a/test/unit/test_partition_padding.py b/test/unit/test_partition_padding.py deleted file mode 100644 index 31f0d35..0000000 --- a/test/unit/test_partition_padding.py +++ /dev/null @@ -1,362 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# TemporalScope/test/unit/test_partition_padding.py - - -import pytest -import numpy as np -import pandas as pd -import modin.pandas as mpd -import polars as pl -from temporalscope.partition.padding import ( - zero_pad, - forward_fill_pad, - backward_fill_pad, - mean_fill_pad, - pad_dataframe, - sort_dataframe, - ensure_type_consistency -) -from temporalscope.core.core_utils import ( - BACKEND_MODIN, - BACKEND_PANDAS, - BACKEND_POLARS, -) - -from temporalscope.core.core_utils import SupportedBackendDataFrame - -np.random.seed(42) # Set a seed for reproducibility - - - -def generate_test_data(backend, num_samples=5): - """Generate test data with consistent column names across all backends.""" - start_date = pd.to_datetime("2021-01-01") - data = { - "feature_1": range(1, num_samples + 1), - "feature_2": range(num_samples, 0, -1), - "target": [i * 10 for i in range(1, num_samples + 1)], - "ds": pd.date_range(start_date, periods=num_samples) # Ensure 'ds' is a date column - } - - if backend == BACKEND_PANDAS: - df = pd.DataFrame(data) - df['ds'] = df['ds'].astype('datetime64[ns]') # Ensure ds is in datetime64[ns] - return df - - elif backend == BACKEND_MODIN: - df = mpd.DataFrame(data) - df['ds'] = df['ds'].astype('datetime64[ns]') # Modin relies on Pandas dtype system - return df - - elif backend == BACKEND_POLARS: - df = pl.DataFrame({ - "feature_1": data["feature_1"], - "feature_2": data["feature_2"], - "target": data["target"], - "ds": [d for d in data["ds"]] # Keep `ds` as a date column - }) - return df.with_columns(pl.col("ds").cast(pl.Datetime)) # Cast ds to Polars datetime - - else: - raise ValueError(f"Unsupported backend: {backend}") - - -@pytest.fixture -def test_data(): - return { - BACKEND_PANDAS: generate_test_data(BACKEND_PANDAS), - BACKEND_MODIN: generate_test_data(BACKEND_MODIN), - BACKEND_POLARS: generate_test_data(BACKEND_POLARS), - } - - -# Utility function to generate empty DataFrame -def get_empty_dataframe(backend): - if backend == BACKEND_PANDAS: - return pd.DataFrame() - elif backend == BACKEND_MODIN: - return mpd.DataFrame() - elif backend == BACKEND_POLARS: - return pl.DataFrame() - else: - raise ValueError(f"Unsupported backend: {backend}") - -def generate_mixed_data(num_samples: int = 5) -> pd.DataFrame: - """Generates a DataFrame with mixed data types (numeric, categorical, datetime). - - This can be used for parametrized tests to check how functions handle different - column types. - - :param num_samples: Number of rows to generate in the DataFrame. - :return: A DataFrame with mixed data types. - """ - start_date = pd.to_datetime("2021-01-01") - data = { - "numeric_col": range(1, num_samples + 1), - "category_col": ["A", "B", "C", "D", "E"][:num_samples], - "datetime_col": pd.date_range(start_date, periods=num_samples), - "mixed_col": ["A", 1, pd.NaT, None, 5][:num_samples], # Mixed types - } - return pd.DataFrame(data) - - - -def check_monotonicity(df: SupportedBackendDataFrame, time_col: str, ascending: bool = True) -> bool: - if isinstance(df, pl.DataFrame): - # Handle Polars DataFrame - diffs = df.select(pl.col(time_col).diff()).select(pl.col(time_col).drop_nulls()) # Handle nulls - if ascending: - return diffs.select(pl.col(time_col).gt(pl.lit(0))).to_series().all() # Use Polars comparison - else: - return diffs.select(pl.col(time_col).lt(pl.lit(0))).to_series().all() - else: - # Handle Pandas and Modin (already handled correctly) - diffs = df[time_col].diff().dropna() # For Pandas/Modin, dropna() works fine - if pd.api.types.is_timedelta64_dtype(diffs): - zero_timedelta = pd.Timedelta(0) - if ascending: - return diffs.gt(zero_timedelta).all() - else: - return diffs.lt(zero_timedelta).all() - else: - if ascending: - return diffs.gt(0).all() - else: - return diffs.lt(0).all() - - - -# Parametrize tests for ascending and descending order -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -@pytest.mark.parametrize("ascending", [True, False]) -def test_sort_dataframe(test_data, backend, ascending): - df = test_data[backend] - sorted_df = sort_dataframe(df, time_col="ds", ascending=ascending) - - # Check sorting for each backend - assert check_monotonicity(sorted_df, "ds", ascending=ascending) - - -# Test for invalid time column in sort_dataframe -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -def test_sort_dataframe_invalid_time_col(test_data, backend): - df = test_data[backend] - with pytest.raises(ValueError): - sort_dataframe(df, time_col="invalid_col") - - -# Test sorting for empty DataFrame -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -def test_sort_dataframe_empty_dataframe(backend): - empty_df = get_empty_dataframe(backend) - with pytest.raises(ValueError): - sort_dataframe(empty_df, time_col="ds") - - -# Test raising TypeError for unsupported input type -def test_sort_dataframe_unsupported_type(): - with pytest.raises(TypeError, match="Unsupported DataFrame type"): - sort_dataframe([], time_col="ds") # List is an unsupported type - - -# Test warning when `time_col` is neither numeric nor datetime -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN]) -def test_sort_dataframe_warning(test_data, backend): - df = test_data[backend] - df["non_time_col"] = ["a", "b", "c", "d", "e"] - - # Ensure warning is raised when time_col is non-numeric and non-datetime - with pytest.warns(UserWarning, match="is neither numeric nor datetime"): - sort_dataframe(df, time_col="non_time_col", ascending=True) - - # Continue with checking valid sorting after warning - sorted_df = sort_dataframe(df, time_col="ds", ascending=True) - assert check_monotonicity(sorted_df, "ds", ascending=True) - - - - - -# Padding function tests with Modin and Polars compatibility -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -@pytest.mark.parametrize("padding_func", [zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad]) -def test_padding_functions(test_data, backend, padding_func): - df = test_data[backend] - - if padding_func == zero_pad: - padded_df = padding_func(df, target_len=7, time_col="ds", pad_value=0) - else: - padded_df = padding_func(df, target_len=7, end=5, reverse=False, time_col="ds") - - assert len(padded_df) == 7 - - -# Ensure the 'ds' column is used consistently across backends in pad_dataframe -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -@pytest.mark.parametrize("mode", ["zero", "forward_fill", "backward_fill", "mean_fill"]) -def test_pad_dataframe(test_data, backend, mode): - df = test_data[backend] - - if mode == "zero": - padded_df = pad_dataframe(df, target_len=7, mode=mode, pad_value=0, time_col="ds") - else: - padded_df = pad_dataframe(df, target_len=7, mode=mode, end=5, reverse=False, time_col="ds") - - assert len(padded_df) == 7 - - -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -def test_empty_dataframe(backend): - if backend == BACKEND_PANDAS: - df = pd.DataFrame() - elif backend == BACKEND_MODIN: - df = mpd.DataFrame() - elif backend == BACKEND_POLARS: - df = pl.DataFrame() - - with pytest.raises(ValueError): - zero_pad(df, target_len=5, time_col="ds") - - -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -def test_invalid_time_col(test_data, backend): - df = test_data[backend] - - with pytest.raises(ValueError): - zero_pad(df, target_len=7, time_col="invalid_col") - - -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -def test_target_len_less_than_current_len(test_data, backend): - df = test_data[backend] - - with pytest.raises(ValueError): - zero_pad(df, target_len=3, time_col="ds") - - -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -def test_sort_dataframe_edge_cases(test_data, backend): - df = test_data[backend] - - # Add non-numeric, non-datetime column to test sorting warnings - if backend == BACKEND_POLARS: - df = df.with_columns(pl.Series("non_numeric", ["a", "b", "c", "d", "e"])) - else: - df["non_numeric"] = ["a", "b", "c", "d", "e"] - - # Ensure warning is raised when time_col is non-numeric and non-datetime - with pytest.warns(UserWarning, match="is neither numeric nor datetime"): - sort_dataframe(df, time_col="non_numeric", ascending=True) - - # Continue with existing tests - sorted_df = sort_dataframe(df, time_col="ds", ascending=True) - if backend == BACKEND_POLARS: - assert sorted_df["ds"].is_sorted() - else: - assert sorted_df["ds"].is_monotonic_increasing - - -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -@pytest.mark.parametrize("padding_func", [zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad]) -def test_padding_functions_with_warnings(test_data, backend, padding_func): - df = test_data[backend] - - # Add non-numeric columns - if backend == BACKEND_POLARS: - df = df.with_columns(pl.Series("non_numeric", ["a", "b", "c", "d", "e"])) - pad_df = pad_dataframe(df, target_len=7, mode="zero", time_col="ds") # Add mode here - pad_df = pad_df.with_columns(pl.lit(None).alias("non_numeric")) # Ensure "non_numeric" exists in pad_df - else: - df["non_numeric"] = ["a", "b", "c", "d", "e"] - - if padding_func == zero_pad: - with pytest.warns(UserWarning, match="Non-numeric columns found"): - padded_df = padding_func(df, target_len=7, time_col="ds", pad_value=0) - else: - padded_df = padding_func(df, target_len=7, end=5, reverse=False, time_col="ds") - - assert len(padded_df) == 7 - - -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) -@pytest.mark.parametrize("mode", ["zero", "forward_fill", "backward_fill", "mean_fill"]) -def test_pad_dataframe_type_consistency(test_data, backend, mode): - df = test_data[backend] - - # Add non-numeric column - if backend == BACKEND_POLARS: - df = df.with_columns(pl.Series("non_numeric", ["x", "y", "z", "w", "v"])) - else: - df["non_numeric"] = ["x", "y", "z", "w", "v"] - - if mode == "zero": - with pytest.warns(UserWarning, match="Non-numeric columns found"): - padded_df = pad_dataframe(df, target_len=7, mode=mode, pad_value=0, time_col="ds") - else: - with pytest.warns(UserWarning, match="Non-numeric columns found"): - padded_df = pad_dataframe(df, target_len=7, mode=mode, end=5, reverse=False, time_col="ds") - - assert len(padded_df) == 7 - - # Ensure types are consistent - assert padded_df["feature_1"].dtype == df["feature_1"].dtype - assert padded_df["feature_2"].dtype == df["feature_2"].dtype - -@pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN]) -def test_pad_dataframe_boolean_to_int64(test_data, backend): - """Test that boolean columns in the DataFrame are correctly cast to int64.""" - df = test_data[backend] - - # Add a boolean column to the DataFrame - if backend == BACKEND_PANDAS: - df["bool_col"] = [True, False, True, False, True] - elif backend == BACKEND_MODIN: - df["bool_col"] = mpd.Series([True, False, True, False, True]) - - # Create a padding DataFrame with the same columns - pad_df = pd.DataFrame({ - "bool_col": [False, False] # Padding with False values (should become 0) - }) - - # Ensure type consistency (bool -> int64) - consistent_df = ensure_type_consistency(df, pad_df) - - # Check that the boolean column is converted to int64 - assert consistent_df["bool_col"].dtype == "int64" - assert (consistent_df["bool_col"] == 0).all() # All padded values should be 0 - - -@pytest.mark.parametrize("backend", [BACKEND_MODIN]) -def test_pad_dataframe_conversion_to_modin(test_data, backend): - """Test that pad_df is correctly converted back to Modin after type consistency check.""" - df = test_data[backend] - - # Create a padding DataFrame with mismatched types - pad_df = pd.DataFrame({ - "feature_1": [0.0, 0.0], - "feature_2": [0, 0], - "target": [0, 0], - "ds": [pd.Timestamp("1970-01-01"), pd.Timestamp("1970-01-01")] - }) - - # Ensure type consistency (pad_df starts as Pandas DataFrame) - consistent_df = ensure_type_consistency(df, pad_df) - - # Ensure pad_df is converted back to Modin if df was Modin - assert isinstance(consistent_df, mpd.DataFrame), "pad_df should be converted back to Modin" diff --git a/test/unit/test_partition_validators.py b/test/unit/test_partition_validators.py deleted file mode 100644 index b0653a1..0000000 --- a/test/unit/test_partition_validators.py +++ /dev/null @@ -1,336 +0,0 @@ -# """ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# """ -import modin.pandas as mpd -import pandas as pd -import polars as pl -import pytest - -from temporalscope.partition.partition_validators import ( - check_binary_numerical_features, - check_categorical_feature_cardinality, - check_class_balance, - check_feature_count, - check_feature_to_sample_ratio, - check_numerical_feature_uniqueness, - check_sample_size, -) - - -@pytest.mark.parametrize( - "dataframe,backend,min_samples,max_samples,expected_result", - [ - (pd.DataFrame({"feature1": range(100)}), "pd", 3000, 50000, False), - ( - pl.DataFrame({"feature1": pl.Series(range(100))}), - "pl", - 3000, - 50000, - False, - ), - ( - mpd.DataFrame({"feature1": range(100000)}), - "mpd", - 3000, - 50000, - False, - ), - ], -) -def test_check_sample_size( - dataframe, backend, min_samples, max_samples, expected_result -): - """Test sample size check for various dataframes and backends.""" - assert ( - check_sample_size( - dataframe, - backend=backend, - min_samples=min_samples, - max_samples=max_samples, - ) - == expected_result - ) - - -@pytest.mark.parametrize( - "dataframe,backend,min_features,expected_result", - [ - # Pandas DataFrame - ( - pd.DataFrame({"feature1": range(100)}), - "pd", - 4, - False, - ), # Too few features - Pandas - # Polars DataFrame - ( - pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), - "pl", - 4, - True, - ), # Enough features - Polars - # Modin DataFrame - ( - mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), - "mpd", - 4, - True, - ), # Enough features - Modin - ], -) -def test_check_feature_count(dataframe, backend, min_features, expected_result): - """Tests check_feature_count for various dataframes and backends.""" - assert ( - check_feature_count(dataframe, backend=backend, min_features=min_features) - == expected_result - ) - - -@pytest.mark.parametrize( - "dataframe,backend,max_ratio,expected_result", - [ - ( - pl.DataFrame({f"feature{i}": pl.Series(range(100000)) for i in range(10)}), - "pl", - 0.1, - True, - ), - ( - mpd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), - "mpd", - 0.1, - True, - ), - ( - pd.DataFrame({f"feature{i}": range(100000) for i in range(10)}), - "pd", - 0.1, - True, - ), - ], -) -def test_check_feature_to_sample_ratio(dataframe, backend, max_ratio, expected_result): - """Tests check_feature_to_sample_ratio for various dataframes and backends.""" - assert ( - check_feature_to_sample_ratio(dataframe, backend=backend, max_ratio=max_ratio) - == expected_result - ) - - -@pytest.mark.parametrize( - "dataframe,backend,max_unique_values,expected_result", - [ - # Pandas DataFrames - ( - pd.DataFrame({"category1": [str(i) for i in range(25)]}), - "pd", - 20, - False, - ), # Too many unique values - Pandas - ( - pd.DataFrame({"category1": ["A", "B", "C"] * 100}), - "pd", - 20, - True, - ), # Normal unique values - Pandas - # Polars DataFrames - ( - pl.DataFrame({"category1": pl.Series([str(i) for i in range(25)])}), - "pl", - 20, - False, - ), # Too many unique values - Polars - ( - pl.DataFrame({"category1": pl.Series(["A", "B", "C"] * 100)}), - "pl", - 20, - True, - ), # Normal unique values - Polars - # Modin DataFrames - ( - mpd.DataFrame({"category1": [str(i) for i in range(25)]}), - "mpd", - 20, - False, - ), # Too many unique values - Modin - ( - mpd.DataFrame({"category1": ["A", "B", "C"] * 100}), - "mpd", - 20, - True, - ), # Normal unique values - Modin - ], -) -def test_check_categorical_feature_cardinality( - dataframe, backend, max_unique_values, expected_result -): - """Tests check_categorical_feature_cardinality for various dataframe backends.""" - assert ( - check_categorical_feature_cardinality( - dataframe, backend=backend, max_unique_values=max_unique_values - ) - == expected_result - ) - - -@pytest.mark.parametrize( - "dataframe,backend,min_unique_values,expected_result", - [ - # Pandas DataFrame - ( - pd.DataFrame({"feature1": range(100)}), - "pd", - 10, - True, - ), # Enough unique values - Pandas - # Polars DataFrame - ( - pl.DataFrame({"feature1": pl.Series(range(100))}), - "pl", - 10, - True, - ), # Enough unique values - Polars - # Modin DataFrame - ( - mpd.DataFrame({"feature1": [1, 1, 1, 2, 2, 2, 3, 3]}), - "mpd", - 10, - False, - ), # Too few unique values - Modin - ( - mpd.DataFrame({"feature1": range(100)}), - "mpd", - 10, - True, - ), # Enough unique values - Modin - ], -) -def test_check_numerical_feature_uniqueness( - dataframe, backend, min_unique_values, expected_result -): - """Tests check_numerical_feature_uniqueness for various dataframes and backends.""" - assert ( - check_numerical_feature_uniqueness( - dataframe, backend=backend, min_unique_values=min_unique_values - ) - == expected_result - ) - - -@pytest.mark.parametrize( - "dataframe,backend,expected_result", - [ - # Pandas DataFrame - ( - pd.DataFrame({"binary_feature": [0, 1] * 50}), - "pd", - False, - ), # Binary numerical feature - Pandas - ( - pd.DataFrame({"feature1": range(100)}), - "pd", - True, - ), # No binary feature - Pandas - # Polars DataFrame - ( - pl.DataFrame({"binary_feature": pl.Series([0, 1] * 50)}), - "pl", - False, - ), # Binary numerical feature - Polars - ( - pl.DataFrame({"feature1": pl.Series(range(100))}), - "pl", - True, - ), # No binary feature - Polars - # Modin DataFrame - ( - mpd.DataFrame({"binary_feature": [0, 1] * 50}), - "mpd", - False, - ), # Binary numerical feature - Modin - ( - mpd.DataFrame({"feature1": range(100)}), - "mpd", - True, - ), # No binary feature - Modin - ], -) -def test_check_binary_numerical_features(dataframe, backend, expected_result): - """Tests check_binary_numerical_features for various dataframes and backends.""" - assert ( - check_binary_numerical_features(dataframe, backend=backend) == expected_result - ) - - -@pytest.mark.parametrize( - "dataframe,target_col,backend,expected_result", - [ - ( - pd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), - "target", - "pd", - False, - ), - ( - pd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), - "target", - "pd", - True, - ), - ( - pl.DataFrame( - { - "feature1": pl.Series(range(100)), - "target": pl.Series([1] * 90 + [0] * 10), - } - ), - "target", - "pl", - False, - ), - ( - pl.DataFrame( - { - "feature1": pl.Series(range(100)), - "target": pl.Series([0, 1] * 50), - } - ), - "target", - "pl", - True, - ), - ( - mpd.DataFrame({"feature1": range(100), "target": [1] * 90 + [0] * 10}), - "target", - "mpd", - False, - ), - ( - mpd.DataFrame({"feature1": range(100), "target": [0, 1] * 50}), - "target", - "mpd", - True, - ), - ], -) -def test_check_class_balance(dataframe, target_col, backend, expected_result): - """Tests check_class_balance for various dataframes and backends.""" - result = check_class_balance(dataframe, target_col=target_col, backend=backend) - assert ( - result == expected_result - ), f"Expected {expected_result}, but got {result} for backend {backend}" From de3b2686c22b18dea956abb725b225ced72c0489 Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Sun, 6 Oct 2024 04:19:19 +0000 Subject: [PATCH 4/6] feat(refactor-core-api-and-add-systematic-data-generator): feat: refactor core api and add systematic data generator - implement new core api to support multiple backends (pandas, polars, modin) - add synthetic_data_generator for systematic testing across backends - refactor core modules: core_utils, exceptions, temporal_data_loader, temporal_target_shifter - add new temporal_core_processing module - restructure and update test files to align with new api design - enhance functionality to support both single-step and multi-step operations - update pyproject.toml to reflect new structure and dependencies - fix pre-commit issues with MyPy and Ruff - merged changes from main branch to integrate latest updates and resolve conflicts --- src/temporalscope/core/core_utils.py | 76 +++++------------- src/temporalscope/core/exceptions.py | 20 +++-- .../core/temporal_core_processing.py | 20 ++--- .../core/temporal_data_loader.py | 73 ++++++----------- .../core/temporal_target_shifter.py | 17 ---- src/temporalscope/datasets/datasets.py | 29 ++++--- .../datasets/synthetic_data_generator.py | 41 ++++------ src/temporalscope/partition/padding.py | 42 +++++----- .../partition/partition_validators.py | 3 +- src/temporalscope/partition/sliding_window.py | 36 ++++----- test/unit/core/test_core_utils.py | 27 ------- test/unit/core/test_exceptions.py | 24 +++--- test/unit/core/test_temporal_data_loader.py | 18 ----- test/unit/datasets/test_datasets.py | 21 +++-- .../datasets/test_synthetic_data_generator.py | 79 +++++++++++-------- test/unit/partition/test_partition_padding.py | 6 -- 16 files changed, 198 insertions(+), 334 deletions(-) diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py index 1683628..a4f0b14 100644 --- a/src/temporalscope/core/core_utils.py +++ b/src/temporalscope/core/core_utils.py @@ -101,15 +101,15 @@ """ import os -from typing import Dict, Optional, Union, cast, Callable, Type -from datetime import datetime, timedelta, date import warnings +from typing import Callable, Dict, Optional, Type, Union, cast import modin.pandas as mpd import pandas as pd import polars as pl from dotenv import load_dotenv -from temporalscope.core.exceptions import UnsupportedBackendError, MixedFrequencyWarning + +from temporalscope.core.exceptions import MixedFrequencyWarning, UnsupportedBackendError # Load environment variables from the .env file load_dotenv() @@ -191,10 +191,7 @@ def validate_mode(backend: str, mode: str) -> None: def validate_and_convert_input( - df: SupportedBackendDataFrame, - backend: str, - time_col: Optional[str] = None, - mode: str = MODE_SINGLE_STEP + df: SupportedBackendDataFrame, backend: str, time_col: Optional[str] = None, mode: str = MODE_SINGLE_STEP ) -> SupportedBackendDataFrame: """Validates and converts the input DataFrame to the specified backend type, with optional time column casting. @@ -203,35 +200,9 @@ def validate_and_convert_input( :param time_col: Optional; the name of the time column for casting. :param mode: The processing mode ('single_step' or 'multi_step'). :raises TypeError: If input DataFrame type doesn't match the specified backend or conversion fails. - :raises NotImplementedError: If multi-step mode is requested for unsupported backends or unsupported conversion to Polars. + :raises NotImplementedError: If multi-step mode is requested for unsupported backends. :return: The DataFrame converted to the specified backend type. - - Example - ------- - Here's how you would use this function to convert a Pandas DataFrame to Polars: - - .. code-block:: python - - import pandas as pd - import polars as pl - - data = {'col1': [1, 2], 'col2': [3, 4], 'time': pd.date_range(start='1/1/2023', periods=2)} - df = pd.DataFrame(data) - - # Convert the DataFrame from Pandas to Polars, with an optional time column for casting - converted_df = validate_and_convert_input(df, 'pl', time_col='time') - print(type(converted_df)) # Output: - - # If you don't need to cast the time column, just omit the time_col argument - converted_df = validate_and_convert_input(df, 'pl') - print(type(converted_df)) # Output: - - .. note:: - - This function first converts the input DataFrame into the appropriate backend. - - If `time_col` is specified and the backend is Polars, it casts the time column to `pl.Datetime`. - - Pandas to Polars conversion is currently unsupported and raises a `NotImplementedError`. This needs to be implemented later. """ - # Validate the backend and mode combination validate_backend(backend) validate_mode(backend, mode) @@ -240,12 +211,11 @@ def validate_and_convert_input( str, Dict[Type[SupportedBackendDataFrame], Callable[[SupportedBackendDataFrame], SupportedBackendDataFrame]] ] = { BACKEND_POLARS: { - # Polars to Polars pl.DataFrame: lambda x: x, - # Pandas to Polars - currently not supported - pd.DataFrame: lambda x: (_ for _ in ()).throw(NotImplementedError("Pandas to Polars conversion is not currently supported.")), - # Modin to Polars - mpd.DataFrame: lambda x: pl.from_pandas(x._to_pandas()), + pd.DataFrame: lambda x: pl.from_pandas(x), # Use polars.from_pandas for conversion + mpd.DataFrame: lambda x: pl.from_pandas( + x._to_pandas() if hasattr(x, "_to_pandas") else x + ), # Safely handle the Modin conversion }, BACKEND_PANDAS: { pd.DataFrame: lambda x: x, # Pandas to Pandas @@ -260,27 +230,20 @@ def validate_and_convert_input( } # Step 1: Convert the DataFrame to the desired backend - converted_df = None for dataframe_type, conversion_func in backend_conversion_map[backend].items(): if isinstance(df, dataframe_type): converted_df = conversion_func(df) break - - if converted_df is None: + else: raise TypeError(f"Input DataFrame type {type(df)} does not match the specified backend '{backend}'") # Step 2: Explicitly cast the time column to pl.Datetime if backend is Polars and the column exists if backend == BACKEND_POLARS and time_col and time_col in converted_df.columns: - # Force cast time_col to pl.Datetime converted_df = converted_df.with_columns(pl.col(time_col).cast(pl.Datetime)) - # Check the type of the column and assert it is correct - assert isinstance(converted_df[time_col][0], pl.Datetime), f"Expected a timestamp-like time column, but got {type(converted_df[time_col][0])}" - return converted_df - def get_api_keys() -> Dict[str, Optional[str]]: """Retrieve API keys from environment variables. @@ -332,8 +295,7 @@ def check_nulls(df: SupportedBackendDataFrame, backend: str) -> bool: elif backend == BACKEND_MODIN: return bool(cast(mpd.DataFrame, df).isnull().values.any()) - # Suppress the warning since this path is unreachable due to `validate_backend` - # mypy: ignore + raise UnsupportedBackendError(f"Unsupported backend: {backend}") def check_nans(df: SupportedBackendDataFrame, backend: str) -> bool: @@ -341,7 +303,7 @@ def check_nans(df: SupportedBackendDataFrame, backend: str) -> bool: :param df: The DataFrame to check for NaN values. :type df: SupportedBackendDataFrame - :param backend: The backend used for the DataFrame ('pl' for Polars, 'pd' for Pandas, 'mpd' for Modin'). + :param backend: The backend used for the DataFrame ('pl', 'pd', 'mpd'). :type backend: str :return: True if there are NaN values, False otherwise. :rtype: bool @@ -357,8 +319,7 @@ def check_nans(df: SupportedBackendDataFrame, backend: str) -> bool: elif backend == BACKEND_MODIN: return bool(cast(mpd.DataFrame, df).isna().values.any()) - # Suppress the warning since this path is unreachable due to `validate_backend` - # mypy: ignore + raise UnsupportedBackendError(f"Unsupported backend: {backend}") def is_timestamp_like(df: SupportedBackendDataFrame, time_col: str) -> bool: @@ -393,6 +354,8 @@ def is_timestamp_like(df: SupportedBackendDataFrame, time_col: str) -> bool: elif isinstance(df, pl.DataFrame): return time_column.dtype == pl.Datetime + raise UnsupportedBackendError(f"Unsupported DataFrame type: {type(df)}") + def is_numeric(df: SupportedBackendDataFrame, time_col: str) -> bool: """Check if the specified column in the DataFrame is numeric. @@ -412,15 +375,12 @@ def is_numeric(df: SupportedBackendDataFrame, time_col: str) -> bool: # Handle empty columns for different backends if isinstance(df, pl.DataFrame): - # Polars: Check if the DataFrame has zero rows or if the column is empty if df.height == 0 or time_column.is_empty(): return False elif isinstance(df, mpd.DataFrame): - # Modin: Check if the column is empty by using length if len(time_column) == 0: return False elif isinstance(df, pd.DataFrame): - # Pandas: Check if the column is empty if isinstance(time_column, pd.Series) and time_column.empty: return False @@ -430,6 +390,8 @@ def is_numeric(df: SupportedBackendDataFrame, time_col: str) -> bool: elif isinstance(df, pl.DataFrame): return time_column.dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64] + raise UnsupportedBackendError(f"Unsupported DataFrame type: {type(df)}") + def has_mixed_frequencies(df: SupportedBackendDataFrame, time_col: str, min_non_null_values: int = 3) -> bool: """Check if the given time column in the DataFrame contains mixed frequencies. @@ -501,10 +463,8 @@ def sort_dataframe( :raises TypeError: If the DataFrame type does not match the backend. :raises UnsupportedBackendError: If the backend is unsupported or validation fails. """ - # Validate backend validate_backend(backend) - # Select backend-specific sorting logic if backend == BACKEND_POLARS: if not isinstance(df, pl.DataFrame): raise TypeError(f"Expected Polars DataFrame but got {type(df)}") @@ -522,6 +482,8 @@ def sort_dataframe( df.sort_values(by=time_col, ascending=ascending, inplace=True) return df + raise UnsupportedBackendError(f"Unsupported backend: {backend}") + def check_empty_columns(df: SupportedBackendDataFrame, backend: str) -> bool: """Check for empty columns in the DataFrame using the specified backend. diff --git a/src/temporalscope/core/exceptions.py b/src/temporalscope/core/exceptions.py index 3095e48..50085c4 100644 --- a/src/temporalscope/core/exceptions.py +++ b/src/temporalscope/core/exceptions.py @@ -39,14 +39,13 @@ -------------- .. code-block:: python - from temporalscope.core.exceptions import ( - TimeColumnError, MixedTypesWarning, MixedTimezonesWarning - ) + from temporalscope.core.exceptions import TimeColumnError, MixedTypesWarning, MixedTimezonesWarning + def validate_time_column(df): - if df['time'].dtype == object: + if df["time"].dtype == object: raise TimeColumnError("Invalid time column data type.") - elif contains_mixed_types(df['time']): + elif contains_mixed_types(df["time"]): warnings.warn("Mixed numeric and timestamp types.", MixedTypesWarning) """ @@ -64,7 +63,7 @@ class TimeFrameError(Exception): class TimeColumnError(TimeFrameError): - """ Exception raised for errors related to the `time_col`. + """Exception raised for errors related to the `time_col`. This error is raised when the `time_col` in the TimeFrame is either missing, contains unsupported types (non-numeric or non-timestamp), @@ -80,6 +79,7 @@ class TimeColumnError(TimeFrameError): if not pd.api.types.is_numeric_dtype(df[time_col]) and \ not pd.api.types.is_datetime64_any_dtype(df[time_col]): raise TimeColumnError("`time_col` must be numeric or timestamp-like.") + """ pass @@ -149,9 +149,15 @@ class UnsupportedBackendError(Exception): Attributes: backend (str): The invalid backend that caused the error. message (str): Explanation of the error. + """ - def __init__(self, backend: str, message: str = "Unsupported backend"): + def __init__(self, backend, message="Unsupported backend"): + """Initialize the UnsupportedBackendError. + + :param backend: The invalid backend (e.g., 'pl', 'pd', 'mpd') that caused the error. + :param message: Optional; a custom error message. Defaults to "Unsupported backend". + """ self.backend = backend self.message = f"{message}: {backend}. Supported backends are 'pd', 'mpd', 'pl'." super().__init__(self.message) diff --git a/src/temporalscope/core/temporal_core_processing.py b/src/temporalscope/core/temporal_core_processing.py index bfc2e6e..ab5c1bc 100644 --- a/src/temporalscope/core/temporal_core_processing.py +++ b/src/temporalscope/core/temporal_core_processing.py @@ -42,11 +42,9 @@ from temporal_core_processing import convert_to_tensorflow, convert_to_pandas # Example DataFrame - df = pd.DataFrame({ - 'time': pd.date_range(start='2023-01-01', periods=100, freq='D'), - 'feature_1': range(100), - 'target': range(100) - }) + df = pd.DataFrame( + {"time": pd.date_range(start="2023-01-01", periods=100, freq="D"), "feature_1": range(100), "target": range(100)} + ) # Convert DataFrame to TensorFlow Dataset tf_dataset = convert_to_tensorflow(df) @@ -55,18 +53,14 @@ df_back = convert_to_pandas(tf_dataset) """ -from typing import Union import pandas as pd -import polars as pl -import modin.pandas as mpd import tensorflow as tf from temporalscope.core.core_utils import SupportedBackendDataFrame def convert_to_tensorflow(df: SupportedBackendDataFrame) -> tf.data.Dataset: - """ - Stub: Convert a DataFrame to a TensorFlow Dataset. + """Stub: Convert a DataFrame to a TensorFlow Dataset. This function will convert Pandas, Modin, or Polars DataFrames into a TensorFlow Dataset to enable compatibility with deep learning frameworks like TensorFlow. @@ -78,8 +72,7 @@ def convert_to_tensorflow(df: SupportedBackendDataFrame) -> tf.data.Dataset: def convert_to_pandas(df: SupportedBackendDataFrame) -> pd.DataFrame: - """ - Stub: Convert a DataFrame or TensorFlow Dataset to a Pandas DataFrame. + """Stub: Convert a DataFrame or TensorFlow Dataset to a Pandas DataFrame. This function will handle converting Modin, Polars, or TensorFlow Datasets back to Pandas DataFrames to ensure interoperability across backends and downstream tasks. @@ -91,8 +84,7 @@ def convert_to_pandas(df: SupportedBackendDataFrame) -> pd.DataFrame: def handle_multi_step_conversion(df: pd.DataFrame, sequence_length: int) -> pd.DataFrame: - """ - Stub: Prepare DataFrame for multi-step forecasting. + """Stub: Prepare DataFrame for multi-step forecasting. This function will handle the preparation of multi-step targets by expanding the target column into sequences of the specified length, suitable for sequential models. diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index 31ded7d..ad7affa 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -109,37 +109,29 @@ """ -import warnings -from typing import Optional, Union, cast -from datetime import datetime, timedelta, date +from typing import Optional, Union import modin.pandas as mpd import pandas as pd import polars as pl -from temporalscope.core.exceptions import ( - TimeColumnError, - MixedTypesWarning, - MixedFrequencyWarning, - UnsupportedBackendError, -) - from temporalscope.core.core_utils import ( BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, SupportedBackendDataFrame, - validate_and_convert_input, + check_empty_columns, + check_nulls, infer_backend_from_dataframe, - validate_backend, is_numeric, is_timestamp_like, - has_mixed_frequencies, sort_dataframe, - check_empty_columns, - check_nulls, + validate_and_convert_input, +) +from temporalscope.core.exceptions import ( + TimeColumnError, + UnsupportedBackendError, ) - # Define alias with forward reference TimeFrameCompatibleData = Union["TimeFrame", SupportedBackendDataFrame] @@ -180,11 +172,9 @@ class TimeFrame: .. code-block:: python import polars as pl - data = pl.DataFrame({ - 'time': pl.date_range(start='2021-01-01', periods=100, interval='1d'), - 'value': range(100) - }) - tf = TimeFrame(data, time_col='time', target_col='value') + + data = pl.DataFrame({"time": pl.date_range(start="2021-01-01", periods=100, interval="1d"), "value": range(100)}) + tf = TimeFrame(data, time_col="time", target_col="value") print(tf.get_data().head()) .. seealso:: @@ -242,15 +232,11 @@ def __init__( import polars as pl from temporalscope.core.temporal_data_loader import TimeFrame - data = pl.DataFrame({ - 'time': pl.date_range(start='2021-01-01', periods=5, interval='1d'), - 'value': range(5) - }) + data = pl.DataFrame({"time": pl.date_range(start="2021-01-01", periods=5, interval="1d"), "value": range(5)}) - tf = TimeFrame(data, time_col='time', target_col='value') + tf = TimeFrame(data, time_col="time", target_col="value") print(tf.get_data().head()) """ - # Ensure time_col and target_col are valid strings if not isinstance(time_col, str) or not time_col: raise ValueError("`time_col` must be a non-empty string.") @@ -345,14 +331,11 @@ def get_data(self) -> SupportedBackendDataFrame: import pandas as pd # Create a Pandas DataFrame - data = { - 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), - 'target': range(5, 0, -1) - } + data = {"time": pd.date_range(start="2021-01-01", periods=5, freq="D"), "target": range(5, 0, -1)} df = pd.DataFrame(data) # Initialize a TimeFrame - tf = TimeFrame(df, time_col='time', target_col='target') + tf = TimeFrame(df, time_col="time", target_col="target") # Retrieve the DataFrame data = tf.get_data() @@ -379,14 +362,11 @@ def sort_data(self, ascending: bool = True) -> None: import pandas as pd # Create a Pandas DataFrame - data = { - 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), - 'target': range(5, 0, -1) - } + data = {"time": pd.date_range(start="2021-01-01", periods=5, freq="D"), "target": range(5, 0, -1)} df = pd.DataFrame(data) # Initialize a TimeFrame - tf = TimeFrame(df, time_col='time', target_col='target') + tf = TimeFrame(df, time_col="time", target_col="target") # Sort the DataFrame in ascending order tf.sort_data(ascending=True) @@ -435,20 +415,16 @@ def update_data( import pandas as pd # Create a Pandas DataFrame - df = pd.DataFrame({ - 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), - 'target': range(5, 0, -1) - }) + df = pd.DataFrame({"time": pd.date_range(start="2021-01-01", periods=5, freq="D"), "target": range(5, 0, -1)}) # Initialize a TimeFrame - tf = TimeFrame(df, time_col='time', target_col='target') + tf = TimeFrame(df, time_col="time", target_col="target") # Update the DataFrame and target column - new_target = pd.Series([1, 2, 3, 4, 5], name='target') + new_target = pd.Series([1, 2, 3, 4, 5], name="target") tf.update_data(new_df=None, new_target_col=new_target) print(tf.get_data()) """ - # Update time_col and target_col if provided if time_col: self._time_col = time_col @@ -504,13 +480,10 @@ def validate_data(self) -> None: import pandas as pd # Create a Pandas DataFrame - df = pd.DataFrame({ - 'time': pd.date_range(start='2021-01-01', periods=5, freq='D'), - 'target': range(5, 0, -1) - }) + df = pd.DataFrame({"time": pd.date_range(start="2021-01-01", periods=5, freq="D"), "target": range(5, 0, -1)}) # Initialize a TimeFrame - tf = TimeFrame(df, time_col='time', target_col='target') + tf = TimeFrame(df, time_col="time", target_col="target") # Run validation on the TimeFrame tf.validate_data() @@ -534,4 +507,4 @@ def validate_data(self) -> None: # 4. Check for missing values in `time_col` and `target_col` if check_nulls(self.df, self._dataframe_backend): - raise ValueError(f"Missing values found in `time_col` or `target_col`.") + raise ValueError("Missing values found in `time_col` or `target_col`.") diff --git a/src/temporalscope/core/temporal_target_shifter.py b/src/temporalscope/core/temporal_target_shifter.py index a09be28..1019619 100644 --- a/src/temporalscope/core/temporal_target_shifter.py +++ b/src/temporalscope/core/temporal_target_shifter.py @@ -28,23 +28,6 @@ 3. Tang, Y., Song, Z., Zhu, Y., Yuan, H., Hou, M., Ji, J., Tang, C., & Li, J. (2022). A survey on machine learning models for financial time series forecasting. Neurocomputing, 512, 363-380. https://doi.org/10.1016/j.neucom.2022.09.078 """ -import warnings -from typing import Optional, Union, cast - -import modin.pandas as mpd -import pandas as pd -import polars as pl - -from temporalscope.core.core_utils import ( - BACKEND_MODIN, - BACKEND_PANDAS, - BACKEND_POLARS, - SupportedBackendDataFrame, - validate_backend, -) -from temporalscope.core.temporal_data_loader import TimeFrame -from temporalscope.core.temporal_data_loader import TimeFrameCompatibleData - # class TemporalTargetShifter: # """A class for shifting the target variable in time series data for machine learning or deep learning. diff --git a/src/temporalscope/datasets/datasets.py b/src/temporalscope/datasets/datasets.py index fad04e7..458b70b 100644 --- a/src/temporalscope/datasets/datasets.py +++ b/src/temporalscope/datasets/datasets.py @@ -26,7 +26,7 @@ Modin, and Polars. The class can be easily extended to include additional datasets in the future. Example: ---------- +------- .. code-block:: python from temporalscope.datasets.datasets import DatasetLoader @@ -46,19 +46,20 @@ """ -import pandas as pd +from typing import Callable, Dict, Tuple, Union + import modin.pandas as mpd +import pandas as pd import polars as pl from statsmodels.datasets import macrodata -from typing import Tuple, Dict, Callable, Union -from temporalscope.core.temporal_data_loader import TimeFrame + from temporalscope.core.core_utils import ( - BACKEND_PANDAS, BACKEND_MODIN, + BACKEND_PANDAS, BACKEND_POLARS, - SupportedBackendDataFrame, print_divider, ) +from temporalscope.core.temporal_data_loader import TimeFrame def _load_macrodata() -> Tuple[pd.DataFrame, str]: @@ -106,17 +107,17 @@ class DatasetLoader: and demonstration of time series forecasting workflows. Attributes: - ------------ + ---------- dataset_name : str The name of the dataset to be loaded. It must be available in the `AVAILABLE_DATASETS` dictionary. Methods: - --------- + ------- load_and_init_timeframes: Load the specified dataset and initialize TimeFrame objects for multiple backends. Example: - --------- + ------- .. code-block:: python # Initialize the loader with the 'macrodata' dataset @@ -132,8 +133,7 @@ class DatasetLoader: """ def __init__(self, dataset_name: str = "macrodata") -> None: - """ - Initialize DatasetLoader with a specified dataset. + """Initialize DatasetLoader with a specified dataset. :param dataset_name: The name of the dataset to load. Must be available in AVAILABLE_DATASETS. :raises ValueError: If the specified dataset is not available. @@ -145,8 +145,7 @@ def __init__(self, dataset_name: str = "macrodata") -> None: self.dataset_name = dataset_name def _load_dataset_and_target(self) -> Tuple[pd.DataFrame, str]: - """ - Internal method to load the dataset and its associated target column. + """Internal method to load the dataset and its associated target column. :return: A tuple containing the preprocessed DataFrame and the associated target column name. :rtype: Tuple[pd.DataFrame, str] @@ -181,7 +180,7 @@ def init_timeframes_for_backends( :raises ValueError: If an unsupported backend is specified. Example: - --------- + ------- .. code-block:: python from temporalscope.datasets.datasets import DatasetLoader @@ -225,7 +224,7 @@ def load_and_init_timeframes( :rtype: Dict[str, TimeFrame] Example: - --------- + ------- .. code-block:: python dataset_loader = DatasetLoader(dataset_name="macrodata") diff --git a/src/temporalscope/datasets/synthetic_data_generator.py b/src/temporalscope/datasets/synthetic_data_generator.py index 9f43a1f..8fe2664 100644 --- a/src/temporalscope/datasets/synthetic_data_generator.py +++ b/src/temporalscope/datasets/synthetic_data_generator.py @@ -85,37 +85,30 @@ .. code-block:: python # Generating data for single-step mode - df = create_sample_data(num_samples=100, num_features=3, mode='single_step') + df = create_sample_data(num_samples=100, num_features=3, mode="single_step") print(df.head()) # Shows the generated data with features and a scalar target. # Generating data for multi-step mode - df = create_sample_data(num_samples=100, num_features=3, mode='multi_step') + df = create_sample_data(num_samples=100, num_features=3, mode="multi_step") print(df.head()) # Shows the generated input sequence (`X`) and target sequence (`Y`). """ -import numpy as np -from datetime import datetime -from typing import Any, Optional, Tuple, Callable -import pytest +from typing import Any, Callable, Optional, Tuple +import numpy as np import pandas as pd -import polars as pl -import modin.pandas as mpd - +import pytest from temporalscope.core.core_utils import ( BACKEND_PANDAS, - BACKEND_MODIN, - BACKEND_POLARS, - MODE_SINGLE_STEP, MODE_MULTI_STEP, + MODE_SINGLE_STEP, SUPPORTED_MULTI_STEP_BACKENDS, + SupportedBackendDataFrame, + validate_and_convert_input, validate_backend, - validate_mode + validate_mode, ) -from temporalscope.core.exceptions import UnsupportedBackendError -from temporalscope.core.core_utils import SupportedBackendDataFrame -from temporalscope.core.core_utils import validate_backend, validate_and_convert_input, BACKEND_MODIN, BACKEND_POLARS # Constants DEFAULT_NUM_SAMPLES = 100 @@ -125,12 +118,7 @@ DEFAULT_NULL_INTERVAL = 15 # Default interval for inserting nulls -import numpy as np -from datetime import datetime -import pandas as pd -from temporalscope.core.core_utils import validate_and_convert_input - -def create_sample_data( +def create_sample_data( # noqa: PLR0912 backend: str, num_samples: int = DEFAULT_NUM_SAMPLES, num_features: int = DEFAULT_NUM_FEATURES, @@ -274,7 +262,7 @@ def create_sample_data( if mode == MODE_SINGLE_STEP: data["target"] = np.random.rand(num_samples) elif mode == MODE_MULTI_STEP: - data["target"] = [np.random.rand(10) for _ in range(num_samples)] + data["target"] = np.array([np.random.rand(10) for _ in range(num_samples)]) else: raise ValueError(f"Unsupported mode: {mode}") @@ -289,7 +277,7 @@ def create_sample_data( @pytest.fixture -def sample_df_with_conditions() -> Callable[[Optional[str], Any], Tuple[SupportedBackendDataFrame, str]]: +def sample_df_with_conditions() -> Callable[..., Tuple[SupportedBackendDataFrame, str]]: """Pytest fixture for creating DataFrames for each backend (Pandas, Modin, Polars) with customizable conditions. This function generates synthetic data using Pandas and leaves the conversion to the backend @@ -297,7 +285,7 @@ def sample_df_with_conditions() -> Callable[[Optional[str], Any], Tuple[Supporte :return: A function that generates a DataFrame and the backend type based on user-specified conditions. - :rtype: Callable[[Optional[str], Any], Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]] + :rtype: Callable[..., Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str]] .. example:: @@ -326,6 +314,9 @@ def _create_sample_df(backend: Optional[str] = None, **kwargs: Any) -> Tuple[Sup A tuple containing the generated DataFrame and the backend type. :rtype: Tuple[Union[pd.DataFrame, pl.DataFrame, mpd.DataFrame], str] """ + # Assign a default backend if none is provided + backend = backend or BACKEND_PANDAS + # Generate the sample data using Pandas df = create_sample_data(backend=BACKEND_PANDAS, **kwargs) diff --git a/src/temporalscope/partition/padding.py b/src/temporalscope/partition/padding.py index be0b943..6d87fe0 100644 --- a/src/temporalscope/partition/padding.py +++ b/src/temporalscope/partition/padding.py @@ -42,8 +42,8 @@ partitioning or padding utilities. This module focuses only on numerical and time columns. The only special handling occurs for the `time_col` (if specified), which can be a timestamp or a numeric column. -Examples: ---------- +Examples +-------- .. code-block:: python >>> import pandas as pd @@ -62,13 +62,16 @@ .. seealso:: 1. Dwarampudi, M. and Reddy, N.V., 2019. Effects of padding on LSTMs and CNNs. arXiv preprint arXiv:1903.07288. 2. Lafabregue, B., Weber, J., Gançarski, P. and Forestier, G., 2022. End-to-end deep representation learning for time series clustering: a comparative study. Data Mining and Knowledge Discovery, 36(1), pp.29-81. + """ import warnings -from typing import Union, Optional, cast -import pandas as pd +from typing import Optional, Union + import modin.pandas as mpd +import pandas as pd import polars as pl + from temporalscope.core.core_utils import SupportedBackendDataFrame # Define numeric types for each backend @@ -144,7 +147,7 @@ def sort_dataframe(df: SupportedBackendDataFrame, time_col: str, ascending: bool raise TypeError(f"Unsupported DataFrame type: {type(df)}") -def ensure_type_consistency( +def ensure_type_consistency( # noqa: PLR0912 df: SupportedBackendDataFrame, pad_df: SupportedBackendDataFrame ) -> SupportedBackendDataFrame: """Ensure the column types of `pad_df` match the column types of `df`. @@ -166,16 +169,10 @@ def ensure_type_consistency( from temporalscope.partition.padding import ensure_type_consistency # Original DataFrame - df = pd.DataFrame({ - "a": pd.Series([1.0, 2.0], dtype="float32"), - "b": pd.Series([3, 4], dtype="int64") - }) + df = pd.DataFrame({"a": pd.Series([1.0, 2.0], dtype="float32"), "b": pd.Series([3, 4], dtype="int64")}) # Padded DataFrame - pad_df = pd.DataFrame({ - "a": [0.0, 0.0], - "b": [0, 0] - }) + pad_df = pd.DataFrame({"a": [0.0, 0.0], "b": [0, 0]}) # Ensure type consistency between df and pad_df pad_df = ensure_type_consistency(df, pad_df) @@ -188,8 +185,8 @@ def ensure_type_consistency( - We convert Modin DataFrames to Pandas temporarily to ensure type consistency because Modin’s internal `astype()` can sometimes cause issues when working with mixed data types or `bool` columns. After consistency is ensured, we convert the DataFrame back to Modin to maintain backend consistency. - """ + """ # If df is a Modin DataFrame, convert to Pandas if possible is_modin_df = False if isinstance(df, mpd.DataFrame): @@ -228,7 +225,7 @@ def ensure_type_consistency( raise TypeError(f"Unsupported DataFrame type: {type(df)}") -def zero_pad( +def zero_pad( # noqa: PLR0911, PLR0912 df: SupportedBackendDataFrame, target_len: int, time_col: Optional[str] = None, @@ -335,7 +332,7 @@ def zero_pad( return df -def forward_fill_pad( +def forward_fill_pad( # noqa: PLR0911, PLR0912 df: SupportedBackendDataFrame, target_len: int, end: int, @@ -375,8 +372,8 @@ def forward_fill_pad( .. note:: Forward-fill padding is useful in scenarios where missing data is best approximated by the last known valid value, such as financial data or sensor readings in IoT applications. - """ + """ # Validate the padding option if padding not in ["pre", "post"]: raise ValueError(f"Invalid padding option: {padding}. Use 'pre' or 'post'.") @@ -436,7 +433,7 @@ def forward_fill_pad( return df -def backward_fill_pad( +def backward_fill_pad( # noqa: PLR0912 df: SupportedBackendDataFrame, target_len: int, end: int, @@ -476,6 +473,7 @@ def backward_fill_pad( .. note:: Backward-fill padding is often applied when future values are unknown and it's reasonable to assume that the first valid observation represents future unknowns, which is useful in cases like predictive modeling. + """ validate_dataframe(df) @@ -532,10 +530,10 @@ def backward_fill_pad( raise ValueError(f"Invalid padding option: {padding}. Use 'pre' or 'post'.") # This line ensures that MyPy sees a return in all cases, although it's unreachable. - assert False, "This should never be reached" + raise RuntimeError("This should never be reached") -def mean_fill_pad( +def mean_fill_pad( # noqa: PLR0912 df: SupportedBackendDataFrame, target_len: int, end: int, @@ -575,6 +573,7 @@ def mean_fill_pad( .. note:: Mean-fill padding is useful when you want to fill gaps in the data with the mean of the numeric columns. It is commonly used in time-series forecasting and analytics when you want to smooth over missing values. + """ validate_dataframe(df) @@ -654,7 +653,7 @@ def mean_fill_pad( raise TypeError(f"Unsupported DataFrame type: {type(df)}") # This return statement satisfies MyPy's expectation, but should not actually be reachable. - assert False, "This should never be reached" + raise RuntimeError("This should never be reached") def pad_dataframe( @@ -703,6 +702,7 @@ def pad_dataframe( 1 2.0 4.0 2021-01-02 2 1.5 3.5 NaT 3 1.5 3.5 NaT + """ validate_dataframe(df) diff --git a/src/temporalscope/partition/partition_validators.py b/src/temporalscope/partition/partition_validators.py index fca36de..2f6abb1 100644 --- a/src/temporalscope/partition/partition_validators.py +++ b/src/temporalscope/partition/partition_validators.py @@ -33,8 +33,7 @@ import pandas as pd import polars as pl -from temporalscope.core.core_utils import validate_backend -from temporalscope.core.core_utils import SupportedBackendDataFrame +from temporalscope.core.core_utils import SupportedBackendDataFrame, validate_backend PandasLike = TypeVar("PandasLike", pd.DataFrame, mpd.DataFrame) diff --git a/src/temporalscope/partition/sliding_window.py b/src/temporalscope/partition/sliding_window.py index 1da5be9..35697d1 100644 --- a/src/temporalscope/partition/sliding_window.py +++ b/src/temporalscope/partition/sliding_window.py @@ -60,15 +60,13 @@ ) from temporalscope.core.temporal_data_loader import TimeFrame from temporalscope.partition.base_protocol import TemporalPartitionerProtocol +from temporalscope.partition.padding import PAD_SCHEMES, backward_fill_pad, forward_fill_pad, mean_fill_pad, zero_pad from temporalscope.partition.partition_validators import ( check_class_balance, check_feature_to_sample_ratio, check_sample_size, ) -from temporalscope.partition.padding import PAD_SCHEMES -from temporalscope.partition.padding import zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad - # Precision constant for floating-point comparisons PRECISION = 1e-6 @@ -236,7 +234,6 @@ def __init__( :param pad_scheme: The padding scheme to use for filling partitions. Defaults to 'forward_fill'. :raises ValueError: If input parameters are invalid or columns (except `time_col`) are not numeric. """ - # Validate the backend and pad scheme validate_backend(tf.dataframe_backend) if pad_scheme not in PAD_SCHEMES: @@ -286,7 +283,7 @@ def __init__( self.pad_scheme = pad_scheme # Assign the chosen padding scheme # Precompute percentages - self.train_pct, self.test_pct, self.val_pct = self._precompute_percentages(train_pct, test_pct, val_pct) + self.train_pct, self.test_pct, self.val_pct = self.precompute_percentages(train_pct, test_pct, val_pct) # Sort the data using TimeFrame's sort_data method self.tf.sort_data(ascending=True) @@ -295,7 +292,7 @@ def __init__( self._fit_executed = False self._transform_executed = False - def _precompute_percentages( + def precompute_percentages( self, train_pct: float, test_pct: Optional[float], @@ -322,7 +319,6 @@ def _precompute_percentages( # Validate the train percentage if not (0 <= train_pct <= 1): raise ValueError("train_pct must be between 0 and 1.") - # Handle test_pct and val_pct cases explicitly if test_pct is None and val_pct is None: test_pct = 1.0 - train_pct @@ -337,18 +333,16 @@ def _precompute_percentages( test_pct = 1.0 - train_pct - val_pct else: # Both test_pct and val_pct are provided, ensure they are valid - if not (0 <= test_pct <= 1): + if test_pct is not None and not (0 <= test_pct <= 1): raise ValueError("test_pct must be between 0 and 1.") - if not (0 <= val_pct <= 1): + if val_pct is not None and not (0 <= val_pct <= 1): raise ValueError("val_pct must be between 0 and 1.") - # Ensure they sum to 1.0, handling floating-point imprecision with precision constant total_pct = train_pct + (test_pct or 0) + (val_pct or 0) if not (abs(total_pct - 1.0) < precision): # Compare with the precision constant raise ValueError("Train, test, and validation percentages must sum to 1.0.") - # Ensure test_pct and val_pct are float types, not None - return train_pct, float(test_pct), float(val_pct) + return train_pct, float(test_pct or 0), float(val_pct or 0) def _fit_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: """Fit method for partitioning using TimeFrame data. @@ -474,10 +468,15 @@ def _transform_pandas_modin(self) -> Iterator[Dict[str, Dict[str, Union[pd.DataF - Ensure that the input DataFrame is not empty to avoid runtime errors. - For very large datasets, the padding process may increase memory usage. Consider using Modin when handling large datasets to take advantage of distributed processing. + """ partition_count = 1 df = self.tf.get_data() # Fetch the data from TimeFrame + # Add a type check to ensure df is a DataFrame + if not isinstance(df, (pd.DataFrame, mpd.DataFrame)): + raise TypeError("Expected df to be a pandas or modin DataFrame") + for partition in self.fit(): # Partition indices generated by fit() partitioned_data = {} @@ -565,6 +564,7 @@ def _transform_polars(self) -> Iterator[Dict[str, Dict[str, pl.DataFrame]]]: - Polars DataFrames offer better performance with large datasets, especially for complex operations. - For very large datasets, Polars DataFrames are recommended due to their lower memory footprint and faster performance when compared to Pandas. Use Polars for more efficient partitioning and transformations. + """ partition_count = 1 df = self.tf.get_data() # Fetch the data from TimeFrame @@ -655,13 +655,11 @@ def fit(self) -> Iterator[Dict[str, Dict[str, Tuple[int, int]]]]: .. seealso:: - :meth:`transform`: For generating the actual data slices corresponding to these indices. """ - df = self.tf.get_data() # Get the dataset from the TimeFrame - # Call backend-specific partitioning method if self.tf.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: - return self._fit_pandas_modin(df) + return self._fit_pandas_modin() # type: ignore[call-arg] elif self.tf.dataframe_backend == BACKEND_POLARS: - return self._fit_polars(df) + return self._fit_polars() # type: ignore[call-arg] else: raise ValueError(f"Unsupported backend: {self.tf.dataframe_backend}") @@ -711,9 +709,9 @@ def transform(self) -> Iterator[Dict[str, Dict[str, SupportedBackendDataFrame]]] # Call backend-specific transformation method if self.tf.dataframe_backend in [BACKEND_PANDAS, BACKEND_MODIN]: - return self._transform_pandas_modin(df) + return self._transform_pandas_modin(df) # type: ignore[call-arg] elif self.tf.dataframe_backend == BACKEND_POLARS: - return self._transform_polars(df) + return self._transform_polars(df) # type: ignore[call-arg] else: raise ValueError(f"Unsupported backend: {self.tf.dataframe_backend}") @@ -741,7 +739,7 @@ def fit_transform(self) -> Iterator[Dict[str, Dict[str, SupportedBackendDataFram Each yielded partition has the following structure: .. code-block:: python - + ` { 'partition_1': { 'full': , diff --git a/test/unit/core/test_core_utils.py b/test/unit/core/test_core_utils.py index c67821a..02af4ba 100644 --- a/test/unit/core/test_core_utils.py +++ b/test/unit/core/test_core_utils.py @@ -17,38 +17,12 @@ # TemporalScope/test/unit/test_core_utils.py -import warnings -import pytest -from unittest.mock import patch -from typing import Optional, Tuple, Union - -import modin.pandas as mpd -import pandas as pd -import polars as pl -import numpy as np # Import core utility functions -from temporalscope.core.core_utils import ( - check_nans, - check_nulls, - get_api_keys, - get_default_backend_cfg, - validate_and_convert_input, - validate_backend, - print_divider, - infer_backend_from_dataframe, - is_timestamp_like, - is_numeric, - has_mixed_frequencies, - sort_dataframe, - check_empty_columns -) # Import exceptions -from temporalscope.core.exceptions import UnsupportedBackendError, MixedFrequencyWarning, MixedTimezonesWarning # Import the sample data generation and fixture from test_data_utils -from temporalscope.datasets.synthetic_data_generator import create_sample_data, sample_df_with_conditions # # Constants # BACKEND_PANDAS = "pd" @@ -233,7 +207,6 @@ # mock_to_pandas.assert_called_once() # Ensure _to_pandas is called - # @pytest.mark.parametrize( # "input_df, expected_backend", # [ diff --git a/test/unit/core/test_exceptions.py b/test/unit/core/test_exceptions.py index ffa5f97..79d292f 100644 --- a/test/unit/core/test_exceptions.py +++ b/test/unit/core/test_exceptions.py @@ -15,29 +15,33 @@ # specific language governing permissions and limitations # under the License. -""" TemporalScope/test/unit/test_core_exceptions.py +"""TemporalScope/test/unit/test_core_exceptions.py This module contains unit tests for the custom exceptions and warnings defined in the TemporalScope package. These tests ensure that the exceptions are raised correctly and the warnings are issued in the appropriate scenarios. """ -import pytest import warnings +import pytest + from temporalscope.core.exceptions import ( - TimeFrameError, - TimeColumnError, - MixedTypesWarning, - MixedTimezonesWarning, MixedFrequencyWarning, - UnsupportedBackendError + MixedTimezonesWarning, + MixedTypesWarning, + TimeColumnError, + TimeFrameError, + UnsupportedBackendError, ) + def test_unsupported_backend_error(): """Test that UnsupportedBackendError is raised with the correct message.""" with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): raise UnsupportedBackendError("Unsupported backend 'invalid_backend'") + + def test_time_frame_error_inheritance(): """Test that TimeFrameError is the base class for other exceptions.""" with pytest.raises(TimeFrameError): @@ -66,9 +70,3 @@ def test_mixed_frequency_warning(): """Test that MixedFrequencyWarning is issued when mixed timestamp frequencies are detected.""" with pytest.warns(MixedFrequencyWarning, match="Mixed timestamp frequencies"): warnings.warn("Mixed timestamp frequencies", MixedFrequencyWarning) - - -def test_unsupported_backend_error(): - """Test that UnsupportedBackendError is raised with the correct message.""" - with pytest.raises(UnsupportedBackendError, match="Unsupported backend"): - raise UnsupportedBackendError("Unsupported backend 'invalid_backend'") diff --git a/test/unit/core/test_temporal_data_loader.py b/test/unit/core/test_temporal_data_loader.py index 083328a..83cfb0f 100644 --- a/test/unit/core/test_temporal_data_loader.py +++ b/test/unit/core/test_temporal_data_loader.py @@ -16,23 +16,6 @@ # TemporalScope/test/unit/test_core_temporal_data_loader.py -import pytest -from typing import Dict, Union, Optional, List -from datetime import datetime, timedelta, timezone - -import numpy as np -import pandas as pd -import polars as pl -import modin.pandas as mpd - -from temporalscope.core.temporal_data_loader import TimeFrame - -from temporalscope.core.exceptions import ( - TimeColumnError, - MixedTypesWarning, - MixedFrequencyWarning, - UnsupportedBackendError, -) BACKEND_POLARS = "pl" BACKEND_PANDAS = "pd" @@ -229,7 +212,6 @@ # assert sorted_df["time"].iloc[0] > sorted_df["time"].iloc[-1] - # @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_POLARS, BACKEND_MODIN]) # def test_update_target_col(backend): # """Test `update_target_col` method across backends by updating the target column.""" diff --git a/test/unit/datasets/test_datasets.py b/test/unit/datasets/test_datasets.py index 4703c12..e70db42 100644 --- a/test/unit/datasets/test_datasets.py +++ b/test/unit/datasets/test_datasets.py @@ -15,13 +15,13 @@ # specific language governing permissions and limitations # under the License. +import pandas as pd import pytest -from temporalscope.datasets.datasets import DatasetLoader + +from temporalscope.core.core_utils import BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS from temporalscope.core.temporal_data_loader import TimeFrame -from temporalscope.core.core_utils import BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS -import pandas as pd -import modin.pandas as mpd -import polars as pl +from temporalscope.datasets.datasets import DatasetLoader + @pytest.fixture def dataset_loader(): @@ -96,12 +96,11 @@ def test_load_dataset_and_verify_time_column(dataset_loader): assert "ds" in df.columns assert pd.api.types.is_datetime64_any_dtype(df["ds"]) -@pytest.mark.parametrize("backends", [ - (BACKEND_PANDAS,), - (BACKEND_MODIN,), - (BACKEND_POLARS,), - (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS) -]) + +@pytest.mark.parametrize( + "backends", + [(BACKEND_PANDAS,), (BACKEND_MODIN,), (BACKEND_POLARS,), (BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS)], +) def test_load_and_init_timeframes_return(dataset_loader, backends): """Test that the returned timeframes object is a dictionary and contains the expected backends.""" timeframes = dataset_loader.load_and_init_timeframes(backends=backends) diff --git a/test/unit/datasets/test_synthetic_data_generator.py b/test/unit/datasets/test_synthetic_data_generator.py index 1d19e40..4aaf826 100644 --- a/test/unit/datasets/test_synthetic_data_generator.py +++ b/test/unit/datasets/test_synthetic_data_generator.py @@ -17,35 +17,43 @@ # TemporalScope/test/unit/datasets/test_synthetic_data_generator.py -import pytest +import numpy as np import pandas as pd import polars as pl -import modin.pandas as mpd -import numpy as np +import pytest + from temporalscope.datasets.synthetic_data_generator import ( - create_sample_data, - BACKEND_PANDAS, BACKEND_MODIN, + BACKEND_PANDAS, BACKEND_POLARS, - MODE_SINGLE_STEP, MODE_MULTI_STEP, + MODE_SINGLE_STEP, + create_sample_data, ) + # Skip unsupported backends for multi-step mode and Pandas-to-Polars conversion -@pytest.mark.parametrize("num_samples, num_features, mode", [ - (100, 3, MODE_SINGLE_STEP), # Single-step mode - pytest.param(100, 3, MODE_MULTI_STEP, marks=pytest.mark.xfail(reason="Unsupported multi-step mode for Modin and Polars")), - (0, 0, MODE_SINGLE_STEP), # Zero samples and features - (1000, 10, MODE_SINGLE_STEP) # Large data -]) -@pytest.mark.parametrize("backend", [ - BACKEND_PANDAS, - BACKEND_MODIN, - pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")) -]) +@pytest.mark.parametrize( + "num_samples, num_features, mode", + [ + (100, 3, MODE_SINGLE_STEP), # Single-step mode + pytest.param( + 100, 3, MODE_MULTI_STEP, marks=pytest.mark.xfail(reason="Unsupported multi-step mode for Modin and Polars") + ), + (0, 0, MODE_SINGLE_STEP), # Zero samples and features + (1000, 10, MODE_SINGLE_STEP), # Large data + ], +) +@pytest.mark.parametrize( + "backend", + [ + BACKEND_PANDAS, + BACKEND_MODIN, + pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")), + ], +) def test_create_sample_data_basic(num_samples, num_features, mode, backend): """Test that data generation works for both single-step and multi-step modes.""" - # Generate synthetic data df = create_sample_data(backend=backend, num_samples=num_samples, num_features=num_features, mode=mode) @@ -67,21 +75,28 @@ def test_create_sample_data_basic(num_samples, num_features, mode, backend): # Check if target is vector for multi-step mode if mode == MODE_MULTI_STEP: - assert isinstance(df["target"][0], (list, np.ndarray)), "Multi-step mode should generate vectorized target values." + assert isinstance( + df["target"][0], (list, np.ndarray) + ), "Multi-step mode should generate vectorized target values." -@pytest.mark.parametrize("timestamp_like, numeric, mixed_frequencies, mixed_timezones", [ - (True, False, False, False), # Timestamp-like time column - (False, True, False, False), # Numeric time column -]) -@pytest.mark.parametrize("backend", [ - BACKEND_PANDAS, - BACKEND_MODIN, - pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")) -]) +@pytest.mark.parametrize( + "timestamp_like, numeric, mixed_frequencies, mixed_timezones", + [ + (True, False, False, False), # Timestamp-like time column + (False, True, False, False), # Numeric time column + ], +) +@pytest.mark.parametrize( + "backend", + [ + BACKEND_PANDAS, + BACKEND_MODIN, + pytest.param(BACKEND_POLARS, marks=pytest.mark.xfail(reason="Pandas to Polars conversion not supported")), + ], +) def test_time_column_generation(timestamp_like, numeric, mixed_frequencies, mixed_timezones, backend): """Test that time columns are generated with the correct type and properties.""" - num_samples, num_features = 100, 3 df = create_sample_data( backend=backend, @@ -90,7 +105,7 @@ def test_time_column_generation(timestamp_like, numeric, mixed_frequencies, mixe timestamp_like=timestamp_like, numeric=numeric, mixed_frequencies=mixed_frequencies, - mixed_timezones=mixed_timezones + mixed_timezones=mixed_timezones, ) # Validate the type of the time column based on configuration @@ -98,10 +113,10 @@ def test_time_column_generation(timestamp_like, numeric, mixed_frequencies, mixe if backend == BACKEND_POLARS: assert isinstance(df["time"][0], pl.datatypes.Datetime), "Expected a timestamp-like time column" else: - assert isinstance(df['time'].iloc[0], pd.Timestamp), "Expected a timestamp-like time column" + assert isinstance(df["time"].iloc[0], pd.Timestamp), "Expected a timestamp-like time column" if numeric: if backend == BACKEND_POLARS: assert isinstance(df["time"][0], float), "Expected a numeric time column" else: - assert isinstance(df['time'].iloc[0], np.float64), "Expected a numeric time column" + assert isinstance(df["time"].iloc[0], np.float64), "Expected a numeric time column" diff --git a/test/unit/partition/test_partition_padding.py b/test/unit/partition/test_partition_padding.py index eccecf3..94c63b2 100644 --- a/test/unit/partition/test_partition_padding.py +++ b/test/unit/partition/test_partition_padding.py @@ -43,7 +43,6 @@ # np.random.seed(42) # Set a seed for reproducibility - # def generate_test_data(backend, num_samples=5): # """Generate test data with consistent column names across all backends.""" # start_date = pd.to_datetime("2021-01-01") @@ -116,7 +115,6 @@ # return pd.DataFrame(data) - # def check_monotonicity(df: SupportedBackendDataFrame, time_col: str, ascending: bool = True) -> bool: # if isinstance(df, pl.DataFrame): # # Handle Polars DataFrame @@ -141,7 +139,6 @@ # return diffs.lt(0).all() - # # Parametrize tests for ascending and descending order # @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) # @pytest.mark.parametrize("ascending", [True, False]) @@ -190,9 +187,6 @@ # assert check_monotonicity(sorted_df, "ds", ascending=True) - - - # # Padding function tests with Modin and Polars compatibility # @pytest.mark.parametrize("backend", [BACKEND_PANDAS, BACKEND_MODIN, BACKEND_POLARS]) # @pytest.mark.parametrize("padding_func", [zero_pad, forward_fill_pad, backward_fill_pad, mean_fill_pad]) From 0fe3ac0739e8293b5ad7effa6f13caceb8fe2154 Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Sun, 6 Oct 2024 04:33:35 +0000 Subject: [PATCH 5/6] feat(refactor-core-api-and-add-systematic-data-generator): feat: refactor core api and add systematic data generator - implement new core api to support multiple backends (pandas, polars, modin) - add synthetic_data_generator for systematic testing across backends - refactor core modules: core_utils, exceptions, temporal_data_loader, temporal_target_shifter - add new temporal_core_processing module - restructure and update test files to align with new api design - enhance functionality to support both single-step and multi-step operations - update pyproject.toml to reflect new structure and dependencies - merged changes from main branch to integrate latest updates and resolve conflicts --- src/temporalscope/datasets/synthetic_data_generator.py | 4 +++- test/unit/datasets/test_synthetic_data_generator.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/temporalscope/datasets/synthetic_data_generator.py b/src/temporalscope/datasets/synthetic_data_generator.py index 8fe2664..af584e9 100644 --- a/src/temporalscope/datasets/synthetic_data_generator.py +++ b/src/temporalscope/datasets/synthetic_data_generator.py @@ -101,6 +101,8 @@ from temporalscope.core.core_utils import ( BACKEND_PANDAS, + BACKEND_MODIN, + BACKEND_POLARS, MODE_MULTI_STEP, MODE_SINGLE_STEP, SUPPORTED_MULTI_STEP_BACKENDS, @@ -110,7 +112,7 @@ validate_mode, ) -# Constants +# Constants defined locally in this file DEFAULT_NUM_SAMPLES = 100 DEFAULT_NUM_FEATURES = 3 SEED = 42 diff --git a/test/unit/datasets/test_synthetic_data_generator.py b/test/unit/datasets/test_synthetic_data_generator.py index 4aaf826..e52d5aa 100644 --- a/test/unit/datasets/test_synthetic_data_generator.py +++ b/test/unit/datasets/test_synthetic_data_generator.py @@ -22,16 +22,18 @@ import polars as pl import pytest -from temporalscope.datasets.synthetic_data_generator import ( +from temporalscope.core.core_utils import ( BACKEND_MODIN, BACKEND_PANDAS, BACKEND_POLARS, MODE_MULTI_STEP, MODE_SINGLE_STEP, - create_sample_data, ) +from temporalscope.datasets.synthetic_data_generator import create_sample_data + + # Skip unsupported backends for multi-step mode and Pandas-to-Polars conversion @pytest.mark.parametrize( "num_samples, num_features, mode", From 6041fa656273f15186256fa9d1f7f2e048d7519c Mon Sep 17 00:00:00 2001 From: Philip Ndikum Date: Sun, 6 Oct 2024 04:40:12 +0000 Subject: [PATCH 6/6] feat(refactor-core-api-and-add-systematic-data-generator): Added ruff unused import fixes --- src/temporalscope/datasets/synthetic_data_generator.py | 2 -- test/unit/datasets/test_synthetic_data_generator.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/temporalscope/datasets/synthetic_data_generator.py b/src/temporalscope/datasets/synthetic_data_generator.py index af584e9..f6fb0f1 100644 --- a/src/temporalscope/datasets/synthetic_data_generator.py +++ b/src/temporalscope/datasets/synthetic_data_generator.py @@ -101,8 +101,6 @@ from temporalscope.core.core_utils import ( BACKEND_PANDAS, - BACKEND_MODIN, - BACKEND_POLARS, MODE_MULTI_STEP, MODE_SINGLE_STEP, SUPPORTED_MULTI_STEP_BACKENDS, diff --git a/test/unit/datasets/test_synthetic_data_generator.py b/test/unit/datasets/test_synthetic_data_generator.py index e52d5aa..df441ac 100644 --- a/test/unit/datasets/test_synthetic_data_generator.py +++ b/test/unit/datasets/test_synthetic_data_generator.py @@ -29,8 +29,6 @@ MODE_MULTI_STEP, MODE_SINGLE_STEP, ) - - from temporalscope.datasets.synthetic_data_generator import create_sample_data