Fix #408: test_sim_engine failing in yaml-cli2, bc hist_df is s not m…

…s. Proper testing and documentation was added, as part of the fix
oceanprotocol · trizin · Jan 16, 2024 · Nov 22, 2023 · Nov 22, 2023 · Nov 22, 2023
commit 6ff502d865b12a63e44730eb86e6c23dd33f05aa
diff --git a/pdr_backend/data_eng/data_factory.py b/pdr_backend/data_eng/data_factory.py
@@ -1,6 +1,6 @@
 import os
 import sys
-from typing import Dict, List, Union
+from typing import Dict, List, Tuple, Union
 
 from enforce_typing import enforce_types
 import numpy as np
@@ -32,18 +32,59 @@
 
 @enforce_types
 class DataFactory:
+    """
+    Roles:
+    - From each CEX API, fill >=1 parquet_dfs -> parquet files data lake
+    - From parquet_dfs, fill 1 hist_df -- historical data across all CEXes
+    - From hist_df, create (X, y, x_df) -- for model building
+
+    Where:
+      parquet_dfs -- dict of [exch_str][pair_str] : df
+        And df has columns of: "open", "high", .., "volume", "datetime"
+        (and index = timestamp)
+
+      hist_df -- polars DataFrame with cols like:
+        "timestamp",
+        "binanceus:ETH-USDT:open",
+        "binanceus:ETH-USDT:high",
+        "binanceus:ETH-USDT:low",
+        "binanceus:ETH-USDT:close",
+        "binanceus:ETH-USDT:volume",
+        ...
+        "datetime",
+        (and no index)
+
+    And:
+      X -- 2d array of [sample_i, var_i] : value -- inputs for model
+      y -- 1d array of [sample_i] -- target outputs for model
+
+      x_df -- *pandas* DataFrame with cols like:
+        "binanceus:ETH-USDT:open:t-3",
+        "binanceus:ETH-USDT:open:t-2",
+        "binanceus:ETH-USDT:open:t-1",
+        "binanceus:ETH-USDT:high:t-3",
+        "binanceus:ETH-USDT:high:t-2",
+        "binanceus:ETH-USDT:high:t-1",
+        ...
+        "datetime",
+        (and index = 0, 1, .. -- nothing special)
+
+    Finally:
+       - "timestamp" values are ut: int is unix time, UTC, in ms (not s)
+       - "datetime" values ares python datetime.datetime, UTC
+    """
+
     def __init__(self, pp: DataPP, ss: DataSS):
         self.pp = pp
         self.ss = ss
 
-    def get_hist_df(self) -> pd.DataFrame:
+    def get_hist_df(self) -> pl.DataFrame:
         """
         @description
           Get historical dataframe, across many exchanges & pairs.
 
         @return
-          hist_df -- df w/ cols={exchange_str}:{pair_str}:{signal}+"datetime",
-            and index=timestamp
+          hist_df -- *polars* Dataframe. See class docstring
         """
         print("Get historical data, across many exchanges & pairs: begin.")
 
@@ -60,7 +101,10 @@ def get_hist_df(self) -> pd.DataFrame:
         hist_df = self._merge_parquet_dfs(parquet_dfs)
 
         print("Get historical data, across many exchanges & pairs: done.")
-        return hist_df.to_pandas()
+
+        # postconditions
+        assert isinstance(hist_df, pl.DataFrame)
+        return hist_df
 
     def _update_parquet(self, fin_ut: int):
         print("  Update parquet.")
@@ -203,11 +247,10 @@ def _load_parquet(self, fin_ut: int) -> Dict[str, Dict[str, pl.DataFrame]]:
     def _merge_parquet_dfs(self, parquet_dfs: dict) -> pl.DataFrame:
         """
         @arguments
-          parquet_dfs -- dict [exch_str][pair_str] : df
-            where df has cols={signal_str}+"datetime", and index=timestamp
+          parquet_dfs -- see class docstring
+
         @return
-          hist_df -- df w/ cols={exch_str}:{pair_str}:{signal_str}+"datetime",
-            and index=timestamp
+          hist_df -- see class docstring
         """
         # init hist_df such that it can do basic operations
         print("  Merge parquet DFs.")
@@ -257,31 +300,34 @@ def _merge_parquet_dfs(self, parquet_dfs: dict) -> pl.DataFrame:
     # TO DO: Move to model_factory/model + use generic df<=>serialize<=>parquet
     def create_xy(
         self,
-        hist_df: pd.DataFrame,  # not a pl.DataFrame, by design
+        hist_df: pl.DataFrame,
         testshift: int,
         do_fill_nans: bool = True,
-    ):
+    ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
         """
         @arguments
-          hist_df -- df w cols={exch_str}:{pair_str}:{signal_str}+"datetime",
-            and index=timestamp
+          hist_df -- *polars* DataFrame. See class docstring
           testshift -- to simulate across historical test data
           do_fill_nans -- if any values are nan, fill them? (Via interpolation)
             If you turn this off and hist_df has nans, then X/y/etc gets nans
 
         @return --
-          X -- 2d array of [sample_i, var_i] : value
-          y -- 1d array of [sample_i]
-          x_df -- df w/ cols={exch_str}:{pair_str}:{signal}:t-{x} + "datetime"
-            index=0,1,.. (nothing special)
+          X -- 2d array of [sample_i, var_i] : value -- inputs for model
+          y -- 1d array of [sample_i] -- target outputs for model
+          x_df -- *pandas* DataFrame. See class docstring.
         """
-        if not isinstance(hist_df, pd.DataFrame):
-            raise ValueError("hist_df should be a pd.DataFrame")
+        # preconditions
+        assert isinstance(hist_df, pl.DataFrame), pl.__class__
+        assert "timestamp" in hist_df.columns
+        assert "datetime" in hist_df.columns
+
+        # condition inputs
         if do_fill_nans and has_nan(hist_df):
             hist_df = fill_nans(hist_df)
-
         ss = self.ss
-        x_df = pd.DataFrame()
+
+        # main work
+        x_df = pd.DataFrame()  # build this up
 
         target_hist_cols = [
             f"{exch_str}:{pair_str}:{signal_str}"
@@ -290,7 +336,7 @@ def create_xy(
 
         for hist_col in target_hist_cols:
             assert hist_col in hist_df.columns, f"missing data col: {hist_col}"
-            z = hist_df[hist_col].tolist()  # [..., z(t-3), z(t-2), z(t-1)]
+            z = hist_df[hist_col].to_list()  # [..., z(t-3), z(t-2), z(t-1)]
             maxshift = testshift + ss.autoregressive_n
             N_train = min(ss.max_n_train, len(z) - maxshift - 1)
             if N_train <= 0:
@@ -315,13 +361,14 @@ def create_xy(
         # eg y = [BinEthC_-1, BinEthC_-2, ..., BinEthC_-450, BinEthC_-451]
         pp = self.pp
         hist_col = f"{pp.exchange_str}:{pp.pair_str}:{pp.signal_str}"
-        z = hist_df[hist_col].tolist()
+        z = hist_df[hist_col].to_list()
         y = np.array(_slice(z, -testshift - N_train - 1, -testshift))
 
         # postconditions
         assert X.shape[0] == y.shape[0]
         assert X.shape[0] <= (ss.max_n_train + 1)
         assert X.shape[1] == ss.n
+        assert isinstance(x_df, pd.DataFrame)
 
         # return
         return X, y, x_df
@@ -366,7 +413,7 @@ def safe_fetch_ohlcv(
       exch -- eg ccxt.binanceus()
       symbol -- eg "BTC/USDT". NOT "BTC-USDT"
       timeframe -- eg "1h", "1m"
-      since -- Timestamp of first candle. In unix time (in ms)
+      since -- timestamp of first candle. In unix time (in ms)
       limit -- max # candles to retrieve
 
     @return

diff --git a/pdr_backend/data_eng/test/test_data_factory.py b/pdr_backend/data_eng/test/test_data_factory.py
@@ -62,7 +62,7 @@ def _data_ss(parquet_dir, input_feeds, st_timestr=None, fin_timestr=None):
 
 
 @enforce_types
-def _assert_shapes(ss: DataSS, X: np.ndarray, y: np.ndarray, x_df: pd.DataFrame):
+def _assert_pd_df_shape(ss: DataSS, X: np.ndarray, y: np.ndarray, x_df: pd.DataFrame):
     assert X.shape[0] == y.shape[0]
     assert X.shape[0] == (ss.max_n_train + 1)  # 1 for test, rest for train
     assert X.shape[1] == ss.n
@@ -108,10 +108,14 @@ def test_update_parquet5(tmpdir):
 
 @enforce_types
 def _test_update_parquet(st_timestr: str, fin_timestr: str, tmpdir, n_uts):
-    """n_uts -- expected # timestamps. Typically int. If '>1K', expect >1000"""
+    """
+    @arguments
+      n_uts -- expected # timestamps. Typically int. If '>1K', expect >1000
+    """
 
     # setup: uts helpers
     def _calc_ut(since: int, i: int) -> int:
+        """Return a ut : unix time, in ms, in UTC time zone"""
         return since + i * MS_PER_5M_EPOCH
 
     def _uts_in_range(st_ut: int, fin_ut: int) -> List[int]:
@@ -226,20 +230,40 @@ def _addval(DATA: list, val: float) -> list:
 }
 
 
+@enforce_types
+def test_hist_df_shape(tmpdir):
+    _, _, data_factory = _data_pp_ss_1feed(tmpdir, "binanceus h ETH-USDT")
+    hist_df = data_factory._merge_parquet_dfs(ETHUSDT_PARQUET_DFS)
+    assert isinstance(hist_df, pl.DataFrame)
+    assert hist_df.columns == [
+        "timestamp",
+        "binanceus:ETH-USDT:open",
+        "binanceus:ETH-USDT:high",
+        "binanceus:ETH-USDT:low",
+        "binanceus:ETH-USDT:close",
+        "binanceus:ETH-USDT:volume",
+        "datetime",
+    ]
+    assert hist_df.shape == (12, 7)
+    assert len(hist_df["timestamp"]) == 12
+    assert (  # pylint: disable=unsubscriptable-object
+        hist_df["timestamp"][0] == 1686805500000
+    )
+
+
 @enforce_types
 def test_create_xy__input_type(tmpdir):
     # hist_df should be pl
     _, _, data_factory = _data_pp_ss_1feed(tmpdir, "binanceus h ETH-USDT")
     hist_df = data_factory._merge_parquet_dfs(ETHUSDT_PARQUET_DFS)
-    assert not isinstance(hist_df, pd.DataFrame)
     assert isinstance(hist_df, pl.DataFrame)
 
-    # create_xy() input should be pd
-    data_factory.create_xy(hist_df.to_pandas(), testshift=0)
+    # create_xy() input should be pl
+    data_factory.create_xy(hist_df, testshift=0)
 
-    # create_xy() inputs shouldn't be pl
-    with pytest.raises(ValueError):
-        data_factory.create_xy(hist_df, testshift=0)
+    # create_xy() inputs shouldn't be pd
+    with pytest.raises(AssertionError):
+        data_factory.create_xy(hist_df.to_pandas(), testshift=0)
 
 
 @enforce_types
@@ -248,9 +272,8 @@ def test_create_xy__1exchange_1coin_1signal(tmpdir):
     hist_df = data_factory._merge_parquet_dfs(ETHUSDT_PARQUET_DFS)
 
     # =========== initial testshift (0)
-    # At model level, we use pandas not polars. Hence "to_pandas()"
-    X, y, x_df = data_factory.create_xy(hist_df.to_pandas(), testshift=0)
-    _assert_shapes(ss, X, y, x_df)
+    X, y, x_df = data_factory.create_xy(hist_df, testshift=0)
+    _assert_pd_df_shape(ss, X, y, x_df)
 
     assert X[-1, :].tolist() == [4, 3, 2] and y[-1] == 1
     assert X[-2, :].tolist() == [5, 4, 3] and y[-2] == 2
@@ -269,9 +292,9 @@ def test_create_xy__1exchange_1coin_1signal(tmpdir):
     assert x_df["binanceus:ETH-USDT:high:t-2"].tolist() == [9, 8, 7, 6, 5, 4, 3, 2]
     assert X[:, 2].tolist() == [9, 8, 7, 6, 5, 4, 3, 2]
 
-    # =========== now have a different testshift (1 not 0). Note "to_pandas()"
-    X, y, x_df = data_factory.create_xy(hist_df.to_pandas(), testshift=1)
-    _assert_shapes(ss, X, y, x_df)
+    # =========== now have a different testshift (1 not 0)
+    X, y, x_df = data_factory.create_xy(hist_df, testshift=1)
+    _assert_pd_df_shape(ss, X, y, x_df)
 
     assert X[-1, :].tolist() == [5, 4, 3] and y[-1] == 2
     assert X[-2, :].tolist() == [6, 5, 4] and y[-2] == 3
@@ -290,11 +313,11 @@ def test_create_xy__1exchange_1coin_1signal(tmpdir):
     assert x_df["binanceus:ETH-USDT:high:t-2"].tolist() == [10, 9, 8, 7, 6, 5, 4, 3]
     assert X[:, 2].tolist() == [10, 9, 8, 7, 6, 5, 4, 3]
 
-    # =========== now have a different max_n_train. Note "to_pandas()"
+    # =========== now have a different max_n_train
     ss.d["max_n_train"] = 5
 
-    X, y, x_df = data_factory.create_xy(hist_df.to_pandas(), testshift=0)
-    _assert_shapes(ss, X, y, x_df)
+    X, y, x_df = data_factory.create_xy(hist_df, testshift=0)
+    _assert_pd_df_shape(ss, X, y, x_df)
 
     assert X.shape[0] == 5 + 1  # +1 for one test point
     assert y.shape[0] == 5 + 1
@@ -330,8 +353,8 @@ def test_create_xy__2exchanges_2coins_2signals(tmpdir):
 
     data_factory = DataFactory(pp, ss)
     hist_df = data_factory._merge_parquet_dfs(parquet_dfs)
-    X, y, x_df = data_factory.create_xy(hist_df.to_pandas(), testshift=0)
-    _assert_shapes(ss, X, y, x_df)
+    X, y, x_df = data_factory.create_xy(hist_df, testshift=0)
+    _assert_pd_df_shape(ss, X, y, x_df)
 
     found_cols = x_df.columns.tolist()
     target_cols = [
@@ -405,23 +428,19 @@ def test_create_xy__handle_nan(tmpdir):
     # =========== initial testshift (0)
     # run create_xy() and force the nans to stick around
     # -> we want to ensure that we're building X/y with risk of nan
-    X, y, x_df = data_factory.create_xy(
-        hist_df.to_pandas(), testshift=0, do_fill_nans=False
-    )
+    X, y, x_df = data_factory.create_xy(hist_df, testshift=0, do_fill_nans=False)
     assert has_nan(X) and has_nan(y) and has_nan(x_df)
 
     # nan approach 1: fix externally
     hist_df2 = fill_nans(hist_df)
     assert not has_nan(hist_df2)
 
     # nan approach 2: explicitly tell create_xy to fill nans
-    X, y, x_df = data_factory.create_xy(
-        hist_df.to_pandas(), testshift=0, do_fill_nans=True
-    )
+    X, y, x_df = data_factory.create_xy(hist_df, testshift=0, do_fill_nans=True)
     assert not has_nan(X) and not has_nan(y) and not has_nan(x_df)
 
     # nan approach 3: create_xy fills nans by default (best)
-    X, y, x_df = data_factory.create_xy(hist_df.to_pandas(), testshift=0)
+    X, y, x_df = data_factory.create_xy(hist_df, testshift=0)
     assert not has_nan(X) and not has_nan(y) and not has_nan(x_df)
 
 
@@ -451,7 +470,7 @@ def mock_merge_parquet_dfs(*args, **kwargs):  # pylint: disable=unused-argument
 
     # call and assert
     hist_df = data_factory.get_hist_df()
-    assert isinstance(hist_df, pd.DataFrame)
+    assert isinstance(hist_df, pl.DataFrame)
     assert len(hist_df) == 3
 
     assert mock_update_parquet.called
@@ -481,7 +500,6 @@ def mock_merge_parquet_dfs(*args, **kwargs):  # pylint: disable=unused-argument
 
     # call and assert
     hist_df = data_factory.get_hist_df()
-    assert isinstance(hist_df, pd.DataFrame)
     assert len(hist_df) == 3
 
     assert mock_update_parquet.called
@@ -503,21 +521,18 @@ def test_get_hist_df(tmpdir):
     )
     data_factory = DataFactory(pp, ss)
 
-    hist_df = data_factory.get_hist_df()
-
     # call and assert
     hist_df = data_factory.get_hist_df()
-    assert isinstance(hist_df, pd.DataFrame)
 
     # 289 records created
     assert len(hist_df) == 289
 
     # binanceus is returning valid data
-    assert hist_df["binanceus:BTC-USDT:high"].isna().sum() == 0
-    assert hist_df["binanceus:ETH-USDT:high"].isna().sum() == 0
+    assert not has_nan(hist_df["binanceus:BTC-USDT:high"])
+    assert not has_nan(hist_df["binanceus:ETH-USDT:high"])
 
     # kraken is returning nans
-    assert hist_df["kraken:BTC-USDT:high"].isna().sum() == 289
+    assert has_nan(hist_df["kraken:BTC-USDT:high"])
 
     # assert head is oldest
     head_timestamp = hist_df.head(1)["timestamp"].to_list()[0]
@@ -537,16 +552,14 @@ def test_exchange_hist_overlap(tmpdir):
 
     # call and assert
     hist_df = data_factory.get_hist_df()
-    assert isinstance(hist_df, pd.DataFrame)
 
     # 289 records created
     assert len(hist_df) == 289
 
-    # assert head is oldest and tail is latest
-    assert (
-        hist_df.head(1)["timestamp"].to_list()[0]
-        < hist_df.tail(1)["timestamp"].to_list()[0]
-    )
+    # assert head is oldest
+    head_timestamp = hist_df.head(1)["timestamp"].to_list()[0]
+    tail_timestamp = hist_df.tail(1)["timestamp"].to_list()[0]
+    assert head_timestamp < tail_timestamp
 
     # let's get more data from exchange with overlap
     _, _, data_factory2 = _data_pp_ss_1feed(