FEAT-modin-project#1222: Implement DataFrame.asof() without Pandas fa…

…llback (modin-project#1989) Signed-off-by: Itamar Turner-Trauring <itamar@itamarst.org>
aregm · Sep 16, 2020 · 07fbd71 · 07fbd71
1 parent 81a299f
commit 07fbd71
Show file tree

Hide file tree

Showing 6 changed files with 162 additions and 34 deletions.
diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst
@@ -47,7 +47,7 @@ default to pandas.
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``asfreq``                 | `asfreq`_                 | D                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
-| ``asof``                   | `asof`_                   | D                      |                                                    |
+| ``asof``                   | `asof`_                   | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``assign``                 | `assign`_                 | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+

diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst
@@ -54,7 +54,7 @@ the related section on `Defaulting to pandas`_.
 +-----------------------------+---------------------------------+
 | ``asobject``                | D                               |
 +-----------------------------+---------------------------------+
-| ``asof``                    | D                               |
+| ``asof``                    | Y                               |
 +-----------------------------+---------------------------------+
 | ``astype``                  | Y                               |
 +-----------------------------+---------------------------------+

diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -625,7 +625,29 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None):
         )
 
     def asof(self, where, subset=None):
-        return self._default_to_pandas("asof", where, subset=subset)
+        scalar = not is_list_like(where)
+        if isinstance(where, pandas.Index):
+            # Prevent accidental mutation of original:
+            where = where.copy()
+        else:
+            if scalar:
+                where = [where]
+            where = pandas.Index(where)
+
+        if subset is None:
+            data = self
+        else:
+            # Only relevant for DataFrames:
+            data = self[subset]
+        no_na_index = data.dropna().index
+        new_index = pandas.Index([no_na_index.asof(i) for i in where])
+        result = self.reindex(new_index)
+        result.index = where
+
+        if scalar:
+            # Need to return a Series:
+            result = result.squeeze()
+        return result
 
     def astype(self, dtype, copy=True, errors="raise"):
         col_dtypes = {}

diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py
@@ -72,23 +72,6 @@ def test_asfreq():
         df.asfreq(freq="30S")
 
 
-def test_asof():
-    df = pd.DataFrame(
-        {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]},
-        index=pd.DatetimeIndex(
-            [
-                "2018-02-27 09:01:00",
-                "2018-02-27 09:02:00",
-                "2018-02-27 09:03:00",
-                "2018-02-27 09:04:00",
-                "2018-02-27 09:05:00",
-            ]
-        ),
-    )
-    with pytest.warns(UserWarning):
-        df.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]))
-
-
 def test_assign():
     data = test_data_values[0]
     modin_df = pd.DataFrame(data)

diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py
@@ -20,6 +20,7 @@
 import sys
 
 from modin.pandas.test.utils import (
+    NROWS,
     RAND_LOW,
     RAND_HIGH,
     df_equals,
@@ -41,6 +42,93 @@
 matplotlib.use("Agg")
 
 
+@pytest.mark.parametrize(
+    "dates",
+    [
+        ["2018-02-27 09:03:30", "2018-02-27 09:04:30"],
+        ["2018-02-27 09:03:00", "2018-02-27 09:05:00"],
+    ],
+)
+@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None])
+def test_asof_with_nan(dates, subset):
+    data = {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]}
+    index = pd.DatetimeIndex(
+        [
+            "2018-02-27 09:01:00",
+            "2018-02-27 09:02:00",
+            "2018-02-27 09:03:00",
+            "2018-02-27 09:04:00",
+            "2018-02-27 09:05:00",
+        ]
+    )
+    modin_where = pd.DatetimeIndex(dates)
+    pandas_where = pandas.DatetimeIndex(dates)
+    compare_asof(data, index, modin_where, pandas_where, subset)
+
+
+@pytest.mark.parametrize(
+    "dates",
+    [
+        ["2018-02-27 09:03:30", "2018-02-27 09:04:30"],
+        ["2018-02-27 09:03:00", "2018-02-27 09:05:00"],
+    ],
+)
+@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None])
+def test_asof_without_nan(dates, subset):
+    data = {"a": [10, 20, 30, 40, 50], "b": [70, 600, 30, -200, 500]}
+    index = pd.DatetimeIndex(
+        [
+            "2018-02-27 09:01:00",
+            "2018-02-27 09:02:00",
+            "2018-02-27 09:03:00",
+            "2018-02-27 09:04:00",
+            "2018-02-27 09:05:00",
+        ]
+    )
+    modin_where = pd.DatetimeIndex(dates)
+    pandas_where = pandas.DatetimeIndex(dates)
+    compare_asof(data, index, modin_where, pandas_where, subset)
+
+
+@pytest.mark.parametrize(
+    "lookup",
+    [
+        [60, 70, 90],
+        [60.5, 70.5, 100],
+    ],
+)
+@pytest.mark.parametrize("subset", ["col2", "col1", ["col1", "col2"], None])
+def test_asof_large(lookup, subset):
+    data = test_data["float_nan_data"]
+    index = list(range(NROWS))
+    modin_where = pd.Index(lookup)
+    pandas_where = pandas.Index(lookup)
+    compare_asof(data, index, modin_where, pandas_where, subset)
+
+
+def compare_asof(
+    data, index, modin_where: pd.Index, pandas_where: pandas.Index, subset
+):
+    modin_df = pd.DataFrame(data, index=index)
+    pandas_df = pandas.DataFrame(data, index=index)
+    df_equals(
+        modin_df.asof(modin_where, subset=subset),
+        pandas_df.asof(pandas_where, subset=subset),
+    )
+    df_equals(
+        modin_df.asof(modin_where.values, subset=subset),
+        pandas_df.asof(pandas_where.values, subset=subset),
+    )
+    df_equals(
+        modin_df.asof(list(modin_where.values), subset=subset),
+        pandas_df.asof(list(pandas_where.values), subset=subset),
+    )
+    df_equals(
+        modin_df.asof(modin_where.values[0], subset=subset),
+        pandas_df.asof(pandas_where.values[0], subset=subset),
+    )
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_first_valid_index(data):
     modin_df = pd.DataFrame(data)

diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
@@ -845,21 +845,56 @@ def test_asfreq():
         series.asfreq(freq="30S")
 
 
-def test_asof():
-    series = pd.Series(
-        [10, 20, 30, 40, 50],
-        index=pd.DatetimeIndex(
-            [
-                "2018-02-27 09:01:00",
-                "2018-02-27 09:02:00",
-                "2018-02-27 09:03:00",
-                "2018-02-27 09:04:00",
-                "2018-02-27 09:05:00",
-            ]
-        ),
+@pytest.mark.parametrize(
+    "where",
+    [
+        20,
+        30,
+        [10, 40],
+        [20, 30],
+        [20],
+        25,
+        [25, 45],
+        [25, 30],
+        pandas.Index([20, 30]),
+        pandas.Index([10]),
+    ],
+)
+def test_asof(where):
+    # With NaN:
+    values = [1, 2, np.nan, 4]
+    index = [10, 20, 30, 40]
+    modin_series, pandas_series = pd.Series(values, index=index), pandas.Series(
+        values, index=index
     )
-    with pytest.warns(UserWarning):
-        series.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]))
+    df_equals(modin_series.asof(where), pandas_series.asof(where))
+
+    # No NaN:
+    values = [1, 2, 7, 4]
+    modin_series, pandas_series = pd.Series(values, index=index), pandas.Series(
+        values, index=index
+    )
+    df_equals(modin_series.asof(where), pandas_series.asof(where))
+
+
+@pytest.mark.parametrize(
+    "where",
+    [
+        20,
+        30,
+        [10.5, 40.5],
+        [10],
+        pandas.Index([20, 30]),
+        pandas.Index([10.5]),
+    ],
+)
+def test_asof_large(where):
+    values = test_data["float_nan_data"]["col1"]
+    index = list(range(len(values)))
+    modin_series, pandas_series = pd.Series(values, index=index), pandas.Series(
+        values, index=index
+    )
+    df_equals(modin_series.asof(where), pandas_series.asof(where))
 
 
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)