From 07fbd711fc556107d20383ba05a72e03fe070b28 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Fri, 4 Sep 2020 12:35:38 -0400 Subject: [PATCH] FEAT-#1222: Implement DataFrame.asof() without Pandas fallback (#1989) Signed-off-by: Itamar Turner-Trauring --- docs/supported_apis/dataframe_supported.rst | 2 +- docs/supported_apis/series_supported.rst | 2 +- modin/pandas/base.py | 24 +++++- modin/pandas/test/dataframe/test_default.py | 17 ---- modin/pandas/test/dataframe/test_indexing.py | 88 ++++++++++++++++++++ modin/pandas/test/test_series.py | 63 ++++++++++---- 6 files changed, 162 insertions(+), 34 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index fc364e540bf..133f3527764 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -47,7 +47,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``asfreq`` | `asfreq`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``asof`` | `asof`_ | D | | +| ``asof`` | `asof`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``assign`` | `assign`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index de87ea616d6..b8996ad74e3 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -54,7 +54,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``asobject`` | D | +-----------------------------+---------------------------------+ -| ``asof`` | D | +| ``asof`` | Y | +-----------------------------+---------------------------------+ | ``astype`` | Y | +-----------------------------+---------------------------------+ diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 4124b1e1190..e3c2ad0b3b1 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -625,7 +625,29 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): ) def asof(self, where, subset=None): - return self._default_to_pandas("asof", where, subset=subset) + scalar = not is_list_like(where) + if isinstance(where, pandas.Index): + # Prevent accidental mutation of original: + where = where.copy() + else: + if scalar: + where = [where] + where = pandas.Index(where) + + if subset is None: + data = self + else: + # Only relevant for DataFrames: + data = self[subset] + no_na_index = data.dropna().index + new_index = pandas.Index([no_na_index.asof(i) for i in where]) + result = self.reindex(new_index) + result.index = where + + if scalar: + # Need to return a Series: + result = result.squeeze() + return result def astype(self, dtype, copy=True, errors="raise"): col_dtypes = {} diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index bd486163653..83f36d23ae2 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -72,23 +72,6 @@ def test_asfreq(): df.asfreq(freq="30S") -def test_asof(): - df = pd.DataFrame( - {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]}, - index=pd.DatetimeIndex( - [ - "2018-02-27 09:01:00", - "2018-02-27 09:02:00", - "2018-02-27 09:03:00", - "2018-02-27 09:04:00", - "2018-02-27 09:05:00", - ] - ), - ) - with pytest.warns(UserWarning): - df.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])) - - def test_assign(): data = test_data_values[0] modin_df = pd.DataFrame(data) diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index 4a669949ddc..a89c0d538f9 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -20,6 +20,7 @@ import sys from modin.pandas.test.utils import ( + NROWS, RAND_LOW, RAND_HIGH, df_equals, @@ -41,6 +42,93 @@ matplotlib.use("Agg") +@pytest.mark.parametrize( + "dates", + [ + ["2018-02-27 09:03:30", "2018-02-27 09:04:30"], + ["2018-02-27 09:03:00", "2018-02-27 09:05:00"], + ], +) +@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None]) +def test_asof_with_nan(dates, subset): + data = {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]} + index = pd.DatetimeIndex( + [ + "2018-02-27 09:01:00", + "2018-02-27 09:02:00", + "2018-02-27 09:03:00", + "2018-02-27 09:04:00", + "2018-02-27 09:05:00", + ] + ) + modin_where = pd.DatetimeIndex(dates) + pandas_where = pandas.DatetimeIndex(dates) + compare_asof(data, index, modin_where, pandas_where, subset) + + +@pytest.mark.parametrize( + "dates", + [ + ["2018-02-27 09:03:30", "2018-02-27 09:04:30"], + ["2018-02-27 09:03:00", "2018-02-27 09:05:00"], + ], +) +@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None]) +def test_asof_without_nan(dates, subset): + data = {"a": [10, 20, 30, 40, 50], "b": [70, 600, 30, -200, 500]} + index = pd.DatetimeIndex( + [ + "2018-02-27 09:01:00", + "2018-02-27 09:02:00", + "2018-02-27 09:03:00", + "2018-02-27 09:04:00", + "2018-02-27 09:05:00", + ] + ) + modin_where = pd.DatetimeIndex(dates) + pandas_where = pandas.DatetimeIndex(dates) + compare_asof(data, index, modin_where, pandas_where, subset) + + +@pytest.mark.parametrize( + "lookup", + [ + [60, 70, 90], + [60.5, 70.5, 100], + ], +) +@pytest.mark.parametrize("subset", ["col2", "col1", ["col1", "col2"], None]) +def test_asof_large(lookup, subset): + data = test_data["float_nan_data"] + index = list(range(NROWS)) + modin_where = pd.Index(lookup) + pandas_where = pandas.Index(lookup) + compare_asof(data, index, modin_where, pandas_where, subset) + + +def compare_asof( + data, index, modin_where: pd.Index, pandas_where: pandas.Index, subset +): + modin_df = pd.DataFrame(data, index=index) + pandas_df = pandas.DataFrame(data, index=index) + df_equals( + modin_df.asof(modin_where, subset=subset), + pandas_df.asof(pandas_where, subset=subset), + ) + df_equals( + modin_df.asof(modin_where.values, subset=subset), + pandas_df.asof(pandas_where.values, subset=subset), + ) + df_equals( + modin_df.asof(list(modin_where.values), subset=subset), + pandas_df.asof(list(pandas_where.values), subset=subset), + ) + df_equals( + modin_df.asof(modin_where.values[0], subset=subset), + pandas_df.asof(pandas_where.values[0], subset=subset), + ) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_first_valid_index(data): modin_df = pd.DataFrame(data) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 468da3c8925..e6fc6daf369 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -845,21 +845,56 @@ def test_asfreq(): series.asfreq(freq="30S") -def test_asof(): - series = pd.Series( - [10, 20, 30, 40, 50], - index=pd.DatetimeIndex( - [ - "2018-02-27 09:01:00", - "2018-02-27 09:02:00", - "2018-02-27 09:03:00", - "2018-02-27 09:04:00", - "2018-02-27 09:05:00", - ] - ), +@pytest.mark.parametrize( + "where", + [ + 20, + 30, + [10, 40], + [20, 30], + [20], + 25, + [25, 45], + [25, 30], + pandas.Index([20, 30]), + pandas.Index([10]), + ], +) +def test_asof(where): + # With NaN: + values = [1, 2, np.nan, 4] + index = [10, 20, 30, 40] + modin_series, pandas_series = pd.Series(values, index=index), pandas.Series( + values, index=index ) - with pytest.warns(UserWarning): - series.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])) + df_equals(modin_series.asof(where), pandas_series.asof(where)) + + # No NaN: + values = [1, 2, 7, 4] + modin_series, pandas_series = pd.Series(values, index=index), pandas.Series( + values, index=index + ) + df_equals(modin_series.asof(where), pandas_series.asof(where)) + + +@pytest.mark.parametrize( + "where", + [ + 20, + 30, + [10.5, 40.5], + [10], + pandas.Index([20, 30]), + pandas.Index([10.5]), + ], +) +def test_asof_large(where): + values = test_data["float_nan_data"]["col1"] + index = list(range(len(values))) + modin_series, pandas_series = pd.Series(values, index=index), pandas.Series( + values, index=index + ) + df_equals(modin_series.asof(where), pandas_series.asof(where)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)