Skip to content

Commit

Permalink
FEAT-modin-project#1222: Implement DataFrame.asof() without Pandas fa…
Browse files Browse the repository at this point in the history
…llback (modin-project#1989)

Signed-off-by: Itamar Turner-Trauring <itamar@itamarst.org>
  • Loading branch information
itamarst authored and aregm committed Sep 16, 2020
1 parent 81a299f commit 07fbd71
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 34 deletions.
2 changes: 1 addition & 1 deletion docs/supported_apis/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ default to pandas.
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``asfreq`` | `asfreq`_ | D | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``asof`` | `asof`_ | D | |
| ``asof`` | `asof`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``assign`` | `assign`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion docs/supported_apis/series_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+
| ``asobject`` | D |
+-----------------------------+---------------------------------+
| ``asof`` | D |
| ``asof`` | Y |
+-----------------------------+---------------------------------+
| ``astype`` | Y |
+-----------------------------+---------------------------------+
Expand Down
24 changes: 23 additions & 1 deletion modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,29 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None):
)

def asof(self, where, subset=None):
return self._default_to_pandas("asof", where, subset=subset)
scalar = not is_list_like(where)
if isinstance(where, pandas.Index):
# Prevent accidental mutation of original:
where = where.copy()
else:
if scalar:
where = [where]
where = pandas.Index(where)

if subset is None:
data = self
else:
# Only relevant for DataFrames:
data = self[subset]
no_na_index = data.dropna().index
new_index = pandas.Index([no_na_index.asof(i) for i in where])
result = self.reindex(new_index)
result.index = where

if scalar:
# Need to return a Series:
result = result.squeeze()
return result

def astype(self, dtype, copy=True, errors="raise"):
col_dtypes = {}
Expand Down
17 changes: 0 additions & 17 deletions modin/pandas/test/dataframe/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,23 +72,6 @@ def test_asfreq():
df.asfreq(freq="30S")


def test_asof():
df = pd.DataFrame(
{"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]},
index=pd.DatetimeIndex(
[
"2018-02-27 09:01:00",
"2018-02-27 09:02:00",
"2018-02-27 09:03:00",
"2018-02-27 09:04:00",
"2018-02-27 09:05:00",
]
),
)
with pytest.warns(UserWarning):
df.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]))


def test_assign():
data = test_data_values[0]
modin_df = pd.DataFrame(data)
Expand Down
88 changes: 88 additions & 0 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import sys

from modin.pandas.test.utils import (
NROWS,
RAND_LOW,
RAND_HIGH,
df_equals,
Expand All @@ -41,6 +42,93 @@
matplotlib.use("Agg")


@pytest.mark.parametrize(
"dates",
[
["2018-02-27 09:03:30", "2018-02-27 09:04:30"],
["2018-02-27 09:03:00", "2018-02-27 09:05:00"],
],
)
@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None])
def test_asof_with_nan(dates, subset):
data = {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]}
index = pd.DatetimeIndex(
[
"2018-02-27 09:01:00",
"2018-02-27 09:02:00",
"2018-02-27 09:03:00",
"2018-02-27 09:04:00",
"2018-02-27 09:05:00",
]
)
modin_where = pd.DatetimeIndex(dates)
pandas_where = pandas.DatetimeIndex(dates)
compare_asof(data, index, modin_where, pandas_where, subset)


@pytest.mark.parametrize(
"dates",
[
["2018-02-27 09:03:30", "2018-02-27 09:04:30"],
["2018-02-27 09:03:00", "2018-02-27 09:05:00"],
],
)
@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None])
def test_asof_without_nan(dates, subset):
data = {"a": [10, 20, 30, 40, 50], "b": [70, 600, 30, -200, 500]}
index = pd.DatetimeIndex(
[
"2018-02-27 09:01:00",
"2018-02-27 09:02:00",
"2018-02-27 09:03:00",
"2018-02-27 09:04:00",
"2018-02-27 09:05:00",
]
)
modin_where = pd.DatetimeIndex(dates)
pandas_where = pandas.DatetimeIndex(dates)
compare_asof(data, index, modin_where, pandas_where, subset)


@pytest.mark.parametrize(
"lookup",
[
[60, 70, 90],
[60.5, 70.5, 100],
],
)
@pytest.mark.parametrize("subset", ["col2", "col1", ["col1", "col2"], None])
def test_asof_large(lookup, subset):
data = test_data["float_nan_data"]
index = list(range(NROWS))
modin_where = pd.Index(lookup)
pandas_where = pandas.Index(lookup)
compare_asof(data, index, modin_where, pandas_where, subset)


def compare_asof(
data, index, modin_where: pd.Index, pandas_where: pandas.Index, subset
):
modin_df = pd.DataFrame(data, index=index)
pandas_df = pandas.DataFrame(data, index=index)
df_equals(
modin_df.asof(modin_where, subset=subset),
pandas_df.asof(pandas_where, subset=subset),
)
df_equals(
modin_df.asof(modin_where.values, subset=subset),
pandas_df.asof(pandas_where.values, subset=subset),
)
df_equals(
modin_df.asof(list(modin_where.values), subset=subset),
pandas_df.asof(list(pandas_where.values), subset=subset),
)
df_equals(
modin_df.asof(modin_where.values[0], subset=subset),
pandas_df.asof(pandas_where.values[0], subset=subset),
)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_first_valid_index(data):
modin_df = pd.DataFrame(data)
Expand Down
63 changes: 49 additions & 14 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,21 +845,56 @@ def test_asfreq():
series.asfreq(freq="30S")


def test_asof():
series = pd.Series(
[10, 20, 30, 40, 50],
index=pd.DatetimeIndex(
[
"2018-02-27 09:01:00",
"2018-02-27 09:02:00",
"2018-02-27 09:03:00",
"2018-02-27 09:04:00",
"2018-02-27 09:05:00",
]
),
@pytest.mark.parametrize(
"where",
[
20,
30,
[10, 40],
[20, 30],
[20],
25,
[25, 45],
[25, 30],
pandas.Index([20, 30]),
pandas.Index([10]),
],
)
def test_asof(where):
# With NaN:
values = [1, 2, np.nan, 4]
index = [10, 20, 30, 40]
modin_series, pandas_series = pd.Series(values, index=index), pandas.Series(
values, index=index
)
with pytest.warns(UserWarning):
series.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]))
df_equals(modin_series.asof(where), pandas_series.asof(where))

# No NaN:
values = [1, 2, 7, 4]
modin_series, pandas_series = pd.Series(values, index=index), pandas.Series(
values, index=index
)
df_equals(modin_series.asof(where), pandas_series.asof(where))


@pytest.mark.parametrize(
"where",
[
20,
30,
[10.5, 40.5],
[10],
pandas.Index([20, 30]),
pandas.Index([10.5]),
],
)
def test_asof_large(where):
values = test_data["float_nan_data"]["col1"]
index = list(range(len(values)))
modin_series, pandas_series = pd.Series(values, index=index), pandas.Series(
values, index=index
)
df_equals(modin_series.asof(where), pandas_series.asof(where))


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
Expand Down

0 comments on commit 07fbd71

Please sign in to comment.