Skip to content

Support duck arrays by default #132

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ dependencies:
- dask-core
- pip
- xarray
- numpy>=1.20
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Illviljan i'm bumping minimum numpy here. So PRs to improve the typing using the new numpy types would be very welcome!

- numpydoc
- numpy_groupies
- toolz
Expand Down
1 change: 1 addition & 0 deletions ci/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ dependencies:
- dask-core
- netcdf4
- pandas
- numpy>=1.20
- pip
- pytest
- pytest-cov
Expand Down
3 changes: 2 additions & 1 deletion ci/minimal-requirements.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ dependencies:
- pytest
- pytest-cov
- pytest-xdist
- numpy_groupies>=0.9.15
- numpy==1.20
- numpy_groupies==0.9.15
- pandas
- pooch
- toolz
1 change: 1 addition & 0 deletions ci/no-dask.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ dependencies:
- codecov
- netcdf4
- pandas
- numpy>=1.20
- pip
- pytest
- pytest-cov
Expand Down
1 change: 1 addition & 0 deletions ci/no-xarray.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ dependencies:
- codecov
- netcdf4
- pandas
- numpy>=1.20
- pip
- pytest
- pytest-cov
Expand Down
27 changes: 20 additions & 7 deletions flox/aggregate_flox.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,21 @@
from .xrutils import isnull


def _prepare_for_flox(group_idx, array):
"""
Sort the input array once to save time.
"""
assert array.shape[-1] == group_idx.shape[0]
issorted = (group_idx[:-1] <= group_idx[1:]).all()
if issorted:
ordered_array = array
else:
perm = group_idx.argsort(kind="stable")
group_idx = group_idx[..., perm]
ordered_array = array[..., perm]
return group_idx, ordered_array


def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dtype=None, out=None):
"""
most of this code is from shoyer's gist
Expand All @@ -13,7 +28,7 @@ def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dt
# assumes input is sorted, which I do in core._prepare_for_flox
aux = group_idx

flag = np.concatenate(([True], aux[1:] != aux[:-1]))
flag = np.concatenate((np.array([True], like=array), aux[1:] != aux[:-1]))
uniques = aux[flag]
(inv_idx,) = flag.nonzero()

Expand All @@ -25,11 +40,11 @@ def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dt
if out is None:
out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)

if (len(uniques) == size) and (uniques == np.arange(size)).all():
if (len(uniques) == size) and (uniques == np.arange(size, like=array)).all():
# The previous version of this if condition
# ((uniques[1:] - uniques[:-1]) == 1).all():
# does not work when group_idx is [1, 2] for e.g.
# This happens during binning
# This happens during binning
op.reduceat(array, inv_idx, axis=axis, dtype=dtype, out=out)
else:
out[..., uniques] = op.reduceat(array, inv_idx, axis=axis, dtype=dtype)
Expand Down Expand Up @@ -91,16 +106,14 @@ def nanlen(group_idx, array, *args, **kwargs):
def mean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None):
if fill_value is None:
fill_value = 0
out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
sum(group_idx, array, axis=axis, size=size, dtype=dtype, out=out)
out = sum(group_idx, array, axis=axis, size=size, dtype=dtype, fill_value=fill_value)
out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0)
return out


def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None):
if fill_value is None:
fill_value = 0
out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
nansum(group_idx, array, size=size, axis=axis, dtype=dtype, out=out)
out = nansum(group_idx, array, size=size, axis=axis, dtype=dtype, fill_value=fill_value)
out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0)
return out
2 changes: 2 additions & 0 deletions flox/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def generic_aggregate(
f"Expected engine to be one of ['flox', 'numpy', 'numba']. Received {engine} instead."
)

group_idx = np.asarray(group_idx, like=array)

return method(
group_idx, array, axis=axis, size=size, fill_value=fill_value, dtype=dtype, **kwargs
)
Expand Down
23 changes: 5 additions & 18 deletions flox/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import toolz as tlz

from . import xrdtypes
from .aggregate_flox import _prepare_for_flox
from .aggregations import (
Aggregation,
_atleast_1d,
Expand Down Expand Up @@ -44,21 +45,6 @@ def _is_arg_reduction(func: str | Aggregation) -> bool:
return False


def _prepare_for_flox(group_idx, array):
"""
Sort the input array once to save time.
"""
assert array.shape[-1] == group_idx.shape[0]
issorted = (group_idx[:-1] <= group_idx[1:]).all()
if issorted:
ordered_array = array
else:
perm = group_idx.argsort(kind="stable")
group_idx = group_idx[..., perm]
ordered_array = array[..., perm]
return group_idx, ordered_array


def _get_expected_groups(by, sort, *, raise_if_dask=True) -> pd.Index | None:
if is_duck_dask_array(by):
if raise_if_dask:
Expand Down Expand Up @@ -1367,7 +1353,7 @@ def groupby_reduce(
min_count: int | None = None,
split_out: int = 1,
method: str = "map-reduce",
engine: str = "flox",
engine: str = "numpy",
reindex: bool | None = None,
finalize_kwargs: Mapping | None = None,
) -> tuple[DaskArray, np.ndarray | DaskArray]:
Expand Down Expand Up @@ -1434,13 +1420,14 @@ def groupby_reduce(
and is identical to xarray's default strategy.
engine : {"flox", "numpy", "numba"}, optional
Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk:
* ``"numpy"``:
Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``.
This is the default choice because it works for most array types.
* ``"flox"``:
Use an internal implementation where the data is sorted so that
all members of a group occur sequentially, and then numpy.ufunc.reduceat
is to used for the reduction. This will fall back to ``numpy_groupies.aggregate_numpy``
for a reduction that is not yet implemented.
* ``"numpy"``:
Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``.
* ``"numba"``:
Use the implementations in ``numpy_groupies.aggregate_numba``.
reindex : bool, optional
Expand Down
7 changes: 4 additions & 3 deletions flox/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def xarray_reduce(
split_out: int = 1,
fill_value=None,
method: str = "map-reduce",
engine: str = "flox",
engine: str = "numpy",
keep_attrs: bool | None = True,
skipna: bool | None = None,
min_count: int | None = None,
Expand Down Expand Up @@ -125,13 +125,14 @@ def xarray_reduce(
and is identical to xarray's default strategy.
engine : {"flox", "numpy", "numba"}, optional
Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk:
* ``"numpy"``:
Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``.
This is the default choice because it works for other array types.
* ``"flox"``:
Use an internal implementation where the data is sorted so that
all members of a group occur sequentially, and then numpy.ufunc.reduceat
is to used for the reduction. This will fall back to ``numpy_groupies.aggregate_numpy``
for a reduction that is not yet implemented.
* ``"numpy"``:
Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``.
* ``"numba"``:
Use the implementations in ``numpy_groupies.aggregate_numba``.
keep_attrs : bool, optional
Expand Down
3 changes: 2 additions & 1 deletion flox/xrutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ def is_scalar(value: Any, include_0d: bool = True) -> bool:


def isnull(data):
data = np.asarray(data)
if not is_duck_array(data):
data = np.asarray(data)
scalar_type = data.dtype.type
if issubclass(scalar_type, (np.datetime64, np.timedelta64)):
# datetime types use NaT for null
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ include_package_data = True
python_requires = >=3.8
install_requires =
pandas
numpy >= '1.20'
numpy_groupies >= '0.9.15'
toolz

Expand Down