diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 36147c64c03..57498fa5700 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -22,6 +22,7 @@ dependencies: - nc-time-axis - netcdf4 - numba + - numexpr - numpy - pandas - pint diff --git a/doc/api.rst b/doc/api.rst index 7e13c2feefe..4af9bb01208 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -138,6 +138,7 @@ Indexing Dataset.set_index Dataset.reset_index Dataset.reorder_levels + Dataset.query Missing value handling ---------------------- @@ -321,6 +322,7 @@ Indexing DataArray.set_index DataArray.reset_index DataArray.reorder_levels + DataArray.query Missing value handling ---------------------- diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e0c11fefd36..fbd2617b16f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,10 @@ v0.17.1 (unreleased) New Features ~~~~~~~~~~~~ + +- Add :py:meth:`Dataset.query` and :py:meth:`DataArray.query` which enable indexing + of datasets and data arrays by evaluating query expressions against the values of the + data variables (:pull:`4984`). By `Alistair Miles `_. - Allow passing ``combine_attrs`` to :py:meth:`Dataset.merge` (:pull:`4895`). By `Justus Magin `_. - Support for `dask.graph_manipulation diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 5f944354901..b2fc14f4ba7 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4354,6 +4354,70 @@ def argmax( else: return self._replace_maybe_drop_dims(result) + def query( + self, + queries: Mapping[Hashable, Any] = None, + parser: str = "pandas", + engine: str = None, + missing_dims: str = "raise", + **queries_kwargs: Any, + ) -> "DataArray": + """Return a new data array indexed along the specified + dimension(s), where the indexers are given as strings containing + Python expressions to be evaluated against the values in the array. + + Parameters + ---------- + queries : dict, optional + A dict with keys matching dimensions and values given by strings + containing Python expressions to be evaluated against the data variables + in the dataset. The expressions will be evaluated using the pandas + eval() function, and can contain any valid Python expressions but cannot + contain any Python statements. + parser : {"pandas", "python"}, default: "pandas" + The parser to use to construct the syntax tree from the expression. + The default of 'pandas' parses code slightly different than standard + Python. Alternatively, you can parse an expression using the 'python' + parser to retain strict Python semantics. + engine: {"python", "numexpr", None}, default: None + The engine used to evaluate the expression. Supported engines are: + - None: tries to use numexpr, falls back to python + - "numexpr": evaluates expressions using numexpr + - "python": performs operations as if you had eval’d in top level python + missing_dims : {"raise", "warn", "ignore"}, default: "raise" + What to do if dimensions that should be selected from are not present in the + Dataset: + - "raise": raise an exception + - "warning": raise a warning, and ignore the missing dimensions + - "ignore": ignore the missing dimensions + **queries_kwargs : {dim: query, ...}, optional + The keyword arguments form of ``queries``. + One of queries or queries_kwargs must be provided. + + Returns + ------- + obj : DataArray + A new DataArray with the same contents as this dataset, indexed by + the results of the appropriate queries. + + See Also + -------- + DataArray.isel + Dataset.query + pandas.eval + + """ + + ds = self._to_dataset_whole(shallow_copy=True) + ds = ds.query( + queries=queries, + parser=parser, + engine=engine, + missing_dims=missing_dims, + **queries_kwargs, + ) + return ds[self.name] + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d76d136be8b..19b3eec5f07 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7001,5 +7001,78 @@ def argmax(self, dim=None, **kwargs): "Dataset.argmin() with a sequence or ... for dim" ) + def query( + self, + queries: Mapping[Hashable, Any] = None, + parser: str = "pandas", + engine: str = None, + missing_dims: str = "raise", + **queries_kwargs: Any, + ) -> "Dataset": + """Return a new dataset with each array indexed along the specified + dimension(s), where the indexers are given as strings containing + Python expressions to be evaluated against the data variables in the + dataset. + + Parameters + ---------- + queries : dict, optional + A dict with keys matching dimensions and values given by strings + containing Python expressions to be evaluated against the data variables + in the dataset. The expressions will be evaluated using the pandas + eval() function, and can contain any valid Python expressions but cannot + contain any Python statements. + parser : {"pandas", "python"}, default: "pandas" + The parser to use to construct the syntax tree from the expression. + The default of 'pandas' parses code slightly different than standard + Python. Alternatively, you can parse an expression using the 'python' + parser to retain strict Python semantics. + engine: {"python", "numexpr", None}, default: None + The engine used to evaluate the expression. Supported engines are: + - None: tries to use numexpr, falls back to python + - "numexpr": evaluates expressions using numexpr + - "python": performs operations as if you had eval’d in top level python + missing_dims : {"raise", "warn", "ignore"}, default: "raise" + What to do if dimensions that should be selected from are not present in the + Dataset: + - "raise": raise an exception + - "warning": raise a warning, and ignore the missing dimensions + - "ignore": ignore the missing dimensions + **queries_kwargs : {dim: query, ...}, optional + The keyword arguments form of ``queries``. + One of queries or queries_kwargs must be provided. + + Returns + ------- + obj : Dataset + A new Dataset with the same contents as this dataset, except each + array and dimension is indexed by the results of the appropriate + queries. + + See Also + -------- + Dataset.isel + pandas.eval + + """ + + # allow queries to be given either as a dict or as kwargs + queries = either_dict_or_kwargs(queries, queries_kwargs, "query") + + # check queries + for dim, expr in queries.items(): + if not isinstance(expr, str): + msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) + + # evaluate the queries to create the indexers + indexers = { + dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine) + for dim, expr in queries.items() + } + + # apply the selection + return self.isel(indexers, missing_dims=missing_dims) + ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 4b47e1d2c7e..aebcb0f2b8d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -83,6 +83,7 @@ def LooseVersion(vstring): has_cartopy, requires_cartopy = _importorskip("cartopy") # Need Pint 0.15 for __dask_tokenize__ tests for Quantity wrapped Dask Arrays has_pint_0_15, requires_pint_0_15 = _importorskip("pint", minversion="0.15") +has_numexpr, requires_numexpr = _importorskip("numexpr") # some special cases has_scipy_or_netCDF4 = has_scipy or has_netCDF4 diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 259e91083a7..c38c3656eaf 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd import pytest +from pandas.core.computation.ops import UndefinedVariableError from pandas.tseries.frequencies import to_offset import xarray as xr @@ -39,6 +40,7 @@ requires_dask, requires_iris, requires_numbagg, + requires_numexpr, requires_scipy, requires_sparse, source_ndarray, @@ -4620,6 +4622,74 @@ def test_pad_reflect(self, mode, reflect_type): assert actual.shape == (7, 4, 9) assert_identical(actual, expected) + @pytest.mark.parametrize("parser", ["pandas", "python"]) + @pytest.mark.parametrize( + "engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])] + ) + @pytest.mark.parametrize( + "backend", ["numpy", pytest.param("dask", marks=[requires_dask])] + ) + def test_query(self, backend, engine, parser): + """Test querying a dataset.""" + + # setup test data + np.random.seed(42) + a = np.arange(0, 10, 1) + b = np.random.randint(0, 100, size=10) + c = np.linspace(0, 1, 20) + d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( + object + ) + if backend == "numpy": + aa = DataArray(data=a, dims=["x"], name="a") + bb = DataArray(data=b, dims=["x"], name="b") + cc = DataArray(data=c, dims=["y"], name="c") + dd = DataArray(data=d, dims=["z"], name="d") + + elif backend == "dask": + import dask.array as da + + aa = DataArray(data=da.from_array(a, chunks=3), dims=["x"], name="a") + bb = DataArray(data=da.from_array(b, chunks=3), dims=["x"], name="b") + cc = DataArray(data=da.from_array(c, chunks=7), dims=["y"], name="c") + dd = DataArray(data=da.from_array(d, chunks=12), dims=["z"], name="d") + + # query single dim, single variable + actual = aa.query(x="a > 5", engine=engine, parser=parser) + expect = aa.isel(x=(a > 5)) + assert_identical(expect, actual) + + # query single dim, single variable, via dict + actual = aa.query(dict(x="a > 5"), engine=engine, parser=parser) + expect = aa.isel(dict(x=(a > 5))) + assert_identical(expect, actual) + + # query single dim, single variable + actual = bb.query(x="b > 50", engine=engine, parser=parser) + expect = bb.isel(x=(b > 50)) + assert_identical(expect, actual) + + # query single dim, single variable + actual = cc.query(y="c < .5", engine=engine, parser=parser) + expect = cc.isel(y=(c < 0.5)) + assert_identical(expect, actual) + + # query single dim, single string variable + if parser == "pandas": + # N.B., this query currently only works with the pandas parser + # xref https://github.com/pandas-dev/pandas/issues/40436 + actual = dd.query(z='d == "bar"', engine=engine, parser=parser) + expect = dd.isel(z=(d == "bar")) + assert_identical(expect, actual) + + # test error handling + with pytest.raises(ValueError): + aa.query("a > 5") # must be dict or kwargs + with pytest.raises(ValueError): + aa.query(x=(a > 5)) # must be query string + with pytest.raises(UndefinedVariableError): + aa.query(x="spam > 50") # name not present + class TestReduce: @pytest.fixture(autouse=True) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index df7f54ed30a..52df7603034 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pytest +from pandas.core.computation.ops import UndefinedVariableError from pandas.core.indexes.datetimes import DatetimeIndex from pandas.tseries.frequencies import to_offset @@ -45,6 +46,7 @@ requires_cftime, requires_dask, requires_numbagg, + requires_numexpr, requires_scipy, requires_sparse, source_ndarray, @@ -5807,6 +5809,135 @@ def test_astype_attrs(self): assert not data.astype(float, keep_attrs=False).attrs assert not data.astype(float, keep_attrs=False).var1.attrs + @pytest.mark.parametrize("parser", ["pandas", "python"]) + @pytest.mark.parametrize( + "engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])] + ) + @pytest.mark.parametrize( + "backend", ["numpy", pytest.param("dask", marks=[requires_dask])] + ) + def test_query(self, backend, engine, parser): + """Test querying a dataset.""" + + # setup test data + np.random.seed(42) + a = np.arange(0, 10, 1) + b = np.random.randint(0, 100, size=10) + c = np.linspace(0, 1, 20) + d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( + object + ) + e = np.arange(0, 10 * 20).reshape(10, 20) + f = np.random.normal(0, 1, size=(10, 20, 30)) + if backend == "numpy": + ds = Dataset( + { + "a": ("x", a), + "b": ("x", b), + "c": ("y", c), + "d": ("z", d), + "e": (("x", "y"), e), + "f": (("x", "y", "z"), f), + } + ) + elif backend == "dask": + ds = Dataset( + { + "a": ("x", da.from_array(a, chunks=3)), + "b": ("x", da.from_array(b, chunks=3)), + "c": ("y", da.from_array(c, chunks=7)), + "d": ("z", da.from_array(d, chunks=12)), + "e": (("x", "y"), da.from_array(e, chunks=(3, 7))), + "f": (("x", "y", "z"), da.from_array(f, chunks=(3, 7, 12))), + } + ) + + # query single dim, single variable + actual = ds.query(x="a > 5", engine=engine, parser=parser) + expect = ds.isel(x=(a > 5)) + assert_identical(expect, actual) + + # query single dim, single variable, via dict + actual = ds.query(dict(x="a > 5"), engine=engine, parser=parser) + expect = ds.isel(dict(x=(a > 5))) + assert_identical(expect, actual) + + # query single dim, single variable + actual = ds.query(x="b > 50", engine=engine, parser=parser) + expect = ds.isel(x=(b > 50)) + assert_identical(expect, actual) + + # query single dim, single variable + actual = ds.query(y="c < .5", engine=engine, parser=parser) + expect = ds.isel(y=(c < 0.5)) + assert_identical(expect, actual) + + # query single dim, single string variable + if parser == "pandas": + # N.B., this query currently only works with the pandas parser + # xref https://github.com/pandas-dev/pandas/issues/40436 + actual = ds.query(z='d == "bar"', engine=engine, parser=parser) + expect = ds.isel(z=(d == "bar")) + assert_identical(expect, actual) + + # query single dim, multiple variables + actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser) + expect = ds.isel(x=((a > 5) & (b > 50))) + assert_identical(expect, actual) + + # query single dim, multiple variables with computation + actual = ds.query(x="(a * b) > 250", engine=engine, parser=parser) + expect = ds.isel(x=(a * b) > 250) + assert_identical(expect, actual) + + # check pandas query syntax is supported + if parser == "pandas": + actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser) + expect = ds.isel(x=((a > 5) & (b > 50))) + assert_identical(expect, actual) + + # query multiple dims via kwargs + actual = ds.query(x="a > 5", y="c < .5", engine=engine, parser=parser) + expect = ds.isel(x=(a > 5), y=(c < 0.5)) + assert_identical(expect, actual) + + # query multiple dims via kwargs + if parser == "pandas": + actual = ds.query( + x="a > 5", y="c < .5", z="d == 'bar'", engine=engine, parser=parser + ) + expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(d == "bar")) + assert_identical(expect, actual) + + # query multiple dims via dict + actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser) + expect = ds.isel(dict(x=(a > 5), y=(c < 0.5))) + assert_identical(expect, actual) + + # query multiple dims via dict + if parser == "pandas": + actual = ds.query( + dict(x="a > 5", y="c < .5", z="d == 'bar'"), + engine=engine, + parser=parser, + ) + expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(d == "bar"))) + assert_identical(expect, actual) + + # test error handling + with pytest.raises(ValueError): + ds.query("a > 5") # must be dict or kwargs + with pytest.raises(ValueError): + ds.query(x=(a > 5)) # must be query string + with pytest.raises(IndexError): + ds.query(y="a > 5") # wrong length dimension + with pytest.raises(IndexError): + ds.query(x="c < .5") # wrong length dimension + with pytest.raises(IndexError): + ds.query(x="e > 100") # wrong number of dimensions + with pytest.raises(UndefinedVariableError): + ds.query(x="spam > 50") # name not present + # Py.test tests