Skip to content
forked from pydata/xarray

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into dataset/quiver
Browse files Browse the repository at this point in the history
* upstream/master:
  speed up the repr for big MultiIndex objects (pydata#4846)
  dim -> coord in DataArray.integrate (pydata#3993)
  WIP: backend interface, now it uses subclassing  (pydata#4836)
  weighted: small improvements (pydata#4818)
  Update related-projects.rst (pydata#4844)
  iris update doc url (pydata#4845)
  Faster unstacking (pydata#4746)
  Allow swap_dims to take kwargs (pydata#4841)
  Move skip ci instructions to contributing guide (pydata#4829)
  fix issues in drop_sel and drop_isel (pydata#4828)
  Bugfix in list_engine (pydata#4811)
  Add drop_isel (pydata#4819)
  Fix RST.
  Remove the references to `_file_obj` outside low level code paths, change to `_close` (pydata#4809)
  • Loading branch information
dcherian committed Jan 29, 2021
2 parents e795672 + 39048f9 commit e0f227f
Show file tree
Hide file tree
Showing 34 changed files with 1,065 additions and 601 deletions.
8 changes: 0 additions & 8 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,3 @@
- [ ] Passes `pre-commit run --all-files`
- [ ] User visible changes (including notable bug fixes) are documented in `whats-new.rst`
- [ ] New functions/methods are listed in `api.rst`


<sub>
<h3>
Overriding CI behaviors
</h3>
By default, the upstream dev CI is disabled on pull request and push events. You can override this behavior per commit by adding a <tt>[test-upstream]</tt> tag to the first line of the commit message. For documentation-only commits, you can skip the CI per commit by adding a <tt>[skip-ci]</tt> tag to the first line of the commit message
</sub>
18 changes: 18 additions & 0 deletions asv_bench/benchmarks/repr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

import xarray as xr


class ReprMultiIndex:
def setup(self, key):
index = pd.MultiIndex.from_product(
[range(10000), range(10000)], names=("level_0", "level_1")
)
series = pd.Series(range(100000000), index=index)
self.da = xr.DataArray(series)

def time_repr(self):
repr(self.da)

def time_repr_html(self):
self.da._repr_html_()
15 changes: 10 additions & 5 deletions asv_bench/benchmarks/unstacking.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,23 @@

class Unstacking:
def setup(self):
data = np.random.RandomState(0).randn(1, 1000, 500)
self.ds = xr.DataArray(data).stack(flat_dim=["dim_1", "dim_2"])
data = np.random.RandomState(0).randn(500, 1000)
self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
self.da_missing = self.da_full[:-1]
self.df_missing = self.da_missing.to_pandas()

def time_unstack_fast(self):
self.ds.unstack("flat_dim")
self.da_full.unstack("flat_dim")

def time_unstack_slow(self):
self.ds[:, ::-1].unstack("flat_dim")
self.da_missing.unstack("flat_dim")

def time_unstack_pandas_slow(self):
self.df_missing.unstack()


class UnstackingDask(Unstacking):
def setup(self, *args, **kwargs):
requires_dask()
super().setup(**kwargs)
self.ds = self.ds.chunk({"flat_dim": 50})
self.da_full = self.da_full.chunk({"flat_dim": 50})
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ Indexing
Dataset.isel
Dataset.sel
Dataset.drop_sel
Dataset.drop_isel
Dataset.head
Dataset.tail
Dataset.thin
Expand Down Expand Up @@ -308,6 +309,7 @@ Indexing
DataArray.isel
DataArray.sel
DataArray.drop_sel
DataArray.drop_isel
DataArray.head
DataArray.tail
DataArray.thin
Expand Down
2 changes: 1 addition & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@
intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
"iris": ("https://scitools.org.uk/iris/docs/latest", None),
"iris": ("https://scitools-iris.readthedocs.io/en/latest", None),
"numpy": ("https://numpy.org/doc/stable", None),
"scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
"numba": ("https://numba.pydata.org/numba-doc/latest", None),
Expand Down
1 change: 1 addition & 0 deletions doc/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,7 @@ PR checklist
- Write new tests if needed. See `"Test-driven development/code writing" <https://xarray.pydata.org/en/stable/contributing.html#test-driven-development-code-writing>`_.
- Test the code using `Pytest <http://doc.pytest.org/en/latest/>`_. Running all tests (type ``pytest`` in the root directory) takes a while, so feel free to only run the tests you think are needed based on your PR (example: ``pytest xarray/tests/test_dataarray.py``). CI will catch any failing tests.
- By default, the upstream dev CI is disabled on pull request and push events. You can override this behavior per commit by adding a <tt>[test-upstream]</tt> tag to the first line of the commit message. For documentation-only commits, you can skip the CI per commit by adding a "[skip-ci]" tag to the first line of the commit message.
- **Properly format your code** and verify that it passes the formatting guidelines set by `Black <https://black.readthedocs.io/en/stable/>`_ and `Flake8 <http://flake8.pycqa.org/en/latest/>`_. See `"Code formatting" <https://xarray.pydata.org/en/stablcontributing.html#code-formatting>`_. You can use `pre-commit <https://pre-commit.com/>`_ to run these automatically on each commit.
Expand Down
2 changes: 1 addition & 1 deletion doc/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ different approaches to handling metadata: Iris strictly interprets
`CF conventions`_. Iris particularly shines at mapping, thanks to its
integration with Cartopy_.

.. _Iris: http://scitools.org.uk/iris/
.. _Iris: https://scitools-iris.readthedocs.io/en/stable/
.. _Cartopy: http://scitools.org.uk/cartopy/docs/latest/

`UV-CDAT`__ is another Python library that implements in-memory netCDF-like
Expand Down
1 change: 1 addition & 0 deletions doc/related-projects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Geosciences
- `aospy <https://aospy.readthedocs.io>`_: Automated analysis and management of gridded climate data.
- `climpred <https://climpred.readthedocs.io>`_: Analysis of ensemble forecast models for climate prediction.
- `geocube <https://corteva.github.io/geocube>`_: Tool to convert geopandas vector data into rasterized xarray data.
- `GeoWombat <https://github.com/jgrss/geowombat>`_: Utilities for analysis of remotely sensed and gridded raster data at scale (easily tame Landsat, Sentinel, Quickbird, and PlanetScope).
- `infinite-diff <https://github.com/spencerahill/infinite-diff>`_: xarray-based finite-differencing, focused on gridded climate/meterology data
- `marc_analysis <https://github.com/darothen/marc_analysis>`_: Analysis package for CESM/MARC experiments and output.
- `MetPy <https://unidata.github.io/MetPy/dev/index.html>`_: A collection of tools in Python for reading, visualizing, and performing calculations with weather data.
Expand Down
25 changes: 22 additions & 3 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ What's New
.. _whats-new.0.16.3:

v0.16.3 (unreleased)
v0.17.0 (unreleased)
--------------------

Breaking changes
Expand All @@ -39,16 +39,32 @@ Breaking changes
always be set such that ``int64`` values can be used. In the past, no units
finer than "seconds" were chosen, which would sometimes mean that ``float64``
values were required, which would lead to inaccurate I/O round-trips.
- remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull: `4725`).
By `Aureliana Barghini <https://github.com/aurghs>`_
- remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull:`4725`).
By `Aureliana Barghini <https://github.com/aurghs>`_.

Deprecations
~~~~~~~~~~~~

- ``dim`` argument to :py:meth:`DataArray.integrate` is being deprecated in
favour of a ``coord`` argument, for consistency with :py:meth:`Dataset.integrate`.
For now using ``dim`` issues a ``FutureWarning``. By `Tom Nicholas <https://github.com/TomNicholas>`_.


New Features
~~~~~~~~~~~~
- Significantly higher ``unstack`` performance on numpy-backed arrays which
contain missing values; 8x faster in our benchmark, and 2x faster than pandas.
(:pull:`4746`);
By `Maximilian Roos <https://github.com/max-sixty>`_.

- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
By `Deepak Cherian <https://github.com/dcherian>`_
- Add :py:meth:`Dataset.plot.quiver` for quiver plots with :py:class:`Dataset` variables.
By `Deepak Cherian <https://github.com/dcherian>`_
By `Deepak Cherian <https://github.com/dcherian>`_.
- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims
in the form of kwargs as well as a dict, like most similar methods.
By `Maximilian Roos <https://github.com/max-sixty>`_.

Bug fixes
~~~~~~~~~
Expand Down Expand Up @@ -82,6 +98,7 @@ Bug fixes
- Expand user directory paths (e.g. ``~/``) in :py:func:`open_mfdataset` and
:py:meth:`Dataset.to_zarr` (:issue:`4783`, :pull:`4795`).
By `Julien Seguinot <https://github.com/juseg>`_.
- Add :py:meth:`Dataset.drop_isel` and :py:meth:`DataArray.drop_isel` (:issue:`4658`, :pull:`4819`). By `Daniel Mesejo <https://github.com/mesejo>`_.

Documentation
~~~~~~~~~~~~~
Expand Down Expand Up @@ -110,6 +127,8 @@ Internal Changes
By `Maximilian Roos <https://github.com/max-sixty>`_.
- Speed up attribute style access (e.g. ``ds.somevar`` instead of ``ds["somevar"]``) and tab completion
in ipython (:issue:`4741`, :pull:`4742`). By `Richard Kleijn <https://github.com/rhkleijn>`_.
- Added the ``set_close`` method to ``Dataset`` and ``DataArray`` for beckends to specify how to voluntary release
all resources. (:pull:`#4809`), By `Alessandro Amici <https://github.com/alexamici>`_.

.. _whats-new.0.16.2:

Expand Down
25 changes: 9 additions & 16 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ def maybe_decode_store(store, chunks):

else:
ds2 = ds
ds2._file_obj = ds._file_obj
ds2.set_close(ds._close)
return ds2

filename_or_obj = _normalize_path(filename_or_obj)
Expand Down Expand Up @@ -701,7 +701,7 @@ def open_dataarray(
else:
(data_array,) = dataset.data_vars.values()

data_array._file_obj = dataset._file_obj
data_array.set_close(dataset._close)

# Reset names if they were changed during saving
# to ensure that we can 'roundtrip' perfectly
Expand All @@ -715,17 +715,6 @@ def open_dataarray(
return data_array


class _MultiFileCloser:
__slots__ = ("file_objs",)

def __init__(self, file_objs):
self.file_objs = file_objs

def close(self):
for f in self.file_objs:
f.close()


def open_mfdataset(
paths,
chunks=None,
Expand Down Expand Up @@ -918,14 +907,14 @@ def open_mfdataset(
getattr_ = getattr

datasets = [open_(p, **open_kwargs) for p in paths]
file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
closers = [getattr_(ds, "_close") for ds in datasets]
if preprocess is not None:
datasets = [preprocess(ds) for ds in datasets]

if parallel:
# calling compute here will return the datasets/file_objs lists,
# the underlying datasets will still be stored as dask arrays
datasets, file_objs = dask.compute(datasets, file_objs)
datasets, closers = dask.compute(datasets, closers)

# Combine all datasets, closing them in case of a ValueError
try:
Expand Down Expand Up @@ -963,7 +952,11 @@ def open_mfdataset(
ds.close()
raise

combined._file_obj = _MultiFileCloser(file_objs)
def multi_file_closer():
for closer in closers:
closer()

combined.set_close(multi_file_closer)

# read global attributes from the attrs_file or from the first dataset
if attrs_file is not None:
Expand Down
2 changes: 1 addition & 1 deletion xarray/backends/apiv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def _dataset_from_backend_dataset(
**extra_tokens,
)

ds._file_obj = backend_ds._file_obj
ds.set_close(backend_ds._close)

# Ensure source filename always stored in dataset object (GH issue #2550)
if "source" not in ds.encoding:
Expand Down
124 changes: 68 additions & 56 deletions xarray/backends/cfgrib_.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,22 @@
from ..core import indexing
from ..core.utils import Frozen, FrozenDict, close_on_error
from ..core.variable import Variable
from .common import AbstractDataStore, BackendArray, BackendEntrypoint
from .common import (
BACKEND_ENTRYPOINTS,
AbstractDataStore,
BackendArray,
BackendEntrypoint,
)
from .locks import SerializableLock, ensure_lock
from .store import open_backend_dataset_store
from .store import StoreBackendEntrypoint

try:
import cfgrib

has_cfgrib = True
except ModuleNotFoundError:
has_cfgrib = False


# FIXME: Add a dedicated lock, even if ecCodes is supposed to be thread-safe
# in most circumstances. See:
Expand Down Expand Up @@ -38,7 +51,6 @@ class CfGribDataStore(AbstractDataStore):
"""

def __init__(self, filename, lock=None, **backend_kwargs):
import cfgrib

if lock is None:
lock = ECCODES_LOCK
Expand Down Expand Up @@ -74,58 +86,58 @@ def get_encoding(self):
return encoding


def guess_can_open_cfgrib(store_spec):
try:
_, ext = os.path.splitext(store_spec)
except TypeError:
return False
return ext in {".grib", ".grib2", ".grb", ".grb2"}


def open_backend_dataset_cfgrib(
filename_or_obj,
*,
mask_and_scale=True,
decode_times=None,
concat_characters=None,
decode_coords=None,
drop_variables=None,
use_cftime=None,
decode_timedelta=None,
lock=None,
indexpath="{path}.{short_hash}.idx",
filter_by_keys={},
read_keys=[],
encode_cf=("parameter", "time", "geography", "vertical"),
squeeze=True,
time_dims=("time", "step"),
):

store = CfGribDataStore(
class CfgribfBackendEntrypoint(BackendEntrypoint):
def guess_can_open(self, store_spec):
try:
_, ext = os.path.splitext(store_spec)
except TypeError:
return False
return ext in {".grib", ".grib2", ".grb", ".grb2"}

def open_dataset(
self,
filename_or_obj,
indexpath=indexpath,
filter_by_keys=filter_by_keys,
read_keys=read_keys,
encode_cf=encode_cf,
squeeze=squeeze,
time_dims=time_dims,
lock=lock,
)

with close_on_error(store):
ds = open_backend_dataset_store(
store,
mask_and_scale=mask_and_scale,
decode_times=decode_times,
concat_characters=concat_characters,
decode_coords=decode_coords,
drop_variables=drop_variables,
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
*,
mask_and_scale=True,
decode_times=None,
concat_characters=None,
decode_coords=None,
drop_variables=None,
use_cftime=None,
decode_timedelta=None,
lock=None,
indexpath="{path}.{short_hash}.idx",
filter_by_keys={},
read_keys=[],
encode_cf=("parameter", "time", "geography", "vertical"),
squeeze=True,
time_dims=("time", "step"),
):

store = CfGribDataStore(
filename_or_obj,
indexpath=indexpath,
filter_by_keys=filter_by_keys,
read_keys=read_keys,
encode_cf=encode_cf,
squeeze=squeeze,
time_dims=time_dims,
lock=lock,
)
return ds


cfgrib_backend = BackendEntrypoint(
open_dataset=open_backend_dataset_cfgrib, guess_can_open=guess_can_open_cfgrib
)
store_entrypoint = StoreBackendEntrypoint()
with close_on_error(store):
ds = store_entrypoint.open_dataset(
store,
mask_and_scale=mask_and_scale,
decode_times=decode_times,
concat_characters=concat_characters,
decode_coords=decode_coords,
drop_variables=drop_variables,
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
)
return ds


if has_cfgrib:
BACKEND_ENTRYPOINTS["cfgrib"] = CfgribfBackendEntrypoint
Loading

0 comments on commit e0f227f

Please sign in to comment.