From 0b6045b77bc8092f940186db1ee24fa3bcb806c0 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 23 Sep 2019 16:20:04 -0600 Subject: [PATCH] small changes. --- doc/dask.rst | 16 ++++++++-------- doc/index.rst | 6 +++--- xarray/core/dataset.py | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/dask.rst b/doc/dask.rst index 1cee6b28335..bc94012b561 100644 --- a/doc/dask.rst +++ b/doc/dask.rst @@ -87,10 +87,8 @@ for the full disclaimer). By default, :py:meth:`~xarray.open_mfdataset` will chu netCDF file into a single Dask array; again, supply the ``chunks`` argument to control the size of the resulting Dask arrays. In more complex cases, you can open each file individually using :py:meth:`~xarray.open_dataset` and merge the result, as -described in :ref:`combining data`. If you have a distributed cluster running, -passing the keyword argument ``parallel=True`` to :py:meth:`~xarray.open_mfdataset` -will speed up the reading of large multi-file datasets by executing those read tasks -in parallel using ``dask.delayed``. +described in :ref:`combining data`. Passing the keyword argument ``parallel=True`` to :py:meth:`~xarray.open_mfdataset` will speed up the reading of large multi-file datasets by +executing those read tasks in parallel using ``dask.delayed``. You'll notice that printing a dataset still shows a preview of array values, even if they are actually Dask arrays. We can do this quickly with Dask because @@ -157,6 +155,12 @@ explicit conversion step. One notable exception is indexing operations: to enable label based indexing, xarray will automatically load coordinate labels into memory. +.. tip:: + + By default, dask uses its multi-threaded scheduler, which distributes work across + multiple cores and allows for processing some datasets that do not fit into memory. + For running across a cluster, `setup the distributed scheduler `_. + The easiest way to convert an xarray data structure from lazy Dask arrays into *eager*, in-memory NumPy arrays is to use the :py:meth:`~xarray.Dataset.load` method: @@ -417,7 +421,3 @@ With analysis pipelines involving both spatial subsetting and temporal resamplin 6. The dask `diagnostics `_ can be useful in identifying performance bottlenecks. - -7. Installing the optional `bottleneck `_ library - will result in greatly reduced memory usage when using :py:meth:`~xarray.Dataset.rolling` - on dask arrays, diff --git a/doc/index.rst b/doc/index.rst index 46d6029397c..03fa7127ee7 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -11,11 +11,11 @@ intuitive, more concise, and less error-prone developer experience. The package includes a large and growing library of domain-agnostic functions for advanced analytics and visualization with these data structures. -Xarray is particularly tailored to working with netCDF_ files, which were the +Xarray is inspired by and borrows heavily from pandas_, the popular data +analysis package focused on labelled tabular data. +It is particularly tailored to working with netCDF_ files, which were the source of xarray's data model, and integrates tightly with dask_ for parallel computing. -It is inspired by and borrows heavily from pandas_, the popular data -analysis package focused on labelled tabular data. .. _NumPy: http://www.numpy.org .. _pandas: http://pandas.pydata.org diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5f6adf5eff4..310b9a1afff 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -613,7 +613,7 @@ def sizes(self) -> Mapping[Hashable, int]: """ return self.dims - def load(self: T, **kwargs) -> T: + def load(self, **kwargs) -> "Dataset": """Manually trigger loading and/or computation of this dataset's data from disk or a remote source into memory and return this dataset. Unlike compute, the original dataset is modified and returned. @@ -771,7 +771,7 @@ def _dask_postpersist(dsk, info, *args): return Dataset._construct_direct(variables, *args) - def compute(self: T, **kwargs) -> T: + def compute(self, **kwargs) -> "Dataset": """Manually trigger loading and/or computation of this dataset's data from disk or a remote source into memory and return a new dataset. Unlike load, the original dataset is left unaltered.