pydata · shoyer · Jun 25, 2019 · Nov 5, 2018 · Nov 6, 2018 · Nov 7, 2018
diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py
@@ -0,0 +1,37 @@
+import numpy as np
+import xarray as xr
+
+
+class Combine:
+    """Benchmark concatenating and merging large datasets"""
+
+    def setup(self):
+        """Create 4 datasets with two different variables"""
+
+        t_size, x_size, y_size = 100, 900, 800
+        t, x, y = np.arange(t_size), np.arange(x_size), np.arange(y_size)
+        data = np.random.randn(t_size, x_size, y_size)
+
+        self.dsA0 = xr.Dataset(
+            {'A': xr.DataArray(data, coords={'T': t},
+                               dims=('T', 'X', 'Y'))})
+        self.dsA1 = xr.Dataset(
+            {'A': xr.DataArray(data, coords={'T': t + t_size},
+                               dims=('T', 'X', 'Y'))})
+        self.dsB0 = xr.Dataset(
+            {'B': xr.DataArray(data, coords={'T': t},
+                               dims=('T', 'X', 'Y'))})
+        self.dsB1 = xr.Dataset(
+            {'B': xr.DataArray(data, coords={'T': t + t_size},
+                               dims=('T', 'X', 'Y'))})
+
+    def time_combine_manual(self):
+        datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]]
+
+        xr.combine_manual(datasets, concat_dim=[None, 't'])
+
+    def time_auto_combine(self):
+        """Also has to load and arrange t coordinate"""
+        datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1]
+
+        xr.combine_auto(datasets)
diff --git a/doc/api.rst b/doc/api.rst
@@ -19,6 +19,9 @@ Top-level functions
    broadcast
    concat
    merge
+   auto_combine
+   combine_auto
+   combine_manual
    where
    set_options
    full_like

diff --git a/doc/combining.rst b/doc/combining.rst
@@ -11,9 +11,10 @@ Combining data
     import xarray as xr
     np.random.seed(123456)
 
-* For combining datasets or data arrays along a dimension, see concatenate_.
+* For combining datasets or data arrays along a single dimension, see concatenate_.
 * For combining datasets with different variables, see merge_.
 * For combining datasets or data arrays with different indexes or missing values, see combine_.
+* For combining datasets or data arrays along multiple dimensions see combining.multi_.
 
 .. _concatenate:
 
@@ -77,7 +78,7 @@ Merge
 ~~~~~
 
 To combine variables and coordinates between multiple ``DataArray`` and/or
-``Dataset`` object, use :py:func:`~xarray.merge`. It can merge a list of
+``Dataset`` objects, use :py:func:`~xarray.merge`. It can merge a list of
 ``Dataset``, ``DataArray`` or dictionaries of objects convertible to
 ``DataArray`` objects:
 
@@ -237,3 +238,64 @@ coordinates as long as any non-missing values agree or are disjoint:
 Note that due to the underlying representation of missing values as floating
 point numbers (``NaN``), variable data type is not always preserved when merging
 in this manner.
+
+.. _combining.multi:
+
+Combining along multiple dimensions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For combining many objects along multiple dimensions xarray provides
+``combine_manual`` and ``combine_auto``. These functions use a combination of
+``concat`` and ``merge`` across different variables to combine many objects
+into one.
+
+``combine_manual`` requires specifying the order in which the objects should be
+combined, while ``combine_auto`` attempts to infer this ordering automatically
+from the coordinates in the data.
+
+``manual_combine`` is useful when you know the spatial relationship between
+each object in advance. A common task is collecting data from a parallelized
+simulation where each processor wrote out data to a separate file. A domain
+which was decomposed into 4 parts, 2 each along both the x and y axes, requires
+organising the datasets into a doubly-nested list, e.g:
+
+.. ipython:: python
+
+    arr = xr.DataArray(name='temperature', data=np.random.randint(5, size=(2, 2)), dims=['x', 'y'])
+    arr
+    ds_grid = [[arr, arr], [arr, arr]]
+    xr.combine_manual(ds_grid, concat_dim=['x', 'y'])
+
+``manual_combine`` can also be used to explicitly merge datasets with
+different variables. For example if we have 4 datasets, which are divided
+along two times, and contain two different variables, we can pass ``None``
+to ``'concat_dim'`` to specify the dimension of the nested list over which
+we wish to use ``merge`` instead of ``concat``:
+
+.. ipython:: python
+
+    temp = xr.DataArray(name='temperature', data=np.random.randn(2), dims=['t'])
+    precip = xr.DataArray(name='precipitation', data=np.random.randn(2), dims=['t'])
+    ds_grid = [[temp, precip], [temp, precip]]
+    xr.combine_manual(ds_grid, concat_dim=['t', None])
+
+``combine_auto`` is for combining objects which have dimension coordinates
+which specify their relationship to and order relative to one another, for
+example a linearly-increasing 'time' dimension coordinate.
+
+Here we combine two datasets using their common dimension coordinates. Notice
+they are concatenated in order based on the values in their dimension
+coordinates, not on their position in the list passed to ``combine_auto``.
+
+.. ipython:: python
+    :okwarning:
+
+    x1 = xr.DataArray(name='foo', data=np.random.randn(3), coords=[('x', [0, 1, 2])])
+    x2 = xr.DataArray(name='foo', data=np.random.randn(3), coords=[('x', [3, 4, 5])])
+    xr.combine_auto([x2, x1])
+
+These functions can be used by ``open_mfdataset`` to open many files as one
+dataset. The particular function used is specified by setting the argument
+``'combine'`` to ``'auto'`` or ``'manual'``. This is useful for situations
+where your data is split across many files in multiple locations, which have
+some known relationship between one another.
diff --git a/doc/io.rst b/doc/io.rst
@@ -766,7 +766,10 @@ Combining multiple files
 
 NetCDF files are often encountered in collections, e.g., with different files
 corresponding to different model runs. xarray can straightforwardly combine such
-files into a single Dataset by making use of :py:func:`~xarray.concat`.
+files into a single Dataset by making use of :py:func:`~xarray.concat`,
+:py:func:`~xarray.merge`, :py:func:`~xarray.combine_manual` and
+:py:func:`~xarray.combine_auto`. For details on the difference between these
+functions see :ref:`combining data`.
 
 .. note::
 
@@ -779,7 +782,8 @@ files into a single Dataset by making use of :py:func:`~xarray.concat`.
     This function automatically concatenates and merges multiple files into a
     single xarray dataset.
     It is the recommended way to open multiple files with xarray.
-    For more details, see :ref:`dask.io` and a `blog post`_ by Stephan Hoyer.
+    For more details, see :ref:`combining.multi`, :ref:`dask.io` and a
+    `blog post`_ by Stephan Hoyer.
 
 .. _dask: http://dask.pydata.org
 .. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -149,6 +149,25 @@ Other enhancements
   report showing what exactly differs between the two objects (dimensions /
   coordinates / variables / attributes)  (:issue:`1507`).
   By `Benoit Bovy <https://github.com/benbovy>`_.
+- Combining datasets along N dimensions:
+  Datasets can now be combined along any number of dimensions,
+  instead of just a one-dimensional list of datasets.
+
+  The new ``manual_combine`` will accept the datasets as a a nested
+  list-of-lists, and combine by applying a series of concat and merge
+  operations.
+
+  ``open_mfdataset`` can use ``manual_combine`` to combine datasets along
+  multiple dimensions, by specifying `combine='manual'`.
+
+  Some combinations of datasets will now throw FutureWarnings. To avoid these
+  switch to using `manual_combine` (or `combine='manual'` in `open_mfdataset`).
+  (:issue:`2159`) By `Tom Nicholas <http://github.com/TomNicholas>`_.
+
+- Resampling of standard and non-standard calendars indexed by
+  :py:class:`~xarray.CFTimeIndex` is now possible. (:issue:`2191`).
+  By `Jwen Fai Low <https://github.com/jwenfai>`_ and
+  `Spencer Clark <https://github.com/spencerkclark>`_.
 - Add ``tolerance`` option to ``resample()`` methods ``bfill``, ``pad``,
   ``nearest``. (:issue:`2695`)
   By `Hauke Schulz <https://github.com/observingClouds>`_.

diff --git a/xarray/__init__.py b/xarray/__init__.py
@@ -6,7 +6,8 @@
 
 from .core.alignment import align, broadcast, broadcast_arrays
 from .core.common import full_like, zeros_like, ones_like
-from .core.combine import concat, auto_combine
+from .core.concat import concat
+from .core.combine import combine_auto, combine_manual, auto_combine
 from .core.computation import apply_ufunc, dot, where
 from .core.extensions import (register_dataarray_accessor,
                               register_dataset_accessor)

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -7,10 +7,11 @@
 
 import numpy as np
 
-from .. import Dataset, backends, conventions
+from .. import Dataset, DataArray, backends, conventions
 from ..core import indexing
-from ..core.combine import (
-    _CONCAT_DIM_DEFAULT, _auto_combine, _infer_concat_order_from_positions)
+from .. import auto_combine
+from ..core.combine import (combine_auto, _manual_combine,
+                            _infer_concat_order_from_positions)
 from ..core.utils import close_on_error, is_grib_path, is_remote_uri
 from .common import ArrayWriter
 from .locks import _get_scheduler
@@ -591,38 +592,51 @@ def close(self):
             f.close()
 
 
-def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
+def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
                    compat='no_conflicts', preprocess=None, engine=None,
                    lock=None, data_vars='all', coords='different',
-                   autoclose=None, parallel=False, **kwargs):
+                   combine='_old_auto', autoclose=None, parallel=False,
+                   **kwargs):
     """Open multiple files as a single dataset.
 
+    If combine='auto' then the function `combine_auto` is used to combine the
+    datasets into one before returning the result, and if combine='manual' then
+    `combine_manual` is used. The filepaths must be structured according to
+    which combining function is used, the details of which are given in the
+    documentation for ``combine_auto`` and ``combine_manual``.
+    By default the old (now deprecated) ``auto_combine`` will be used, please 
+    specify either ``combine='auto'`` or ``combine='manual'`` in future.
     Requires dask to be installed. See documentation for details on dask [1].
     Attributes from the first dataset file are used for the combined dataset.
 
     Parameters
     ----------
     paths : str or sequence
         Either a string glob in the form "path/to/my/files/*.nc" or an explicit
-        list of files to open.  Paths can be given as strings or as pathlib
-        Paths.
+        list of files to open. Paths can be given as strings or as pathlib
+        Paths. If concatenation along more than one dimension is desired, then
+        ``paths`` must be a nested list-of-lists (see ``manual_combine`` for
+        details). (A string glob will be expanded to a 1-dimensional list.)
     chunks : int or dict, optional
         Dictionary with keys given by dimension names and values given by chunk
         sizes. In general, these should divide the dimensions of each dataset.
         If int, chunk each dimension by ``chunks``.
         By default, chunks will be chosen to load entire input files into
         memory at once. This has a major impact on performance: please see the
         full documentation for more details [2].
-    concat_dim : None, str, DataArray or Index, optional
-        Dimension to concatenate files along. This argument is passed on to
-        :py:func:`xarray.auto_combine` along with the dataset objects. You only
-        need to provide this argument if the dimension along which you want to
-        concatenate is not a dimension in the original datasets, e.g., if you
-        want to stack a collection of 2D arrays along a third dimension.
-        By default, xarray attempts to infer this argument by examining
-        component files. Set ``concat_dim=None`` explicitly to disable
-        concatenation.
-    compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional
+    concat_dim : str, or list of str, DataArray, Index or None, optional
+        Dimensions to concatenate files along.  You only
+        need to provide this argument if any of the dimensions along which you
+        want to concatenate is not a dimension in the original datasets, e.g.,
+        if you want to stack a collection of 2D arrays along a third dimension.
+        Set ``concat_dim=[..., None, ...]`` explicitly to
+        disable concatenation along a particular dimension.
+    combine : {'auto', 'manual'}, optional
+        Whether ``xarray.auto_combine`` or ``xarray.manual_combine`` is used to
+        combine all the data. Default is to use ``xarray.auto_combine``, but 
+        this function has been deprecated..
+    compat : {'identical', 'equals', 'broadcast_equals',
+              'no_conflicts'}, optional
         String indicating how to compare variables of the same name for
         potential conflicts when merging:
          * 'broadcast_equals': all values must be equal when variables are
@@ -649,20 +663,18 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
         active dask scheduler.
     data_vars : {'minimal', 'different', 'all' or list of str}, optional
         These data variables will be concatenated together:
-
-         * 'minimal': Only data variables in which the dimension already
-           appears are included.
-         * 'different': Data variables which are not equal (ignoring
-           attributes) across all datasets are also concatenated (as well as
-           all for which dimension already appears). Beware: this option may
-           load the data payload of data variables into memory if they are not
-           already loaded.
-         * 'all': All data variables will be concatenated.
-         * list of str: The listed data variables will be concatenated, in
-           addition to the 'minimal' data variables.
-    coords : {'minimal', 'different', 'all' o list of str}, optional
+          * 'minimal': Only data variables in which the dimension already
+            appears are included.
+          * 'different': Data variables which are not equal (ignoring
+            attributes) across all datasets are also concatenated (as well as
+            all for which dimension already appears). Beware: this option may
+            load the data payload of data variables into memory if they are not
+            already loaded.
+          * 'all': All data variables will be concatenated.
+          * list of str: The listed data variables will be concatenated, in
+            addition to the 'minimal' data variables.
+    coords : {'minimal', 'different', 'all' or list of str}, optional
         These coordinate variables will be concatenated together:
-
          * 'minimal': Only coordinates in which the dimension already appears
            are included.
          * 'different': Coordinates which are not equal (ignoring attributes)
@@ -693,6 +705,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
 
     See Also
     --------
+    combine_auto
+    combine_manual
     auto_combine
     open_dataset
 
@@ -715,22 +729,17 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
     if not paths:
         raise IOError('no files to open')
 
-    # Coerce 1D input into ND to maintain backwards-compatible API until API
-    # for N-D combine decided
-    # (see https://github.com/pydata/xarray/pull/2553/#issuecomment-445892746)
-    if concat_dim is None or concat_dim is _CONCAT_DIM_DEFAULT:
-        concat_dims = concat_dim
-    elif not isinstance(concat_dim, list):
-        concat_dims = [concat_dim]
-    else:
-        concat_dims = concat_dim
-    infer_order_from_coords = False
-
-    # If infer_order_from_coords=True then this is unnecessary, but quick.
-    # If infer_order_from_coords=False then this creates a flat list which is
-    # easier to iterate over, while saving the originally-supplied structure
-    combined_ids_paths, concat_dims = _infer_concat_order_from_positions(
-        paths, concat_dims)
+    # If combine='auto' then this is unnecessary, but quick.
+    # If combine='manual' then this creates a flat list which is easier to
+    # iterate over, while saving the originally-supplied structure as "ids"
+    if combine is 'manual':
+        if concat_dim is '__auto_combine__':
+            raise ValueError("Must supply concat_dim when using manual "
+                             "combine")
+        else:
+            if isinstance(concat_dim, (str, DataArray)) or concat_dim is None:
+                concat_dim = [concat_dim]
+    combined_ids_paths = _infer_concat_order_from_positions(paths)
     ids, paths = (
         list(combined_ids_paths.keys()), list(combined_ids_paths.values()))
 
@@ -758,18 +767,28 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
         # the underlying datasets will still be stored as dask arrays
         datasets, file_objs = dask.compute(datasets, file_objs)
 
-    # Close datasets in case of a ValueError
+    # Combine all datasets, closing them in case of a ValueError
     try:
-        if infer_order_from_coords:
-            # Discard ordering because it should be redone from coordinates
-            ids = False
-
-        combined = _auto_combine(
-            datasets, concat_dims=concat_dims,
-            compat=compat,
-            data_vars=data_vars, coords=coords,
-            infer_order_from_coords=infer_order_from_coords,
-            ids=ids)
+        if combine is '_old_auto':
+            # Use the old auto_combine for now
+            # Remove this after deprecation cycle from #2616 is complete
+            combined = auto_combine(datasets, concat_dim=concat_dim,
+                                    compat=compat, data_vars=data_vars,
+                                    coords=coords)
+        elif combine is 'manual':
+            # Combined nested list by successive concat and merge operations
+            # along each dimension, using structure given by "ids"
+            combined = _manual_combine(datasets, concat_dims=concat_dim,
+                                       compat=compat, data_vars=data_vars,
+                                       coords=coords, ids=ids)
+        elif combine is 'auto':
+            # Redo ordering from coordinates, ignoring how they were ordered
+            # previously
+            combined = combine_auto(datasets, compat=compat,
+                                    data_vars=data_vars, coords=coords)
+        else:
+            raise ValueError("{} is an invalid option forthe keyword argument "
+                             "``combine``".format(combine))
     except ValueError:
         for ds in datasets:
             ds.close()