Dataset.encoding and unlimited dimensions for to_netcdf (#1170)

Joe Hamman · web-flow · commit 6d5ad44e4666 · 2017-01-23T22:38:48.000-08:00
* initial hack at enabling unlimited dims in to_netcdf * unlimited dims for netcdf4, still working on scipy * fix two bugs in h5netcdf tests * fix failing tests, try workaround for scipy/scipy#6880 * cleanup * simple slice in scipy workaround * initial fixes after @shoyer's review * fix failing test by passing unlimited_dims through to in memory store * remove encoding from dataset constructor * more tests for unlimited_dims and update whats-new * refactor unlimited dimensions / dataset encoding to avoid using DataStore statespace, respond to a few of @shoyer's comments * raise user warning if unlimited dims is used with h5netcdf * cleanup backends after unlimited_dims changes
diff --git a/doc/api.rst b/doc/api.rst
@@ -46,6 +46,7 @@ Attributes
    Dataset.data_vars
    Dataset.coords
    Dataset.attrs
+   Dataset.encoding
    Dataset.indexes
    Dataset.get_index
 
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -209,6 +209,10 @@ Enhancements
   and attributes. The method prints to a buffer (e.g. ``stdout``) with output
   similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`).
   By `Joe Hamman <https://github.com/jhamman>`_.
+- Added the ability write unlimited netCDF dimensions with the ``scipy`` and
+  ``netcdf4`` backends via the new :py:attr:`~xray.Dataset.encoding` attribute
+  or via the ``unlimited_dims`` argument to :py:meth:`~xray.Dataset.to_netcdf`.
+  By `Joe Hamman <https://github.com/jhamman>`_.
 - New :py:meth:`~DataArray.quantile` method to calculate quantiles from
   DataArray objects (:issue:`1187`).
   By `Joe Hamman <https://github.com/jhamman>`_.
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -522,7 +522,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
 
 
 def to_netcdf(dataset, path=None, mode='w', format=None, group=None,
-              engine=None, writer=None, encoding=None):
+              engine=None, writer=None, encoding=None, unlimited_dims=None):
     """This function creates an appropriate datastore for writing a dataset to
     disk as a netCDF file
 
@@ -561,8 +561,12 @@ def to_netcdf(dataset, path=None, mode='w', format=None, group=None,
     sync = writer is None
 
     store = store_cls(path, mode, format, group, writer)
+
+    if unlimited_dims is None:
+        unlimited_dims = dataset.encoding.get('unlimited_dims', None)
     try:
-        dataset.dump_to_store(store, sync=sync, encoding=encoding)
+        dataset.dump_to_store(store, sync=sync, encoding=encoding,
+                              unlimited_dims=unlimited_dims)
         if isinstance(path, BytesIO):
             return path.getvalue()
     finally:
diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -5,7 +5,6 @@
 import logging
 import time
 import traceback
-import threading
 from collections import Mapping
 from distutils.version import StrictVersion
 
@@ -84,6 +83,9 @@ def get_attrs(self):  # pragma: no cover
     def get_variables(self):  # pragma: no cover
         raise NotImplementedError
 
+    def get_encoding(self):
+        return {}
+
     def load(self):
         """
         This loads the variables and attributes simultaneously.
@@ -105,8 +107,8 @@ def load(self):
         This function will be called anytime variables or attributes
         are requested, so care should be taken to make sure its fast.
         """
-        variables = FrozenOrderedDict((_decode_variable_name(k), v)
-                                      for k, v in iteritems(self.get_variables()))
+        variables = FrozenOrderedDict((_decode_variable_name(k), v) for k, v in
+                                      iteritems(self.get_variables()))
         attributes = FrozenOrderedDict(self.get_attrs())
         return variables, attributes
 
@@ -152,7 +154,11 @@ def add(self, source, target):
             self.sources.append(source)
             self.targets.append(target)
         else:
-            target[...] = source
+            try:
+                target[...] = source
+            except TypeError:
+                # workaround for GH: scipy/scipy#6880
+                target[:] = source
 
     def sync(self):
         if self.sources:
@@ -191,34 +197,42 @@ def store_dataset(self, dataset):
         # dataset.variables
         self.store(dataset, dataset.attrs)
 
-    def store(self, variables, attributes, check_encoding_set=frozenset()):
+    def store(self, variables, attributes, check_encoding_set=frozenset(),
+              unlimited_dims=None):
         self.set_attributes(attributes)
-        self.set_variables(variables, check_encoding_set)
+        self.set_variables(variables, check_encoding_set,
+                           unlimited_dims=unlimited_dims)
 
     def set_attributes(self, attributes):
         for k, v in iteritems(attributes):
             self.set_attribute(k, v)
 
-    def set_variables(self, variables, check_encoding_set):
+    def set_variables(self, variables, check_encoding_set,
+                      unlimited_dims=None):
         for vn, v in iteritems(variables):
             name = _encode_variable_name(vn)
             check = vn in check_encoding_set
-            target, source = self.prepare_variable(name, v, check)
+            target, source = self.prepare_variable(
+                name, v, check, unlimited_dims=unlimited_dims)
             self.writer.add(source, target)
 
-    def set_necessary_dimensions(self, variable):
+    def set_necessary_dimensions(self, variable, unlimited_dims=None):
+        if unlimited_dims is None:
+            unlimited_dims = set()
         for d, l in zip(variable.dims, variable.shape):
             if d not in self.dimensions:
+                if d in unlimited_dims:
+                    l = None
                 self.set_dimension(d, l)
 
 
 class WritableCFDataStore(AbstractWritableDataStore):
-    def store(self, variables, attributes, check_encoding_set=frozenset()):
+    def store(self, variables, attributes, *args, **kwargs):
         # All NetCDF files get CF encoded by default, without this attempting
         # to write times, for example, would fail.
         cf_variables, cf_attrs = cf_encoder(variables, attributes)
         AbstractWritableDataStore.store(self, cf_variables, cf_attrs,
-                                        check_encoding_set)
+                                        *args, **kwargs)
 
 
 class DataStorePickleMixin(object):
diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -2,15 +2,16 @@
 from __future__ import division
 from __future__ import print_function
 import functools
+import warnings
 
 from .. import Variable
 from ..core import indexing
 from ..core.utils import FrozenOrderedDict, close_on_error, Frozen
 from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict
 
 from .common import WritableCFDataStore, DataStorePickleMixin
-from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, _extract_nc4_encoding,
-                       BaseNetCDF4Array)
+from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype,
+                       _extract_nc4_variable_encoding, BaseNetCDF4Array)
 
 
 def maybe_decode_bytes(txt):
@@ -33,7 +34,7 @@ def _read_attributes(h5netcdf_var):
     return attrs
 
 
-_extract_h5nc_encoding = functools.partial(_extract_nc4_encoding,
+_extract_h5nc_encoding = functools.partial(_extract_nc4_variable_encoding,
                                            lsd_okay=False, backend='h5netcdf')
 
 
@@ -92,15 +93,20 @@ def set_dimension(self, name, length):
     def set_attribute(self, key, value):
         self.ds.setncattr(key, value)
 
-    def prepare_variable(self, name, variable, check_encoding=False):
+    def prepare_variable(self, name, variable, check_encoding=False,
+                         unlimited_dims=None):
         import h5py
 
         attrs = variable.attrs.copy()
         variable, dtype = _nc4_values_and_dtype(variable)
         if dtype is str:
             dtype = h5py.special_dtype(vlen=unicode_type)
 
-        self.set_necessary_dimensions(variable)
+        if unlimited_dims is not None:
+            warnings.warn('h5netcdf does not support unlimited dimensions, '
+                          'got: %s.' % unlimited_dims)
+            unlimited_dims = None
+        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
 
         fill_value = attrs.pop('_FillValue', None)
         if fill_value in ['\x00']:
diff --git a/xarray/backends/memory.py b/xarray/backends/memory.py
@@ -29,7 +29,7 @@ def get_attrs(self):
     def get_variables(self):
         return self._variables
 
-    def prepare_variable(self, k, v, check_encoding=False):
+    def prepare_variable(self, k, v, *args, **kwargs):
         new_var = Variable(v.dims, np.empty_like(v), v.attrs)
         # we copy the variable and stuff all encodings in the
         # attributes to imitate what happens when writing to disk.
diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 from .. import Variable
-from ..conventions import pop_to, cf_encoder
+from ..conventions import pop_to
 from ..core import indexing
 from ..core.utils import (FrozenOrderedDict, NDArrayMixin,
                           close_on_error, is_remote_uri)
@@ -138,13 +138,13 @@ def _force_native_endianness(var):
     # check to see if encoding has a value for endian its 'native'
     if not var.encoding.get('endian', 'native') is 'native':
         raise NotImplementedError("Attempt to write non-native endian type, "
-                                  "this is not supported by the netCDF4 python "
-                                  "library.")
+                                  "this is not supported by the netCDF4 "
+                                  "python library.")
     return var
 
 
-def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True,
-                          backend='netCDF4'):
+def _extract_nc4_variable_encoding(variable, raise_on_invalid=False,
+                                   lsd_okay=True, backend='netCDF4'):
     encoding = variable.encoding.copy()
 
     safe_to_drop = set(['source', 'original_shape'])
@@ -154,9 +154,8 @@ def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True,
         valid_encodings.add('least_significant_digit')
 
     if (encoding.get('chunksizes') is not None and
-            (encoding.get('original_shape', variable.shape)
-             != variable.shape) and
-            not raise_on_invalid):
+            (encoding.get('original_shape', variable.shape) !=
+                variable.shape) and not raise_on_invalid):
         del encoding['chunksizes']
 
     for k in safe_to_drop:
@@ -251,6 +250,12 @@ def get_dimensions(self):
         return FrozenOrderedDict((k, len(v))
                                  for k, v in iteritems(self.ds.dimensions))
 
+    def get_encoding(self):
+        encoding = {}
+        encoding['unlimited_dims'] = {
+            k for k, v in self.ds.dimensions.items() if v.isunlimited()}
+        return encoding
+
     def set_dimension(self, name, length):
         self.ds.createDimension(name, size=length)
 
@@ -259,7 +264,8 @@ def set_attribute(self, key, value):
             value = encode_nc3_attr_value(value)
         self.ds.setncattr(key, value)
 
-    def prepare_variable(self, name, variable, check_encoding=False):
+    def prepare_variable(self, name, variable, check_encoding=False,
+                         unlimited_dims=None):
         attrs = variable.attrs.copy()
 
         variable = _force_native_endianness(variable)
@@ -270,16 +276,16 @@ def prepare_variable(self, name, variable, check_encoding=False):
             variable = encode_nc3_variable(variable)
             datatype = variable.dtype
 
-        self.set_necessary_dimensions(variable)
+        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
 
         fill_value = attrs.pop('_FillValue', None)
         if fill_value in ['', '\x00']:
             # these are equivalent to the default FillValue, but netCDF4
             # doesn't like setting fill_value to an empty string
             fill_value = None
 
-        encoding = _extract_nc4_encoding(variable,
-                                         raise_on_invalid=check_encoding)
+        encoding = _extract_nc4_variable_encoding(
+            variable, raise_on_invalid=check_encoding)
         nc4_var = self.ds.createVariable(
             varname=name,
             datatype=datatype,
diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py
@@ -57,5 +57,11 @@ def get_attrs(self):
     def get_dimensions(self):
         return Frozen(self.ds.dimensions)
 
+    def get_encoding(self):
+        encoding = {}
+        encoding['unlimited_dims'] = set(
+            [k for k in self.ds.dimensions if self.ds.unlimited(k)])
+        return encoding
+
     def close(self):
         self.ds.close()
diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py
@@ -8,7 +8,7 @@
 import warnings
 
 from .. import Variable
-from ..core.pycompat import iteritems, basestring, OrderedDict
+from ..core.pycompat import iteritems, OrderedDict
 from ..core.utils import Frozen, FrozenOrderedDict
 from ..core.indexing import NumpyIndexingAdapter
 
@@ -119,6 +119,12 @@ def get_attrs(self):
     def get_dimensions(self):
         return Frozen(self.ds.dimensions)
 
+    def get_encoding(self):
+        encoding = {}
+        encoding['unlimited_dims'] = {
+            k for k, v in self.ds.dimensions.items() if v is None}
+        return encoding
+
     def set_dimension(self, name, length):
         if name in self.dimensions:
             raise ValueError('%s does not support modifying dimensions'
@@ -134,13 +140,17 @@ def set_attribute(self, key, value):
         value = encode_nc3_attr_value(value)
         setattr(self.ds, key, value)
 
-    def prepare_variable(self, name, variable, check_encoding=False):
+    def prepare_variable(self, name, variable, check_encoding=False,
+                         unlimited_dims=None):
         variable = encode_nc3_variable(variable)
         if check_encoding and variable.encoding:
             raise ValueError('unexpected encoding for scipy backend: %r'
                              % list(variable.encoding))
 
-        self.set_necessary_dimensions(variable)
+        if unlimited_dims is not None and len(unlimited_dims) > 1:
+            raise ValueError('NETCDF3 only supports one unlimited dimension')
+        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
+
         data = variable.data
         # nb. this still creates a numpy array in all memory, even though we
         # don't write the data yet; scipy.io.netcdf does not not support
diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -937,10 +937,12 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,
         attrs = obj.attrs
         extra_coords = set(obj.coords)
         file_obj = obj._file_obj
+        encoding = obj.encoding
     elif isinstance(obj, AbstractDataStore):
         vars, attrs = obj.load()
         extra_coords = set()
         file_obj = obj
+        encoding = obj.get_encoding()
     else:
         raise TypeError('can only decode Dataset or DataStore objects')
 
@@ -950,6 +952,8 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,
     ds = Dataset(vars, attrs=attrs)
     ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))
     ds._file_obj = file_obj
+    ds.encoding = encoding
+
     return ds
 
 
diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -4,8 +4,7 @@
 import numpy as np
 import pandas as pd
 
-from .pycompat import (basestring, iteritems, suppress, dask_array_type,
-                       OrderedDict)
+from .pycompat import (basestring, suppress, dask_array_type, OrderedDict)
 from . import formatting
 from .utils import SortedKeysDict, not_implemented, Frozen
 
@@ -751,7 +750,8 @@ def full_like(other, fill_value, dtype=None):
     elif isinstance(other, DataArray):
         return DataArray(
             _full_like_variable(other.variable, fill_value, dtype),
-            dims=other.dims, coords=other.coords, attrs=other.attrs, name=other.name)
+            dims=other.dims, coords=other.coords, attrs=other.attrs,
+            name=other.name)
     elif isinstance(other, Variable):
         return _full_like_variable(other, fill_value, dtype)
     else:
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py
diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py